From jussi.kivilinna at iki.fi  Fri Apr  5 19:25:49 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:25:49 +0300
Subject: [PATCH 2/7] Burn stack in transform functions for SHA1 AMD64
 implementations
In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
Message-ID: <155448514938.14473.7912761314620953735.stgit@localhost.localdomain>

* cipher/sha1-avx-amd64.S: Burn stack inside transform functions.
* cipher/sha1-avx-bmi2-amd64.S: Ditto.
* cipher/sha1-avx2-bmi2-amd64.S: Ditto.
* cipher/sha1-ssse3-amd64.S: Ditto.
--

This change reduces per call overhead for SHA1.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 143e4066d..5d674c151 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -380,7 +380,7 @@ _gcry_sha1_transform_amd64_avx:
 .Lend:
   vzeroall;
 
-  /* Transform 64-79. */
+  /* Transform 64-79 + burn stack */
   R( b, c, d, e, a, F4, 64 );
   R( a, b, c, d, e, F4, 65 );
   R( e, a, b, c, d, F4, 66 );
@@ -393,12 +393,15 @@ _gcry_sha1_transform_amd64_avx:
   R( c, d, e, a, b, F4, 73 );
   R( b, c, d, e, a, F4, 74 );
   R( a, b, c, d, e, F4, 75 );
-  R( e, a, b, c, d, F4, 76 );
-  R( d, e, a, b, c, F4, 77 );
-  R( c, d, e, a, b, F4, 78 );
+  R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
+  R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
+  R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
 
+  /* 16*4/16-1 = 3 */
+  vmovdqa %xmm0, (3*16)(%rsp);
+
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
@@ -416,8 +419,8 @@ _gcry_sha1_transform_amd64_avx:
   popq %rbp;
   popq %rbx;
 
-  /* burn_stack */
-  movl $(16*4 + 2*8 + 31), %eax;
+  /* stack already burned */
+  xorl %eax, %eax;
 
 .Lret:
   ret;
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index 79ea24ef9..fe8901eff 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -387,7 +387,7 @@ _gcry_sha1_transform_amd64_avx_bmi2:
 .Lend:
   vzeroall;
 
-  /* Transform 64-79. */
+  /* Transform 64-79 + burn stack */
   R( b, c, d, e, a, F4, 64 );
   R( a, b, c, d, e, F4, 65 );
   R( e, a, b, c, d, F4, 66 );
@@ -400,14 +400,17 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   R( c, d, e, a, b, F4, 73 );
   R( b, c, d, e, a, F4, 74 );
   R( a, b, c, d, e, F4, 75 );
-  R( e, a, b, c, d, F4, 76 );
-  R( d, e, a, b, c, F4, 77 );
-  R( c, d, e, a, b, F4, 78 );
+  R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
+  R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
+  R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
   addl ne, a;
   xorl ne, ne;
 
+  /* 16*4/16-1 = 3 */
+  vmovdqa %xmm0, (3*16)(%rsp);
+
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
@@ -426,8 +429,8 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   popq %rbp;
   popq %rbx;
 
-  /* burn_stack */
-  movl $(16*4 + 3*8 + 31), %eax;
+  /* stack already burned */
+  xorl %eax, %eax;
 
 .Lret:
   ret;
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
index c666290f2..2a2f21a56 100644
--- a/cipher/sha1-avx2-bmi2-amd64.S
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -504,7 +504,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
 .Lend:
   vzeroall;
 
-  /* Transform 48-79 for block 2. */
+  /* Transform 48-79 for block 2 + burn stack */
   R( c, d, e, a, b, F3, 48, 1 );
   R( b, c, d, e, a, F3, 49, 1 );
   R( a, b, c, d, e, F3, 50, 1 );
@@ -517,30 +517,33 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   R( d, e, a, b, c, F3, 57, 1 );
   R( c, d, e, a, b, F3, 58, 1 );
   R( b, c, d, e, a, F3, 59, 1 );
-  R( a, b, c, d, e, F4, 60, 1 );
-  R( e, a, b, c, d, F4, 61, 1 );
-  R( d, e, a, b, c, F4, 62, 1 );
-  R( c, d, e, a, b, F4, 63, 1 );
-  R( b, c, d, e, a, F4, 64, 1 );
-  R( a, b, c, d, e, F4, 65, 1 );
-  R( e, a, b, c, d, F4, 66, 1 );
-  R( d, e, a, b, c, F4, 67, 1 );
-  R( c, d, e, a, b, F4, 68, 1 );
-  R( b, c, d, e, a, F4, 69, 1 );
-  R( a, b, c, d, e, F4, 70, 1 );
-  R( e, a, b, c, d, F4, 71, 1 );
-  R( d, e, a, b, c, F4, 72, 1 );
-  R( c, d, e, a, b, F4, 73, 1 );
-  R( b, c, d, e, a, F4, 74, 1 );
-  R( a, b, c, d, e, F4, 75, 1 );
-  R( e, a, b, c, d, F4, 76, 1 );
-  R( d, e, a, b, c, F4, 77, 1 );
-  R( c, d, e, a, b, F4, 78, 1 );
+  R( a, b, c, d, e, F4, 60, 1 ); vmovdqa %ymm0, (0*32)(%rsp);
+  R( e, a, b, c, d, F4, 61, 1 ); vmovdqa %ymm0, (1*32)(%rsp);
+  R( d, e, a, b, c, F4, 62, 1 ); vmovdqa %ymm0, (2*32)(%rsp);
+  R( c, d, e, a, b, F4, 63, 1 ); vmovdqa %ymm0, (3*32)(%rsp);
+  R( b, c, d, e, a, F4, 64, 1 ); vmovdqa %ymm0, (4*32)(%rsp);
+  R( a, b, c, d, e, F4, 65, 1 ); vmovdqa %ymm0, (5*32)(%rsp);
+  R( e, a, b, c, d, F4, 66, 1 ); vmovdqa %ymm0, (6*32)(%rsp);
+  R( d, e, a, b, c, F4, 67, 1 ); vmovdqa %ymm0, (7*32)(%rsp);
+  R( c, d, e, a, b, F4, 68, 1 ); vmovdqa %ymm0, (8*32)(%rsp);
+  R( b, c, d, e, a, F4, 69, 1 ); vmovdqa %ymm0, (9*32)(%rsp);
+  R( a, b, c, d, e, F4, 70, 1 ); vmovdqa %ymm0, (10*32)(%rsp);
+  R( e, a, b, c, d, F4, 71, 1 ); vmovdqa %ymm0, (11*32)(%rsp);
+  R( d, e, a, b, c, F4, 72, 1 ); vmovdqa %ymm0, (12*32)(%rsp);
+  R( c, d, e, a, b, F4, 73, 1 ); vmovdqa %ymm0, (13*32)(%rsp);
+  R( b, c, d, e, a, F4, 74, 1 ); vmovdqa %ymm0, (14*32)(%rsp);
+  R( a, b, c, d, e, F4, 75, 1 ); vmovdqa %ymm0, (15*32)(%rsp);
+  R( e, a, b, c, d, F4, 76, 1 ); vmovdqa %ymm0, (16*32)(%rsp);
+  R( d, e, a, b, c, F4, 77, 1 ); vmovdqa %ymm0, (17*32)(%rsp);
+  R( c, d, e, a, b, F4, 78, 1 ); vmovdqa %ymm0, (18*32)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79, 1 );
   addl ne, a;
   xorl ne, ne;
 
+  /* WK_STACK_WORDS*4/32-1 = 19 */
+  vmovdqa %ymm0, (19*32)(%rsp);
+
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
@@ -559,8 +562,8 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   popq %rbp;
   popq %rbx;
 
-  /* burn_stack */
-  movl $((WK_STACK_WORDS)*4 + 3*8 + 31), %eax;
+  /* stack already burned */
+  xorl %eax, %eax;
 
   ret;
 ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index 421bebeca..fff140345 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -388,7 +388,7 @@ _gcry_sha1_transform_amd64_ssse3:
 
 .align 16
 .Lend:
-  /* Transform 64-79 + Clear XMM registers. */
+  /* Transform 64-79 + Clear XMM registers + Burn stack. */
   R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG);
   R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0);
   R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1);
@@ -401,12 +401,15 @@ _gcry_sha1_transform_amd64_ssse3:
   R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6);
   R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7);
   R( a, b, c, d, e, F4, 75 );
-  R( e, a, b, c, d, F4, 76 );
-  R( d, e, a, b, c, F4, 77 );
-  R( c, d, e, a, b, F4, 78 );
+  R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp);
+  R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp);
+  R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp);
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
 
+  /* 16*4/16-1 = 3 */
+  vmovdqa Wtmp0, (3*16)(%rsp);
+
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
   addl state_h2(RSTATE), c;
@@ -424,8 +427,8 @@ _gcry_sha1_transform_amd64_ssse3:
   popq %rbp;
   popq %rbx;
 
-  /* burn_stack */
-  movl $(16*4 + 2*8 + 31), %eax;
+  /* stack already burned */
+  xorl %eax, %eax;
 
 .Lret:
   ret;


From jussi.kivilinna at iki.fi  Fri Apr  5 19:25:59 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:25:59 +0300
Subject: [PATCH 4/7] tests/basic: add hash test for small block sizes
In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
Message-ID: <155448515972.14473.1366753321398503046.stgit@localhost.localdomain>

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/tests/basic.c b/tests/basic.c
index 190b0060b..a28dc6997 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -7951,7 +7951,7 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen,
   gcry_md_hd_t hd, hd2;
   unsigned char *p;
   int mdlen;
-  int i;
+  int i, j;
   int xof = 0;
   gcry_error_t err = 0;
 
@@ -7988,6 +7988,66 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen,
 	}
     }
 
+  if (*data == '!' && !data[1] && !xof)
+   {
+      unsigned char *p1, *p2;
+      char buf[129];
+
+      /* Test hashing small input sizes first as full block, then byte-by-byte
+       * and check that resulting digests are the same. */
+
+      err = gcry_md_open (&hd2, algo, 0);
+      if (err)
+	{
+	  gcry_md_close (hd);
+	  fail ("algo %d, gcry_md_open failed: %s\n", algo, gpg_strerror (err));
+	  return;
+	}
+
+      if (key && klen)
+	{
+	  err = gcry_md_setkey (hd2, key, klen);
+	  if (err)
+	    {
+	      gcry_md_close (hd);
+	      gcry_md_close (hd2);
+	      fail ("algo %d, gcry_md_setkey failed: %s\n", algo, gpg_strerror (err));
+	      return;
+	    }
+	}
+
+      for (i = 0; i < sizeof(buf); i++)
+	buf[i] = i;
+
+      for (i = 1; i < sizeof(buf); i++)
+	{
+	  gcry_md_reset (hd);
+	  gcry_md_reset (hd2);
+
+          gcry_md_write (hd, buf, i);
+	  for (j = 0; j < i; j++)
+	    gcry_md_write (hd2, &buf[j], 1);
+
+	  p1 = gcry_md_read (hd, algo);
+	  p2 = gcry_md_read (hd2, algo);
+	  if (memcmp (p1, p2, mdlen))
+	    {
+	      printf ("full block (input length %d): ", i);
+	      for (i = 0; i < mdlen; i++)
+		printf ("%02x ", p1[i] & 0xFF);
+	      printf ("\nbyte-by-byte: ");
+	      for (i = 0; i < mdlen; i++)
+		printf ("%02x ", p2[i] & 0xFF);
+	      printf ("\n");
+
+	      fail ("algo %d, digest mismatch\n", algo);
+	    }
+	}
+
+      gcry_md_close (hd2);
+      gcry_md_reset (hd);
+   }
+
   if ((*data == '!' && !data[1]) || /* hash one million times a "a" */
       (*data == '?' && !data[1]))   /* hash million byte data-set with byte pattern 0x00,0x01,0x02,... */
     {


From jussi.kivilinna at iki.fi  Fri Apr  5 19:25:54 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:25:54 +0300
Subject: [PATCH 3/7] Burn stack in transform functions for SHA2 AMD64
 implementations
In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
Message-ID: <155448515455.14473.5548704477061402190.stgit@localhost.localdomain>

* cipher/sha256-avx-amd64.S: Burn stack inside transform functions.
* cipher/sha256-avx2-bmi2-amd64.S: Ditto.
* cipher/sha256-ssse3-amd64.S: Ditto.
* cipher/sha512-avx-amd64.S: Ditto.
* cipher/sha512-avx2-bmi2-amd64.S: Ditto.
* cipher/sha512-ssse3-amd64.S: Ditto.
--

This change reduces per call overhead for SHA256 & SHA512.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index 6953855bb..b8b01b15b 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -480,9 +480,12 @@ _gcry_sha256_transform_amd64_avx:
 	cmp	INP, [rsp + _INP_END]
 	jne	.Loop0
 
+.Ldone_hash:
 	vzeroall
 
-.Ldone_hash:
+	vmovdqa	[rsp + _XFER], XFER
+	xor     eax, eax
+
 	add	rsp, STACK_SIZE
 
 	pop	r15
@@ -491,8 +494,6 @@ _gcry_sha256_transform_amd64_avx:
 	pop	rbp
 	pop	rbx
 
-	mov     eax, STACK_SIZE + 5*8
-
 	ret
 
 
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 85e663fef..598f93821 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -747,10 +747,29 @@ _gcry_sha256_transform_amd64_avx2:
 	jmp	.Ldo_last_block
 
 .Ldone_hash:
-	mov	rsp, [rsp + _RSP]
-
 	vzeroall
 
+	/* burn stack */
+	vmovdqa [rsp + _XFER + 0 * 32], ymm0
+	vmovdqa [rsp + _XFER + 1 * 32], ymm0
+	vmovdqa [rsp + _XFER + 2 * 32], ymm0
+	vmovdqa [rsp + _XFER + 3 * 32], ymm0
+	vmovdqa [rsp + _XFER + 4 * 32], ymm0
+	vmovdqa [rsp + _XFER + 5 * 32], ymm0
+	vmovdqa [rsp + _XFER + 6 * 32], ymm0
+	vmovdqa [rsp + _XFER + 7 * 32], ymm0
+	vmovdqa [rsp + _XFER + 8 * 32], ymm0
+	vmovdqa [rsp + _XFER + 9 * 32], ymm0
+	vmovdqa [rsp + _XFER + 10 * 32], ymm0
+	vmovdqa [rsp + _XFER + 11 * 32], ymm0
+	vmovdqa [rsp + _XFER + 12 * 32], ymm0
+	vmovdqa [rsp + _XFER + 13 * 32], ymm0
+	vmovdqa [rsp + _XFER + 14 * 32], ymm0
+	vmovdqa [rsp + _XFER + 15 * 32], ymm0
+	xor     eax, eax
+
+	mov	rsp, [rsp + _RSP]
+
 	pop	r15
 	pop	r14
 	pop	r13
@@ -758,9 +777,6 @@ _gcry_sha256_transform_amd64_avx2:
 	pop	rbp
 	pop	rbx
 
-	/* stack burn depth */
-	mov	eax, STACK_SIZE + 6*8 + 31
-
 	ret
 
 .align 64
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index a9213e419..ca5c9fd1d 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -503,6 +503,10 @@ _gcry_sha256_transform_amd64_ssse3:
 	pxor	xmm12, xmm12
 
 .Ldone_hash:
+	pxor	XFER, XFER
+	movdqa	[rsp + _XFER], XFER
+	xor     eax, eax
+
 	add	rsp, STACK_SIZE
 
 	pop	r15
@@ -511,8 +515,6 @@ _gcry_sha256_transform_amd64_ssse3:
 	pop	rbp
 	pop	rbx
 
-	mov     eax, STACK_SIZE + 5*8
-
 	ret
 
 
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 446a8b4e5..534351e44 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -352,13 +352,19 @@ _gcry_sha512_transform_amd64_avx:
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
 
-	/* Restore Stack Pointer */
-	add	rsp, frame_size
-
 	vzeroall
 
-	/* Return stack burn depth */
-	mov	rax, frame_size
+	/* Burn stack */
+	t = 0
+	.rept frame_W_size / 32
+		vmovups [rsp + frame_W + (t) * 32], ymm0
+		t = ((t)+1)
+	.endr
+	vmovdqu [rsp + frame_WK], xmm0
+	xor     eax, eax
+
+	/* Restore Stack Pointer */
+	add	rsp, frame_size
 
 .Lnowork:
 	ret
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 05bef64cf..914f920af 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -714,6 +714,7 @@ _gcry_sha512_transform_amd64_avx2:
 	jne	.Loop0
 
 .Ldone_hash:
+	vzeroall
 
 	/* Restore GPRs */
 	mov	rbp, [rsp + frame_GPRSAVE + 8 * 0]
@@ -723,12 +724,12 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
 
+	/* Burn stack */
+	vmovdqa	[rsp + frame_XFER], XFER
+	xor     eax, eax
+
 	/* Restore Stack Pointer */
 	mov	rsp, [rsp + frame_RSPSAVE]
-
-	vzeroall
-
-	mov	eax, frame_size + 31
 .Lnowork:
 	ret
 
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 51193b361..8e950e0e4 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -352,9 +352,6 @@ _gcry_sha512_transform_amd64_ssse3:
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
 
-	/* Restore Stack Pointer */
-	add	rsp, frame_size
-
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
 	pxor	xmm2, xmm2
@@ -362,8 +359,17 @@ _gcry_sha512_transform_amd64_ssse3:
 	pxor	xmm4, xmm4
 	pxor	xmm5, xmm5
 
-	/* Return stack burn depth */
-	mov	rax, frame_size
+	/* Burn stack */
+	t = 0
+	.rept frame_W_size / 16
+		movdqu [rsp + frame_W + (t) * 16], xmm0
+		t = ((t)+1)
+	.endr
+	movdqu [rsp + frame_WK], xmm0
+	xor     eax, eax
+
+	/* Restore Stack Pointer */
+	add	rsp, frame_size
 
 .Lnowork:
 	ret


From jussi.kivilinna at iki.fi  Fri Apr  5 19:26:05 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:26:05 +0300
Subject: [PATCH 5/7] Optimizations for digest final functions
In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
Message-ID: <155448516489.14473.7380936271683534052.stgit@localhost.localdomain>

* cipher/md4.c (md4_final): Avoid byte-by-byte buffer setting when
padding; Merge extra and last block processing.
* cipher/md5.c (md5_final): Ditto.
* cipher/rmd160.c (rmd160_final): Ditto.
* cipher/sha1.c (sha1_final): Ditto.
* cipher/sha256.c (sha256_final): Ditto.
* cipher/sm3.c (sm3_final): Ditto.
* cipher/tiger.c (tiger_final): Ditto.
* cipher/sha512.c (sha512_final): Avoid byte-by-byte buffer setting
when padding.
* cipher/stribog.c (stribog_final): Ditto.
* cipher/whirlpool.c (whirlpool_final): Ditto.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/md4.c b/cipher/md4.c
index 098380801..997dbe0ce 100644
--- a/cipher/md4.c
+++ b/cipher/md4.c
@@ -234,25 +234,30 @@ md4_final( void *context )
   msb <<= 3;
   msb |= t >> 29;
 
-  if( hd->bctx.count < 56 )  /* enough room */
+  if (hd->bctx.count < 56)  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
-      while( hd->bctx.count < 56 )
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 1);
     }
   else /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
-      while( hd->bctx.count < 64 )
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write(hd, NULL, 0);  /* flush */;
-      memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 2);
     }
-  /* append the 64 bit count */
-  buf_put_le32(hd->bctx.buf + 56, lsb);
-  buf_put_le32(hd->bctx.buf + 60, msb);
-  burn = transform ( hd, hd->bctx.buf, 1 );
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_le32(p, hd->a); p += 4; } while(0)
@@ -262,6 +267,7 @@ md4_final( void *context )
   X(D);
 #undef X
 
+  _gcry_burn_stack (burn);
 }
 
 static byte *
diff --git a/cipher/md5.c b/cipher/md5.c
index e35a500c4..c432502ff 100644
--- a/cipher/md5.c
+++ b/cipher/md5.c
@@ -258,25 +258,30 @@ md5_final( void *context)
   msb <<= 3;
   msb |= t >> 29;
 
-  if( hd->bctx.count < 56 )  /* enough room */
+  if (hd->bctx.count < 56)  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
-      while( hd->bctx.count < 56 )
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 1);
     }
-  else  /* need one extra block */
+  else /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
-      while( hd->bctx.count < 64 )
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write(hd, NULL, 0);  /* flush */;
-      memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 2);
     }
-  /* append the 64 bit count */
-  buf_put_le32(hd->bctx.buf + 56, lsb);
-  buf_put_le32(hd->bctx.buf + 60, msb);
-  burn = transform ( hd, hd->bctx.buf, 1 );
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_le32(p, hd->a); p += 4; } while(0)
@@ -286,6 +291,7 @@ md5_final( void *context)
   X(D);
 #undef X
 
+  _gcry_burn_stack (burn);
 }
 
 static byte *
diff --git a/cipher/rmd160.c b/cipher/rmd160.c
index 2d2fae916..231640d27 100644
--- a/cipher/rmd160.c
+++ b/cipher/rmd160.c
@@ -431,25 +431,30 @@ rmd160_final( void *context )
   msb <<= 3;
   msb |= t >> 29;
 
-  if( hd->bctx.count < 56 )  /* enough room */
+  if (hd->bctx.count < 56)  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
-      while( hd->bctx.count < 56 )
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 1);
     }
-  else  /* need one extra block */
+  else /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
-      while( hd->bctx.count < 64 )
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write(hd, NULL, 0);  /* flush */;
-      memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform (hd, hd->bctx.buf, 2);
     }
-  /* append the 64 bit count */
-  buf_put_le32(hd->bctx.buf + 56, lsb);
-  buf_put_le32(hd->bctx.buf + 60, msb);
-  burn = transform ( hd, hd->bctx.buf, 1 );
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_le32(p, hd->h##a); p += 4; } while(0)
@@ -459,6 +464,8 @@ rmd160_final( void *context )
   X(3);
   X(4);
 #undef X
+
+  _gcry_burn_stack (burn);
 }
 
 static byte *
diff --git a/cipher/sha256.c b/cipher/sha256.c
index e82a9d902..327e1029f 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -498,25 +498,30 @@ sha256_final(void *context)
   msb <<= 3;
   msb |= t >> 29;
 
-  if (hd->bctx.count < 56)
-    { /* enough room */
+  if (hd->bctx.count < 56)  /* enough room */
+    {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
-      while (hd->bctx.count < 56)
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 56, msb);
+      buf_put_be32(hd->bctx.buf + 60, lsb);
+      burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1);
     }
-  else
-    { /* need one extra block */
+  else  /* need one extra block */
+    {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
-      while (hd->bctx.count < 64)
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write (hd, NULL, 0);  /* flush */;
-      memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+      buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+      burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 2);
     }
-  /* append the 64 bit count */
-  buf_put_be32(hd->bctx.buf + 56, msb);
-  buf_put_be32(hd->bctx.buf + 60, lsb);
-  burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1);
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
@@ -529,6 +534,8 @@ sha256_final(void *context)
   X(6);
   X(7);
 #undef X
+
+  _gcry_burn_stack (burn);
 }
 
 static byte *
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 721f34054..615b55357 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -659,14 +659,16 @@ sha512_final (void *context)
   if (hd->bctx.count < 112)
     {				/* enough room */
       hd->bctx.buf[hd->bctx.count++] = 0x80;	/* pad */
-      while (hd->bctx.count < 112)
-        hd->bctx.buf[hd->bctx.count++] = 0;	/* pad */
+      if (hd->bctx.count < 112)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 112 - hd->bctx.count);
+      hd->bctx.count = 112;
     }
   else
     {				/* need one extra block */
       hd->bctx.buf[hd->bctx.count++] = 0x80;	/* pad character */
-      while (hd->bctx.count < 128)
-        hd->bctx.buf[hd->bctx.count++] = 0;
+      if (hd->bctx.count < 128)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 128 - hd->bctx.count);
+      hd->bctx.count = 128;
       _gcry_md_block_write (context, NULL, 0); /* flush */ ;
       memset (hd->bctx.buf, 0, 112);	/* fill next block with zeroes */
     }
diff --git a/cipher/sm3.c b/cipher/sm3.c
index c6f1a091d..7bfb37b95 100644
--- a/cipher/sm3.c
+++ b/cipher/sm3.c
@@ -291,25 +291,30 @@ sm3_final(void *context)
   msb <<= 3;
   msb |= t >> 29;
 
-  if (hd->bctx.count < 56)
-    { /* enough room */
+  if (hd->bctx.count < 56)  /* enough room */
+    {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
-      while (hd->bctx.count < 56)
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 56, msb);
+      buf_put_be32(hd->bctx.buf + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
     }
-  else
-    { /* need one extra block */
+  else  /* need one extra block */
+    {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
-      while (hd->bctx.count < 64)
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write (hd, NULL, 0);  /* flush */;
-      memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+      buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 );
     }
-  /* append the 64 bit count */
-  buf_put_be32(hd->bctx.buf + 56, msb);
-  buf_put_be32(hd->bctx.buf + 60, lsb);
-  burn = transform (hd, hd->bctx.buf, 1);
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
@@ -322,6 +327,8 @@ sm3_final(void *context)
   X(6);
   X(7);
 #undef X
+
+  _gcry_burn_stack (burn);
 }
 
 static byte *
diff --git a/cipher/stribog.c b/cipher/stribog.c
index 459e4db99..d31dddd37 100644
--- a/cipher/stribog.c
+++ b/cipher/stribog.c
@@ -1292,8 +1292,9 @@ stribog_final (void *context)
   i = hd->bctx.count;
   /* After flush we have at least one byte free) */
   hd->bctx.buf[i++] = 1;
-  while (i < 64)
-    hd->bctx.buf[i++] = 0;
+  if (i < 64)
+    memset (&hd->bctx.buf[i], 0, 64 - i);
+  i = 64;
   transform_bits (hd, hd->bctx.buf, hd->bctx.count * 8);
 
   g (hd->h, hd->N, Z);
diff --git a/cipher/tiger.c b/cipher/tiger.c
index d24d1603b..0319b7115 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -760,22 +760,26 @@ tiger_final( void *context )
   if( hd->bctx.count < 56 )  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = pad;
-      while( hd->bctx.count < 56 )
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 60, msb);
+      burn = transform( hd, hd->bctx.buf, 1 );
     }
   else  /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = pad; /* pad character */
-      while( hd->bctx.count < 64 )
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write(hd, NULL, 0);  /* flush */;
-      memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_le32(hd->bctx.buf + 64 + 56, lsb);
+      buf_put_le32(hd->bctx.buf + 64 + 60, msb);
+      burn = transform( hd, hd->bctx.buf, 2 );
     }
-  /* append the 64 bit count */
-  buf_put_le32(hd->bctx.buf + 56, lsb);
-  buf_put_le32(hd->bctx.buf + 60, msb);
-  burn = transform( hd, hd->bctx.buf, 1 );
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be64(p, hd->a); p += 8; } while(0)
@@ -794,6 +798,8 @@ tiger_final( void *context )
     }
 #undef X
 #undef Y
+
+  _gcry_burn_stack (burn);
 }
 
 static byte *
diff --git a/cipher/whirlpool.c b/cipher/whirlpool.c
index d52375ada..d9b79cf1a 100644
--- a/cipher/whirlpool.c
+++ b/cipher/whirlpool.c
@@ -1494,12 +1494,16 @@ whirlpool_final (void *ctx)
   if (context->bctx.count > 32)
     {
       /* An extra block is necessary.  */
-      while (context->bctx.count < 64)
-	context->bctx.buf[context->bctx.count++] = 0;
+      if (context->bctx.count < 64)
+	memset (&context->bctx.buf[context->bctx.count], 0,
+	        64 - context->bctx.count);
+      context->bctx.count = 64;
       whirlpool_write (context, NULL, 0);
     }
-  while (context->bctx.count < 32)
-    context->bctx.buf[context->bctx.count++] = 0;
+  if (context->bctx.count < 32)
+    memset (&context->bctx.buf[context->bctx.count], 0,
+	    32 - context->bctx.count);
+  context->bctx.count = 32;
 
   /* Add length of message.  */
   length = context->bctx.buf + context->bctx.count;


From jussi.kivilinna at iki.fi  Fri Apr  5 19:26:10 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:26:10 +0300
Subject: [PATCH 6/7] Remove extra buffer flush at begining of digest final
 functions
In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
Message-ID: <155448517006.14473.15729038755403086657.stgit@localhost.localdomain>

* cipher/md2.c (md2_final): Remove _gcry_md_block_write flush call
from entry.
* cipher/md4.c (md4_final): Ditto.
* cipher/md5.c (md5_final): Ditto.
* cipher/rmd160.c (rmd160_final): Ditto.
* cipher/sha1.c (sha1_final): Ditto.
* cipher/sha256.c (sha256_final): Ditto.
* cipher/sha512.c (sha512_final): Ditto.
* cipher/sm3.c (sm3_final): Ditto.
* cipher/stribog.c (stribog_final): Ditto.
* cipher/tiger.c (tiger_final): Ditto.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/md2.c b/cipher/md2.c
index b6f7e94f4..bf2fbee4c 100644
--- a/cipher/md2.c
+++ b/cipher/md2.c
@@ -146,8 +146,6 @@ md2_final (void *context)
   MD2_CONTEXT *hd = context;
   unsigned int burn;
 
-  _gcry_md_block_write(hd, NULL, 0); /* flush */;
-
   /* pad */
   memset (hd->bctx.buf + hd->bctx.count,
           16 - hd->bctx.count, 16 - hd->bctx.count);
diff --git a/cipher/md4.c b/cipher/md4.c
index 997dbe0ce..f6258893e 100644
--- a/cipher/md4.c
+++ b/cipher/md4.c
@@ -213,8 +213,6 @@ md4_final( void *context )
   byte *p;
   unsigned int burn;
 
-  _gcry_md_block_write(hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
diff --git a/cipher/md5.c b/cipher/md5.c
index c432502ff..67511ba01 100644
--- a/cipher/md5.c
+++ b/cipher/md5.c
@@ -237,8 +237,6 @@ md5_final( void *context)
   byte *p;
   unsigned int burn;
 
-  _gcry_md_block_write(hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
diff --git a/cipher/rmd160.c b/cipher/rmd160.c
index 231640d27..f15eec225 100644
--- a/cipher/rmd160.c
+++ b/cipher/rmd160.c
@@ -410,8 +410,6 @@ rmd160_final( void *context )
   byte *p;
   unsigned int burn;
 
-  _gcry_md_block_write(hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
diff --git a/cipher/sha1.c b/cipher/sha1.c
index affabfb07..23aceef32 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -511,8 +511,6 @@ sha1_final(void *context)
   unsigned char *p;
   unsigned int burn;
 
-  _gcry_md_block_write (hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
@@ -532,7 +530,7 @@ sha1_final(void *context)
   msb <<= 3;
   msb |= t >> 29;
 
-  if( hd->bctx.count < 56 )  /* enough room */
+  if (hd->bctx.count < 56)  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
       if (hd->bctx.count < 56)
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 327e1029f..6c6833482 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -477,8 +477,6 @@ sha256_final(void *context)
   byte *p;
   unsigned int burn;
 
-  _gcry_md_block_write (hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 615b55357..59e65f07a 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -635,8 +635,6 @@ sha512_final (void *context)
   u64 t, th, msb, lsb;
   byte *p;
 
-  _gcry_md_block_write (context, NULL, 0); /* flush */ ;
-
   t = hd->bctx.nblocks;
   /* if (sizeof t == sizeof hd->bctx.nblocks) */
   th = hd->bctx.nblocks_high;
diff --git a/cipher/sm3.c b/cipher/sm3.c
index 7bfb37b95..e76f32297 100644
--- a/cipher/sm3.c
+++ b/cipher/sm3.c
@@ -270,8 +270,6 @@ sm3_final(void *context)
   byte *p;
   unsigned int burn;
 
-  _gcry_md_block_write (hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;
diff --git a/cipher/stribog.c b/cipher/stribog.c
index d31dddd37..3eb077356 100644
--- a/cipher/stribog.c
+++ b/cipher/stribog.c
@@ -1287,7 +1287,6 @@ stribog_final (void *context)
   u64 Z[8] = {};
   int i;
 
-  _gcry_md_block_write (context, NULL, 0); /* flush */ ;
   /* PAD. It does not count towards message length */
   i = hd->bctx.count;
   /* After flush we have at least one byte free) */
diff --git a/cipher/tiger.c b/cipher/tiger.c
index 0319b7115..c78e3ac35 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -736,8 +736,6 @@ tiger_final( void *context )
   unsigned int burn;
   byte pad = hd->variant == 2? 0x80 : 0x01;
 
-  _gcry_md_block_write(hd, NULL, 0); /* flush */;
-
   t = hd->bctx.nblocks;
   if (sizeof t == sizeof hd->bctx.nblocks)
     th = hd->bctx.nblocks_high;


From jussi.kivilinna at iki.fi  Fri Apr  5 19:26:15 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:26:15 +0300
Subject: [PATCH 7/7] Add SHA512/224 and SHA512/256 algorithms
In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>
Message-ID: <155448517522.14473.14391969797660348419.stgit@localhost.localdomain>

* cipher/mac-hmac.c (map_mac_algo_to_md): Add mapping for SHA512/224
and SHA512/256.
(_gcry_mac_type_spec_hmac_sha512_256)
(_gcry_mac_type_spec_hmac_sha512_224): New.
* cipher/mac-internal.h (_gcry_mac_type_spec_hmac_sha512_256)
(_gcry_mac_type_spec_hmac_sha512_224): New.
* cipher/mac.c (mac_list, mac_list_algo101): Add SHA512/224 and
SHA512/256.
* cipher/md.c (digest_list, digest_list_algo301)
(prepare_macpads): Ditto.
* cipher/sha512.c (run_selftests): Ditto.
(sha512_init_common): Move common initialization here.
(sha512_init, sha384_init): Use common initialization function.
(sha512_224_init, sha512_256_init, _gcry_sha512_224_hash_buffer)
(_gcry_sha512_224_hash_buffers, _gcry_sha512_256_hash_buffer)
(_gcry_sha512_256_hash_buffers, selftests_sha512_224)
(selftests_sha512_256, sha512_224_asn, oid_spec_sha512_224)
(_gcry_digest_spec_sha512_224, sha512_256_asn, oid_spec_sha512_256)
(_gcry_digest_spec_sha512_256): New.
* doc/gcrypt.texi: Add SHA512/224 and SHA512/256; Add missing
HMAC-BLAKE2s and HMAC-BLAKE2b.
* src/cipher.h (_gcry_digest_spec_sha512_224)
(_gcry_digest_spec_sha512_256): New.
* src/gcrypt.h.in (GCRY_MD_SHA512_256, GCRY_MD_SHA512_224): New.
(GCRY_MAC_HMAC_SHA512_256, GCRY_MAC_HMAC_SHA512_224): New.
* tests/basic.c (check_digests): Add SHA512/224 and SHA512/256
test vectors.
--

This change adds truncated SHA512/224 and SHA512/256 algorithms
specified in FIPS 180-4.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/mac-hmac.c b/cipher/mac-hmac.c
index 86281acdf..e488d03aa 100644
--- a/cipher/mac-hmac.c
+++ b/cipher/mac-hmac.c
@@ -51,6 +51,10 @@ map_mac_algo_to_md (int mac_algo)
       return GCRY_MD_SHA384;
     case GCRY_MAC_HMAC_SHA512:
       return GCRY_MD_SHA512;
+    case GCRY_MAC_HMAC_SHA512_256:
+      return GCRY_MD_SHA512_256;
+    case GCRY_MAC_HMAC_SHA512_224:
+      return GCRY_MD_SHA512_224;
     case GCRY_MAC_HMAC_SHA3_224:
       return GCRY_MD_SHA3_224;
     case GCRY_MAC_HMAC_SHA3_256:
@@ -260,6 +264,17 @@ gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha384 = {
   GCRY_MAC_HMAC_SHA384, {0, 1}, "HMAC_SHA384",
   &hmac_ops
 };
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_256 = {
+  GCRY_MAC_HMAC_SHA512_256, {0, 1}, "HMAC_SHA512_256",
+  &hmac_ops
+};
+
+gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_224 = {
+  GCRY_MAC_HMAC_SHA512_224, {0, 1}, "HMAC_SHA512_224",
+  &hmac_ops
+};
+
 #endif
 #if USE_SHA3
 gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_224 = {
diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h
index eb5467380..03f5b8da8 100644
--- a/cipher/mac-internal.h
+++ b/cipher/mac-internal.h
@@ -133,6 +133,8 @@ extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha224;
 #if USE_SHA512
 extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512;
 extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha384;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_224;
+extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_256;
 #endif
 #if USE_SHA3
 extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_224;
diff --git a/cipher/mac.c b/cipher/mac.c
index 1b79bf315..0bbac3e41 100644
--- a/cipher/mac.c
+++ b/cipher/mac.c
@@ -40,6 +40,8 @@ static gcry_mac_spec_t * const mac_list[] = {
 #if USE_SHA512
   &_gcry_mac_type_spec_hmac_sha512,
   &_gcry_mac_type_spec_hmac_sha384,
+  &_gcry_mac_type_spec_hmac_sha512_256,
+  &_gcry_mac_type_spec_hmac_sha512_224,
 #endif
 #if USE_SHA3
   &_gcry_mac_type_spec_hmac_sha3_224,
@@ -230,9 +232,16 @@ static gcry_mac_spec_t * const mac_list_algo101[] =
     NULL,
 #endif
 #if USE_SM3
-    &_gcry_mac_type_spec_hmac_sm3
+    &_gcry_mac_type_spec_hmac_sm3,
 #else
-    NULL
+    NULL,
+#endif
+#if USE_SHA512
+    &_gcry_mac_type_spec_hmac_sha512_256,
+    &_gcry_mac_type_spec_hmac_sha512_224,
+#else
+    NULL,
+    NULL,
 #endif
   };
 
diff --git a/cipher/md.c b/cipher/md.c
index 15e19a95f..6ca390ff6 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -48,6 +48,8 @@ static gcry_md_spec_t * const digest_list[] =
 #if USE_SHA512
      &_gcry_digest_spec_sha512,
      &_gcry_digest_spec_sha384,
+     &_gcry_digest_spec_sha512_256,
+     &_gcry_digest_spec_sha512_224,
 #endif
 #if USE_SHA3
      &_gcry_digest_spec_sha3_224,
@@ -232,9 +234,16 @@ static gcry_md_spec_t * const digest_list_algo301[] =
     NULL,
 #endif
 #if USE_SM3
-    &_gcry_digest_spec_sm3
+    &_gcry_digest_spec_sm3,
 #else
-    NULL
+    NULL,
+#endif
+#if USE_SHA512
+    &_gcry_digest_spec_sha512_256,
+    &_gcry_digest_spec_sha512_224,
+#else
+    NULL,
+    NULL,
 #endif
   };
 
@@ -928,6 +937,8 @@ prepare_macpads (gcry_md_hd_t a, const unsigned char *key, size_t keylen)
           break;
         case GCRY_MD_SHA384:
         case GCRY_MD_SHA512:
+        case GCRY_MD_SHA512_256:
+        case GCRY_MD_SHA512_224:
         case GCRY_MD_BLAKE2B_512:
         case GCRY_MD_BLAKE2B_384:
         case GCRY_MD_BLAKE2B_256:
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 59e65f07a..1a808f884 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -254,24 +254,13 @@ do_transform_generic (void *context, const unsigned char *data, size_t nblks);
 
 
 static void
-sha512_init (void *context, unsigned int flags)
+sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
 {
-  SHA512_CONTEXT *ctx = context;
-  SHA512_STATE *hd = &ctx->state;
   unsigned int features = _gcry_get_hw_features ();
 
   (void)flags;
   (void)k;
 
-  hd->h0 = U64_C(0x6a09e667f3bcc908);
-  hd->h1 = U64_C(0xbb67ae8584caa73b);
-  hd->h2 = U64_C(0x3c6ef372fe94f82b);
-  hd->h3 = U64_C(0xa54ff53a5f1d36f1);
-  hd->h4 = U64_C(0x510e527fade682d1);
-  hd->h5 = U64_C(0x9b05688c2b3e6c1f);
-  hd->h6 = U64_C(0x1f83d9abfb41bd6b);
-  hd->h7 = U64_C(0x5be0cd19137e2179);
-
   ctx->bctx.nblocks = 0;
   ctx->bctx.nblocks_high = 0;
   ctx->bctx.count = 0;
@@ -300,14 +289,30 @@ sha512_init (void *context, unsigned int flags)
   (void)features;
 }
 
+
 static void
-sha384_init (void *context, unsigned int flags)
+sha512_init (void *context, unsigned int flags)
 {
   SHA512_CONTEXT *ctx = context;
   SHA512_STATE *hd = &ctx->state;
-  unsigned int features = _gcry_get_hw_features ();
 
-  (void)flags;
+  hd->h0 = U64_C(0x6a09e667f3bcc908);
+  hd->h1 = U64_C(0xbb67ae8584caa73b);
+  hd->h2 = U64_C(0x3c6ef372fe94f82b);
+  hd->h3 = U64_C(0xa54ff53a5f1d36f1);
+  hd->h4 = U64_C(0x510e527fade682d1);
+  hd->h5 = U64_C(0x9b05688c2b3e6c1f);
+  hd->h6 = U64_C(0x1f83d9abfb41bd6b);
+  hd->h7 = U64_C(0x5be0cd19137e2179);
+
+  sha512_init_common (ctx, flags);
+}
+
+static void
+sha384_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
 
   hd->h0 = U64_C(0xcbbb9d5dc1059ed8);
   hd->h1 = U64_C(0x629a292a367cd507);
@@ -318,35 +323,49 @@ sha384_init (void *context, unsigned int flags)
   hd->h6 = U64_C(0xdb0c2e0d64f98fa7);
   hd->h7 = U64_C(0x47b5481dbefa4fa4);
 
-  ctx->bctx.nblocks = 0;
-  ctx->bctx.nblocks_high = 0;
-  ctx->bctx.count = 0;
-  ctx->bctx.blocksize = 128;
+  sha512_init_common (ctx, flags);
+}
 
-  /* Order of feature checks is important here; last match will be
-   * selected.  Keep slower implementations at the top and faster at
-   * the bottom.  */
-  ctx->bctx.bwrite = do_transform_generic;
-#ifdef USE_ARM_NEON_ASM
-  if ((features & HWF_ARM_NEON) != 0)
-    ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
-#endif
-#ifdef USE_SSSE3
-  if ((features & HWF_INTEL_SSSE3) != 0)
-    ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
-#endif
-#ifdef USE_AVX
-  if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
-    ctx->bctx.bwrite = do_sha512_transform_amd64_avx;
-#endif
-#ifdef USE_AVX2
-  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
-    ctx->bctx.bwrite = do_sha512_transform_amd64_avx2;
-#endif
-  (void)features;
+
+static void
+sha512_256_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  hd->h0 = U64_C(0x22312194fc2bf72c);
+  hd->h1 = U64_C(0x9f555fa3c84c64c2);
+  hd->h2 = U64_C(0x2393b86b6f53b151);
+  hd->h3 = U64_C(0x963877195940eabd);
+  hd->h4 = U64_C(0x96283ee2a88effe3);
+  hd->h5 = U64_C(0xbe5e1e2553863992);
+  hd->h6 = U64_C(0x2b0199fc2c85b8aa);
+  hd->h7 = U64_C(0x0eb72ddc81c52ca2);
+
+  sha512_init_common (ctx, flags);
 }
 
 
+static void
+sha512_224_init (void *context, unsigned int flags)
+{
+  SHA512_CONTEXT *ctx = context;
+  SHA512_STATE *hd = &ctx->state;
+
+  hd->h0 = U64_C(0x8c3d37c819544da2);
+  hd->h1 = U64_C(0x73e1996689dcd4d6);
+  hd->h2 = U64_C(0x1dfab7ae32ff9c82);
+  hd->h3 = U64_C(0x679dd514582f9fcf);
+  hd->h4 = U64_C(0x0f6d2b697bd44da8);
+  hd->h5 = U64_C(0x77e36f7304c48942);
+  hd->h6 = U64_C(0x3f9d85a86a1d36c8);
+  hd->h7 = U64_C(0x1112e6ad91d692a1);
+
+  sha512_init_common (ctx, flags);
+}
+
+
+
 #ifndef USE_ARM_ASM
 
 static inline u64
@@ -758,6 +777,68 @@ _gcry_sha384_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt)
 }
 
 
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 32 bytes.  */
+static void
+_gcry_sha512_256_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_256_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha512_256_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+			       int iovcnt)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_256_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 32);
+}
+
+
+
+/* Shortcut functions which puts the hash value of the supplied buffer
+ * into outbuf which must have a size of 28 bytes.  */
+static void
+_gcry_sha512_224_hash_buffer (void *outbuf, const void *buffer, size_t length)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_224_init (&hd, 0);
+  _gcry_md_block_write (&hd, buffer, length);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 28);
+}
+
+
+/* Variant of the above shortcut function using multiple buffers.  */
+static void
+_gcry_sha512_224_hash_buffers (void *outbuf, const gcry_buffer_t *iov,
+			       int iovcnt)
+{
+  SHA512_CONTEXT hd;
+
+  sha512_224_init (&hd, 0);
+  for (;iovcnt > 0; iov++, iovcnt--)
+    _gcry_md_block_write (&hd,
+                          (const char*)iov[0].data + iov[0].off, iov[0].len);
+  sha512_final (&hd);
+  memcpy (outbuf, hd.bctx.buf, 28);
+}
+
+
 

 /*
      Self-test section.
@@ -867,6 +948,102 @@ selftests_sha512 (int extended, selftest_report_func_t report)
   return GPG_ERR_SELFTEST_FAILED;
 }
 
+static gpg_err_code_t
+selftests_sha512_224 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA512_224, 0,
+     "abc", 3,
+     "\x46\x34\x27\x0F\x70\x7B\x6A\x54\xDA\xAE\x75\x30\x46\x08\x42\xE2"
+     "\x0E\x37\xED\x26\x5C\xEE\xE9\xA4\x3E\x89\x24\xAA",
+     28);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_224, 0,
+         "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+         "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+         "\x23\xFE\xC5\xBB\x94\xD6\x0B\x23\x30\x81\x92\x64\x0B\x0C\x45\x33"
+         "\x35\xD6\x64\x73\x4F\xE4\x0E\x72\x68\x67\x4A\xF9",
+         28);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_224, 1,
+         NULL, 0,
+         "\x37\xab\x33\x1d\x76\xf0\xd3\x6d\xe4\x22\xbd\x0e\xde\xb2\x2a\x28"
+         "\xac\xcd\x48\x7b\x7a\x84\x53\xae\x96\x5d\xd2\x87",
+         28);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA512_224, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
+static gpg_err_code_t
+selftests_sha512_256 (int extended, selftest_report_func_t report)
+{
+  const char *what;
+  const char *errtxt;
+
+  what = "short string";
+  errtxt = _gcry_hash_selftest_check_one
+    (GCRY_MD_SHA512_256, 0,
+     "abc", 3,
+     "\x53\x04\x8E\x26\x81\x94\x1E\xF9\x9B\x2E\x29\xB7\x6B\x4C\x7D\xAB"
+     "\xE4\xC2\xD0\xC6\x34\xFC\x6D\x46\xE0\xE2\xF1\x31\x07\xE7\xAF\x23",
+     32);
+  if (errtxt)
+    goto failed;
+
+  if (extended)
+    {
+      what = "long string";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_256, 0,
+         "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn"
+         "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112,
+         "\x39\x28\xE1\x84\xFB\x86\x90\xF8\x40\xDA\x39\x88\x12\x1D\x31\xBE"
+         "\x65\xCB\x9D\x3E\xF8\x3E\xE6\x14\x6F\xEA\xC8\x61\xE1\x9B\x56\x3A",
+         32);
+      if (errtxt)
+        goto failed;
+
+      what = "one million \"a\"";
+      errtxt = _gcry_hash_selftest_check_one
+        (GCRY_MD_SHA512_256, 1,
+         NULL, 0,
+         "\x9a\x59\xa0\x52\x93\x01\x87\xa9\x70\x38\xca\xe6\x92\xf3\x07\x08"
+         "\xaa\x64\x91\x92\x3e\xf5\x19\x43\x94\xdc\x68\xd5\x6c\x74\xfb\x21",
+         32);
+      if (errtxt)
+        goto failed;
+    }
+
+  return 0; /* Succeeded. */
+
+ failed:
+  if (report)
+    report ("digest", GCRY_MD_SHA512_256, what, errtxt);
+  return GPG_ERR_SELFTEST_FAILED;
+}
+
 
 /* Run a full self-test for ALGO and return 0 on success.  */
 static gpg_err_code_t
@@ -882,6 +1059,12 @@ run_selftests (int algo, int extended, selftest_report_func_t report)
     case GCRY_MD_SHA512:
       ec = selftests_sha512 (extended, report);
       break;
+    case GCRY_MD_SHA512_224:
+      ec = selftests_sha512_224 (extended, report);
+      break;
+    case GCRY_MD_SHA512_256:
+      ec = selftests_sha512_256 (extended, report);
+      break;
     default:
       ec = GPG_ERR_DIGEST_ALGO;
       break;
@@ -949,3 +1132,41 @@ gcry_md_spec_t _gcry_digest_spec_sha384 =
     sizeof (SHA512_CONTEXT),
     run_selftests
   };
+
+static byte sha512_256_asn[] = { 0x30 };
+
+static gcry_md_oid_spec_t oid_spec_sha512_256[] =
+  {
+    { "2.16.840.1.101.3.4.2.6" },
+
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha512_256 =
+  {
+    GCRY_MD_SHA512_256, {0, 1},
+    "SHA512_256", sha512_256_asn, DIM (sha512_256_asn), oid_spec_sha512_256, 32,
+    sha512_256_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
+    _gcry_sha512_256_hash_buffer, _gcry_sha512_256_hash_buffers,
+    sizeof (SHA512_CONTEXT),
+    run_selftests
+  };
+
+static byte sha512_224_asn[] = { 0x30 };
+
+static gcry_md_oid_spec_t oid_spec_sha512_224[] =
+  {
+    { "2.16.840.1.101.3.4.2.5" },
+
+    { NULL },
+  };
+
+gcry_md_spec_t _gcry_digest_spec_sha512_224 =
+  {
+    GCRY_MD_SHA512_224, {0, 1},
+    "SHA512_224", sha512_224_asn, DIM (sha512_224_asn), oid_spec_sha512_224, 28,
+    sha512_224_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
+    _gcry_sha512_224_hash_buffer, _gcry_sha512_224_hash_buffers,
+    sizeof (SHA512_CONTEXT),
+    run_selftests
+  };
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 8adf3a355..8b765ba80 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -3141,7 +3141,7 @@ are also supported.
 
 @c begin table of hash algorithms
 @cindex SHA-1
- at cindex SHA-224, SHA-256, SHA-384, SHA-512
+ at cindex SHA-224, SHA-256, SHA-384, SHA-512, SHA-512/224, SHA-512/256
 @cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256
 @cindex RIPE-MD-160
 @cindex MD2, MD4, MD5
@@ -3214,9 +3214,17 @@ This is the SHA-384 algorithm which yields a message digest of 48 bytes.
 See FIPS 180-2 for the specification.
 
 @item GCRY_MD_SHA512
-This is the SHA-384 algorithm which yields a message digest of 64 bytes.
+This is the SHA-512 algorithm which yields a message digest of 64 bytes.
 See FIPS 180-2 for the specification.
 
+ at item GCRY_MD_SHA512_224
+This is the SHA-512/224 algorithm which yields a message digest of 28 bytes.
+See FIPS 180-4 for the specification.
+
+ at item GCRY_MD_SHA512_256
+This is the SHA-512/256 algorithm which yields a message digest of 32 bytes.
+See FIPS 180-4 for the specification.
+
 @item GCRY_MD_SHA3_224
 This is the SHA3-224 algorithm which yields a message digest of 28 bytes.
 See FIPS 202 for the specification.
@@ -3680,6 +3688,7 @@ provided by Libgcrypt.
 @c begin table of MAC algorithms
 @cindex HMAC-SHA-1
 @cindex HMAC-SHA-224, HMAC-SHA-256, HMAC-SHA-384, HMAC-SHA-512
+ at cindex HMAC-SHA-512/224, HMAC-SHA-512/256
 @cindex HMAC-SHA3-224, HMAC-SHA3-256, HMAC-SHA3-384, HMAC-SHA3-512
 @cindex HMAC-RIPE-MD-160
 @cindex HMAC-MD2, HMAC-MD4, HMAC-MD5
@@ -3687,6 +3696,7 @@ provided by Libgcrypt.
 @cindex HMAC-Whirlpool
 @cindex HMAC-Stribog-256, HMAC-Stribog-512
 @cindex HMAC-GOSTR-3411-94
+ at cindex HMAC-BLAKE2s, HMAC-BLAKE2b
 @table @code
 @item GCRY_MAC_NONE
 This is not a real algorithm but used by some functions as an error
@@ -3724,6 +3734,14 @@ algorithm.
 This is HMAC message authentication algorithm based on the SHA3-384 hash
 algorithm.
 
+ at item GCRY_MAC_HMAC_SHA512_224
+This is HMAC message authentication algorithm based on the SHA-512/224 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_SHA512_256
+This is HMAC message authentication algorithm based on the SHA-512/256 hash
+algorithm.
+
 @item GCRY_MAC_HMAC_SHA1
 This is HMAC message authentication algorithm based on the SHA-1 hash
 algorithm.
@@ -3756,6 +3774,38 @@ algorithm described in GOST R 34.11-2012.
 This is HMAC message authentication algorithm based on the 512-bit hash
 algorithm described in GOST R 34.11-2012.
 
+ at item GCRY_MAC_HMAC_BLAKE2B_512
+This is HMAC message authentication algorithm based on the BLAKE2b-512 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2B_384
+This is HMAC message authentication algorithm based on the BLAKE2b-384 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2B_256
+This is HMAC message authentication algorithm based on the BLAKE2b-256 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2B_160
+This is HMAC message authentication algorithm based on the BLAKE2b-160 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2S_256
+This is HMAC message authentication algorithm based on the BLAKE2s-256 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2S_224
+This is HMAC message authentication algorithm based on the BLAKE2s-224 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2S_160
+This is HMAC message authentication algorithm based on the BLAKE2s-160 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_BLAKE2S_128
+This is HMAC message authentication algorithm based on the BLAKE2s-128 hash
+algorithm.
+
 @item GCRY_MAC_CMAC_AES
 This is CMAC (Cipher-based MAC) message authentication algorithm based on
 the AES block cipher algorithm.
diff --git a/src/cipher.h b/src/cipher.h
index 6e89be3da..5aac19f17 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -318,6 +318,8 @@ extern gcry_md_spec_t _gcry_digest_spec_sha224;
 extern gcry_md_spec_t _gcry_digest_spec_sha256;
 extern gcry_md_spec_t _gcry_digest_spec_sha384;
 extern gcry_md_spec_t _gcry_digest_spec_sha512;
+extern gcry_md_spec_t _gcry_digest_spec_sha512_224;
+extern gcry_md_spec_t _gcry_digest_spec_sha512_256;
 extern gcry_md_spec_t _gcry_digest_spec_sha3_224;
 extern gcry_md_spec_t _gcry_digest_spec_sha3_256;
 extern gcry_md_spec_t _gcry_digest_spec_sha3_512;
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 36bbf200c..8346ce151 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -1248,6 +1248,8 @@ enum gcry_md_algos
     GCRY_MD_BLAKE2S_160   = 324,
     GCRY_MD_BLAKE2S_128   = 325,
     GCRY_MD_SM3           = 326,
+    GCRY_MD_SHA512_256    = 327,
+    GCRY_MD_SHA512_224    = 328,
   };
 
 /* Flags used with the open function.  */
@@ -1443,6 +1445,8 @@ enum gcry_mac_algos
     GCRY_MAC_HMAC_BLAKE2S_160   = 126,
     GCRY_MAC_HMAC_BLAKE2S_128   = 127,
     GCRY_MAC_HMAC_SM3           = 128,
+    GCRY_MAC_HMAC_SHA512_256    = 129,
+    GCRY_MAC_HMAC_SHA512_224    = 130,
 
     GCRY_MAC_CMAC_AES           = 201,
     GCRY_MAC_CMAC_3DES          = 202,
diff --git a/tests/basic.c b/tests/basic.c
index a28dc6997..3d6e8fc1e 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -8536,6 +8536,18 @@ check_digests (void)
         "\x74\xee\x78\xeb\x79\x1f\x94\x38\x5b\x73\xef\xf8\xfd\x5d\x74\xd8"
         "\x51\x36\xfe\x63\x52\xde\x07\x70\x95\xd6\x78\x2b\x7b\x46\x8a\x2c"
         "\x30\x0f\x48\x0c\x74\x43\x06\xdb\xa3\x8d\x64\x3d\xe9\xa1\xa7\x72" },
+      { GCRY_MD_SHA512_256, "abc",
+	"\x53\x04\x8E\x26\x81\x94\x1E\xF9\x9B\x2E\x29\xB7\x6B\x4C\x7D\xAB"
+	"\xE4\xC2\xD0\xC6\x34\xFC\x6D\x46\xE0\xE2\xF1\x31\x07\xE7\xAF\x23" },
+      { GCRY_MD_SHA512_256, "!",
+	"\x9a\x59\xa0\x52\x93\x01\x87\xa9\x70\x38\xca\xe6\x92\xf3\x07\x08"
+	"\xaa\x64\x91\x92\x3e\xf5\x19\x43\x94\xdc\x68\xd5\x6c\x74\xfb\x21" },
+      { GCRY_MD_SHA512_224, "abc",
+	"\x46\x34\x27\x0F\x70\x7B\x6A\x54\xDA\xAE\x75\x30\x46\x08\x42\xE2"
+	"\x0E\x37\xED\x26\x5C\xEE\xE9\xA4\x3E\x89\x24\xAA" },
+      { GCRY_MD_SHA512_224, "!",
+	"\x37\xab\x33\x1d\x76\xf0\xd3\x6d\xe4\x22\xbd\x0e\xde\xb2\x2a\x28"
+	"\xac\xcd\x48\x7b\x7a\x84\x53\xae\x96\x5d\xd2\x87" },
       { GCRY_MD_SHA3_224, "abc",
 	"\xe6\x42\x82\x4c\x3f\x8c\xf2\x4a\xd0\x92\x34\xee\x7d\x3c\x76\x6f"
 	"\xc9\xa3\xa5\x16\x8d\x0c\x94\xad\x73\xb4\x6f\xdf" },


From jussi.kivilinna at iki.fi  Fri Apr  5 19:25:44 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri,  5 Apr 2019 20:25:44 +0300
Subject: [PATCH 1/7] Add AVX2/BMI2 implementation of SHA1
Message-ID: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain>

* cipher/Makefile.am: Add 'sha1-avx2-bmi2-amd64.S'.
* cipher/hash-common.h (MD_BLOCK_CTX_BUFFER_SIZE): New.
(gcry_md_block_ctx): Change buffer length to MD_BLOCK_CTX_BUFFER_SIZE.
* cipher/sha1-avx-amd64.S: Add missing .size for transform function.
* cipher/sha1-ssse3-amd64.S: Add missing .size for transform function.
* cipher/sha1-avx-bmi2-amd64.S: Add missing .size for transform
function; Tweak implementation for small ~1% speed increase.
* cipher/sha1-avx2-bmi2-amd64.S: New.
* cipher/sha1.c (USE_AVX2, _gcry_sha1_transform_amd64_avx2_bmi2)
(do_sha1_transform_amd64_avx2_bmi2): New.
(sha1_init) [USE_AVX2]: Enable AVX2 implementation if supported by
HW features.
(sha1_final): Merge processing of two last blocks when extra block is
needed.
--

Benchmarks on Intel Haswell (4.0 Ghz):

Before (AVX/BMI2):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA1           |     0.970 ns/B     983.2 MiB/s      3.88 c/B

After (AVX/BMI2, ~1% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA1           |     0.960 ns/B     993.1 MiB/s      3.84 c/B

After (AVX2/BMI2, ~9% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA1           |     0.890 ns/B      1071 MiB/s      3.56 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 1e67771e5..3f00ed4a8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -103,8 +103,8 @@ EXTRA_libcipher_la_SOURCES = \
 	serpent.c serpent-sse2-amd64.S \
 	serpent-avx2-amd64.S serpent-armv7-neon.S \
 	sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
-	sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
-	sha1-intel-shaext.c \
+	sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
+	sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
 	sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
 	sha256-avx2-bmi2-amd64.S \
 	sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
diff --git a/cipher/hash-common.h b/cipher/hash-common.h
index 23f81ed71..0b3ade11e 100644
--- a/cipher/hash-common.h
+++ b/cipher/hash-common.h
@@ -42,9 +42,12 @@ typedef unsigned int (*_gcry_md_block_write_t) (void *c,
 # define MD_NBLOCKS_TYPE u32
 #endif
 
+/* SHA1 needs 2x64 bytes and SHA-512 needs 128 bytes. */
+#define MD_BLOCK_CTX_BUFFER_SIZE 128
+
 typedef struct gcry_md_block_ctx
 {
-    byte buf[MD_BLOCK_MAX_BLOCKSIZE];
+    byte buf[MD_BLOCK_CTX_BUFFER_SIZE];
     MD_NBLOCKS_TYPE nblocks;
     MD_NBLOCKS_TYPE nblocks_high;
     int count;
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 5f5b9c0e4..143e4066d 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -421,6 +421,8 @@ _gcry_sha1_transform_amd64_avx:
 
 .Lret:
   ret;
+ELF(.size _gcry_sha1_transform_amd64_avx,
+    .-_gcry_sha1_transform_amd64_avx;)
 
 #endif
 #endif
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index 8292c3afb..79ea24ef9 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -60,20 +60,15 @@
 /* Constants */
 
 .text
-#define K1  0x5A827999
-#define K2  0x6ED9EBA1
-#define K3  0x8F1BBCDC
-#define K4  0xCA62C1D6
 .align 16
-.LK_XMM:
-.LK1:	.long K1, K1, K1, K1
-.LK2:	.long K2, K2, K2, K2
-.LK3:	.long K3, K3, K3, K3
-.LK4:	.long K4, K4, K4, K4
-
 .Lbswap_shufb_ctl:
 	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
 
+.LK1:	.long 0x5A827999
+.LK2:	.long 0x6ED9EBA1
+.LK3:	.long 0x8F1BBCDC
+.LK4:	.long 0xCA62C1D6
+
 
 /* Register macros */
 
@@ -82,14 +77,15 @@
 #define ROLDSTACK %r10
 #define RNBLKS %r11
 
-#define a %eax
-#define b %ebx
-#define c %ecx
+#define a %esi
+#define b %edi
+#define c %ebp
 #define d %edx
-#define e %edi
+#define e %ecx
+#define ne %ebx
 
-#define RT0 %esi
-#define RT1 %ebp
+#define RT0 %eax
+#define RT1 %r12d
 
 #define Wtmp0 %xmm0
 #define Wtmp1 %xmm1
@@ -105,6 +101,11 @@
 
 #define BSWAP_REG %xmm10
 
+#define K1 %xmm11
+#define K2 %xmm12
+#define K3 %xmm13
+#define K4 %xmm14
+
 
 /* Round function macros. */
 
@@ -117,9 +118,9 @@
 	andl b, RT0; \
 	rorxl $2, b, b; \
 	addl RT1, e; \
-	leal (RT0,e), e; \
-	rorxl $27, a, RT1; \
-	addl RT1, e;
+	addl ne, a; \
+	leal (RT0,e), ne; \
+	rorxl $27, a, e;
 
 #define R_F2(a,b,c,d,e,i) \
 	movl c, RT0; \
@@ -127,22 +128,22 @@
 	xorl b, RT0; \
 	rorxl $2, b, b; \
 	xorl d, RT0; \
-	leal (RT0,e), e; \
-	rorxl $27, a, RT1; \
-	addl RT1, e;
+	addl ne, a; \
+	leal (RT0,e), ne; \
+	rorxl $27, a, e;
 
 #define R_F3(a,b,c,d,e,i) \
 	movl c, RT0; \
 	movl b, RT1; \
+	addl WK(i), e; \
 	xorl b, RT0; \
 	andl c, RT1; \
 	andl d, RT0; \
 	addl RT1, e; \
-	addl WK(i), e; \
 	rorxl $2, b, b; \
-	leal (RT0,e), e; \
-	rorxl $27, a, RT1; \
-	addl RT1, e;
+	addl ne, a; \
+	leal (RT0,e), ne; \
+	rorxl $27, a, e;
 
 #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
 
@@ -158,8 +159,8 @@
 #define W_PRECALC_00_15_1(i, W, tmp0) \
 	vpshufb BSWAP_REG, tmp0, W;
 
-#define W_PRECALC_00_15_2(i, W, tmp0) \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0;
+#define W_PRECALC_00_15_2(i, W, tmp0, K) \
+	vpaddd K, W, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	vmovdqa tmp0, WK(i&~3);
@@ -181,10 +182,10 @@
 	vpsrld $30, tmp1, W; \
 	vpslld $2, tmp1, tmp1;
 
-#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
 	vpxor W, tmp0, tmp0; \
 	vpxor tmp1, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd K, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
@@ -199,9 +200,9 @@
 	vpsrld $30, W, tmp0; \
 	vpslld $2, W, W;
 
-#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
 	vpor W, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd K, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 
@@ -233,6 +234,7 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movq %rsi, RDATA;
   pushq %rbx;
   pushq %rbp;
+  pushq %r12;
 
   movq %rsp, ROLDSTACK;
 
@@ -245,25 +247,30 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movl state_h2(RSTATE), c;
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
+  xorl ne, ne;
 
   vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  vpbroadcastd .LK1 RIP, K1;
+  vpbroadcastd .LK2 RIP, K2;
+  vpbroadcastd .LK3 RIP, K3;
+  vpbroadcastd .LK4 RIP, K4;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
   W_PRECALC_00_15_1(1, W0, Wtmp0);
-  W_PRECALC_00_15_2(2, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
   W_PRECALC_00_15_3(3, W0, Wtmp0);
   W_PRECALC_00_15_0(4, W7, Wtmp0);
   W_PRECALC_00_15_1(5, W7, Wtmp0);
-  W_PRECALC_00_15_2(6, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
   W_PRECALC_00_15_3(7, W7, Wtmp0);
   W_PRECALC_00_15_0(8, W6, Wtmp0);
   W_PRECALC_00_15_1(9, W6, Wtmp0);
-  W_PRECALC_00_15_2(10, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
   W_PRECALC_00_15_3(11, W6, Wtmp0);
   W_PRECALC_00_15_0(12, W5, Wtmp0);
   W_PRECALC_00_15_1(13, W5, Wtmp0);
-  W_PRECALC_00_15_2(14, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
   W_PRECALC_00_15_3(15, W5, Wtmp0);
 
 .align 8
@@ -274,69 +281,69 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   R( a, b, c, d, e, F1,  0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
   R( d, e, a, b, c, F1,  2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
-  R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1,  3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
   R( b, c, d, e, a, F1,  4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1,  5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
   R( e, a, b, c, d, F1,  6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
-  R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1,  7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
   R( c, d, e, a, b, F1,  8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1,  9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
   R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
-  R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
   R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
   R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
-  R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
 
   /* Transform 16-63 + Precalc 32-79. */
   R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
-  R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
   R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
-  R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
   R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
-  R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
   R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
-  R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
   R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
   R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
-  R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
   R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
   R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
-  R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
   R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
   R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
-  R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
   R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
   R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
-  R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
   R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
   R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
-  R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
   R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
   R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
-  R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
   R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
   R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
-  R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
   R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
   R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
-  R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
 
   decq RNBLKS;
   jz .Lend;
@@ -344,21 +351,23 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   /* Transform 64-79 + Precalc 0-15 of next block. */
   R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
   R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
-  R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
+  R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
   R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
   R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
   R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
-  R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
+  R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
   R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
   R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
   R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
-  R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
+  R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
   R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
   R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
   R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
   R( c, d, e, a, b, F4, 78 );
-  addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0);
+  addl state_h0(RSTATE), a;   W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
   R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+  addl ne, a;
+  xorl ne, ne;
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
@@ -396,6 +405,8 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   R( c, d, e, a, b, F4, 78 );
   addl state_h0(RSTATE), a;
   R( b, c, d, e, a, F4, 79 );
+  addl ne, a;
+  xorl ne, ne;
 
   /* Update the chaining variables. */
   addl state_h3(RSTATE), d;
@@ -411,14 +422,17 @@ _gcry_sha1_transform_amd64_avx_bmi2:
 
   movq ROLDSTACK, %rsp;
 
+  popq %r12;
   popq %rbp;
   popq %rbx;
 
   /* burn_stack */
-  movl $(16*4 + 2*8 + 31), %eax;
+  movl $(16*4 + 3*8 + 31), %eax;
 
 .Lret:
   ret;
+ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
+    .-_gcry_sha1_transform_amd64_avx_bmi2;)
 
 #endif
 #endif
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
new file mode 100644
index 000000000..c666290f2
--- /dev/null
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -0,0 +1,570 @@
+/* sha1-avx2-bmi2-amd64.S - Intel AVX2/BMI2 accelerated SHA-1 transform function
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+     defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define WK_STACK_WORDS (80 * 2)
+
+.text
+.align 16
+.Lbswap_shufb_ctl:
+	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.LK1:	.long 0x5A827999
+.LK2:	.long 0x6ED9EBA1
+.LK3:	.long 0x8F1BBCDC
+.LK4:	.long 0xCA62C1D6
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+#define RNBLKS %r11
+
+#define a %eax
+#define b %ebx
+#define c %ecx
+#define d %edx
+#define e %edi
+#define ne %r12d
+
+#define RT0 %esi
+#define RT1 %ebp
+
+#define Wtmp0 %ymm0
+#define Wtmp1 %ymm1
+#define Wtmp0x %xmm0
+#define Wtmp1x %xmm1
+
+#define W0 %ymm2
+#define W1 %ymm3
+#define W2 %ymm4
+#define W3 %ymm5
+#define W4 %ymm6
+#define W5 %ymm7
+#define W6 %ymm8
+#define W7 %ymm9
+
+#define BSWAP_REG %ymm10
+
+#define K1 %ymm11
+#define K2 %ymm12
+#define K3 %ymm13
+#define K4 %ymm14
+
+
+/* Round function macros. */
+
+#define WK(i,block) ((block) * 16 + ((i) / 4) * 32 + ((i) % 4) * 4)(%rsp)
+#define PRE_WK(i) ((i) * 4 * 2)(%rsp)
+
+#define R_F1(a,b,c,d,e,i,block) \
+	movl c, RT0; \
+	andn d, b, RT1; \
+	addl WK(i,block), e; \
+	andl b, RT0; \
+	leal (a,ne), a; \
+	rorxl $2, b, b; \
+	addl RT1, e; \
+	rorxl $27, a, ne; \
+	addl RT0, e;
+
+#define R_F2(a,b,c,d,e,i,block) \
+	addl WK(i,block), e; \
+	movl c, RT0; \
+	xorl b, RT0; \
+	leal (a,ne), a; \
+	rorxl $2, b, b; \
+	xorl d, RT0; \
+	addl RT0, e; \
+	rorxl $27, a, ne;
+
+#define R_F3(a,b,c,d,e,i,block) \
+	movl c, RT0; \
+	addl WK(i,block), e; \
+	movl b, RT1; \
+	xorl b, RT0; \
+	leal (a,ne), a; \
+	rorxl $2, b, b; \
+	andl c, RT1; \
+	addl RT1, e; \
+	andl d, RT0; \
+	rorxl $27, a, ne; \
+	addl RT0, e;
+
+#define R_F4(a,b,c,d,e,i,block) R_F2(a,b,c,d,e,i,block)
+
+#define R(a,b,c,d,e,f,i,block) \
+	R_##f(a,b,c,d,e,i,block)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+	vmovdqu (4*(i))(RDATA), tmp0##x; \
+	vinserti128 $1, (4*(i) + 64)(RDATA), tmp0, tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+	vpshufb BSWAP_REG, tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0, K) \
+	vpaddd K, W, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+	vmovdqa tmp0, PRE_WK((i)&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpalignr $8, W_m16, W_m12, W; \
+	vpsrldq $4, W_m04, tmp0; \
+	vpxor W_m08, W, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpxor W_m16, tmp0, tmp0; \
+	vpxor tmp0, W, W; \
+	vpslld $1, W, tmp0; \
+	vpslldq $12, W, tmp1; \
+	vpsrld $31, W, W;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+	vpor W, tmp0, tmp0; \
+	vpsrld $30, tmp1, W; \
+	vpslld $2, tmp1, tmp1;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
+	vpxor W, tmp0, tmp0; \
+	vpxor tmp1, tmp0, W; \
+	vpaddd K, W, tmp0; \
+	vmovdqa tmp0, PRE_WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m28, W, W; \
+	vpalignr $8, W_m08, W_m04, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpxor W_m16, W, W; \
+	vpxor tmp0, W, W;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+	vpsrld $30, W, tmp0; \
+	vpslld $2, W, W;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
+	vpor W, tmp0, W; \
+	vpaddd K, W, tmp0; \
+	vmovdqa tmp0, PRE_WK((i)&~3);
+
+
+/*
+ * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
+ *                                       size_t nblks)
+ */
+.globl _gcry_sha1_transform_amd64_avx2_bmi2
+ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2, at function)
+.align 16
+_gcry_sha1_transform_amd64_avx2_bmi2:
+  /* input:
+   *	%rdi: ctx, CTX
+   *	%rsi: data (64*nblks bytes)
+   *	%rdx: nblks (multiple of 2, larger than 0)
+   */
+
+  vzeroupper;
+
+  movq %rdx, RNBLKS;
+  movq %rdi, RSTATE;
+  movq %rsi, RDATA;
+  pushq %rbx;
+  pushq %rbp;
+  pushq %r12;
+
+  movq %rsp, ROLDSTACK;
+
+  subq $(WK_STACK_WORDS*4), %rsp;
+  andq $(~63), %rsp;
+
+  /* Get the values of the chaining variables. */
+  movl state_h0(RSTATE), a;
+  movl state_h1(RSTATE), b;
+  movl state_h2(RSTATE), c;
+  movl state_h3(RSTATE), d;
+  movl state_h4(RSTATE), e;
+  xorl ne, ne;
+
+  vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  vpbroadcastd .LK1 RIP, K1;
+  vpbroadcastd .LK2 RIP, K2;
+  vpbroadcastd .LK3 RIP, K3;
+  vpbroadcastd .LK4 RIP, K4;
+
+  /* Precalc 0-31 for block 1 & 2. */
+  W_PRECALC_00_15_0(0, W0, Wtmp0);
+  W_PRECALC_00_15_1(1, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+  W_PRECALC_00_15_3(3, W0, Wtmp0);
+  W_PRECALC_00_15_0(4, W7, Wtmp0);
+  W_PRECALC_00_15_1(5, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+  W_PRECALC_00_15_3(7, W7, Wtmp0);
+  W_PRECALC_00_15_0(8, W6, Wtmp0);
+  W_PRECALC_00_15_1(9, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+  W_PRECALC_00_15_3(11, W6, Wtmp0);
+  W_PRECALC_00_15_0(12, W5, Wtmp0);
+  W_PRECALC_00_15_1(13, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+  W_PRECALC_00_15_3(15, W5, Wtmp0);
+  W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+  W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+  W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+  W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+
+.align 8
+.Loop:
+  addq $(2 * 64), RDATA;
+
+  /* Transform 0-15 for block 1 + Precalc 32-47 for block 1 & 2. */
+  R( a, b, c, d, e, F1,  0, 0 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F1,  1, 0 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F1,  2, 0 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F1,  3, 0 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
+  R( b, c, d, e, a, F1,  4, 0 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F1,  5, 0 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F1,  6, 0 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F1,  7, 0 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
+  R( c, d, e, a, b, F1,  8, 0 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F1,  9, 0 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F1, 10, 0 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F1, 11, 0 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
+  R( d, e, a, b, c, F1, 12, 0 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F1, 13, 0 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( b, c, d, e, a, F1, 14, 0 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F1, 15, 0 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
+
+  /* Transform 16-47 for block 1 + Precalc 48-79 for block 1 & 2. */
+  R( e, a, b, c, d, F1, 16, 0 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( d, e, a, b, c, F1, 17, 0 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( c, d, e, a, b, F1, 18, 0 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( b, c, d, e, a, F1, 19, 0 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
+  R( a, b, c, d, e, F2, 20, 0 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( e, a, b, c, d, F2, 21, 0 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( d, e, a, b, c, F2, 22, 0 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( c, d, e, a, b, F2, 23, 0 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
+  R( b, c, d, e, a, F2, 24, 0 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( a, b, c, d, e, F2, 25, 0 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( e, a, b, c, d, F2, 26, 0 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( d, e, a, b, c, F2, 27, 0 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
+  R( c, d, e, a, b, F2, 28, 0 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( b, c, d, e, a, F2, 29, 0 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( a, b, c, d, e, F2, 30, 0 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( e, a, b, c, d, F2, 31, 0 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
+  R( d, e, a, b, c, F2, 32, 0 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F2, 33, 0 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F2, 34, 0 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F2, 35, 0 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
+  R( e, a, b, c, d, F2, 36, 0 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F2, 37, 0 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, 38, 0 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F2, 39, 0 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
+  R( a, b, c, d, e, F3, 40, 0 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F3, 41, 0 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F3, 42, 0 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F3, 43, 0 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
+  R( b, c, d, e, a, F3, 44, 0 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F3, 45, 0 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F3, 46, 0 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F3, 47, 0 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
+
+  /* Transform 48-79 for block 1. */
+  R( c, d, e, a, b, F3, 48, 0 );
+  R( b, c, d, e, a, F3, 49, 0 );
+  R( a, b, c, d, e, F3, 50, 0 );
+  R( e, a, b, c, d, F3, 51, 0 );
+  R( d, e, a, b, c, F3, 52, 0 );
+  R( c, d, e, a, b, F3, 53, 0 );
+  R( b, c, d, e, a, F3, 54, 0 );
+  R( a, b, c, d, e, F3, 55, 0 );
+  R( e, a, b, c, d, F3, 56, 0 );
+  R( d, e, a, b, c, F3, 57, 0 );
+  R( c, d, e, a, b, F3, 58, 0 );
+  R( b, c, d, e, a, F3, 59, 0 );
+  R( a, b, c, d, e, F4, 60, 0 );
+  R( e, a, b, c, d, F4, 61, 0 );
+  R( d, e, a, b, c, F4, 62, 0 );
+  R( c, d, e, a, b, F4, 63, 0 );
+  R( b, c, d, e, a, F4, 64, 0 );
+  R( a, b, c, d, e, F4, 65, 0 );
+  R( e, a, b, c, d, F4, 66, 0 );
+  R( d, e, a, b, c, F4, 67, 0 );
+  R( c, d, e, a, b, F4, 68, 0 );
+  R( b, c, d, e, a, F4, 69, 0 );
+  R( a, b, c, d, e, F4, 70, 0 );
+  R( e, a, b, c, d, F4, 71, 0 );
+  R( d, e, a, b, c, F4, 72, 0 );
+  R( c, d, e, a, b, F4, 73, 0 );
+  R( b, c, d, e, a, F4, 74, 0 );
+  R( a, b, c, d, e, F4, 75, 0 );
+  R( e, a, b, c, d, F4, 76, 0 );
+  R( d, e, a, b, c, F4, 77, 0 );
+  R( c, d, e, a, b, F4, 78, 0 );
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79, 0 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  /* Transform 0-47 for block 2. */
+  R( a, b, c, d, e, F1,  0, 1 );
+  R( e, a, b, c, d, F1,  1, 1 );
+  R( d, e, a, b, c, F1,  2, 1 );
+  R( c, d, e, a, b, F1,  3, 1 );
+  R( b, c, d, e, a, F1,  4, 1 );
+  R( a, b, c, d, e, F1,  5, 1 );
+  R( e, a, b, c, d, F1,  6, 1 );
+  R( d, e, a, b, c, F1,  7, 1 );
+  R( c, d, e, a, b, F1,  8, 1 );
+  R( b, c, d, e, a, F1,  9, 1 );
+  R( a, b, c, d, e, F1, 10, 1 );
+  R( e, a, b, c, d, F1, 11, 1 );
+  R( d, e, a, b, c, F1, 12, 1 );
+  R( c, d, e, a, b, F1, 13, 1 );
+  R( b, c, d, e, a, F1, 14, 1 );
+  R( a, b, c, d, e, F1, 15, 1 );
+  R( e, a, b, c, d, F1, 16, 1 );
+  R( d, e, a, b, c, F1, 17, 1 );
+  R( c, d, e, a, b, F1, 18, 1 );
+  R( b, c, d, e, a, F1, 19, 1 );
+  R( a, b, c, d, e, F2, 20, 1 );
+  R( e, a, b, c, d, F2, 21, 1 );
+  R( d, e, a, b, c, F2, 22, 1 );
+  R( c, d, e, a, b, F2, 23, 1 );
+  R( b, c, d, e, a, F2, 24, 1 );
+  R( a, b, c, d, e, F2, 25, 1 );
+  R( e, a, b, c, d, F2, 26, 1 );
+  R( d, e, a, b, c, F2, 27, 1 );
+  R( c, d, e, a, b, F2, 28, 1 );
+  R( b, c, d, e, a, F2, 29, 1 );
+  R( a, b, c, d, e, F2, 30, 1 );
+  R( e, a, b, c, d, F2, 31, 1 );
+  R( d, e, a, b, c, F2, 32, 1 );
+  R( c, d, e, a, b, F2, 33, 1 );
+  R( b, c, d, e, a, F2, 34, 1 );
+  R( a, b, c, d, e, F2, 35, 1 );
+  R( e, a, b, c, d, F2, 36, 1 );
+  R( d, e, a, b, c, F2, 37, 1 );
+  R( c, d, e, a, b, F2, 38, 1 );
+  R( b, c, d, e, a, F2, 39, 1 );
+  R( a, b, c, d, e, F3, 40, 1 );
+  R( e, a, b, c, d, F3, 41, 1 );
+  R( d, e, a, b, c, F3, 42, 1 );
+  R( c, d, e, a, b, F3, 43, 1 );
+  R( b, c, d, e, a, F3, 44, 1 );
+  R( a, b, c, d, e, F3, 45, 1 );
+  R( e, a, b, c, d, F3, 46, 1 );
+  R( d, e, a, b, c, F3, 47, 1 );
+
+  addq $-2, RNBLKS;
+  jz .Lend;
+
+  /* Transform 48-79 for block 2 + Precalc 0-31 for next two blocks. */
+  R( c, d, e, a, b, F3, 48, 1 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
+  R( b, c, d, e, a, F3, 49, 1 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
+  R( a, b, c, d, e, F3, 50, 1 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+  R( e, a, b, c, d, F3, 51, 1 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
+  R( d, e, a, b, c, F3, 52, 1 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
+  R( c, d, e, a, b, F3, 53, 1 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
+  R( b, c, d, e, a, F3, 54, 1 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+  R( a, b, c, d, e, F3, 55, 1 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
+  R( e, a, b, c, d, F3, 56, 1 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
+  R( d, e, a, b, c, F3, 57, 1 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
+  R( c, d, e, a, b, F3, 58, 1 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+  R( b, c, d, e, a, F3, 59, 1 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
+  R( a, b, c, d, e, F4, 60, 1 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
+  R( e, a, b, c, d, F4, 61, 1 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
+  R( d, e, a, b, c, F4, 62, 1 ); W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+  R( c, d, e, a, b, F4, 63, 1 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+  R( b, c, d, e, a, F4, 64, 1 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F4, 65, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F4, 66, 1 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F4, 67, 1 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+  R( c, d, e, a, b, F4, 68, 1 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F4, 69, 1 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F4, 70, 1 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F4, 71, 1 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+  R( d, e, a, b, c, F4, 72, 1 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F4, 73, 1 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F4, 74, 1 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F4, 75, 1 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+  R( e, a, b, c, d, F4, 76, 1 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F4, 77, 1 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F4, 78, 1 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  addl state_h0(RSTATE), a;      W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+  R( b, c, d, e, a, F4, 79, 1 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  jmp .Loop;
+
+.align 16
+.Lend:
+  vzeroall;
+
+  /* Transform 48-79 for block 2. */
+  R( c, d, e, a, b, F3, 48, 1 );
+  R( b, c, d, e, a, F3, 49, 1 );
+  R( a, b, c, d, e, F3, 50, 1 );
+  R( e, a, b, c, d, F3, 51, 1 );
+  R( d, e, a, b, c, F3, 52, 1 );
+  R( c, d, e, a, b, F3, 53, 1 );
+  R( b, c, d, e, a, F3, 54, 1 );
+  R( a, b, c, d, e, F3, 55, 1 );
+  R( e, a, b, c, d, F3, 56, 1 );
+  R( d, e, a, b, c, F3, 57, 1 );
+  R( c, d, e, a, b, F3, 58, 1 );
+  R( b, c, d, e, a, F3, 59, 1 );
+  R( a, b, c, d, e, F4, 60, 1 );
+  R( e, a, b, c, d, F4, 61, 1 );
+  R( d, e, a, b, c, F4, 62, 1 );
+  R( c, d, e, a, b, F4, 63, 1 );
+  R( b, c, d, e, a, F4, 64, 1 );
+  R( a, b, c, d, e, F4, 65, 1 );
+  R( e, a, b, c, d, F4, 66, 1 );
+  R( d, e, a, b, c, F4, 67, 1 );
+  R( c, d, e, a, b, F4, 68, 1 );
+  R( b, c, d, e, a, F4, 69, 1 );
+  R( a, b, c, d, e, F4, 70, 1 );
+  R( e, a, b, c, d, F4, 71, 1 );
+  R( d, e, a, b, c, F4, 72, 1 );
+  R( c, d, e, a, b, F4, 73, 1 );
+  R( b, c, d, e, a, F4, 74, 1 );
+  R( a, b, c, d, e, F4, 75, 1 );
+  R( e, a, b, c, d, F4, 76, 1 );
+  R( d, e, a, b, c, F4, 77, 1 );
+  R( c, d, e, a, b, F4, 78, 1 );
+  addl state_h0(RSTATE), a;
+  R( b, c, d, e, a, F4, 79, 1 );
+  addl ne, a;
+  xorl ne, ne;
+
+  /* Update the chaining variables. */
+  addl state_h3(RSTATE), d;
+  addl state_h2(RSTATE), c;
+  addl state_h1(RSTATE), b;
+  addl state_h4(RSTATE), e;
+
+  movl d, state_h3(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl a, state_h0(RSTATE);
+  movl e, state_h4(RSTATE);
+
+  movq ROLDSTACK, %rsp;
+
+  popq %r12;
+  popq %rbp;
+  popq %rbx;
+
+  /* burn_stack */
+  movl $((WK_STACK_WORDS)*4 + 3*8 + 31), %eax;
+
+  ret;
+ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
+    .-_gcry_sha1_transform_amd64_avx2_bmi2;)
+
+#endif
+#endif
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index 2b4394765..421bebeca 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -429,6 +429,8 @@ _gcry_sha1_transform_amd64_ssse3:
 
 .Lret:
   ret;
+ELF(.size _gcry_sha1_transform_amd64_ssse3,
+    .-_gcry_sha1_transform_amd64_ssse3;)
 
 #endif
 #endif
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 76c486c7e..affabfb07 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -68,6 +68,12 @@
 # define USE_BMI2 1
 #endif
 
+/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
+#undef USE_AVX2
+#if defined(USE_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX2)
+# define USE_AVX2 1
+#endif
+
 /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
 #undef USE_SHAEXT
 #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
@@ -171,7 +177,37 @@ do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
   return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
          + ASM_EXTRA_STACK;
 }
-#endif
+
+#ifdef USE_AVX2
+unsigned int
+_gcry_sha1_transform_amd64_avx2_bmi2 (void *state, const unsigned char *data,
+                                      size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
+                                   size_t nblks)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+  /* AVX2/BMI2 function only handles pair of blocks so nblks needs to be
+   * multiple of 2 and function does not handle zero nblks. Use AVX/BMI2
+   * code to handle these cases. */
+
+  if (nblks <= 1)
+    return do_sha1_transform_amd64_avx_bmi2 (ctx, data, nblks);
+
+  if (nblks & 1)
+    {
+      (void)_gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, 1);
+      nblks--;
+      data += 64;
+    }
+
+  return _gcry_sha1_transform_amd64_avx2_bmi2 (&hd->h0, data, nblks)
+         + ASM_EXTRA_STACK;
+}
+#endif /* USE_AVX2 */
+#endif /* USE_BMI2 */
 
 #ifdef USE_SHAEXT
 /* Does not need ASM_FUNC_ABI */
@@ -258,6 +294,11 @@ sha1_init (void *context, unsigned int flags)
   if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2))
     hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2;
 #endif
+#ifdef USE_AVX2
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_AVX) &&
+      (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sha1_transform_amd64_avx2_bmi2;
+#endif
 #ifdef USE_SHAEXT
   if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
     hd->bctx.bwrite = do_sha1_transform_intel_shaext;
@@ -494,22 +535,27 @@ sha1_final(void *context)
   if( hd->bctx.count < 56 )  /* enough room */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
-      while( hd->bctx.count < 56 )
-        hd->bctx.buf[hd->bctx.count++] = 0;  /* pad */
+      if (hd->bctx.count < 56)
+	memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+      hd->bctx.count = 56;
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 56, msb);
+      buf_put_be32(hd->bctx.buf + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
     }
   else  /* need one extra block */
     {
       hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
-      while( hd->bctx.count < 64 )
-        hd->bctx.buf[hd->bctx.count++] = 0;
-      _gcry_md_block_write(hd, NULL, 0);  /* flush */;
-      memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+      /* fill pad and next block with zeroes */
+      memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+      hd->bctx.count = 64 + 56;
+
+      /* append the 64 bit count */
+      buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+      buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+      burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 );
     }
-  /* append the 64 bit count */
-  buf_put_be32(hd->bctx.buf + 56, msb);
-  buf_put_be32(hd->bctx.buf + 60, lsb);
-  burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
-  _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
 #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
@@ -520,6 +566,7 @@ sha1_final(void *context)
   X(4);
 #undef X
 
+  _gcry_burn_stack (burn);
 }
 
 static unsigned char *
diff --git a/configure.ac b/configure.ac
index bb3c666f4..0a931f952 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2541,6 +2541,7 @@ case "${host}" in
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo"
     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo"
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx2-bmi2-amd64.lo"
   ;;
   arm*-*-*)
     # Build with the assembly implementation


From jussi.kivilinna at iki.fi  Sun Apr  7 22:07:28 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun,  7 Apr 2019 23:07:28 +0300
Subject: [PATCH] Tune SHA-512/AVX2 and SHA-256/AVX2 implementations
Message-ID: <155466764826.17126.15898822326526391088.stgit@localhost.localdomain>

* cipher/sha256-avx2-bmi2-amd64.S (ONE_ROUND_PART1, ONE_ROUND_PART2)
(ONE_ROUND): New round function.
(FOUR_ROUNDS_AND_SCHED, FOUR_ROUNDS): Use new round function.
(_gcry_sha256_transform_amd64_avx2): Exit early if number of blocks is
zero; Writing XFER to stack earlier and handle XREF writing in
FOUR_ROUNDS_AND_SCHED.
* cipher/sha512-avx2-bmi2-amd64.S (MASK_YMM_LO, MASK_YMM_LOx): New.
(ONE_ROUND_PART1, ONE_ROUND_PART2, ONE_ROUND): New round function.
(FOUR_ROUNDS_AND_SCHED, FOUR_ROUNDS): Use new round function.
(_gcry_sha512_transform_amd64_avx2): Writing XFER to stack earlier and
handle XREF writing in FOUR_ROUNDS_AND_SCHED.
--

Benchmark on Intel Haswell (4.0Ghz):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA256         |      2.17 ns/B     439.0 MiB/s      8.68 c/B
 SHA512         |      1.56 ns/B     612.5 MiB/s      6.23 c/B

After (~4-6% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHA256         |      2.05 ns/B     465.9 MiB/s      8.18 c/B
 SHA512         |      1.49 ns/B     640.3 MiB/s      5.95 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 598f93821..5fc402cd1 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -176,379 +176,128 @@ b = a
 a = TMP_
 .endm
 
-.macro FOUR_ROUNDS_AND_SCHED XFER
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-
-	add	h, [\XFER+0*4]		/*  h = k + w + h         ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
+.macro ONE_ROUND_PART1 XFER
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
+	 * d += h;
+	 * h += Sum0 (a) + Maj (a, b, c);
+	 *
+	 * Ch(x, y, z) => ((x & y) + (~x & z))
+	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
+	 */
+
+	mov y3, e
+	add h, [\XFER]
+	and y3, f
+	rorx y0, e, 25
+	rorx y1, e, 11
+	lea h, [h + y3]
+	andn y3, e, g
+	rorx T1, a, 13
+	xor y0, y1
+	lea h, [h + y3]
+.endm
+.macro ONE_ROUND_PART2
+	rorx y2, a, 22
+	rorx y1, e, 6
+	mov y3, a
+	xor T1, y2
+	xor y0, y1
+	xor y3, b
+	lea h, [h + y0]
+	mov y0, a
+	rorx y2, a, 2
+	add d, h
+	and y3, c
+	xor T1, y2
+	lea h, [h + y3]
+	lea h, [h + T1]
+	and y0, b
+	lea h, [h + y0]
+.endm
 
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6)					; S1 */
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
+.macro ONE_ROUND XFER
+	ONE_ROUND_PART1 \XFER
+	ONE_ROUND_PART2
+.endm
 
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
+.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT
+/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
+		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */
+		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */
 		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
-
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
 		vpsrld	XTMP2, XTMP1, 7
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
 		vpslld	XTMP3, XTMP1, (32-7)
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
 		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */
-
 		vpsrld	XTMP2, XTMP1,18
-	add	h, y2		/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-	lea	h, [h + y3]	/*  h = t1 + S0 + MAJ                     ; --	 */
-
 
-ROTATE_ARGS
+	ONE_ROUND 0*4+\XFER
+	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	add	h, [\XFER+1*4]		/*  h = k + w + h         ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-
-
 		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-
 		vpslld	XTMP1, XTMP1, (32-18)
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-
 		vpxor	XTMP3, XTMP3, XTMP1
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-
 		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
-
 		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */
 		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-
 		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-	add	h, y2		/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-	lea	h, [h + y3]	/*  h = t1 + S0 + MAJ                     ; --	 */
-
 		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */
 
-
-ROTATE_ARGS
+	ONE_ROUND 1*4+\XFER
+	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	add	h, [\XFER+2*4]		/*  h = k + w + h         ; --	 */
-
 		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
 		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
 		vpxor	XTMP2, XTMP2, XTMP3
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
 		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-
 		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
 		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
 		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */
 
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-	add	h, y2		/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-
-	lea	h, [h + y3]	/*  h = t1 + S0 + MAJ                     ; --	 */
-
-
-ROTATE_ARGS
+	ONE_ROUND 2*4+\XFER
+	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	add	h, [\XFER+3*4]		/*  h = k + w + h         ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-
-
 		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-
 		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-
 		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-
 		vpxor	XTMP2, XTMP2, XTMP3
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
-
 		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
 		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */
-
 		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-	add	h, y2		/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-	lea	h, [h + y3]	/*  h = t1 + S0 + MAJ                     ; --	 */
+		vpaddd	XFER, X0, [TBL + \XFEROUT]
 
-ROTATE_ARGS
-rotate_Xs
+	ONE_ROUND_PART1 3*4+\XFER
+		vmovdqa [rsp + _XFER + \XFEROUT], XFER
+	ONE_ROUND_PART2
+	ROTATE_ARGS
+	rotate_Xs
 .endm
 
 .macro DO_4ROUNDS XFER
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
-	add	h, [\XFER + 4*0]		/*  h = k + w + h ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
-
-
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-
-
-	/* add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-
-	/* lea	h, [h + y3]	; h = t1 + S0 + MAJ                     ; --	 */
-
+	ONE_ROUND 0*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	add	old_h, y2	/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-	add	old_h, y3	/*  h = t1 + S0 + MAJ                     ; --	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
-	add	h, [\XFER + 4*1]		/*  h = k + w + h ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
-
-
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-
-
-	/* add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-
-	/* lea	h, [h + y3]	; h = t1 + S0 + MAJ                     ; --	 */
-
+	ONE_ROUND 1*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	add	old_h, y2	/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-	add	old_h, y3	/*  h = t1 + S0 + MAJ                     ; --	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
-	add	h, [\XFER + 4*2]		/*  h = k + w + h ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
-
-
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-
-
-	/* add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-
-	/* lea	h, [h + y3]	; h = t1 + S0 + MAJ                     ; --	 */
-
+	ONE_ROUND 2*4+\XFER
 	ROTATE_ARGS
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	add	old_h, y2	/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-	mov	y2, f		/*  y2 = f                                ; CH	 */
-	rorx	y0, e, 25	/*  y0 = e >> 25				; S1A */
-	rorx	y1, e, 11	/*  y1 = e >> 11				; S1B */
-	xor	y2, g		/*  y2 = f^g                              ; CH	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11)		; S1 */
-	rorx	y1, e, 6	/*  y1 = (e >> 6)				; S1 */
-	and	y2, e		/*  y2 = (f^g)&e                          ; CH	 */
-	add	old_h, y3	/*  h = t1 + S0 + MAJ                     ; --	 */
-
-	xor	y0, y1		/*  y0 = (e>>25) ^ (e>>11) ^ (e>>6)	; S1 */
-	rorx	T1, a, 13	/*  T1 = a >> 13				; S0B */
-	xor	y2, g		/*  y2 = CH = ((f^g)&e)^g                 ; CH	 */
-	rorx	y1, a, 22	/*  y1 = a >> 22				; S0A */
-	mov	y3, a		/*  y3 = a                                ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13)		; S0 */
-	rorx	T1, a, 2	/*  T1 = (a >> 2)				; S0 */
-	add	h, [\XFER + 4*3]		/*  h = k + w + h ; --	 */
-	or	y3, c		/*  y3 = a|c                              ; MAJA	 */
-
-	xor	y1, T1		/*  y1 = (a>>22) ^ (a>>13) ^ (a>>2)	; S0 */
-	mov	T1, a		/*  T1 = a                                ; MAJB	 */
-	and	y3, b		/*  y3 = (a|c)&b                          ; MAJA	 */
-	and	T1, c		/*  T1 = a&c                              ; MAJB	 */
-	add	y2, y0		/*  y2 = S1 + CH                          ; --	 */
-
-
-	add	d, h		/*  d = k + w + h + d                     ; --	 */
-	or	y3, T1		/*  y3 = MAJ = (a|c)&b)|(a&c)             ; MAJ	 */
-	add	h, y1		/*  h = k + w + h + S0                    ; --	 */
-
-	add	d, y2		/*  d = k + w + h + d + S1 + CH = d + t1  ; --	 */
-
-
-	add	h, y2		/*  h = k + w + h + S0 + S1 + CH = t1 + S0; --	 */
-
-	lea	h, [h + y3]	/*  h = t1 + S0 + MAJ                     ; --	 */
-
+	ONE_ROUND 3*4+\XFER
 	ROTATE_ARGS
 .endm
 
@@ -565,6 +314,11 @@ rotate_Xs
 ELF(.type _gcry_sha256_transform_amd64_avx2, at function)
 .align 32
 _gcry_sha256_transform_amd64_avx2:
+	xor eax, eax
+
+	cmp rdx, 0
+	je .Lnowork
+
 	push	rbx
 	push	rbp
 	push	r12
@@ -574,19 +328,19 @@ _gcry_sha256_transform_amd64_avx2:
 
 	vzeroupper
 
+	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
+	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
+
 	mov	rax, rsp
 	sub	rsp, STACK_SIZE
-	and	rsp, -32
+	and	rsp, ~63
 	mov	[rsp + _RSP], rax
 
 	shl	NUM_BLKS, 6	/*  convert to bytes */
-	jz	.Ldone_hash
 	lea	NUM_BLKS, [NUM_BLKS + INP - 64] /*  pointer to last block */
 	mov	[rsp + _INP_END], NUM_BLKS
 
-	cmp	INP, NUM_BLKS
-	je	.Lonly_one_block
-
 	/* ; load initial digest */
 	mov	a,[4*0 + CTX]
 	mov	b,[4*1 + CTX]
@@ -597,10 +351,6 @@ _gcry_sha256_transform_amd64_avx2:
 	mov	g,[4*6 + CTX]
 	mov	h,[4*7 + CTX]
 
-	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
-	vmovdqa	SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
-	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
-
 	mov	[rsp + _CTX], CTX
 
 .Loop0:
@@ -631,43 +381,31 @@ _gcry_sha256_transform_amd64_avx2:
 	/* ; schedule 48 input dwords, by doing 3 rounds of 12 each */
 	xor	SRND, SRND
 
+	vpaddd	XFER, X0, [TBL + 0*32]
+	vmovdqa [rsp + _XFER + 0*32], XFER
+	vpaddd	XFER, X1, [TBL + 1*32]
+	vmovdqa [rsp + _XFER + 1*32], XFER
+	vpaddd	XFER, X2, [TBL + 2*32]
+	vmovdqa [rsp + _XFER + 2*32], XFER
+	vpaddd	XFER, X3, [TBL + 3*32]
+	vmovdqa [rsp + _XFER + 3*32], XFER
+
 .align 16
 .Loop1:
-	vpaddd	XFER, X0, [TBL + SRND + 0*32]
-	vmovdqa [rsp + _XFER + SRND + 0*32], XFER
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 0*32
-
-	vpaddd	XFER, X0, [TBL + SRND + 1*32]
-	vmovdqa [rsp + _XFER + SRND + 1*32], XFER
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 1*32
-
-	vpaddd	XFER, X0, [TBL + SRND + 2*32]
-	vmovdqa [rsp + _XFER + SRND + 2*32], XFER
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 2*32
-
-	vpaddd	XFER, X0, [TBL + SRND + 3*32]
-	vmovdqa [rsp + _XFER + SRND + 3*32], XFER
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 3*32
+	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 0*32, SRND + 4*32
+	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 1*32, SRND + 5*32
+	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 2*32, SRND + 6*32
+	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 3*32, SRND + 7*32
 
 	add	SRND, 4*32
 	cmp	SRND, 3 * 4*32
 	jb	.Loop1
 
-.Loop2:
 	/* ; Do last 16 rounds with no scheduling */
-	vpaddd	XFER, X0, [TBL + SRND + 0*32]
-	vmovdqa [rsp + _XFER + SRND + 0*32], XFER
-	DO_4ROUNDS	rsp + _XFER + SRND + 0*32
-	vpaddd	XFER, X1, [TBL + SRND + 1*32]
-	vmovdqa [rsp + _XFER + SRND + 1*32], XFER
-	DO_4ROUNDS	rsp + _XFER + SRND + 1*32
-	add	SRND, 2*32
-
-	vmovdqa	X0, X2
-	vmovdqa	X1, X3
-
-	cmp	SRND, 4 * 4*32
-	jb	.Loop2
+	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 0*32)
+	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 1*32)
+	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 2*32)
+	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 3*32)
 
 	mov	CTX, [rsp + _CTX]
 	mov	INP, [rsp + _INP]
@@ -777,6 +515,7 @@ _gcry_sha256_transform_amd64_avx2:
 	pop	rbp
 	pop	rbx
 
+.Lnowork:
 	ret
 
 .align 64
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 914f920af..32cfceb0b 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -79,6 +79,8 @@ YTMP4 = ymm8
 XFER =  YTMP0
 
 BYTE_FLIP_MASK =  ymm9
+MASK_YMM_LO    =  ymm10
+MASK_YMM_LOx   =  xmm10
 
 INP =         rdi /* 1st arg */
 CTX =         rsi /* 2nd arg */
@@ -96,7 +98,7 @@ b =     rbx
 f =     r9
 g =     r10
 h =     r11
-old_h = r11
+old_h = rax
 
 T1 =    r12
 y0 =    r13
@@ -107,14 +109,14 @@ y4 =    r12
 
 /* Local variables (stack frame) */
 #define frame_XFER      0
-#define frame_XFER_size (4*8)
+#define frame_XFER_size (4*4*8)
 #define frame_SRND      (frame_XFER + frame_XFER_size)
 #define frame_SRND_size (1*8)
 #define frame_INP      (frame_SRND + frame_SRND_size)
 #define frame_INP_size (1*8)
-#define frame_INPEND      (frame_INP + frame_INP_size)
-#define frame_INPEND_size (1*8)
-#define frame_RSPSAVE      (frame_INPEND + frame_INPEND_size)
+#define frame_NBLKS      (frame_INP + frame_INP_size)
+#define frame_NBLKS_size (1*8)
+#define frame_RSPSAVE      (frame_NBLKS + frame_NBLKS_size)
 #define frame_RSPSAVE_size (1*8)
 #define frame_GPRSAVE      (frame_RSPSAVE + frame_RSPSAVE_size)
 #define frame_GPRSAVE_size (6*8)
@@ -168,7 +170,51 @@ y4 =    r12
 	vpalignr 	\YDST, \YDST, \YSRC2, \RVAL	/* YDST = {YDS1, YS2} >> RVAL*8 */
 .endm
 
-.macro FOUR_ROUNDS_AND_SCHED
+.macro ONE_ROUND_PART1 XFER
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
+	 * d += h;
+	 * h += Sum0 (a) + Maj (a, b, c);
+	 *
+	 * Ch(x, y, z) => ((x & y) + (~x & z))
+	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
+	 */
+
+	mov y3, e
+	add h, [\XFER]
+	and y3, f
+	rorx y0, e, 41
+	rorx y1, e, 18
+	lea h, [h + y3]
+	andn y3, e, g
+	rorx T1, a, 34
+	xor y0, y1
+	lea h, [h + y3]
+.endm
+.macro ONE_ROUND_PART2
+	rorx y2, a, 39
+	rorx y1, e, 14
+	mov y3, a
+	xor T1, y2
+	xor y0, y1
+	xor y3, b
+	lea h, [h + y0]
+	mov y0, a
+	rorx y2, a, 28
+	add d, h
+	and y3, c
+	xor T1, y2
+	lea h, [h + y3]
+	lea h, [h + T1]
+	and y0, b
+	lea h, [h + y0]
+.endm
+
+.macro ONE_ROUND XFER
+	ONE_ROUND_PART1 \XFER
+	ONE_ROUND_PART2
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED X
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
 		/* Extract w[t-7] */
@@ -187,43 +233,8 @@ y4 =    r12
 		/* Calculate w[t-15] shr 7 */
 		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */
 
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-
-	add	h, [rsp+frame_XFER+0*8]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-
-	add	h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-	add	h, y3		/* h = t1 + S0 + MAJ                            ; --	 */
-
-RotateState
+	ONE_ROUND rsp+frame_XFER+0*8+\X*32
+	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
@@ -243,7 +254,7 @@ RotateState
 		/* Move to appropriate lanes for calculating w[16] and w[17] */
 		vperm2f128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */
 		/* Move to appropriate lanes for calculating w[18] and w[19] */
-		vpand		YTMP0, YTMP0, [.LMASK_YMM_LO ADD_RIP]	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */
+		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */
 
 		/* Calculate w[16] and w[17] in both 128 bit lanes */
 
@@ -251,48 +262,8 @@ RotateState
 		vperm2f128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */
 		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */
 
-
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	add	h, [rsp+frame_XFER+1*8]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-
-
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
-
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-	add	h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-	add	h, y3		/* h = t1 + S0 + MAJ                            ; --	 */
-
-RotateState
-
-
-
+	ONE_ROUND rsp+frame_XFER+1*8+\X*32
+	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
@@ -314,43 +285,8 @@ RotateState
 		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */
 		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */
 
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	add	h, [rsp+frame_XFER+2*8]		/* h = k + w + h                                ; --	 */
-
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-	add	h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-
-	add	h, y3		/* h = t1 + S0 + MAJ                            ; --	 */
-
-RotateState
+	ONE_ROUND rsp+frame_XFER+2*8+\X*32
+	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
@@ -370,225 +306,35 @@ RotateState
 
 		/* Form w[19, w[18], w17], w[16] */
 		vpblendd		Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */
-/*		vperm2f128		Y_0, Y_0, YTMP2, 0x30 */
-
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	add	h, [rsp+frame_XFER+3*8]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-
-
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
 
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-	add	h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-	add	h, y3		/* h = t1 + S0 + MAJ                            ; --	 */
-
-RotateState
-
-rotate_Ys
+	ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32
+		vpaddq		XFER, Y_0, [TBL + (4+\X)*32]
+		vmovdqa		[rsp + frame_XFER + \X*32], XFER
+	ONE_ROUND_PART2
+	RotateState
+	rotate_Ys
 .endm
 
-.macro DO_4ROUNDS
+.macro DO_4ROUNDS X
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-	add	h, [rsp + frame_XFER + 8*0]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-
-
-	/*add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-
-	/*add	h, y3		; h = t1 + S0 + MAJ                            ; --	 */
-
+	ONE_ROUND rsp+frame_XFER+0*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	add	old_h, y2	/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-	add	old_h, y3	/* h = t1 + S0 + MAJ                            ; --	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-	add	h, [rsp + frame_XFER + 8*1]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-
-
-	/*add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-
-	/*add	h, y3		; h = t1 + S0 + MAJ                            ; --	 */
-
+	ONE_ROUND rsp+frame_XFER+1*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	add	old_h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-	add	old_h, y3	/* h = t1 + S0 + MAJ                            ; --	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-	add	h, [rsp + frame_XFER + 8*2]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-
-
-	/*add	h, y2		; h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-
-	/*add	h, y3		; h = t1 + S0 + MAJ                            ; --	 */
-
+	ONE_ROUND rsp+frame_XFER+2*8+\X*32
 	RotateState
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-	add	old_h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-	mov	y2, f		/* y2 = f                                       ; CH	 */
-	rorx	y0, e, 41	/* y0 = e >> 41					; S1A */
-	rorx	y1, e, 18	/* y1 = e >> 18					; S1B */
-	xor	y2, g		/* y2 = f^g                                     ; CH	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18)			; S1 */
-	rorx	y1, e, 14	/* y1 = (e >> 14)					; S1 */
-	and	y2, e		/* y2 = (f^g)&e                                 ; CH	 */
-	add	old_h, y3	/* h = t1 + S0 + MAJ                            ; --	 */
-
-	xor	y0, y1		/* y0 = (e>>41) ^ (e>>18) ^ (e>>14)		; S1 */
-	rorx	T1, a, 34	/* T1 = a >> 34					; S0B */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g                        ; CH	 */
-	rorx	y1, a, 39	/* y1 = a >> 39					; S0A */
-	mov	y3, a		/* y3 = a                                       ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34)			; S0 */
-	rorx	T1, a, 28	/* T1 = (a >> 28)					; S0 */
-	add	h, [rsp + frame_XFER + 8*3]		/* h = k + w + h                                ; --	 */
-	or	y3, c		/* y3 = a|c                                     ; MAJA	 */
-
-	xor	y1, T1		/* y1 = (a>>39) ^ (a>>34) ^ (a>>28)		; S0 */
-	mov	T1, a		/* T1 = a                                       ; MAJB	 */
-	and	y3, b		/* y3 = (a|c)&b                                 ; MAJA	 */
-	and	T1, c		/* T1 = a&c                                     ; MAJB	 */
-	add	y2, y0		/* y2 = S1 + CH                                 ; --	 */
-
-
-	add	d, h		/* d = k + w + h + d                            ; --	 */
-	or	y3, T1		/* y3 = MAJ = (a|c)&b)|(a&c)                    ; MAJ	 */
-	add	h, y1		/* h = k + w + h + S0                           ; --	 */
-
-	add	d, y2		/* d = k + w + h + d + S1 + CH = d + t1         ; --	 */
-
-
-	add	h, y2		/* h = k + w + h + S0 + S1 + CH = t1 + S0       ; --	 */
-
-	add	h, y3		/* h = t1 + S0 + MAJ                            ; --	 */
-
+	ONE_ROUND rsp+frame_XFER+3*8+\X*32
 	RotateState
 
 .endm
@@ -616,7 +362,7 @@ _gcry_sha512_transform_amd64_avx2:
 	/* Allocate Stack Space */
 	mov	rax, rsp
 	sub	rsp, frame_size
-	and	rsp, ~(0x20 - 1)
+	and	rsp, ~(0x40 - 1)
 	mov	[rsp + frame_RSPSAVE], rax
 
 	/* Save GPRs */
@@ -627,13 +373,7 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 5], r15
 
-	vpblendd	xmm0, xmm0, xmm1, 0xf0
-	vpblendd	ymm0, ymm0, ymm1, 0xf0
-
-	shl	NUM_BLKS, 7	/* convert to bytes */
-	jz	.Ldone_hash
-	add	NUM_BLKS, INP	/* pointer to end of data */
-	mov	[rsp + frame_INPEND], NUM_BLKS
+	mov	[rsp + frame_NBLKS], NUM_BLKS
 
 	/*; load initial digest */
 	mov	a,[8*0 + CTX]
@@ -646,8 +386,8 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	h,[8*7 + CTX]
 
 	vmovdqa	BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
+	vmovdqa	MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP]
 
-.Loop0:
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/*; byte swap first 16 dwords */
@@ -656,48 +396,60 @@ _gcry_sha512_transform_amd64_avx2:
 	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
 	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
 
+	add	INP, 128
 	mov	[rsp + frame_INP], INP
 
+	vpaddq	XFER, Y_0, [TBL + 0*32]
+	vmovdqa [rsp + frame_XFER + 0*32], XFER
+	vpaddq	XFER, Y_1, [TBL + 1*32]
+	vmovdqa [rsp + frame_XFER + 1*32], XFER
+	vpaddq	XFER, Y_2, [TBL + 2*32]
+	vmovdqa [rsp + frame_XFER + 2*32], XFER
+	vpaddq	XFER, Y_3, [TBL + 3*32]
+	vmovdqa [rsp + frame_XFER + 3*32], XFER
+
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
 	movq	[rsp + frame_SRND],4
 
 .align 16
-.Loop1:
-	vpaddq	XFER, Y_0, [TBL + 0*32]
-	vmovdqa [rsp + frame_XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+.Loop0:
+	FOUR_ROUNDS_AND_SCHED 0
+	FOUR_ROUNDS_AND_SCHED 1
+	FOUR_ROUNDS_AND_SCHED 2
+	FOUR_ROUNDS_AND_SCHED 3
+	add	TBL, 4*32
 
-	vpaddq	XFER, Y_0, [TBL + 1*32]
-	vmovdqa [rsp + frame_XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	subq	[rsp + frame_SRND], 1
+	jne	.Loop0
 
-	vpaddq	XFER, Y_0, [TBL + 2*32]
-	vmovdqa [rsp + frame_XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	subq	[rsp + frame_NBLKS], 1
+	je	.Ldone_hash
 
-	vpaddq	XFER, Y_0, [TBL + 3*32]
-	vmovdqa [rsp + frame_XFER], XFER
-	add	TBL, 4*32
-	FOUR_ROUNDS_AND_SCHED
+	mov	INP, [rsp + frame_INP]
 
-	subq	[rsp + frame_SRND], 1
-	jne	.Loop1
+	lea	TBL,[.LK512 ADD_RIP]
 
-	movq	[rsp + frame_SRND], 2
-.Loop2:
-	vpaddq	XFER, Y_0, [TBL + 0*32]
-	vmovdqa [rsp + frame_XFER], XFER
-	DO_4ROUNDS
-	vpaddq	XFER, Y_1, [TBL + 1*32]
-	vmovdqa [rsp + frame_XFER], XFER
-	add	TBL, 2*32
-	DO_4ROUNDS
+	/* load next block and byte swap */
+	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
 
-	vmovdqa	Y_0, Y_2
-	vmovdqa	Y_1, Y_3
+	add	INP, 128
+	mov	[rsp + frame_INP], INP
 
-	subq	[rsp + frame_SRND], 1
-	jne	.Loop2
+	DO_4ROUNDS 0
+	vpaddq	XFER, Y_0, [TBL + 0*32]
+	vmovdqa [rsp + frame_XFER + 0*32], XFER
+	DO_4ROUNDS 1
+	vpaddq	XFER, Y_1, [TBL + 1*32]
+	vmovdqa [rsp + frame_XFER + 1*32], XFER
+	DO_4ROUNDS 2
+	vpaddq	XFER, Y_2, [TBL + 2*32]
+	vmovdqa [rsp + frame_XFER + 2*32], XFER
+	DO_4ROUNDS 3
+	vpaddq	XFER, Y_3, [TBL + 3*32]
+	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
 	addm	[8*0 + CTX],a
 	addm	[8*1 + CTX],b
@@ -708,14 +460,33 @@ _gcry_sha512_transform_amd64_avx2:
 	addm	[8*6 + CTX],g
 	addm	[8*7 + CTX],h
 
-	mov	INP, [rsp + frame_INP]
-	add	INP, 128
-	cmp	INP, [rsp + frame_INPEND]
-	jne	.Loop0
+	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
+	movq	[rsp + frame_SRND],4
+
+	jmp	.Loop0
 
 .Ldone_hash:
 	vzeroall
 
+	DO_4ROUNDS 0
+	vmovdqa	[rsp + frame_XFER + 0*32], ymm0 /* burn stack */
+	DO_4ROUNDS 1
+	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
+	DO_4ROUNDS 2
+	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
+	DO_4ROUNDS 3
+	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
+
+	addm	[8*0 + CTX],a
+	xor     eax, eax /* burn stack */
+	addm	[8*1 + CTX],b
+	addm	[8*2 + CTX],c
+	addm	[8*3 + CTX],d
+	addm	[8*4 + CTX],e
+	addm	[8*5 + CTX],f
+	addm	[8*6 + CTX],g
+	addm	[8*7 + CTX],h
+
 	/* Restore GPRs */
 	mov	rbp, [rsp + frame_GPRSAVE + 8 * 0]
 	mov	rbx, [rsp + frame_GPRSAVE + 8 * 1]
@@ -724,10 +495,6 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
 
-	/* Burn stack */
-	vmovdqa	[rsp + frame_XFER], XFER
-	xor     eax, eax
-
 	/* Restore Stack Pointer */
 	mov	rsp, [rsp + frame_RSPSAVE]
 .Lnowork:


From jussi.kivilinna at iki.fi  Mon Apr  8 18:07:00 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  8 Apr 2019 19:07:00 +0300
Subject: [PATCH] Disable SM3 in FIPS mode
Message-ID: <155473962034.28973.13794332078146246445.stgit@localhost.localdomain>

* cipher/sm3.h (_gcry_digest_spec_sm3): Set flags.fips to zero.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sm3.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cipher/sm3.c b/cipher/sm3.c
index e76f32297..b6f0ab28c 100644
--- a/cipher/sm3.c
+++ b/cipher/sm3.c
@@ -464,7 +464,7 @@ static gcry_md_oid_spec_t oid_spec_sm3[] =
 
 gcry_md_spec_t _gcry_digest_spec_sm3 =
   {
-    GCRY_MD_SM3, {0, 1},
+    GCRY_MD_SM3, {0, 0},
     "SM3", asn_sm3, DIM (asn_sm3), oid_spec_sm3, 32,
     sm3_init, _gcry_md_block_write, sm3_final, sm3_read, NULL,
     _gcry_sm3_hash_buffer, _gcry_sm3_hash_buffers,


From jussi.kivilinna at iki.fi  Mon Apr  8 20:01:15 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  8 Apr 2019 21:01:15 +0300
Subject: [PATCH] Use getauxval system function for detecting ARM HW features
Message-ID: <155474647569.15904.17215999863429401661.stgit@localhost.localdomain>

* configure.ac: Add header check for 'sys/auxv.h'.
* src/hwf-arm.c [HAVE_SYS_AUXV_H]: Include 'sys/auxv.h'.
(AT_HWCAP, AT_HWCAP2, HWCAP_NEON, HWCAP2_AES, HWCAP2_PMULL)
(HWCAP2_SHA1, HWCAP2_SHA2, HWCAP_ASIMD, HWCAP_AES)
(HWCAP_PMULL, HWCAP_SHA1, HWCAP_SHA2): Define these macros only if not
already defined.
(get_hwcap) [HAVE_SYS_AUXV_H]: Use 'getauxval' to fetch HW capability
flags.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/configure.ac b/configure.ac
index 0a931f952..63a275079 100644
--- a/configure.ac
+++ b/configure.ac
@@ -805,7 +805,7 @@ AC_SEARCH_LIBS(setsockopt, [nsl])
 ##################################
 
 AC_HEADER_STDC
-AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h)
+AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h sys/auxv.h)
 INSERT_SYS_SELECT_H=
 if test x"$ac_cv_header_sys_select_h" = xyes; then
   INSERT_SYS_SELECT_H=" include <sys/select.h>"
diff --git a/src/hwf-arm.c b/src/hwf-arm.c
index a762b5eab..efbbd0c2d 100644
--- a/src/hwf-arm.c
+++ b/src/hwf-arm.c
@@ -23,6 +23,10 @@
 #include <string.h>
 #include <stdarg.h>
 #include <unistd.h>
+#include <errno.h>
+#ifdef HAVE_SYS_AUXV_H
+#include <sys/auxv.h>
+#endif
 
 #include "g10lib.h"
 #include "hwf-common.h"
@@ -47,15 +51,29 @@ struct feature_map_s {
 
 #ifdef __arm__
 
-#define AT_HWCAP      16
-#define AT_HWCAP2     26
+#ifndef AT_HWCAP
+# define AT_HWCAP      16
+#endif
+#ifndef AT_HWCAP2
+# define AT_HWCAP2     26
+#endif
 
-#define HWCAP_NEON    4096
+#ifndef HWCAP_NEON
+# define HWCAP_NEON    4096
+#endif
 
-#define HWCAP2_AES    1
-#define HWCAP2_PMULL  2
-#define HWCAP2_SHA1   3
-#define HWCAP2_SHA2   4
+#ifndef HWCAP2_AES
+# define HWCAP2_AES    1
+#endif
+#ifndef HWCAP2_PMULL
+# define HWCAP2_PMULL  2
+#endif
+#ifndef HWCAP2_SHA1
+# define HWCAP2_SHA1   3
+#endif
+#ifndef HWCAP2_SHA2
+# define HWCAP2_SHA2   4
+#endif
 
 static const struct feature_map_s arm_features[] =
   {
@@ -72,14 +90,28 @@ static const struct feature_map_s arm_features[] =
 
 #elif defined(__aarch64__)
 
-#define AT_HWCAP    16
-#define AT_HWCAP2   -1
+#ifndef AT_HWCAP
+# define AT_HWCAP    16
+#endif
+#ifndef AT_HWCAP2
+# define AT_HWCAP2   -1
+#endif
 
-#define HWCAP_ASIMD 2
-#define HWCAP_AES   8
-#define HWCAP_PMULL 16
-#define HWCAP_SHA1  32
-#define HWCAP_SHA2  64
+#ifndef HWCAP_ASIMD
+# define HWCAP_ASIMD 2
+#endif
+#ifndef HWCAP_AES
+# define HWCAP_AES   8
+#endif
+#ifndef HWCAP_PMULL
+# define HWCAP_PMULL 16
+#endif
+#ifndef HWCAP_SHA1
+# define HWCAP_SHA1  32
+#endif
+#ifndef HWCAP_SHA2
+# define HWCAP_SHA2  64
+#endif
 
 static const struct feature_map_s arm_features[] =
   {
@@ -113,6 +145,34 @@ get_hwcap(unsigned int *hwcap, unsigned int *hwcap2)
       return 0;
     }
 
+#ifdef HAVE_SYS_AUXV_H
+  errno = 0;
+  auxv.a_val = getauxval (AT_HWCAP);
+  if (errno == 0)
+    {
+      stored_hwcap |= auxv.a_val;
+      hwcap_initialized = 1;
+    }
+
+  if (AT_HWCAP2 >= 0)
+    {
+      errno = 0;
+      auxv.a_val = getauxval (AT_HWCAP2);
+      if (errno == 0)
+	{
+	  stored_hwcap2 |= auxv.a_val;
+	  hwcap_initialized = 1;
+	}
+    }
+
+  if (hwcap_initialized && (stored_hwcap || stored_hwcap2))
+    {
+      *hwcap = stored_hwcap;
+      *hwcap2 = stored_hwcap2;
+      return 0;
+    }
+#endif
+
   f = fopen("/proc/self/auxv", "r");
   if (!f)
     {
@@ -125,13 +185,13 @@ get_hwcap(unsigned int *hwcap, unsigned int *hwcap2)
     {
       if (auxv.a_type == AT_HWCAP)
         {
-          stored_hwcap = auxv.a_val;
+          stored_hwcap |= auxv.a_val;
           hwcap_initialized = 1;
         }
 
       if (auxv.a_type == AT_HWCAP2)
         {
-          stored_hwcap2 = auxv.a_val;
+          stored_hwcap2 |= auxv.a_val;
           hwcap_initialized = 1;
         }
     }


From jussi.kivilinna at iki.fi  Tue Apr  9 19:11:04 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue,  9 Apr 2019 20:11:04 +0300
Subject: [PATCH] Use FreeBSD's elf_aux_info for detecting ARM HW features
Message-ID: <155482986392.14628.10419192865532577551.stgit@localhost.localdomain>

* configure.ac: Add function check for 'elf_aux_info'.
* src/hwf-arm.c [HAVE_ELF_AUX_INFO]: Include 'sys/auxv.h'.
[HAVE_ELF_AUX_INFO && !HAVE_GETAUXVAL] (HAVE_GETAUXVAL)
(getauxval): New.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/configure.ac b/configure.ac
index b0d7f8903..b54b212b3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1806,7 +1806,7 @@ AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise)
 # Other checks
 AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4)
 AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog)
-AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval)
+AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info)
 AC_CHECK_FUNCS(explicit_bzero getentropy)
 
 GNUPG_CHECK_MLOCK
diff --git a/src/hwf-arm.c b/src/hwf-arm.c
index 0f8f83f61..1d19ea86b 100644
--- a/src/hwf-arm.c
+++ b/src/hwf-arm.c
@@ -24,7 +24,8 @@
 #include <stdarg.h>
 #include <unistd.h>
 #include <errno.h>
-#if defined(HAVE_SYS_AUXV_H) && defined(HAVE_GETAUXVAL)
+#if defined(HAVE_SYS_AUXV_H) && (defined(HAVE_GETAUXVAL) || \
+    defined(HAVE_ELF_AUX_INFO))
 #include <sys/auxv.h>
 #endif
 
@@ -35,6 +36,30 @@
 # error Module build for wrong CPU.
 #endif
 
+
+#if defined(HAVE_SYS_AUXV_H) && defined(HAVE_ELF_AUX_INFO) && \
+    !defined(HAVE_GETAUXVAL) && defined(AT_HWCAP)
+#define HAVE_GETAUXVAL
+static unsigned long getauxval(unsigned long type)
+{
+  unsigned long auxval = 0;
+  int err;
+
+  /* FreeBSD provides 'elf_aux_info' function that does the same as
+   * 'getauxval' on Linux. */
+
+  err = elf_aux_info (type, &auxval, sizeof(auxval));
+  if (err)
+    {
+      errno = err;
+      auxval = 0;
+    }
+
+  return auxval;
+}
+#endif
+
+
 #undef HAS_SYS_AT_HWCAP
 #if defined(__linux__) || \
     (defined(HAVE_SYS_AUXV_H) && defined(HAVE_GETAUXVAL))
@@ -49,6 +74,7 @@ struct feature_map_s {
 
 #ifdef __arm__
 
+/* Note: These macros have same values on Linux and FreeBSD. */
 #ifndef AT_HWCAP
 # define AT_HWCAP      16
 #endif
@@ -88,6 +114,7 @@ static const struct feature_map_s arm_features[] =
 
 #elif defined(__aarch64__)
 
+/* Note: These macros have same values on Linux and FreeBSD. */
 #ifndef AT_HWCAP
 # define AT_HWCAP    16
 #endif


From peter at lekensteyn.nl  Wed Apr 10 00:18:10 2019
From: peter at lekensteyn.nl (Peter Wu)
Date: Tue, 9 Apr 2019 23:18:10 +0100
Subject: Blowfish actually supports more than 128-bit keys
Message-ID: <20190409221810.GA10760@al>

Hi,

The current cipher documentation[1] reports that GCRY_CIPHER_BLOWFISH
only supports 128-bit keys. In the long past, a "BLOWFISH160" variant
seems to have existed which supported 160-bit keys as reported through
gcry_cipher_get_algo_keylen.

One of our users would like to use Blowfish with 576 bit keys (don't
ask). Based on the documentation it would not be possible. The source
code (cipher/blowfish.c) however shows that do_bf_setkey function does
not limit the key size. In fact it is designed to support any number of
bytes (up to 72 bytes / 576 bits).

Could this be documented such that we can rely on it? Attached are two
test programs:
- One using test vectors from OpenSSL (key lengths 8..200 bits).
- One using Eric Young's test vector as linked by Schneier[2] (key
  lengths 8..192 bits).

Rejecting key lengths above 576 bits (72 bytes) might be a good idea.
Rejecting 0 bytes would also be good to avoid a buffer overrun by one
byte. I have no idea why someone would like to use a very short key
though...
-- 
Kind regards,
Peter Wu
https://lekensteyn.nl

 [1]: https://gnupg.org/documentation/manuals/gcrypt/Available-ciphers.html
 [2]: https://www.schneier.com/academic/blowfish/
-------------- next part --------------
/* cc gcry-bf-test.c -lgcrypt && ./a.out */
#include <gcrypt.h>

// from openssl test/bftest.c
# define KEY_TEST_NUM    25
static unsigned char key_test[KEY_TEST_NUM] = {
    0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87,
    0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f,
    0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
    0x88
};

static unsigned char key_data[8] =
    { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };

static unsigned char key_out[KEY_TEST_NUM][8] = {
    {0xF9, 0xAD, 0x59, 0x7C, 0x49, 0xDB, 0x00, 0x5E},
    {0xE9, 0x1D, 0x21, 0xC1, 0xD9, 0x61, 0xA6, 0xD6},
    {0xE9, 0xC2, 0xB7, 0x0A, 0x1B, 0xC6, 0x5C, 0xF3},
    {0xBE, 0x1E, 0x63, 0x94, 0x08, 0x64, 0x0F, 0x05},
    {0xB3, 0x9E, 0x44, 0x48, 0x1B, 0xDB, 0x1E, 0x6E},
    {0x94, 0x57, 0xAA, 0x83, 0xB1, 0x92, 0x8C, 0x0D},
    {0x8B, 0xB7, 0x70, 0x32, 0xF9, 0x60, 0x62, 0x9D},
    {0xE8, 0x7A, 0x24, 0x4E, 0x2C, 0xC8, 0x5E, 0x82},
    {0x15, 0x75, 0x0E, 0x7A, 0x4F, 0x4E, 0xC5, 0x77},
    {0x12, 0x2B, 0xA7, 0x0B, 0x3A, 0xB6, 0x4A, 0xE0},
    {0x3A, 0x83, 0x3C, 0x9A, 0xFF, 0xC5, 0x37, 0xF6},
    {0x94, 0x09, 0xDA, 0x87, 0xA9, 0x0F, 0x6B, 0xF2},
    {0x88, 0x4F, 0x80, 0x62, 0x50, 0x60, 0xB8, 0xB4},
    {0x1F, 0x85, 0x03, 0x1C, 0x19, 0xE1, 0x19, 0x68},
    {0x79, 0xD9, 0x37, 0x3A, 0x71, 0x4C, 0xA3, 0x4F},
    {0x93, 0x14, 0x28, 0x87, 0xEE, 0x3B, 0xE1, 0x5C},
    {0x03, 0x42, 0x9E, 0x83, 0x8C, 0xE2, 0xD1, 0x4B},
    {0xA4, 0x29, 0x9E, 0x27, 0x46, 0x9F, 0xF6, 0x7B},
    {0xAF, 0xD5, 0xAE, 0xD1, 0xC1, 0xBC, 0x96, 0xA8},
    {0x10, 0x85, 0x1C, 0x0E, 0x38, 0x58, 0xDA, 0x9F},
    {0xE6, 0xF5, 0x1E, 0xD7, 0x9B, 0x9D, 0xB2, 0x1F},
    {0x64, 0xA6, 0xE1, 0x4A, 0xFD, 0x36, 0xB4, 0x6F},
    {0x80, 0xC7, 0xD7, 0xD4, 0x5A, 0x54, 0x79, 0xAD},
    {0x05, 0x04, 0x4B, 0x62, 0xFA, 0x52, 0xD0, 0x80},
};

int main(void) {
    gcry_cipher_hd_t hd;
    gcry_error_t err;

    err = gcry_cipher_open(&hd, GCRY_CIPHER_BLOWFISH, GCRY_CIPHER_MODE_ECB, 0);
    if (err != 0) {
        printf("open: %s\n", gcry_strerror(err));
        return 1;
    }

    for (unsigned i = 0; i < KEY_TEST_NUM - 1; i++) {
        err = gcry_cipher_setkey(hd, key_test, i + 1);
        if (err != 0) {
            printf("setkey %d: %s\n", i, gcry_strerror(err));
            goto end;
        }

        unsigned char out[8];
        err = gcry_cipher_encrypt(hd, out, sizeof(out), key_data, sizeof(key_data));
        if (err != 0) {
            printf("decrypt %d: %s\n", i, gcry_strerror(err));
            goto end;
        }

        if (memcmp(out, key_out[i], 8) != 0) {
            printf("Test failure: %d\n", i);
            goto end;
        }
    }

    puts("Passed.");

end:
    gcry_cipher_close(hd);
    return err != 0;
}
-------------- next part --------------
/* cc gcry-bf-test.c -lgcrypt && ./a.out */
#include <gcrypt.h>

// test vectors from https://www.schneier.com/code/vectors.txt
# define KEY_TEST_NUM    24
static unsigned char key_test[KEY_TEST_NUM] = {
    0xF0, 0xE1, 0xD2, 0xC3, 0xB4, 0xA5, 0x96, 0x87,
    0x78, 0x69, 0x5A, 0x4B, 0x3C, 0x2D, 0x1E, 0x0F,
    0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77
};

static unsigned char key_data[8] = {
    0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10
};

static unsigned char key_out[KEY_TEST_NUM][8] = {
    { 0xF9, 0xAD, 0x59, 0x7C, 0x49, 0xDB, 0x00, 0x5E },
    { 0xE9, 0x1D, 0x21, 0xC1, 0xD9, 0x61, 0xA6, 0xD6 },
    { 0xE9, 0xC2, 0xB7, 0x0A, 0x1B, 0xC6, 0x5C, 0xF3 },
    { 0xBE, 0x1E, 0x63, 0x94, 0x08, 0x64, 0x0F, 0x05 },
    { 0xB3, 0x9E, 0x44, 0x48, 0x1B, 0xDB, 0x1E, 0x6E },
    { 0x94, 0x57, 0xAA, 0x83, 0xB1, 0x92, 0x8C, 0x0D },
    { 0x8B, 0xB7, 0x70, 0x32, 0xF9, 0x60, 0x62, 0x9D },
    { 0xE8, 0x7A, 0x24, 0x4E, 0x2C, 0xC8, 0x5E, 0x82 },
    { 0x15, 0x75, 0x0E, 0x7A, 0x4F, 0x4E, 0xC5, 0x77 },
    { 0x12, 0x2B, 0xA7, 0x0B, 0x3A, 0xB6, 0x4A, 0xE0 },
    { 0x3A, 0x83, 0x3C, 0x9A, 0xFF, 0xC5, 0x37, 0xF6 },
    { 0x94, 0x09, 0xDA, 0x87, 0xA9, 0x0F, 0x6B, 0xF2 },
    { 0x88, 0x4F, 0x80, 0x62, 0x50, 0x60, 0xB8, 0xB4 },
    { 0x1F, 0x85, 0x03, 0x1C, 0x19, 0xE1, 0x19, 0x68 },
    { 0x79, 0xD9, 0x37, 0x3A, 0x71, 0x4C, 0xA3, 0x4F },
    { 0x93, 0x14, 0x28, 0x87, 0xEE, 0x3B, 0xE1, 0x5C },
    { 0x03, 0x42, 0x9E, 0x83, 0x8C, 0xE2, 0xD1, 0x4B },
    { 0xA4, 0x29, 0x9E, 0x27, 0x46, 0x9F, 0xF6, 0x7B },
    { 0xAF, 0xD5, 0xAE, 0xD1, 0xC1, 0xBC, 0x96, 0xA8 },
    { 0x10, 0x85, 0x1C, 0x0E, 0x38, 0x58, 0xDA, 0x9F },
    { 0xE6, 0xF5, 0x1E, 0xD7, 0x9B, 0x9D, 0xB2, 0x1F },
    { 0x64, 0xA6, 0xE1, 0x4A, 0xFD, 0x36, 0xB4, 0x6F },
    { 0x80, 0xC7, 0xD7, 0xD4, 0x5A, 0x54, 0x79, 0xAD },
    { 0x05, 0x04, 0x4B, 0x62, 0xFA, 0x52, 0xD0, 0x80 }
};

int main(void) {
    gcry_cipher_hd_t hd;
    gcry_error_t err;

    err = gcry_cipher_open(&hd, GCRY_CIPHER_BLOWFISH, GCRY_CIPHER_MODE_ECB, 0);
    if (err != 0) {
        printf("open: %s\n", gcry_strerror(err));
        return 1;
    }

    for (unsigned i = 0; i < KEY_TEST_NUM - 1; i++) {
        err = gcry_cipher_setkey(hd, key_test, i + 1);
        if (err != 0) {
            printf("setkey %d: %s\n", i, gcry_strerror(err));
            goto end;
        }

        unsigned char out[8];
        err = gcry_cipher_encrypt(hd, out, sizeof(out), key_data, sizeof(key_data));
        if (err != 0) {
            printf("decrypt %d: %s\n", i, gcry_strerror(err));
            goto end;
        }

        if (memcmp(out, key_out[i], 8) != 0) {
            printf("Test failure: %d\n", i);
            goto end;
        }
    }

    puts("Passed.");

end:
    gcry_cipher_close(hd);
    return err != 0;
}

From jussi.kivilinna at iki.fi  Tue Apr 16 22:03:33 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 16 Apr 2019 23:03:33 +0300
Subject: [PATCH] twofish-amd64: do not use xchg instruction
Message-ID: <155544501358.19493.14951648802162799382.stgit@localhost.localdomain>

* cipher/twofish-amd64.S (g1g2_3): Swap ab and cd registers using
'movq' instructions instead of 'xchgq'.
--

Avoiding xchg instruction improves three block parallel performance
by ~3% on Intel Haswell.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 7a836463c..134d6401e 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -368,15 +368,21 @@ ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;
 	/* G1,2 && G2,2 */ \
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
-	xchgq cd ## 0, ab ## 0; \
+	movq ab ## 0, RT0; \
+	movq cd ## 0, ab ## 0; \
+	movq RT0, cd ## 0; \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
-	xchgq cd ## 1, ab ## 1; \
+	movq ab ## 1, RT0; \
+	movq cd ## 1, ab ## 1; \
+	movq RT0, cd ## 1; \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
-	xchgq cd ## 2, ab ## 2;
+	movq ab ## 2, RT0; \
+	movq cd ## 2, ab ## 2; \
+	movq RT0, cd ## 2;
 
 #define enc_round_end(ab, x, y, n) \
 	addl y ## d,			x ## d; \


From jussi.kivilinna at iki.fi  Wed Apr 17 18:43:24 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 17 Apr 2019 19:43:24 +0300
Subject: Blowfish actually supports more than 128-bit keys
In-Reply-To: <20190409221810.GA10760@al>
References: <20190409221810.GA10760@al>
Message-ID: <42183823-d2f4-8750-8727-19d8d9c87cc9@iki.fi>

Hello,

On 10.4.2019 1.18, Peter Wu wrote:
> Hi,
> 
> The current cipher documentation[1] reports that GCRY_CIPHER_BLOWFISH
> only supports 128-bit keys. In the long past, a "BLOWFISH160" variant
> seems to have existed which supported 160-bit keys as reported through
> gcry_cipher_get_algo_keylen.
> 
> One of our users would like to use Blowfish with 576 bit keys (don't
> ask). Based on the documentation it would not be possible. The source
> code (cipher/blowfish.c) however shows that do_bf_setkey function does
> not limit the key size. In fact it is designed to support any number of
> bytes (up to 72 bytes / 576 bits).
> 
> Could this be documented such that we can rely on it? Attached are two
> test programs:
> - One using test vectors from OpenSSL (key lengths 8..200 bits).
> - One using Eric Young's test vector as linked by Schneier[2] (key
>   lengths 8..192 bits).

I guess it would make sense to update documentation to match existing
implementation. It would be good to have test vector for maximum
key size. I've tried to search for existing test vectors for key 
lengths of 448-bit and 576-bit, but have not yet found one for the
latter.

> 
> Rejecting key lengths above 576 bits (72 bytes) might be a good idea.
> Rejecting 0 bytes would also be good to avoid a buffer overrun by one
> byte. I have no idea why someone would like to use a very short key
> though...
> 

Yes, limiting to supported key length would be a good thing.

-Jussi


From jussi.kivilinna at iki.fi  Wed Apr 17 22:16:17 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 17 Apr 2019 23:16:17 +0300
Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits
Message-ID: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain>

* cipher/blowfish.c (BLOWFISH_KEY_MIN_BITS)
(BLOWFISH_KEY_MAX_BITS): New.
(do_bf_setkey): Check input key length to MIN_BITS and MAX_BITS.
* doc/gcrypt.texi: Update supported Blowfish key lengths.
* tests/basic.c (check_ecb_cipher): New, with Blowfish test vectors
for different key lengths.
(check_cipher_modes): Call 'check_ecb_cipher'.
--

As noted by Peter Wu, Blowfish cipher implementation already supports key
lengths 8 to 576 bits [1]. This change updates documentation to reflect
that and adds new test vectors to check handling of different key lengths.

[1] https://lists.gnupg.org/pipermail/gcrypt-devel/2019-April/004680.html

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index ea6e64a7b..a1d81d310 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -41,6 +41,8 @@
 #include "cipher-selftest.h"
 
 #define BLOWFISH_BLOCKSIZE 8
+#define BLOWFISH_KEY_MIN_BITS 8
+#define BLOWFISH_KEY_MAX_BITS 576
 
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
@@ -1018,6 +1020,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen)
   if( selftest_failed )
     return GPG_ERR_SELFTEST_FAILED;
 
+  if (keylen < BLOWFISH_KEY_MIN_BITS / 8 ||
+      keylen > BLOWFISH_KEY_MAX_BITS / 8)
+    return GPG_ERR_INV_KEYLEN;
+
   memset(hset, 0, sizeof(hset));
 
   for(i=0; i < 16+2; i++ )
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 8b765ba80..d7bfa4c27 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1538,7 +1538,7 @@ This is the IDEA algorithm.
 @cindex Triple-DES
 @cindex DES-EDE
 @cindex Digital Encryption Standard
-Triple-DES with 3 Keys as EDE.  The key size of this algorithm is 168 but
+Triple-DES with 3 Keys as EDE.  The key size of this algorithm is 168 bits but
 you have to pass 192 bits because the most significant bits of each byte
 are ignored.
 
@@ -1548,8 +1548,8 @@ CAST128-5 block cipher algorithm.  The key size is 128 bits.
 
 @item GCRY_CIPHER_BLOWFISH
 @cindex Blowfish
-The blowfish algorithm. The current implementation allows only for a key
-size of 128 bits.
+The blowfish algorithm. The supported key sizes are 8 to 576 bits in
+8 bit increments.
 
 @item GCRY_CIPHER_SAFER_SK128
 Reserved and not currently implemented.
diff --git a/tests/basic.c b/tests/basic.c
index 3d6e8fc1e..792b7737b 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -446,6 +446,239 @@ check_aes128_cbc_cts_cipher (void)
     fprintf (stderr, "  Completed AES128 CBC CTS checks.\n");
 }
 
+static void
+check_ecb_cipher (void)
+{
+  /* ECB cipher check. Mainly for testing underlying block cipher. */
+  static const struct tv
+  {
+    int algo;
+    const char *key;
+    struct
+    {
+      const char *plaintext;
+      int keylen;
+      int inlen;
+      const char *out;
+    } data[MAX_DATA_LEN];
+  } tv[] =
+    {
+      /* Test vectors from OpenSSL for key lengths of 8 to 200 bits */
+      { GCRY_CIPHER_BLOWFISH,
+	"\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f"
+	"\x00\x11\x22\x33\x44\x55\x66\x77\x88",
+	{ { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    1,
+	    8,
+	    "\xf9\xad\x59\x7c\x49\xdb\x00\x5e" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    2,
+	    8,
+	    "\xe9\x1d\x21\xc1\xd9\x61\xa6\xd6" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    3,
+	    8,
+	    "\xe9\xc2\xb7\x0a\x1b\xc6\x5c\xf3" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    4,
+	    8,
+	    "\xbe\x1e\x63\x94\x08\x64\x0f\x05" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    5,
+	    8,
+	    "\xb3\x9e\x44\x48\x1b\xdb\x1e\x6e" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    6,
+	    8,
+	    "\x94\x57\xaa\x83\xb1\x92\x8c\x0d" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    7,
+	    8,
+	    "\x8b\xb7\x70\x32\xf9\x60\x62\x9d" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    8,
+	    8,
+	    "\xe8\x7a\x24\x4e\x2c\xc8\x5e\x82" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    9,
+	    8,
+	    "\x15\x75\x0e\x7a\x4f\x4e\xc5\x77" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    10,
+	    8,
+	    "\x12\x2b\xa7\x0b\x3a\xb6\x4a\xe0" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    11,
+	    8,
+	    "\x3a\x83\x3c\x9a\xff\xc5\x37\xf6" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    12,
+	    8,
+	    "\x94\x09\xda\x87\xa9\x0f\x6b\xf2" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    13,
+	    8,
+	    "\x88\x4f\x80\x62\x50\x60\xb8\xb4" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    14,
+	    8,
+	    "\x1f\x85\x03\x1c\x19\xe1\x19\x68" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    15,
+	    8,
+	    "\x79\xd9\x37\x3a\x71\x4c\xa3\x4f" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    16,
+	    8,
+	    "\x93\x14\x28\x87\xee\x3b\xe1\x5c" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    17,
+	    8,
+	    "\x03\x42\x9e\x83\x8c\xe2\xd1\x4b" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    18,
+	    8,
+	    "\xa4\x29\x9e\x27\x46\x9f\xf6\x7b" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    19,
+	    8,
+	    "\xaf\xd5\xae\xd1\xc1\xbc\x96\xa8" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    20,
+	    8,
+	    "\x10\x85\x1c\x0e\x38\x58\xda\x9f" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    21,
+	    8,
+	    "\xe6\xf5\x1e\xd7\x9b\x9d\xb2\x1f" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    22,
+	    8,
+	    "\x64\xa6\xe1\x4a\xfd\x36\xb4\x6f" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    23,
+	    8,
+	    "\x80\xc7\xd7\xd4\x5a\x54\x79\xad" },
+	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    24,
+	    8,
+	    "\x05\x04\x4b\x62\xfa\x52\xd0\x80" }
+	}
+      },
+      /* Test vector from Linux kernel for key length of 448 bits */
+      { GCRY_CIPHER_BLOWFISH,
+	"\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f"
+	"\x00\x11\x22\x33\x44\x55\x66\x77\x04\x68\x91\x04\xc2\xfd\x3b\x2f"
+	"\x58\x40\x23\x64\x1a\xba\x61\x76\x1f\x1f\x1f\x1f\x0e\x0e\x0e\x0e"
+	"\xff\xff\xff\xff\xff\xff\xff\xff",
+	{ { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
+	    56,
+	    8,
+	    "\xc0\x45\x04\x01\x2e\x4e\x1f\x53" } }
+      },
+    };
+  gcry_cipher_hd_t hde, hdd;
+  unsigned char out[MAX_DATA_LEN];
+  int i, j, keylen, algo;
+  gcry_error_t err = 0;
+
+  if (verbose)
+    fprintf (stderr, "  Starting ECB checks.\n");
+
+  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
+    {
+      algo = tv[i].algo;
+
+      if (gcry_cipher_test_algo (algo) && in_fips_mode)
+	{
+	  if (verbose)
+	    fprintf (stderr, "  algorithm %d not available in fips mode\n",
+		    algo);
+	  continue;
+	}
+
+      if (verbose)
+	fprintf (stderr, "    checking ECB mode for %s [%i]\n",
+		gcry_cipher_algo_name (algo),
+		algo);
+      err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_ECB, 0);
+      if (!err)
+	err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_ECB, 0);
+      if (err)
+	{
+	  fail ("ecb-algo:%d-tv:%d, gcry_cipher_open failed: %s\n", algo, i,
+		gpg_strerror (err));
+	  return;
+	}
+
+      for (j = 0; tv[i].data[j].inlen; j++)
+	{
+	  keylen = tv[i].data[j].keylen;
+	  if (!keylen)
+	    {
+	      keylen = gcry_cipher_get_algo_keylen(algo);
+	      if (!keylen)
+		{
+		  fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_get_algo_keylen failed\n",
+			algo, i, j);
+		  return;
+		}
+	    }
+
+	  err = gcry_cipher_setkey (hde, tv[i].key, keylen);
+	  if (!err)
+	    err = gcry_cipher_setkey (hdd, tv[i].key, keylen);
+	  if (err)
+	    {
+	      fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_setkey failed: %s\n",
+		    algo, i, j, gpg_strerror (err));
+	      gcry_cipher_close (hde);
+	      gcry_cipher_close (hdd);
+	      return;
+	    }
+
+	  err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN,
+				    tv[i].data[j].plaintext,
+				    tv[i].data[j].inlen);
+	  if (err)
+	    {
+	      fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_encrypt failed: %s\n",
+		    algo, i, j, gpg_strerror (err));
+	      gcry_cipher_close (hde);
+	      gcry_cipher_close (hdd);
+	      return;
+	    }
+
+	  if (memcmp (tv[i].data[j].out, out, tv[i].data[j].inlen))
+	    {
+	      fail ("ecb-algo:%d-tv:%d-data:%d, encrypt mismatch entry\n",
+		    algo, i, j);
+	    }
+
+	  err = gcry_cipher_decrypt (hdd, out, tv[i].data[j].inlen, NULL, 0);
+	  if (err)
+	    {
+	      fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_decrypt failed: %s\n",
+		    algo, i, j, gpg_strerror (err));
+	      gcry_cipher_close (hde);
+	      gcry_cipher_close (hdd);
+	      return;
+	    }
+
+	  if (memcmp (tv[i].data[j].plaintext, out, tv[i].data[j].inlen))
+	    {
+	      fail ("ecb-algo:%d-tv:%d-data:%d, decrypt mismatch entry\n",
+		    algo, i, j);
+	    }
+	}
+
+      gcry_cipher_close (hde);
+      gcry_cipher_close (hdd);
+    }
+  if (verbose)
+    fprintf (stderr, "  Completed ECB checks.\n");
+}
+
 static void
 check_ctr_cipher (void)
 {
@@ -7916,6 +8149,7 @@ check_cipher_modes(void)
   if (verbose)
     fprintf (stderr, "Starting Cipher Mode checks.\n");
 
+  check_ecb_cipher ();
   check_aes128_cbc_cts_cipher ();
   check_cbc_mac_cipher ();
   check_ctr_cipher ();


From peter at lekensteyn.nl  Wed Apr 17 23:50:36 2019
From: peter at lekensteyn.nl (Peter Wu)
Date: Wed, 17 Apr 2019 22:50:36 +0100
Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits
In-Reply-To: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain>
References: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain>
Message-ID: <20190417215036.GA20903@al>

Hi Jussi,

Just some notes below on the tests, documentation looks good to me.
Additionally, indentation in this file is a bit of a mess with mixed
tabs and spaces.

Should gcry_cipher_get_algo_keylen be modified as well to return "the
maximum supported key length"? Hopefully it does not break stuff that
assumed this to be fixed.

On Wed, Apr 17, 2019 at 11:16:17PM +0300, Jussi Kivilinna wrote:
> * cipher/blowfish.c (BLOWFISH_KEY_MIN_BITS)
> (BLOWFISH_KEY_MAX_BITS): New.
> (do_bf_setkey): Check input key length to MIN_BITS and MAX_BITS.
> * doc/gcrypt.texi: Update supported Blowfish key lengths.
> * tests/basic.c (check_ecb_cipher): New, with Blowfish test vectors
> for different key lengths.
> (check_cipher_modes): Call 'check_ecb_cipher'.
> --
> 
> As noted by Peter Wu, Blowfish cipher implementation already supports key
> lengths 8 to 576 bits [1]. This change updates documentation to reflect
> that and adds new test vectors to check handling of different key lengths.
> 
> [1] https://lists.gnupg.org/pipermail/gcrypt-devel/2019-April/004680.html
> 
> Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
> ---
>  0 files changed
> 
> diff --git a/cipher/blowfish.c b/cipher/blowfish.c
> index ea6e64a7b..a1d81d310 100644
> --- a/cipher/blowfish.c
> +++ b/cipher/blowfish.c
> @@ -41,6 +41,8 @@
>  #include "cipher-selftest.h"
>  
>  #define BLOWFISH_BLOCKSIZE 8
> +#define BLOWFISH_KEY_MIN_BITS 8
> +#define BLOWFISH_KEY_MAX_BITS 576
>  
>  
>  /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
> @@ -1018,6 +1020,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen)
>    if( selftest_failed )
>      return GPG_ERR_SELFTEST_FAILED;
>  
> +  if (keylen < BLOWFISH_KEY_MIN_BITS / 8 ||
> +      keylen > BLOWFISH_KEY_MAX_BITS / 8)
> +    return GPG_ERR_INV_KEYLEN;
> +
>    memset(hset, 0, sizeof(hset));
>  
>    for(i=0; i < 16+2; i++ )
> diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
> index 8b765ba80..d7bfa4c27 100644
> --- a/doc/gcrypt.texi
> +++ b/doc/gcrypt.texi
> @@ -1538,7 +1538,7 @@ This is the IDEA algorithm.
>  @cindex Triple-DES
>  @cindex DES-EDE
>  @cindex Digital Encryption Standard
> -Triple-DES with 3 Keys as EDE.  The key size of this algorithm is 168 but
> +Triple-DES with 3 Keys as EDE.  The key size of this algorithm is 168 bits but
>  you have to pass 192 bits because the most significant bits of each byte
>  are ignored.
>  
> @@ -1548,8 +1548,8 @@ CAST128-5 block cipher algorithm.  The key size is 128 bits.
>  
>  @item GCRY_CIPHER_BLOWFISH
>  @cindex Blowfish
> -The blowfish algorithm. The current implementation allows only for a key
> -size of 128 bits.
> +The blowfish algorithm. The supported key sizes are 8 to 576 bits in
> +8 bit increments.
>  
>  @item GCRY_CIPHER_SAFER_SK128
>  Reserved and not currently implemented.
> diff --git a/tests/basic.c b/tests/basic.c
> index 3d6e8fc1e..792b7737b 100644
> --- a/tests/basic.c
> +++ b/tests/basic.c
> @@ -446,6 +446,239 @@ check_aes128_cbc_cts_cipher (void)
>      fprintf (stderr, "  Completed AES128 CBC CTS checks.\n");
>  }
>  
> +static void
> +check_ecb_cipher (void)
> +{
> +  /* ECB cipher check. Mainly for testing underlying block cipher. */
> +  static const struct tv
> +  {
> +    int algo;
> +    const char *key;
> +    struct
> +    {
> +      const char *plaintext;
> +      int keylen;
> +      int inlen;
> +      const char *out;
> +    } data[MAX_DATA_LEN];
> +  } tv[] =
> +    {
> +      /* Test vectors from OpenSSL for key lengths of 8 to 200 bits */
> +      { GCRY_CIPHER_BLOWFISH,
> +	"\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f"
> +	"\x00\x11\x22\x33\x44\x55\x66\x77\x88",
> +	{ { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    1,
> +	    8,
> +	    "\xf9\xad\x59\x7c\x49\xdb\x00\x5e" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    2,
> +	    8,
> +	    "\xe9\x1d\x21\xc1\xd9\x61\xa6\xd6" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    3,
> +	    8,
> +	    "\xe9\xc2\xb7\x0a\x1b\xc6\x5c\xf3" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    4,
> +	    8,
> +	    "\xbe\x1e\x63\x94\x08\x64\x0f\x05" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    5,
> +	    8,
> +	    "\xb3\x9e\x44\x48\x1b\xdb\x1e\x6e" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    6,
> +	    8,
> +	    "\x94\x57\xaa\x83\xb1\x92\x8c\x0d" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    7,
> +	    8,
> +	    "\x8b\xb7\x70\x32\xf9\x60\x62\x9d" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    8,
> +	    8,
> +	    "\xe8\x7a\x24\x4e\x2c\xc8\x5e\x82" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    9,
> +	    8,
> +	    "\x15\x75\x0e\x7a\x4f\x4e\xc5\x77" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    10,
> +	    8,
> +	    "\x12\x2b\xa7\x0b\x3a\xb6\x4a\xe0" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    11,
> +	    8,
> +	    "\x3a\x83\x3c\x9a\xff\xc5\x37\xf6" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    12,
> +	    8,
> +	    "\x94\x09\xda\x87\xa9\x0f\x6b\xf2" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    13,
> +	    8,
> +	    "\x88\x4f\x80\x62\x50\x60\xb8\xb4" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    14,
> +	    8,
> +	    "\x1f\x85\x03\x1c\x19\xe1\x19\x68" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    15,
> +	    8,
> +	    "\x79\xd9\x37\x3a\x71\x4c\xa3\x4f" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    16,
> +	    8,
> +	    "\x93\x14\x28\x87\xee\x3b\xe1\x5c" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    17,
> +	    8,
> +	    "\x03\x42\x9e\x83\x8c\xe2\xd1\x4b" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    18,
> +	    8,
> +	    "\xa4\x29\x9e\x27\x46\x9f\xf6\x7b" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    19,
> +	    8,
> +	    "\xaf\xd5\xae\xd1\xc1\xbc\x96\xa8" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    20,
> +	    8,
> +	    "\x10\x85\x1c\x0e\x38\x58\xda\x9f" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    21,
> +	    8,
> +	    "\xe6\xf5\x1e\xd7\x9b\x9d\xb2\x1f" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    22,
> +	    8,
> +	    "\x64\xa6\xe1\x4a\xfd\x36\xb4\x6f" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    23,
> +	    8,
> +	    "\x80\xc7\xd7\xd4\x5a\x54\x79\xad" },
> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    24,
> +	    8,
> +	    "\x05\x04\x4b\x62\xfa\x52\xd0\x80" }
> +	}
> +      },
> +      /* Test vector from Linux kernel for key length of 448 bits */
> +      { GCRY_CIPHER_BLOWFISH,
> +	"\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f"
> +	"\x00\x11\x22\x33\x44\x55\x66\x77\x04\x68\x91\x04\xc2\xfd\x3b\x2f"
> +	"\x58\x40\x23\x64\x1a\xba\x61\x76\x1f\x1f\x1f\x1f\x0e\x0e\x0e\x0e"
> +	"\xff\xff\xff\xff\xff\xff\xff\xff",
> +	{ { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
> +	    56,
> +	    8,
> +	    "\xc0\x45\x04\x01\x2e\x4e\x1f\x53" } }
> +      },
> +    };
> +  gcry_cipher_hd_t hde, hdd;
> +  unsigned char out[MAX_DATA_LEN];
> +  int i, j, keylen, algo;
> +  gcry_error_t err = 0;
> +
> +  if (verbose)
> +    fprintf (stderr, "  Starting ECB checks.\n");
> +
> +  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
> +    {
> +      algo = tv[i].algo;
> +
> +      if (gcry_cipher_test_algo (algo) && in_fips_mode)
> +	{
> +	  if (verbose)
> +	    fprintf (stderr, "  algorithm %d not available in fips mode\n",
> +		    algo);
> +	  continue;
> +	}
> +
> +      if (verbose)
> +	fprintf (stderr, "    checking ECB mode for %s [%i]\n",
> +		gcry_cipher_algo_name (algo),
> +		algo);
> +      err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_ECB, 0);
> +      if (!err)
> +	err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_ECB, 0);
> +      if (err)
> +	{
> +	  fail ("ecb-algo:%d-tv:%d, gcry_cipher_open failed: %s\n", algo, i,
> +		gpg_strerror (err));

You do close the cipher handle below in the error case. For consistency,
should you do it here (and below) as well?

> +	  return;
> +	}
> +
> +      for (j = 0; tv[i].data[j].inlen; j++)

The arrays are not terminated with an empty element, this probably trips
over a buffer overflow error if you run it with AddressSanitizer.

> +	{
> +	  keylen = tv[i].data[j].keylen;

> +	  if (!keylen)
> +	    {
> +	      keylen = gcry_cipher_get_algo_keylen(algo);
> +	      if (!keylen)
> +		{
> +		  fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_get_algo_keylen failed\n",
> +			algo, i, j);
> +		  return;
> +		}
> +	    }

This check is dead code, the key length is always specified here.

> +
> +	  err = gcry_cipher_setkey (hde, tv[i].key, keylen);
> +	  if (!err)
> +	    err = gcry_cipher_setkey (hdd, tv[i].key, keylen);
> +	  if (err)
> +	    {
> +	      fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_setkey failed: %s\n",
> +		    algo, i, j, gpg_strerror (err));
> +	      gcry_cipher_close (hde);
> +	      gcry_cipher_close (hdd);
> +	      return;
> +	    }
> +
> +	  err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN,
> +				    tv[i].data[j].plaintext,
> +				    tv[i].data[j].inlen);
> +	  if (err)
> +	    {
> +	      fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_encrypt failed: %s\n",
> +		    algo, i, j, gpg_strerror (err));
> +	      gcry_cipher_close (hde);
> +	      gcry_cipher_close (hdd);
> +	      return;
> +	    }
> +
> +	  if (memcmp (tv[i].data[j].out, out, tv[i].data[j].inlen))
> +	    {
> +	      fail ("ecb-algo:%d-tv:%d-data:%d, encrypt mismatch entry\n",
> +		    algo, i, j);
> +	    }
> +
> +	  err = gcry_cipher_decrypt (hdd, out, tv[i].data[j].inlen, NULL, 0);
> +	  if (err)
> +	    {
> +	      fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_decrypt failed: %s\n",
> +		    algo, i, j, gpg_strerror (err));
> +	      gcry_cipher_close (hde);
> +	      gcry_cipher_close (hdd);
> +	      return;
> +	    }
> +
> +	  if (memcmp (tv[i].data[j].plaintext, out, tv[i].data[j].inlen))
> +	    {
> +	      fail ("ecb-algo:%d-tv:%d-data:%d, decrypt mismatch entry\n",
> +		    algo, i, j);
> +	    }
> +	}
> +
> +      gcry_cipher_close (hde);
> +      gcry_cipher_close (hdd);
> +    }
> +  if (verbose)
> +    fprintf (stderr, "  Completed ECB checks.\n");
> +}
> +
>  static void
>  check_ctr_cipher (void)
>  {
> @@ -7916,6 +8149,7 @@ check_cipher_modes(void)
>    if (verbose)
>      fprintf (stderr, "Starting Cipher Mode checks.\n");
>  
> +  check_ecb_cipher ();
>    check_aes128_cbc_cts_cipher ();
>    check_cbc_mac_cipher ();
>    check_ctr_cipher ();
> 
> 


From jussi.kivilinna at iki.fi  Thu Apr 18 17:38:48 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 18 Apr 2019 18:38:48 +0300
Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits
In-Reply-To: <20190417215036.GA20903@al>
References: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain>
 <20190417215036.GA20903@al>
Message-ID: <6fea47bc-24af-cad3-282b-9947accd464e@iki.fi>

Hello,

On 18.4.2019 0.50, Peter Wu wrote:
> Hi Jussi,
> 
> Just some notes below on the tests, documentation looks good to me.
> Additionally, indentation in this file is a bit of a mess with mixed
> tabs and spaces.
> 
> Should gcry_cipher_get_algo_keylen be modified as well to return "the
> maximum supported key length"? Hopefully it does not break stuff that
> assumed this to be fixed.

I think it's better not to change the return value for
gcry_cipher_get_algo_keylen as existing users might depend it to stay
fixed to 128bits.

> 
> On Wed, Apr 17, 2019 at 11:16:17PM +0300, Jussi Kivilinna wrote:
>> * cipher/blowfish.c (BLOWFISH_KEY_MIN_BITS)
>> (BLOWFISH_KEY_MAX_BITS): New.
>> (do_bf_setkey): Check input key length to MIN_BITS and MAX_BITS.
>> * doc/gcrypt.texi: Update supported Blowfish key lengths.
>> * tests/basic.c (check_ecb_cipher): New, with Blowfish test vectors
>> for different key lengths.
>> (check_cipher_modes): Call 'check_ecb_cipher'.
>> --
>>
>> As noted by Peter Wu, Blowfish cipher implementation already supports key
>> lengths 8 to 576 bits [1]. This change updates documentation to reflect
>> that and adds new test vectors to check handling of different key lengths.
>>
>> [1] https://lists.gnupg.org/pipermail/gcrypt-devel/2019-April/004680.html
>>
>> Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
>> ---
>>  0 files changed
>>
>> diff --git a/cipher/blowfish.c b/cipher/blowfish.c
>> index ea6e64a7b..a1d81d310 100644
>> --- a/cipher/blowfish.c
>> +++ b/cipher/blowfish.c
>> @@ -41,6 +41,8 @@
>>  #include "cipher-selftest.h"
>>  
>>  #define BLOWFISH_BLOCKSIZE 8
>> +#define BLOWFISH_KEY_MIN_BITS 8
>> +#define BLOWFISH_KEY_MAX_BITS 576
>>  
>>  
>>  /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
>> @@ -1018,6 +1020,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen)
>>    if( selftest_failed )
>>      return GPG_ERR_SELFTEST_FAILED;
>>  
>> +  if (keylen < BLOWFISH_KEY_MIN_BITS / 8 ||
>> +      keylen > BLOWFISH_KEY_MAX_BITS / 8)
>> +    return GPG_ERR_INV_KEYLEN;
>> +
>>    memset(hset, 0, sizeof(hset));
>>  
>>    for(i=0; i < 16+2; i++ )
>> diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
>> index 8b765ba80..d7bfa4c27 100644
>> --- a/doc/gcrypt.texi
>> +++ b/doc/gcrypt.texi
>> @@ -1538,7 +1538,7 @@ This is the IDEA algorithm.
>>  @cindex Triple-DES
>>  @cindex DES-EDE
>>  @cindex Digital Encryption Standard
>> -Triple-DES with 3 Keys as EDE.  The key size of this algorithm is 168 but
>> +Triple-DES with 3 Keys as EDE.  The key size of this algorithm is 168 bits but
>>  you have to pass 192 bits because the most significant bits of each byte
>>  are ignored.
>>  
>> @@ -1548,8 +1548,8 @@ CAST128-5 block cipher algorithm.  The key size is 128 bits.
>>  
>>  @item GCRY_CIPHER_BLOWFISH
>>  @cindex Blowfish
>> -The blowfish algorithm. The current implementation allows only for a key
>> -size of 128 bits.
>> +The blowfish algorithm. The supported key sizes are 8 to 576 bits in
>> +8 bit increments.
>>  
>>  @item GCRY_CIPHER_SAFER_SK128
>>  Reserved and not currently implemented.
>> diff --git a/tests/basic.c b/tests/basic.c
>> index 3d6e8fc1e..792b7737b 100644
>> --- a/tests/basic.c
>> +++ b/tests/basic.c
>> @@ -446,6 +446,239 @@ check_aes128_cbc_cts_cipher (void)
>>      fprintf (stderr, "  Completed AES128 CBC CTS checks.\n");
>>  }
>>  
>> +static void
>> +check_ecb_cipher (void)
>> +{
>> +  /* ECB cipher check. Mainly for testing underlying block cipher. */
>> +  static const struct tv
>> +  {
>> +    int algo;
>> +    const char *key;
>> +    struct
>> +    {
>> +      const char *plaintext;
>> +      int keylen;
>> +      int inlen;
>> +      const char *out;
>> +    } data[MAX_DATA_LEN];
>> +  } tv[] =
>> +    {
>> +      /* Test vectors from OpenSSL for key lengths of 8 to 200 bits */
>> +      { GCRY_CIPHER_BLOWFISH,
>> +	"\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f"
>> +	"\x00\x11\x22\x33\x44\x55\x66\x77\x88",
>> +	{ { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    1,
>> +	    8,
>> +	    "\xf9\xad\x59\x7c\x49\xdb\x00\x5e" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    2,
>> +	    8,
>> +	    "\xe9\x1d\x21\xc1\xd9\x61\xa6\xd6" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    3,
>> +	    8,
>> +	    "\xe9\xc2\xb7\x0a\x1b\xc6\x5c\xf3" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    4,
>> +	    8,
>> +	    "\xbe\x1e\x63\x94\x08\x64\x0f\x05" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    5,
>> +	    8,
>> +	    "\xb3\x9e\x44\x48\x1b\xdb\x1e\x6e" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    6,
>> +	    8,
>> +	    "\x94\x57\xaa\x83\xb1\x92\x8c\x0d" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    7,
>> +	    8,
>> +	    "\x8b\xb7\x70\x32\xf9\x60\x62\x9d" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    8,
>> +	    8,
>> +	    "\xe8\x7a\x24\x4e\x2c\xc8\x5e\x82" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    9,
>> +	    8,
>> +	    "\x15\x75\x0e\x7a\x4f\x4e\xc5\x77" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    10,
>> +	    8,
>> +	    "\x12\x2b\xa7\x0b\x3a\xb6\x4a\xe0" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    11,
>> +	    8,
>> +	    "\x3a\x83\x3c\x9a\xff\xc5\x37\xf6" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    12,
>> +	    8,
>> +	    "\x94\x09\xda\x87\xa9\x0f\x6b\xf2" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    13,
>> +	    8,
>> +	    "\x88\x4f\x80\x62\x50\x60\xb8\xb4" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    14,
>> +	    8,
>> +	    "\x1f\x85\x03\x1c\x19\xe1\x19\x68" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    15,
>> +	    8,
>> +	    "\x79\xd9\x37\x3a\x71\x4c\xa3\x4f" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    16,
>> +	    8,
>> +	    "\x93\x14\x28\x87\xee\x3b\xe1\x5c" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    17,
>> +	    8,
>> +	    "\x03\x42\x9e\x83\x8c\xe2\xd1\x4b" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    18,
>> +	    8,
>> +	    "\xa4\x29\x9e\x27\x46\x9f\xf6\x7b" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    19,
>> +	    8,
>> +	    "\xaf\xd5\xae\xd1\xc1\xbc\x96\xa8" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    20,
>> +	    8,
>> +	    "\x10\x85\x1c\x0e\x38\x58\xda\x9f" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    21,
>> +	    8,
>> +	    "\xe6\xf5\x1e\xd7\x9b\x9d\xb2\x1f" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    22,
>> +	    8,
>> +	    "\x64\xa6\xe1\x4a\xfd\x36\xb4\x6f" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    23,
>> +	    8,
>> +	    "\x80\xc7\xd7\xd4\x5a\x54\x79\xad" },
>> +	  { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    24,
>> +	    8,
>> +	    "\x05\x04\x4b\x62\xfa\x52\xd0\x80" }
>> +	}
>> +      },
>> +      /* Test vector from Linux kernel for key length of 448 bits */
>> +      { GCRY_CIPHER_BLOWFISH,
>> +	"\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f"
>> +	"\x00\x11\x22\x33\x44\x55\x66\x77\x04\x68\x91\x04\xc2\xfd\x3b\x2f"
>> +	"\x58\x40\x23\x64\x1a\xba\x61\x76\x1f\x1f\x1f\x1f\x0e\x0e\x0e\x0e"
>> +	"\xff\xff\xff\xff\xff\xff\xff\xff",
>> +	{ { "\xfe\xdc\xba\x98\x76\x54\x32\x10",
>> +	    56,
>> +	    8,
>> +	    "\xc0\x45\x04\x01\x2e\x4e\x1f\x53" } }
>> +      },
>> +    };
>> +  gcry_cipher_hd_t hde, hdd;
>> +  unsigned char out[MAX_DATA_LEN];
>> +  int i, j, keylen, algo;
>> +  gcry_error_t err = 0;
>> +
>> +  if (verbose)
>> +    fprintf (stderr, "  Starting ECB checks.\n");
>> +
>> +  for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++)
>> +    {
>> +      algo = tv[i].algo;
>> +
>> +      if (gcry_cipher_test_algo (algo) && in_fips_mode)
>> +	{
>> +	  if (verbose)
>> +	    fprintf (stderr, "  algorithm %d not available in fips mode\n",
>> +		    algo);
>> +	  continue;
>> +	}
>> +
>> +      if (verbose)
>> +	fprintf (stderr, "    checking ECB mode for %s [%i]\n",
>> +		gcry_cipher_algo_name (algo),
>> +		algo);
>> +      err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_ECB, 0);
>> +      if (!err)
>> +	err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_ECB, 0);
>> +      if (err)
>> +	{
>> +	  fail ("ecb-algo:%d-tv:%d, gcry_cipher_open failed: %s\n", algo, i,
>> +		gpg_strerror (err));
> 
> You do close the cipher handle below in the error case. For consistency,
> should you do it here (and below) as well?

Yes, handles should be closed here too.

> 
>> +	  return;
>> +	}
>> +
>> +      for (j = 0; tv[i].data[j].inlen; j++)
> 
> The arrays are not terminated with an empty element, this probably trips
> over a buffer overflow error if you run it with AddressSanitizer.

Need to add terminating last entry.

> 
>> +	{
>> +	  keylen = tv[i].data[j].keylen;
> 
>> +	  if (!keylen)
>> +	    {
>> +	      keylen = gcry_cipher_get_algo_keylen(algo);
>> +	      if (!keylen)
>> +		{
>> +		  fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_get_algo_keylen failed\n",
>> +			algo, i, j);
>> +		  return;
>> +		}
>> +	    }
> 
> This check is dead code, the key length is always specified here.

For now all test vectors specify key length, but if new vectors are 
add they could use default key length (and also test that 
gcry_cipher_get_algo_keylen returns expected value).

-Jussi


From jussi.kivilinna at iki.fi  Thu Apr 18 18:30:12 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 18 Apr 2019 19:30:12 +0300
Subject: [PATCH 1/2] hwf-x86: make stack unwinding work at i386 cpuid functions
Message-ID: <155560501207.19038.8971332154736111401.stgit@localhost.localdomain>

* src/hwf-x86.c (FORCE_FUNC_FRAME_POINTER): New.
[__i386__] (is_cpuid_available): Force use of stack frame pointer as
inline assembly modifies stack register; Add 'memory' constraint for
inline assembly.
[__i386__] (get_cpuid): Avoid push/pop instruction when preserving
%ebx register over cpuid.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index b644eda1f..796e874f0 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -39,7 +39,14 @@
 #if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__)
 # define HAS_X86_CPUID 1
 
-static int
+#if _GCRY_GCC_VERSION >= 40700 /* 4.7 */
+# define FORCE_FUNC_FRAME_POINTER \
+	__attribute__ ((optimize("no-omit-frame-pointer")))
+#else
+# define FORCE_FUNC_FRAME_POINTER
+#endif
+
+static FORCE_FUNC_FRAME_POINTER int
 is_cpuid_available(void)
 {
   int has_cpuid = 0;
@@ -63,7 +70,7 @@ is_cpuid_available(void)
      ".Lno_cpuid%=:\n\t"
      : "+r" (has_cpuid)
      :
-     : "%eax", "%ecx", "cc"
+     : "%eax", "%ecx", "cc", "memory"
      );
 
   return has_cpuid;
@@ -76,14 +83,14 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx,
   unsigned int regs[4];
 
   asm volatile
-    ("pushl %%ebx\n\t"           /* Save GOT register.  */
-     "movl %1, %%ebx\n\t"
+    ("movl %%ebx, %%edi\n\t"     /* Save GOT register.  */
+     "xorl %%ebx, %%ebx\n\t"
      "cpuid\n\t"
      "movl %%ebx, %1\n\t"
-     "popl %%ebx\n\t"            /* Restore GOT register. */
-     : "=a" (regs[0]), "=D" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
-     : "0" (in), "1" (0), "2" (0), "3" (0)
-     : "cc"
+     "movl %%edi, %%ebx\n\t"     /* Restore GOT register. */
+     : "=a" (regs[0]), "=g" (regs[1]), "=c" (regs[2]), "=d" (regs[3])
+     : "0" (in), "2" (0), "3" (0)
+     : "cc", "edi"
      );
 
   if (eax)


From jussi.kivilinna at iki.fi  Thu Apr 18 18:30:17 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 18 Apr 2019 19:30:17 +0300
Subject: [PATCH 2/2] mpi: make stack unwinding work at i386 mpi functions
In-Reply-To: <155560501207.19038.8971332154736111401.stgit@localhost.localdomain>
References: <155560501207.19038.8971332154736111401.stgit@localhost.localdomain>
Message-ID: <155560501731.19038.16204103468981344016.stgit@localhost.localdomain>

* mpi/i386/syntax.h: Include 'config.h'.
(CFI_STARTPROC, CFI_ENDPROC, CFI_ADJUST_CFA_OFFSET, CFI_REL_OFFSET)
(CFI_RESTORE, CFI_PUSH, CFI_POP): New.
* mpi/i386/mpih-add1.S: Add CFI directives.
* mpi/i386/mpih-lshift.S: Add CFI directives.
* mpi/i386/mpih-mul1.S: Add CFI directives.
* mpi/i386/mpih-mul2.S: Add CFI directives.
* mpi/i386/mpih-mul3.S: Add CFI directives.
* mpi/i386/mpih-rshift.S: Add CFI directives.
* mpi/i386/mpih-sub1.S: Add CFI directives.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/mpi/i386/mpih-add1.S b/mpi/i386/mpih-add1.S
index 652b23218..32091f340 100644
--- a/mpi/i386/mpih-add1.S
+++ b/mpi/i386/mpih-add1.S
@@ -44,8 +44,11 @@
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(_gcry_mpih_add_n)
 C_SYMBOL_NAME(_gcry_mpih_add_n:)
+	CFI_STARTPROC()
 	pushl %edi
+	CFI_PUSH(%edi)
 	pushl %esi
+	CFI_PUSH(%esi)
 
 	movl 12(%esp),%edi		/* res_ptr */
 	movl 16(%esp),%esi		/* s1_ptr */
@@ -111,6 +114,9 @@ Loop:	movl	(%esi),%eax
 	negl	%eax
 
 	popl %esi
+	CFI_POP(%esi)
 	popl %edi
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-lshift.S b/mpi/i386/mpih-lshift.S
index bf8ed9d4c..55da0678d 100644
--- a/mpi/i386/mpih-lshift.S
+++ b/mpi/i386/mpih-lshift.S
@@ -42,9 +42,13 @@
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(_gcry_mpih_lshift)
 C_SYMBOL_NAME(_gcry_mpih_lshift:)
+	CFI_STARTPROC()
 	pushl	%edi
+	CFI_PUSH(%edi)
 	pushl	%esi
+	CFI_PUSH(%esi)
 	pushl	%ebx
+	CFI_PUSH(%ebx)
 
 	movl	16(%esp),%edi		/* res_ptr */
 	movl	20(%esp),%esi		/* s_ptr */
@@ -88,7 +92,11 @@ Lend:	shll	%cl,%ebx		/* compute least significant limb */
 	movl	%ebx,(%edi)		/* store it */
 
 	popl	%ebx
+	CFI_POP(%ebx)
 	popl	%esi
+	CFI_POP(%esi)
 	popl	%edi
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-mul1.S b/mpi/i386/mpih-mul1.S
index c9760ef92..9679ea622 100644
--- a/mpi/i386/mpih-mul1.S
+++ b/mpi/i386/mpih-mul1.S
@@ -49,10 +49,15 @@
 	GLOBL	C_SYMBOL_NAME(_gcry_mpih_mul_1)
 C_SYMBOL_NAME(_gcry_mpih_mul_1:)
 
+	CFI_STARTPROC()
 	INSN1(push,l	,R(edi))
+	CFI_PUSH(%edi)
 	INSN1(push,l	,R(esi))
+	CFI_PUSH(%esi)
 	INSN1(push,l	,R(ebx))
+	CFI_PUSH(%ebx)
 	INSN1(push,l	,R(ebp))
+	CFI_PUSH(%ebp)
 
 	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
 	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
@@ -77,8 +82,13 @@ Loop:
 	INSN2(mov,l	,R(eax),R(ebx))
 
 	INSN1(pop,l	,R(ebp))
+	CFI_POP(%ebp)
 	INSN1(pop,l	,R(ebx))
+	CFI_POP(%ebx)
 	INSN1(pop,l	,R(esi))
+	CFI_POP(%esi)
 	INSN1(pop,l	,R(edi))
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-mul2.S b/mpi/i386/mpih-mul2.S
index 9794e1108..fe4129c43 100644
--- a/mpi/i386/mpih-mul2.S
+++ b/mpi/i386/mpih-mul2.S
@@ -50,10 +50,15 @@
 	GLOBL	C_SYMBOL_NAME(_gcry_mpih_addmul_1)
 C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
 
+	CFI_STARTPROC()
 	INSN1(push,l	,R(edi))
+	CFI_PUSH(%edi)
 	INSN1(push,l	,R(esi))
+	CFI_PUSH(%esi)
 	INSN1(push,l	,R(ebx))
+	CFI_PUSH(%ebx)
 	INSN1(push,l	,R(ebp))
+	CFI_PUSH(%ebp)
 
 	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
 	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
@@ -79,8 +84,13 @@ Loop:
 	INSN2(mov,l	,R(eax),R(ebx))
 
 	INSN1(pop,l	,R(ebp))
+	CFI_POP(%ebp)
 	INSN1(pop,l	,R(ebx))
+	CFI_POP(%ebx)
 	INSN1(pop,l	,R(esi))
+	CFI_POP(%esi)
 	INSN1(pop,l	,R(edi))
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-mul3.S b/mpi/i386/mpih-mul3.S
index 6df201763..87577d54c 100644
--- a/mpi/i386/mpih-mul3.S
+++ b/mpi/i386/mpih-mul3.S
@@ -50,10 +50,15 @@
 	GLOBL	C_SYMBOL_NAME(_gcry_mpih_submul_1)
 C_SYMBOL_NAME(_gcry_mpih_submul_1:)
 
+	CFI_STARTPROC()
 	INSN1(push,l	,R(edi))
+	CFI_PUSH(%edi)
 	INSN1(push,l	,R(esi))
+	CFI_PUSH(%esi)
 	INSN1(push,l	,R(ebx))
+	CFI_PUSH(%ebx)
 	INSN1(push,l	,R(ebp))
+	CFI_PUSH(%ebp)
 
 	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
 	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
@@ -79,8 +84,13 @@ Loop:
 	INSN2(mov,l	,R(eax),R(ebx))
 
 	INSN1(pop,l	,R(ebp))
+	CFI_POP(%ebp)
 	INSN1(pop,l	,R(ebx))
+	CFI_POP(%ebx)
 	INSN1(pop,l	,R(esi))
+	CFI_POP(%esi)
 	INSN1(pop,l	,R(edi))
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-rshift.S b/mpi/i386/mpih-rshift.S
index 2920e55d8..35a8201f3 100644
--- a/mpi/i386/mpih-rshift.S
+++ b/mpi/i386/mpih-rshift.S
@@ -43,9 +43,13 @@
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
 C_SYMBOL_NAME(_gcry_mpih_rshift:)
+	CFI_STARTPROC()
 	pushl	%edi
+	CFI_PUSH(%edi)
 	pushl	%esi
+	CFI_PUSH(%esi)
 	pushl	%ebx
+	CFI_PUSH(%ebx)
 
 	movl	16(%esp),%edi		/* wp */
 	movl	20(%esp),%esi		/* up */
@@ -67,7 +71,7 @@ C_SYMBOL_NAME(_gcry_mpih_rshift:)
 	movl	%ebx,%eax
 
 	ALIGN (3)
-Loop2:	 movl	 (%esi,%edx,4),%ebx	 /* load next higher limb */
+Loop2:	movl	 (%esi,%edx,4),%ebx	/* load next higher limb */
 	shrdl	%cl,%ebx,%eax		/* compute result limb */
 	movl	%eax,(%edi,%edx,4)	/* store it */
 	incl	%edx
@@ -91,7 +95,11 @@ Lend2:	shrl	%cl,%ebx		/* compute most significant limb */
 	movl	%ebx,(%edi)		/* store it */
 
 	popl	%ebx
+	CFI_POP(%ebx)
 	popl	%esi
+	CFI_POP(%esi)
 	popl	%edi
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-sub1.S b/mpi/i386/mpih-sub1.S
index f447f7a66..501c4a9fd 100644
--- a/mpi/i386/mpih-sub1.S
+++ b/mpi/i386/mpih-sub1.S
@@ -45,8 +45,11 @@
 	ALIGN (3)
 	.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
 C_SYMBOL_NAME(_gcry_mpih_sub_n:)
+	CFI_STARTPROC()
 	pushl %edi
+	CFI_PUSH(%edi)
 	pushl %esi
+	CFI_PUSH(%esi)
 
 	movl 12(%esp),%edi		/* res_ptr */
 	movl 16(%esp),%esi		/* s1_ptr */
@@ -112,6 +115,9 @@ Loop:	movl	(%esi),%eax
 	negl	%eax
 
 	popl %esi
+	CFI_POP(%esi)
 	popl %edi
+	CFI_POP(%edi)
 	ret
+	CFI_ENDPROC()
 
diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h
index 39ede988f..9101585a8 100644
--- a/mpi/i386/syntax.h
+++ b/mpi/i386/syntax.h
@@ -26,6 +26,30 @@
  *	 to avoid revealing of sensitive data due to paging etc.
  */
 
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(4); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-4); CFI_RESTORE(reg)
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+#endif
+
 #undef ALIGN
 
 #if defined (BSD_SYNTAX) || defined (ELF_SYNTAX)


From jussi.kivilinna at iki.fi  Tue Apr 16 22:04:23 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 16 Apr 2019 23:04:23 +0300
Subject: [PATCH] Add CFI unwind assembly directives for AMD64
Message-ID: <155544506311.19850.12838764214519531613.stgit@localhost.localdomain>

* configure.ac (gcry_cv_gcc_asm_cfi_directives): New.
* cipher/asm-common-amd64.h (ADD_RIP, CFI_STARTPROC, CFI_ENDPROC)
(CFI_REMEMBER_STATE, CFI_RESTORE_STATE, CFI_ADJUST_CFA_OFFSET)
(CFI_REL_OFFSET, CFI_DEF_CFA_REGISTER, CFI_REGISTER, CFI_RESTORE)
(CFI_PUSH, CFI_POP, CFI_POP_TMP_REG, CFI_LEAVE, DW_REGNO)
(DW_SLEB128_7BIT, DW_SLEB128_28BIT, CFI_CFA_ON_STACK)
(CFI_REG_ON_STACK): New.
(ENTER_SYSV_FUNCPARAMS_0_4, EXIT_SYSV_FUNC): Add CFI directives.
* cipher/arcfour-amd64.S: Add CFI directives.
* cipher/blake2b-amd64-avx2.S: Add CFI directives.
* cipher/blake2s-amd64-avx.S: Add CFI directives.
* cipher/blowfish-amd64.S: Add CFI directives.
* cipher/camellia-aesni-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/camellia-aesni-avx2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/cast5-amd64.S: Add CFI directives.
* cipher/chacha20-amd64-avx2.S: Add CFI directives.
* cipher/chacha20-amd64-ssse3.S: Add CFI directives.
* cipher/des-amd64.S: Add CFI directives.
* cipher/rijndael-amd64.S: Add CFI directives.
* cipher/rijndael-ssse3-amd64-asm.S: Add CFI directives.
* cipher/salsa20-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'.
* cipher/serpent-avx2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/serpent-sse2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-avx-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-avx2-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-ssse3-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha256-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha256-avx2-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha256-ssse3-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha512-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha512-avx2-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha512-ssse3-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/twofish-amd64.S: Add CFI directives.
* cipher/twofish-avx2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/whirlpool-sse2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* mpi/amd64/func_abi.h: Include 'config.h'.
(CFI_STARTPROC, CFI_ENDPROC, CFI_ADJUST_CFA_OFFSET, CFI_REL_OFFSET)
(CFI_RESTORE, CFI_PUSH, CFI_POP): New.
(FUNC_ENTRY, FUNC_EXIT): Add CFI directives.
--

This commit adds CFI directives that add DWARF unwinding information for
debugger to backtrace when executing code from AMD64 assembly files.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index c08f3453b..221dfeff7 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -25,9 +25,12 @@
 .globl _gcry_arcfour_amd64
 ELF(.type _gcry_arcfour_amd64, at function)
 _gcry_arcfour_amd64:
+	CFI_STARTPROC()
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	push	%rbp
+	CFI_PUSH(%rbp)
 	push	%rbx
+	CFI_PUSH(%rbx)
 	mov	%rdi,		%rbp	# key = ARG(key)
 	mov	%rsi,		%rbx	# rbx = ARG(len)
 	mov	%rdx,		%rsi	# in = ARG(in)
@@ -92,9 +95,12 @@ _gcry_arcfour_amd64:
 	movb	%cl,		(4*256)(%rbp)	# key->y = y
 	movb	%dl,		(4*256+4)(%rbp)	# key->x = x
 	pop	%rbx
+	CFI_POP(%rbx)
 	pop	%rbp
+	CFI_POP(%rbp)
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC()
 .L__gcry_arcfour_amd64_end:
 ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
 
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 7eb426495..9d4a028a0 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -41,6 +41,12 @@
 #  define RIP
 #endif
 
+#ifdef __PIC__
+#  define ADD_RIP +rip
+#else
+#  define ADD_RIP
+#endif
+
 #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
 #  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
 #else
@@ -60,10 +66,101 @@
 #  endif
 #endif
 
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+	CFI_ADJUST_CFA_OFFSET(-8);
+# define CFI_LEAVE() \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_rax 0
+# define DW_REGNO_rdx 1
+# define DW_REGNO_rcx 2
+# define DW_REGNO_rbx 3
+# define DW_REGNO_rsi 4
+# define DW_REGNO_rdi 5
+# define DW_REGNO_rbp 6
+# define DW_REGNO_rsp 7
+# define DW_REGNO_r8  8
+# define DW_REGNO_r9  9
+# define DW_REGNO_r10 10
+# define DW_REGNO_r11 11
+# define DW_REGNO_r12 12
+# define DW_REGNO_r13 13
+# define DW_REGNO_r14 14
+# define DW_REGNO_r15 15
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(reg,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(DW_REGNO(reg)), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 # define ENTER_SYSV_FUNC_PARAMS_0_4 \
 	pushq %rdi; \
+	CFI_PUSH(%rdi); \
 	pushq %rsi; \
+	CFI_PUSH(%rsi); \
 	movq %rcx, %rdi; \
 	movq %rdx, %rsi; \
 	movq %r8, %rdx; \
@@ -79,7 +176,9 @@
 
 # define EXIT_SYSV_FUNC \
 	popq %rsi; \
-	popq %rdi;
+	CFI_POP(%rsi); \
+	popq %rdi; \
+	CFI_POP(%rdi);
 #else
 # define ENTER_SYSV_FUNC_PARAMS_0_4
 # define ENTER_SYSV_FUNC_PARAMS_5
diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S
index 6bcc5652d..08c816cdf 100644
--- a/cipher/blake2b-amd64-avx2.S
+++ b/cipher/blake2b-amd64-avx2.S
@@ -207,6 +207,7 @@ _gcry_blake2b_transform_amd64_avx2:
          *	%rsi: blks
          *	%rdx: num_blks
          */
+        CFI_STARTPROC();
 
         vzeroupper;
 
@@ -291,6 +292,7 @@ _gcry_blake2b_transform_amd64_avx2:
         xor %eax, %eax;
         vzeroall;
         ret;
+        CFI_ENDPROC();
 ELF(.size _gcry_blake2b_transform_amd64_avx2,
     .-_gcry_blake2b_transform_amd64_avx2;)
 
diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S
index f7312dbd0..198373262 100644
--- a/cipher/blake2s-amd64-avx.S
+++ b/cipher/blake2s-amd64-avx.S
@@ -191,6 +191,7 @@ _gcry_blake2s_transform_amd64_avx:
          *	%rsi: blks
          *	%rdx: num_blks
          */
+        CFI_STARTPROC();
 
         vzeroupper;
 
@@ -269,6 +270,7 @@ _gcry_blake2s_transform_amd64_avx:
         xor %eax, %eax;
         vzeroall;
         ret;
+        CFI_ENDPROC();
 ELF(.size _gcry_blake2s_transform_amd64_avx,
     .-_gcry_blake2s_transform_amd64_avx;)
 
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 02d3b7102..bdb361d7e 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -133,7 +133,9 @@ __blowfish_enc_blk1:
 	 * output:
 	 *	RX0: output plaintext block
 	 */
+	CFI_STARTPROC();
 	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
 
 	load_roundkey_enc(0);
 	round_enc(2);
@@ -147,8 +149,10 @@ __blowfish_enc_blk1:
 	add_roundkey_enc();
 
 	movq %r11, %rbp;
+	CFI_RESTORE(%rbp)
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
 
 .align 8
@@ -161,6 +165,7 @@ _gcry_blowfish_amd64_do_encrypt:
 	 *	%rsi: u32 *ret_xl
 	 *	%rdx: u32 *ret_xr
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movl (%rdx), RX0d;
@@ -178,6 +183,7 @@ _gcry_blowfish_amd64_do_encrypt:
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
 .align 8
@@ -190,6 +196,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rsi, %r10;
@@ -204,6 +211,7 @@ _gcry_blowfish_amd64_encrypt_block:
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
 .align 8
@@ -216,9 +224,11 @@ _gcry_blowfish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
 
 	movq %rsi, %r10;
 	movq %rdx, RIO;
@@ -240,9 +250,11 @@ _gcry_blowfish_amd64_decrypt_block:
 	write_block();
 
 	movq %r11, %rbp;
+	CFI_RESTORE(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
 /**********************************************************************
@@ -340,6 +352,7 @@ __blowfish_enc_blk4:
 	 * output:
 	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	preload_roundkey_enc(0);
 
 	round_enc4(0);
@@ -355,6 +368,7 @@ __blowfish_enc_blk4:
 	outbswap_block4();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
 
 .align 8
@@ -367,6 +381,7 @@ __blowfish_dec_blk4:
 	 * output:
 	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
 	 */
+	CFI_STARTPROC();
 	preload_roundkey_dec(17);
 
 	inbswap_block4();
@@ -384,6 +399,7 @@ __blowfish_dec_blk4:
 	outbswap_block4();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
 
 .align 8
@@ -396,12 +412,17 @@ _gcry_blowfish_amd64_ctr_enc:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
 	movq %rcx, %r13; /*iv*/
@@ -438,12 +459,17 @@ _gcry_blowfish_amd64_ctr_enc:
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
 .align 8
@@ -456,12 +482,17 @@ _gcry_blowfish_amd64_cbc_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
 	movq %rsi, %r11; /*dst*/
@@ -489,12 +520,17 @@ _gcry_blowfish_amd64_cbc_dec:
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
 .align 8
@@ -507,12 +543,17 @@ _gcry_blowfish_amd64_cfb_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
 	movq %rcx, %r13; /*iv*/
@@ -543,12 +584,17 @@ _gcry_blowfish_amd64_cfb_dec:
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
 #endif /*defined(USE_BLOWFISH)*/
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 8022934fb..e16d4f613 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -24,17 +24,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -75,10 +65,10 @@
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
-	vmovdqa .Linv_shift_row RIP, t4; \
-	vbroadcastss .L0f0f0f0f RIP, t7; \
-	vmovdqa .Lpre_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpre_tf_hi_s1 RIP, t1; \
+	vmovdqa .Linv_shift_row rRIP, t4; \
+	vbroadcastss .L0f0f0f0f rRIP, t7; \
+	vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
@@ -91,8 +81,8 @@
 	vpshufb t4, x6, x6; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
-	vmovdqa .Lpre_tf_lo_s4 RIP, t2; \
-	vmovdqa .Lpre_tf_hi_s4 RIP, t3; \
+	vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \
+	vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t6); \
 	filter_8bit(x7, t0, t1, t7, t6); \
 	filter_8bit(x1, t0, t1, t7, t6); \
@@ -106,8 +96,8 @@
 	filter_8bit(x6, t2, t3, t7, t6); \
 	\
 	/* AES subbytes + AES shift rows */ \
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
 	vaesenclast t4, x0, x0; \
 	vaesenclast t4, x7, x7; \
 	vaesenclast t4, x1, x1; \
@@ -118,16 +108,16 @@
 	vaesenclast t4, x6, x6; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
-	vmovdqa .Lpost_tf_lo_s3 RIP, t2; \
-	vmovdqa .Lpost_tf_hi_s3 RIP, t3; \
+	vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \
+	vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t6); \
 	filter_8bit(x7, t0, t1, t7, t6); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
-	vmovdqa .Lpost_tf_lo_s2 RIP, t4; \
-	vmovdqa .Lpost_tf_hi_s2 RIP, t5; \
+	vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \
+	vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
@@ -442,7 +432,7 @@
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
-	vmovdqu .Lshufb_16x16b RIP, a0; \
+	vmovdqu .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
@@ -508,7 +498,7 @@
 	vpunpcklwd t1, t3, e; \
 	vpunpckhwd t1, t3, f; \
 	\
-	vmovdqa .Ltranspose_8x8_shuf RIP, t3; \
+	vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \
 	\
 	vpunpcklwd g, c, d; \
 	vpunpckhwd g, c, c; \
@@ -540,7 +530,7 @@
 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 16(rio), x0, y7; \
 	vpxor 1 * 16(rio), x0, y6; \
@@ -591,7 +581,7 @@
 	vmovdqu x0, stack_tmp0; \
 	\
 	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
@@ -786,6 +776,7 @@ __camellia_enc_blk16:
 	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 16(%rax), %rcx;
 
@@ -859,6 +850,7 @@ __camellia_enc_blk16:
 		     %xmm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
 .align 8
@@ -874,6 +866,7 @@ __camellia_dec_blk16:
 	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 16(%rax), %rcx;
 
@@ -944,6 +937,7 @@ __camellia_dec_blk16:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
+	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -963,9 +957,12 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -973,7 +970,7 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	vmovdqa .Lbswap128_mask RIP, %xmm14;
+	vmovdqa .Lbswap128_mask rRIP, %xmm14;
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm15;
@@ -1018,12 +1015,12 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	vmovdqa %xmm0, %xmm13;
 	vpshufb %xmm14, %xmm0, %xmm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
 	vmovdqu %xmm13, (%rcx);
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1067,7 +1064,9 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
 .align 8
@@ -1081,9 +1080,12 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1135,7 +1137,9 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
 .align 8
@@ -1149,9 +1153,12 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1161,7 +1168,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm0;
-	vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+	vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0;
 	vpxor (%rcx), %xmm0, %xmm15;
 	vmovdqu 15 * 16(%rdx), %xmm1;
 	vmovdqu %xmm1, (%rcx); /* store new IV */
@@ -1207,7 +1214,9 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
 .align 8
@@ -1223,9 +1232,12 @@ _gcry_camellia_aesni_avx_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1233,10 +1245,14 @@ _gcry_camellia_aesni_avx_ocb_enc:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm15;
@@ -1292,7 +1308,7 @@ _gcry_camellia_aesni_avx_ocb_enc:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1335,13 +1351,19 @@ _gcry_camellia_aesni_avx_ocb_enc:
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
 
 .align 8
@@ -1357,9 +1379,12 @@ _gcry_camellia_aesni_avx_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1367,10 +1392,14 @@ _gcry_camellia_aesni_avx_ocb_dec:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm15;
 
@@ -1428,7 +1457,7 @@ _gcry_camellia_aesni_avx_ocb_dec:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX, %r8, 8), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1493,13 +1522,19 @@ _gcry_camellia_aesni_avx_ocb_dec:
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
 
 .align 8
@@ -1514,9 +1549,12 @@ _gcry_camellia_aesni_avx_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1524,10 +1562,14 @@ _gcry_camellia_aesni_avx_ocb_auth:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm15;
 
@@ -1580,7 +1622,7 @@ _gcry_camellia_aesni_avx_ocb_auth:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1623,13 +1665,19 @@ _gcry_camellia_aesni_avx_ocb_auth:
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
 
 /*
@@ -1657,8 +1705,8 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
 	vpand sbox4mask, t0, t0; \
 	vpor t0, x, x; \
 	\
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
 	\
 	/* prefilter sboxes */ \
 	filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
@@ -1672,18 +1720,18 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
 	/* output rotation for sbox2 (<<< 1) */ \
 	/* output rotation for sbox3 (>>> 1) */ \
 	vpshufb inv_shift_row, x, t1; \
-	vpshufb .Lsp0044440444044404mask RIP, x, t4; \
-	vpshufb .Lsp1110111010011110mask RIP, x, x; \
+	vpshufb .Lsp0044440444044404mask rRIP, x, t4; \
+	vpshufb .Lsp1110111010011110mask rRIP, x, x; \
 	vpaddb t1, t1, t2; \
 	vpsrlw $7, t1, t0; \
 	vpsllw $7, t1, t3; \
 	vpor t0, t2, t0; \
 	vpsrlw $1, t1, t1; \
-	vpshufb .Lsp0222022222000222mask RIP, t0, t0; \
+	vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \
 	vpor t1, t3, t1; \
 	\
 	vpxor x, t4, t4; \
-	vpshufb .Lsp3033303303303033mask RIP, t1, t1; \
+	vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \
 	vpxor t4, t0, t0; \
 	vpxor t1, t0, t0; \
 	vpsrldq $8, t0, x; \
@@ -1741,17 +1789,19 @@ __camellia_avx_setup128:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
 	 *	%xmm0: key
 	 */
+	CFI_STARTPROC();
+
 #define cmll_sub(n, ctx) (key_table+((n)*8))(ctx)
 #define KL128 %xmm0
 #define KA128 %xmm2
 
-	vpshufb .Lbswap128_mask RIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
 
-	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
-	vmovq .Lsbox4_input_mask RIP, %xmm12;
-	vbroadcastss .L0f0f0f0f RIP, %xmm13;
-	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
-	vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15;
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
 
 	/*
 	 * Generate KA
@@ -1763,18 +1813,18 @@ __camellia_avx_setup128:
 
 	camellia_f(%xmm2, %xmm4, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm2, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
 	camellia_f(%xmm2, %xmm3, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm4, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
 
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm4, %xmm2, %xmm2;
@@ -2076,6 +2126,7 @@ __camellia_avx_setup128:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
 .align 8
@@ -2086,19 +2137,21 @@ __camellia_avx_setup256:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
 	 *	%xmm0 & %xmm1: key
 	 */
+	CFI_STARTPROC();
+
 #define KL128 %xmm0
 #define KR128 %xmm1
 #define KA128 %xmm2
 #define KB128 %xmm3
 
-	vpshufb .Lbswap128_mask RIP, KL128, KL128;
-	vpshufb .Lbswap128_mask RIP, KR128, KR128;
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KR128, KR128;
 
-	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
-	vmovq .Lsbox4_input_mask RIP, %xmm12;
-	vbroadcastss .L0f0f0f0f RIP, %xmm13;
-	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
-	vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15;
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
 
 	/*
 	 * Generate KA
@@ -2111,20 +2164,20 @@ __camellia_avx_setup256:
 
 	camellia_f(%xmm2, %xmm4, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm2, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
 	vpxor %xmm6, %xmm2, %xmm2;
 	camellia_f(%xmm2, %xmm3, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	vpxor KR128, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm4, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
 
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm4, %xmm2, %xmm2;
@@ -2142,12 +2195,12 @@ __camellia_avx_setup256:
 
 	camellia_f(%xmm4, %xmm5, %xmm6,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
 	vpxor %xmm5, %xmm3, %xmm3;
 
 	camellia_f(%xmm3, %xmm5, %xmm6,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm5, %xmm4, %xmm4;
 	vpsrldq $8, %xmm3, %xmm3;
@@ -2553,6 +2606,7 @@ __camellia_avx_setup256:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
 .align 8
@@ -2565,6 +2619,7 @@ _gcry_camellia_aesni_avx_keygen:
 	 *	%rsi: key
 	 *	%rdx: keylen
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -2585,6 +2640,7 @@ _gcry_camellia_aesni_avx_keygen:
 	vpor %xmm2, %xmm1, %xmm1;
 
 	jmp __camellia_avx_setup256;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 897e4aeec..cc01c7743 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -24,17 +24,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -92,12 +82,12 @@
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
-	vbroadcasti128 .Linv_shift_row RIP, t4; \
-	vpbroadcastd .L0f0f0f0f RIP, t7; \
-	vbroadcasti128 .Lpre_tf_lo_s1 RIP, t5; \
-	vbroadcasti128 .Lpre_tf_hi_s1 RIP, t6; \
-	vbroadcasti128 .Lpre_tf_lo_s4 RIP, t2; \
-	vbroadcasti128 .Lpre_tf_hi_s4 RIP, t3; \
+	vbroadcasti128 .Linv_shift_row rRIP, t4; \
+	vpbroadcastd .L0f0f0f0f rRIP, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \
+	vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \
+	vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
@@ -143,8 +133,8 @@
 	vinserti128 $1, t2##_x, x6, x6; \
 	vextracti128 $1, x1, t3##_x; \
 	vextracti128 $1, x4, t2##_x; \
-	vbroadcasti128 .Lpost_tf_lo_s1 RIP, t0; \
-	vbroadcasti128 .Lpost_tf_hi_s1 RIP, t1; \
+	vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \
+	vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \
 	vaesenclast t4##_x, x2##_x, x2##_x; \
 	vaesenclast t4##_x, t6##_x, t6##_x; \
 	vaesenclast t4##_x, x5##_x, x5##_x; \
@@ -159,16 +149,16 @@
 	vinserti128 $1, t2##_x, x4, x4; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
-	vbroadcasti128 .Lpost_tf_lo_s3 RIP, t2; \
-	vbroadcasti128 .Lpost_tf_hi_s3 RIP, t3; \
+	vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \
+	vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t4); \
 	filter_8bit(x7, t0, t1, t7, t4); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
-	vbroadcasti128 .Lpost_tf_lo_s2 RIP, t4; \
-	vbroadcasti128 .Lpost_tf_hi_s2 RIP, t5; \
+	vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \
+	vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
@@ -485,7 +475,7 @@
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
-	vbroadcasti128 .Lshufb_16x16b RIP, a0; \
+	vbroadcasti128 .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
@@ -524,7 +514,7 @@
 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vpbroadcastq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 32(rio), x0, y7; \
 	vpxor 1 * 32(rio), x0, y6; \
@@ -575,7 +565,7 @@
 	vmovdqu x0, stack_tmp0; \
 	\
 	vpbroadcastq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
@@ -765,6 +755,7 @@ __camellia_enc_blk32:
 	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
@@ -838,6 +829,7 @@ __camellia_enc_blk32:
 		     %ymm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
 .align 8
@@ -853,6 +845,7 @@ __camellia_dec_blk32:
 	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
@@ -923,6 +916,7 @@ __camellia_dec_blk32:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
+	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -942,9 +936,12 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movq 8(%rcx), %r11;
 	bswapq %r11;
@@ -960,10 +957,10 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm0;
-	vpshufb .Lbswap128_mask RIP, %xmm0, %xmm0;
+	vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0;
 	vmovdqa %xmm0, %xmm1;
 	inc_le128(%xmm0, %xmm15, %xmm14);
-	vbroadcasti128 .Lbswap128_mask RIP, %ymm14;
+	vbroadcasti128 .Lbswap128_mask rRIP, %ymm14;
 	vinserti128 $1, %xmm0, %ymm1, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 15 * 32(%rax);
@@ -1064,14 +1061,14 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	vextracti128 $1, %ymm0, %xmm13;
 	vpshufb %ymm14, %ymm0, %ymm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13;
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
 .align 4
 .Lload_ctr_done:
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1116,7 +1113,9 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
 
 .align 8
@@ -1130,9 +1129,12 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1188,7 +1190,9 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
 
 .align 8
@@ -1202,9 +1206,12 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1214,7 +1221,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm0;
-	vpshufb .Lpack_bswap RIP, %ymm0, %ymm0;
+	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
 	vmovdqu (%rcx), %xmm15;
 	vinserti128 $1, (%rdx), %ymm15, %ymm15;
 	vpxor %ymm15, %ymm0, %ymm15;
@@ -1262,7 +1269,9 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
 .align 8
@@ -1278,9 +1287,12 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1288,10 +1300,14 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm13;
@@ -1369,7 +1385,7 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1412,13 +1428,19 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;)
 
 .align 8
@@ -1434,9 +1456,12 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1444,10 +1469,14 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 
@@ -1525,7 +1554,7 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1596,13 +1625,19 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;)
 
 .align 8
@@ -1617,9 +1652,12 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1627,10 +1665,14 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm14;
 
@@ -1703,7 +1745,7 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1749,13 +1791,19 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index 1a1d43fd5..82f678901 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -183,10 +183,13 @@ _gcry_cast5_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 
 	movq %rsi, %r10;
 
@@ -211,10 +214,13 @@ _gcry_cast5_amd64_encrypt_block:
 	write_block();
 
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
 .align 8
@@ -227,10 +233,13 @@ _gcry_cast5_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 
 	movq %rsi, %r10;
 
@@ -255,10 +264,13 @@ _gcry_cast5_amd64_decrypt_block:
 	write_block();
 
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
 /**********************************************************************
@@ -371,6 +383,7 @@ __cast5_enc_blk4:
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	get_round_km(0, RKM0d);
@@ -387,6 +400,7 @@ __cast5_enc_blk4:
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
+	CFI_ENDPROC();
 ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
 .align 8
@@ -399,6 +413,7 @@ __cast5_dec_blk4:
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
 	 */
+	CFI_STARTPROC();
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
@@ -416,6 +431,7 @@ __cast5_dec_blk4:
 	round_dec_last4(1, F4_2, F4_1);
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	CFI_ENDPROC();
 	ret;
 ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
@@ -425,20 +441,28 @@ ELF(.type   _gcry_cast5_amd64_ctr_enc, at function;)
 _gcry_cast5_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* load IV and byteswap */
 	movq (%rcx), RX0;
@@ -458,7 +482,9 @@ _gcry_cast5_amd64_ctr_enc:
 	call __cast5_enc_blk4;
 
 	popq %r14; /*src*/
+	CFI_POP_TMP_REG();
 	popq %r13; /*dst*/
+	CFI_POP_TMP_REG();
 
 	/* XOR key-stream with plaintext */
 	xorq 0 * 8(%r14), RLR0;
@@ -471,13 +497,19 @@ _gcry_cast5_amd64_ctr_enc:
 	movq RLR3, 3 * 8(%r13);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
 .align 8
@@ -486,21 +518,30 @@ ELF(.type   _gcry_cast5_amd64_cbc_dec, at function;)
 _gcry_cast5_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rcx;
+	CFI_PUSH(%rcx);
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* load input */
 	movq 0 * 8(%rdx), RLR0;
@@ -511,8 +552,11 @@ _gcry_cast5_amd64_cbc_dec:
 	call __cast5_dec_blk4;
 
 	popq RX0; /*src*/
+	CFI_POP_TMP_REG();
 	popq RX1; /*dst*/
+	CFI_POP_TMP_REG();
 	popq RX2; /*iv*/
+	CFI_POP_TMP_REG();
 
 	movq 3 * 8(RX0), %r14;
 	xorq      (RX2), RLR0;
@@ -527,14 +571,19 @@ _gcry_cast5_amd64_cbc_dec:
 	movq RLR3, 3 * 8(RX1);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
-
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
 .align 8
@@ -543,20 +592,28 @@ ELF(.type   _gcry_cast5_amd64_cfb_dec, at function;)
 _gcry_cast5_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* Load input */
 	movq (%rcx), RLR0;
@@ -573,7 +630,9 @@ _gcry_cast5_amd64_cfb_dec:
 	call __cast5_enc_blk4;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rcx; /*dst*/
+	CFI_POP_TMP_REG();
 
 	xorq 0 * 8(%rdx), RLR0;
 	xorq 1 * 8(%rdx), RLR1;
@@ -585,14 +644,19 @@ _gcry_cast5_amd64_cfb_dec:
 	movq RLR3, 3 * 8(%rcx);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
-
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
 
 #endif /*defined(USE_CAST5)*/
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index 94c8e8cf7..de6263b69 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -179,11 +179,14 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 8)
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $STACK_MAX, %rsp;
 	andq $~31, %rsp;
@@ -318,7 +321,9 @@ _gcry_chacha20_amd64_avx2_blocks8:
 
 	/* eax zeroed by round loop. */
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_amd64_avx2_blocks8;)
 
@@ -339,9 +344,12 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -353,6 +361,11 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
 	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
 	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
 
 	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
 	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
@@ -752,10 +765,17 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
 	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
 	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
 
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 1657f7712..6bbf12fc1 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -175,9 +175,12 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 4)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $STACK_MAX, %rsp;
 	andq $~15, %rsp;
@@ -329,7 +332,9 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 
 	/* eax zeroed by round loop. */
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
@@ -372,6 +377,7 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 	 *	%rdx: src
 	 *	%rcx: nblks
 	 */
+	CFI_STARTPROC();
 
 	/* Load constants */
 	movdqa .Lcounter1 rRIP, X4;
@@ -497,6 +503,7 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 
 	/* eax zeroed by round loop. */
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
 
@@ -517,9 +524,12 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(8 * 8) + STACK_MAX + 16, %rsp;
 	andq $~15, %rsp;
@@ -529,6 +539,11 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
 	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
 	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
 
 	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
 	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
@@ -901,10 +916,17 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
 	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
 	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
 
@@ -925,8 +947,12 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
+
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(8 * 8), %rsp;
 	movq %rbx, (0 * 8)(%rsp);
@@ -934,6 +960,11 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	movq %r13, (2 * 8)(%rsp);
 	movq %r14, (3 * 8)(%rsp);
 	movq %r15, (4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, 0 * 8);
+	CFI_REG_ON_STACK(r12, 1 * 8);
+	CFI_REG_ON_STACK(r13, 2 * 8);
+	CFI_REG_ON_STACK(r14, 3 * 8);
+	CFI_REG_ON_STACK(r15, 4 * 8);
 
 	movq %rdx, (5 * 8)(%rsp); # SRC
 	movq %rsi, (6 * 8)(%rsp); # DST
@@ -1206,10 +1237,17 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	movq (2 * 8)(%rsp), %r13;
 	movq (3 * 8)(%rsp), %r14;
 	movq (4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
 
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index f25573d99..a211dac38 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -190,15 +190,23 @@ _gcry_3des_amd64_crypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 
 	leaq .L_s1 rRIP, SBOXES;
 
@@ -259,18 +267,26 @@ _gcry_3des_amd64_crypt_block:
 	round1(32+15, RL0, RR0, dummy2);
 
 	popq RW2; /*dst*/
+	CFI_POP_TMP_REG();
 	final_permutation(RR0, RL0);
 	write_block(RW2, RR0, RL0);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
 /***********************************************************************
@@ -465,6 +481,7 @@ _gcry_3des_amd64_crypt_blk3:
 	 *  RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks
 	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
 	 */
+	CFI_STARTPROC();
 
 	leaq .L_s1 rRIP, SBOXES;
 
@@ -528,6 +545,7 @@ _gcry_3des_amd64_crypt_blk3:
 	final_permutation3(RR, RL);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
 
 .align 8
@@ -540,18 +558,28 @@ _gcry_3des_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	pushq %rcx; /*iv*/
+	CFI_PUSH(%rcx);
 
 	/* load input */
 	movl 0 * 4(%rdx), RL0d;
@@ -571,8 +599,11 @@ _gcry_3des_amd64_cbc_dec:
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rcx; /*iv*/
+	CFI_POP_TMP_REG();
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
@@ -598,14 +629,21 @@ _gcry_3des_amd64_cbc_dec:
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
@@ -618,17 +656,26 @@ _gcry_3des_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	movq %rcx, RW2;
 
 	/* load IV and byteswap */
@@ -654,7 +701,9 @@ _gcry_3des_amd64_ctr_enc:
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
@@ -678,14 +727,21 @@ _gcry_3des_amd64_ctr_enc:
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
@@ -698,17 +754,26 @@ _gcry_3des_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	movq %rcx, RW2;
 
 	/* Load input */
@@ -733,7 +798,9 @@ _gcry_3des_amd64_cfb_dec:
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
@@ -757,14 +824,21 @@ _gcry_3des_amd64_cfb_dec:
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
 .align 16
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index 798ff51af..3dcaa856b 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -212,14 +212,19 @@ _gcry_aes_amd64_encrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  encryption tables
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
 	movq %rbp, (2 * 8)(%rsp);
 	movq %rbx, (3 * 8)(%rsp);
 	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
 
 	leaq (%r8), RTAB;
 
@@ -251,16 +256,23 @@ _gcry_aes_amd64_encrypt_block:
 	movl RCd, 2 * 4(%rsi);
 	movl RDd, 3 * 4(%rsi);
 
+	CFI_REMEMBER_STATE();
+
 	movq (4 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
 
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
 	ret;
 
+	CFI_RESTORE_STATE();
 .align 4
 .Lenc_not_128:
 	je .Lenc_192
@@ -280,6 +292,7 @@ _gcry_aes_amd64_encrypt_block:
 	lastencround(11);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
 
 #define do_decround(next_r) \
@@ -376,14 +389,19 @@ _gcry_aes_amd64_decrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  decryption tables
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
 	movq %rbp, (2 * 8)(%rsp);
 	movq %rbx, (3 * 8)(%rsp);
 	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
 
 	leaq (%r8), RTAB;
 
@@ -416,16 +434,23 @@ _gcry_aes_amd64_decrypt_block:
 	movl RCd, 2 * 4(%rsi);
 	movl RDd, 3 * 4(%rsi);
 
+	CFI_REMEMBER_STATE();
+
 	movq (4 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
 
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
 	ret;
 
+	CFI_RESTORE_STATE();
 .align 4
 .Ldec_256:
 	je .Ldec_192;
@@ -445,6 +470,7 @@ _gcry_aes_amd64_decrypt_block:
 	decround(9);
 
 	jmp .Ldec_tail;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;)
 
 #endif /*USE_AES*/
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index ffce5df2f..8124eb219 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -50,6 +50,7 @@
 ELF(.type _gcry_aes_ssse3_enc_preload, at function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9  # 0F
@@ -61,6 +62,7 @@ _gcry_aes_ssse3_enc_preload:
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
 ##
@@ -69,6 +71,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 ELF(.type _gcry_aes_ssse3_dec_preload, at function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9   # 0F
@@ -81,6 +84,7 @@ _gcry_aes_ssse3_dec_preload:
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
 ##
@@ -111,6 +115,7 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
 .globl _gcry_aes_ssse3_encrypt_core
 _gcry_aes_ssse3_encrypt_core:
 _aes_encrypt_core:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	mov	%rdi,	%rdx
 	leaq	-1(%rsi), %rax
@@ -190,6 +195,7 @@ _aes_encrypt_core:
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
 ##
@@ -202,6 +208,7 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
 _gcry_aes_ssse3_decrypt_core:
 _aes_decrypt_core:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	mov	%rdi,	%rdx
 	lea	.Laes_consts(%rip), %rcx
@@ -297,6 +304,7 @@ _aes_decrypt_core:
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
 ########################################################
@@ -315,6 +323,7 @@ _aes_schedule_core:
 	# rdx = buffer
 	# rcx = direction.  0=encrypt, 1=decrypt
 	# r8 = rotoffs
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	# load the tables
@@ -671,6 +680,7 @@ _aes_schedule_core:
 	pxor	%xmm8,  %xmm8
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
 ########################################################
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
index 470c32aad..ae8f27155 100644
--- a/cipher/salsa20-amd64.S
+++ b/cipher/salsa20-amd64.S
@@ -28,11 +28,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -40,6 +36,7 @@
 .globl _gcry_salsa20_amd64_keysetup
 ELF(.type  _gcry_salsa20_amd64_keysetup, at function;)
 _gcry_salsa20_amd64_keysetup:
+	CFI_STARTPROC();
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%r9d
 	movl   8(%rsi),%eax
@@ -87,11 +84,13 @@ _gcry_salsa20_amd64_keysetup:
 	movl   %r8d,12(%rdi)
 .L_keysetupdone:
 	ret
+	CFI_ENDPROC();
 
 .align 8
 .globl _gcry_salsa20_amd64_ivsetup
 ELF(.type  _gcry_salsa20_amd64_ivsetup, at function;)
 _gcry_salsa20_amd64_ivsetup:
+	CFI_STARTPROC();
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%esi
 	mov  $0,%r9
@@ -101,6 +100,7 @@ _gcry_salsa20_amd64_ivsetup:
 	movl   %r9d,32(%rdi)
 	movl   %eax,52(%rdi)
 	ret
+	CFI_ENDPROC();
 
 .align 8
 .globl _gcry_salsa20_amd64_encrypt_blocks
@@ -112,13 +112,15 @@ _gcry_salsa20_amd64_encrypt_blocks:
 	 *  - Length is input as number of blocks, so don't handle tail bytes
 	 *    (this is done in salsa20.c).
 	 */
+	CFI_STARTPROC();
 	push %rbx
+	CFI_PUSH(%rbx);
 	shlq $6, %rcx /* blocks to bytes */
 	mov %r8, %rbx
 	mov %rsp,%r11
-	and $31,%r11
-	add $384,%r11
-	sub %r11,%rsp
+	CFI_DEF_CFA_REGISTER(%r11);
+	sub $384,%rsp
+	and $~31,%rsp
 	mov  %rdi,%r8
 	mov  %rsi,%rsi
 	mov  %rdx,%rdi
@@ -916,15 +918,22 @@ _gcry_salsa20_amd64_encrypt_blocks:
 	cmp  $64,%rdx
 	ja .L_bytes_are_128_or_192
 .L_done:
-	add %r11,%rsp
+	CFI_REMEMBER_STATE();
 	mov %r11,%rax
+	sub %rsp,%rax
+	mov %r11,%rsp
+	CFI_REGISTER(%r11, %rsp)
+	CFI_DEF_CFA_REGISTER(%rsp)
 	pop %rbx
+	CFI_POP(%rbx)
 	ret
+	CFI_RESTORE_STATE();
 .L_bytes_are_128_or_192:
 	sub  $64,%rdx
 	add  $64,%rdi
 	add  $64,%rsi
 	jmp .L_bytes_are_64_128_or_192
+	CFI_ENDPROC();
 ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;)
 
 #endif /*defined(USE_SALSA20)*/
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index 8d60a159e..9b17c2bd1 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -24,17 +24,7 @@
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \
     defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 /* struct serpent_context: */
 #define ctx_keys 0
@@ -421,6 +411,7 @@ __serpent_enc_blk16:
 	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
 	 * 						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
@@ -496,6 +487,7 @@ __serpent_enc_blk16:
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
 
 .align 8
@@ -509,6 +501,7 @@ __serpent_dec_blk16:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
 	 *						plaintext blocks
 	 */
+	CFI_STARTPROC();
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
@@ -586,6 +579,7 @@ __serpent_dec_blk16:
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -604,13 +598,14 @@ _gcry_serpent_avx2_ctr_enc:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
 	vzeroupper;
 
-	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
 	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
@@ -701,7 +696,8 @@ _gcry_serpent_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
 
 .align 8
@@ -714,6 +710,7 @@ _gcry_serpent_avx2_cbc_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -752,7 +749,8 @@ _gcry_serpent_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
 
 .align 8
@@ -765,6 +763,7 @@ _gcry_serpent_avx2_cfb_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -805,7 +804,8 @@ _gcry_serpent_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
 .align 8
@@ -821,15 +821,21 @@ _gcry_serpent_avx2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 	vmovdqu (%r8), RTMP1x;
@@ -882,10 +888,15 @@ _gcry_serpent_avx2_ocb_enc:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor (0 * 32)(%rsi), RA4, RA4;
 	vpxor (1 * 32)(%rsi), RA1, RA1;
@@ -908,6 +919,7 @@ _gcry_serpent_avx2_ocb_enc:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
 
 .align 8
@@ -923,15 +935,21 @@ _gcry_serpent_avx2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 
@@ -978,10 +996,15 @@ _gcry_serpent_avx2_ocb_dec:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_dec_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vmovdqu (%r8), RTMP1x;
 
@@ -1020,6 +1043,7 @@ _gcry_serpent_avx2_ocb_dec:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
 
 .align 8
@@ -1034,15 +1058,21 @@ _gcry_serpent_avx2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rdx), RTMP0x;
 
@@ -1088,10 +1118,15 @@ _gcry_serpent_avx2_ocb_auth:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor RA4, RB4, RA4;
 	vpxor RA1, RB1, RA1;
@@ -1111,6 +1146,7 @@ _gcry_serpent_avx2_ocb_auth:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
 
 .align 16
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index b149af24e..39cba0029 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 /* struct serpent_context: */
 #define ctx_keys 0
@@ -444,6 +434,7 @@ __serpent_enc_blk8:
 	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
 	 * 						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 
 	pcmpeqd RNOT, RNOT;
 
@@ -519,6 +510,7 @@ __serpent_enc_blk8:
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
 
 .align 8
@@ -532,6 +524,7 @@ __serpent_dec_blk8:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
 	 *						blocks
 	 */
+	CFI_STARTPROC();
 
 	pcmpeqd RNOT, RNOT;
 
@@ -609,6 +602,7 @@ __serpent_dec_blk8:
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
 .align 8
@@ -621,6 +615,7 @@ _gcry_serpent_sse2_ctr_enc:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	/* load IV and byteswap */
 	movdqu (%rcx), RA0;
@@ -738,7 +733,8 @@ _gcry_serpent_sse2_ctr_enc:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
 
 .align 8
@@ -751,6 +747,7 @@ _gcry_serpent_sse2_cbc_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	movdqu (0 * 16)(%rdx), RA0;
 	movdqu (1 * 16)(%rdx), RA1;
@@ -799,7 +796,8 @@ _gcry_serpent_sse2_cbc_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
 
 .align 8
@@ -812,6 +810,7 @@ _gcry_serpent_sse2_cfb_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	/* Load input */
 	movdqu (%rcx), RA0;
@@ -863,7 +862,8 @@ _gcry_serpent_sse2_cfb_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
 .align 8
@@ -879,13 +879,19 @@ _gcry_serpent_sse2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rcx), RTMP0;
 	movdqu (%r8), RTMP1;
@@ -926,10 +932,15 @@ _gcry_serpent_sse2_ocb_enc:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	pxor_u((0 * 16)(%rsi), RA4, RTMP0);
 	pxor_u((1 * 16)(%rsi), RA1, RTMP0);
@@ -966,6 +977,7 @@ _gcry_serpent_sse2_ocb_enc:
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
 
 .align 8
@@ -981,13 +993,19 @@ _gcry_serpent_sse2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rcx), RTMP0;
 
@@ -1024,10 +1042,15 @@ _gcry_serpent_sse2_ocb_dec:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_dec_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	movdqu (%r8), RTMP0;
 
@@ -1078,6 +1101,7 @@ _gcry_serpent_sse2_ocb_dec:
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
 
 .align 8
@@ -1092,13 +1116,19 @@ _gcry_serpent_sse2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rdx), RTMP0;
 
@@ -1134,10 +1164,15 @@ _gcry_serpent_sse2_ocb_auth:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	movdqu (%rcx), RTMP0;
 	pxor RB4, RA4;
@@ -1169,6 +1204,7 @@ _gcry_serpent_sse2_ocb_auth:
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;)
 
 #endif /*defined(USE_SERPENT)*/
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 5d674c151..85876ad41 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -33,18 +33,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -161,7 +150,7 @@
 	vpshufb BSWAP_REG, tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0) \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0;
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	vmovdqa tmp0, WK(i&~3);
@@ -186,7 +175,7 @@
 #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpxor W, tmp0, tmp0; \
 	vpxor tmp1, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
@@ -203,7 +192,7 @@
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpor W, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 
@@ -223,6 +212,7 @@ _gcry_sha1_transform_amd64_avx:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
@@ -234,9 +224,12 @@ _gcry_sha1_transform_amd64_avx:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
@@ -248,7 +241,7 @@ _gcry_sha1_transform_amd64_avx:
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
 
-  vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -415,15 +408,20 @@ _gcry_sha1_transform_amd64_avx:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx,
     .-_gcry_sha1_transform_amd64_avx;)
 
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index fe8901eff..5dfcdca97 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -34,18 +34,7 @@
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -222,6 +211,7 @@ _gcry_sha1_transform_amd64_avx_bmi2:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
@@ -233,10 +223,14 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
   pushq %r12;
+  CFI_PUSH(%r12);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
@@ -249,11 +243,11 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movl state_h4(RSTATE), e;
   xorl ne, ne;
 
-  vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
-  vpbroadcastd .LK1 RIP, K1;
-  vpbroadcastd .LK2 RIP, K2;
-  vpbroadcastd .LK3 RIP, K3;
-  vpbroadcastd .LK4 RIP, K4;
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -424,16 +418,22 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %r12;
+  CFI_POP(%r12);
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
     .-_gcry_sha1_transform_amd64_avx_bmi2;)
 
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
index 2a2f21a56..938632305 100644
--- a/cipher/sha1-avx2-bmi2-amd64.S
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -34,18 +34,7 @@
      defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
      defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -228,6 +217,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks (multiple of 2, larger than 0)
    */
+  CFI_STARTPROC();
 
   vzeroupper;
 
@@ -235,10 +225,14 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
   pushq %r12;
+  CFI_PUSH(%r12);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(WK_STACK_WORDS*4), %rsp;
   andq $(~63), %rsp;
@@ -251,11 +245,11 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   movl state_h4(RSTATE), e;
   xorl ne, ne;
 
-  vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG;
-  vpbroadcastd .LK1 RIP, K1;
-  vpbroadcastd .LK2 RIP, K2;
-  vpbroadcastd .LK3 RIP, K3;
-  vpbroadcastd .LK4 RIP, K4;
+  vbroadcasti128 .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
 
   /* Precalc 0-31 for block 1 & 2. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -557,15 +551,21 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %r12;
+  CFI_POP(%r12);
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
     .-_gcry_sha1_transform_amd64_avx2_bmi2;)
 
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index fff140345..7e32b0f4b 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -33,18 +33,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -162,7 +151,7 @@
 	movdqa tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0) \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0;
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	movdqa tmp0, WK(i&~3);
@@ -193,7 +182,7 @@
 	pxor W, tmp0; \
 	pxor tmp1, tmp0; \
 	movdqa tmp0, W; \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
 	movdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
@@ -213,7 +202,7 @@
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	movdqa tmp0, W; \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
 	movdqa tmp0, WK((i)&~3);
 
 #define CLEAR_REG(reg) pxor reg, reg;
@@ -235,6 +224,7 @@ _gcry_sha1_transform_amd64_ssse3:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
@@ -244,9 +234,12 @@ _gcry_sha1_transform_amd64_ssse3:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
@@ -258,7 +251,7 @@ _gcry_sha1_transform_amd64_ssse3:
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
 
-  movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -423,15 +416,20 @@ _gcry_sha1_transform_amd64_ssse3:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_ssse3,
     .-_gcry_sha1_transform_amd64_ssse3;)
 
diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index b8b01b15b..77143ff0e 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -59,17 +59,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -380,15 +370,22 @@ rotate_Xs
 ELF(.type  _gcry_sha256_transform_amd64_avx, at function;)
 .align 16
 _gcry_sha256_transform_amd64_avx:
+	CFI_STARTPROC()
 	vzeroupper
 
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
@@ -487,14 +484,21 @@ _gcry_sha256_transform_amd64_avx:
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 	ret
+	CFI_ENDPROC()
 
 
 .align 16
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 5fc402cd1..52be1a07b 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -60,17 +60,7 @@
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -314,17 +304,24 @@ a = TMP_
 ELF(.type _gcry_sha256_transform_amd64_avx2, at function)
 .align 32
 _gcry_sha256_transform_amd64_avx2:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
 	je .Lnowork
 
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r12
+	CFI_PUSH(r12)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	vzeroupper
 
@@ -333,9 +330,11 @@ _gcry_sha256_transform_amd64_avx2:
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, STACK_SIZE
 	and	rsp, ~63
 	mov	[rsp + _RSP], rax
+	CFI_CFA_ON_STACK(_RSP, 6 * 8)
 
 	shl	NUM_BLKS, 6	/*  convert to bytes */
 	lea	NUM_BLKS, [NUM_BLKS + INP - 64] /*  pointer to last block */
@@ -507,16 +506,24 @@ _gcry_sha256_transform_amd64_avx2:
 	xor     eax, eax
 
 	mov	rsp, [rsp + _RSP]
+	CFI_DEF_CFA_REGISTER(rsp)
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	r12
+	CFI_POP(r12)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 .align 64
 .LK256:
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index ca5c9fd1d..0fb94c1b3 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -60,17 +60,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -386,13 +376,20 @@ rotate_Xs
 ELF(.type  _gcry_sha256_transform_amd64_ssse3, at function;)
 .align 16
 _gcry_sha256_transform_amd64_ssse3:
+	CFI_STARTPROC()
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
@@ -508,14 +505,21 @@ _gcry_sha256_transform_amd64_ssse3:
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 	ret
+	CFI_ENDPROC()
 
 
 .align 16
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 534351e44..991fd6395 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -46,17 +46,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -269,6 +259,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 ELF(.type _gcry_sha512_transform_amd64_avx, at function;)
 .align 16
 _gcry_sha512_transform_amd64_avx:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp	msglen, 0
@@ -278,6 +269,7 @@ _gcry_sha512_transform_amd64_avx:
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
@@ -285,6 +277,11 @@ _gcry_sha512_transform_amd64_avx:
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
@@ -351,6 +348,11 @@ _gcry_sha512_transform_amd64_avx:
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	vzeroall
 
@@ -365,9 +367,11 @@ _gcry_sha512_transform_amd64_avx:
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 32cfceb0b..3b28ab6c6 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -49,17 +49,7 @@
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -352,6 +342,7 @@ y4 =    r12
 ELF(.type _gcry_sha512_transform_amd64_avx2, at function;)
 .align 16
 _gcry_sha512_transform_amd64_avx2:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
@@ -361,9 +352,11 @@ _gcry_sha512_transform_amd64_avx2:
 
 	/* Allocate Stack Space */
 	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, frame_size
 	and	rsp, ~(0x40 - 1)
 	mov	[rsp + frame_RSPSAVE], rax
+	CFI_CFA_ON_STACK(frame_RSPSAVE, 0)
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbp
@@ -372,6 +365,12 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 5], r15
+	CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0)
+	CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1)
+	CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2)
+	CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3)
+	CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4)
+	CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5)
 
 	mov	[rsp + frame_NBLKS], NUM_BLKS
 
@@ -494,11 +493,20 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
+	CFI_RESTORE(rbp)
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	/* Restore Stack Pointer */
 	mov	rsp, [rsp + frame_RSPSAVE]
+	CFI_DEF_CFA_REGISTER(rsp)
+
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 /*;; Binary Data */
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 8e950e0e4..39bfe3625 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -49,17 +49,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -271,6 +261,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 ELF(.type _gcry_sha512_transform_amd64_ssse3, at function;)
 .align 16
 _gcry_sha512_transform_amd64_ssse3:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp msglen, 0
@@ -278,6 +269,7 @@ _gcry_sha512_transform_amd64_ssse3:
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
@@ -285,6 +277,11 @@ _gcry_sha512_transform_amd64_ssse3:
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
@@ -351,6 +348,11 @@ _gcry_sha512_transform_amd64_ssse3:
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
@@ -370,9 +372,11 @@ _gcry_sha512_transform_amd64_ssse3:
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 134d6401e..3cb734317 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -171,12 +171,16 @@ _gcry_twofish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
 	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
 
 	movq %rdx, RX;
 	inpack(RX, 0, RAd, 0);
@@ -201,10 +205,14 @@ _gcry_twofish_amd64_encrypt_block:
 
 	movq (2 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 .align 8
@@ -217,12 +225,16 @@ _gcry_twofish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
 	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
 
 	movq %rdx, RX;
 	inpack(RX, 0, RCd, 4);
@@ -247,10 +259,14 @@ _gcry_twofish_amd64_decrypt_block:
 
 	movq (2 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 #undef CTX
@@ -480,6 +496,8 @@ __twofish_enc_blk3:
 	 * output:
 	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks
 	 */
+	CFI_STARTPROC();
+
 	inpack_enc3();
 
 	encrypt_cycle3(RAB, RCD, 0);
@@ -494,6 +512,7 @@ __twofish_enc_blk3:
 	outunpack_enc3();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
 
 .align 8
@@ -506,6 +525,8 @@ __twofish_dec_blk3:
 	 * output:
 	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks
 	 */
+	CFI_STARTPROC();
+
 	inpack_dec3();
 
 	decrypt_cycle3(RAB, RCD, 7);
@@ -520,6 +541,7 @@ __twofish_dec_blk3:
 	outunpack_dec3();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
 .align 8
@@ -532,15 +554,23 @@ _gcry_twofish_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
@@ -601,10 +631,18 @@ _gcry_twofish_amd64_ctr_enc:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
 .align 8
@@ -617,15 +655,23 @@ _gcry_twofish_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(9 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
@@ -670,10 +716,18 @@ _gcry_twofish_amd64_cbc_dec:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-9 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
 .align 8
@@ -686,15 +740,23 @@ _gcry_twofish_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
@@ -739,10 +801,18 @@ _gcry_twofish_amd64_cfb_dec:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
 .align 8
@@ -757,15 +827,23 @@ _gcry_twofish_amd64_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_6
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, RX0;
@@ -849,10 +927,18 @@ _gcry_twofish_amd64_ocb_enc:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
 .align 8
@@ -867,15 +953,23 @@ _gcry_twofish_amd64_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_6
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %r8,  (7 * 8)(%rsp);
@@ -967,10 +1061,18 @@ _gcry_twofish_amd64_ocb_dec:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
 .align 8
@@ -984,15 +1086,23 @@ _gcry_twofish_amd64_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rcx, (6 * 8)(%rsp);
 	movq %rsi, RX0;
@@ -1056,10 +1166,18 @@ _gcry_twofish_amd64_ocb_auth:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
 
 #endif /*USE_TWOFISH*/
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index db6e21826..74cad3558 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -24,17 +24,7 @@
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \
     defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -423,6 +413,7 @@ __twofish_enc_blk16:
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	init_round_constants();
 
 	transpose4x4_16(RA, RB, RC, RD);
@@ -441,6 +432,7 @@ __twofish_enc_blk16:
 	transpose4x4_16(RA, RB, RC, RD);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
 
 .align 8
@@ -454,6 +446,7 @@ __twofish_dec_blk16:
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	init_round_constants();
 
 	transpose4x4_16(RA, RB, RC, RD);
@@ -472,6 +465,7 @@ __twofish_dec_blk16:
 	transpose4x4_16(RA, RB, RC, RD);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -490,13 +484,14 @@ _gcry_twofish_avx2_ctr_enc:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
 	vzeroupper;
 
-	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
 	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
@@ -587,7 +582,8 @@ _gcry_twofish_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
 
 .align 8
@@ -600,6 +596,7 @@ _gcry_twofish_avx2_cbc_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -638,7 +635,8 @@ _gcry_twofish_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
 
 .align 8
@@ -651,6 +649,7 @@ _gcry_twofish_avx2_cfb_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -691,7 +690,8 @@ _gcry_twofish_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
 
 .align 8
@@ -707,15 +707,21 @@ _gcry_twofish_avx2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 	vmovdqu (%r8), RTMP1x;
@@ -768,10 +774,15 @@ _gcry_twofish_avx2_ocb_enc:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor (0 * 32)(%rsi), RA0, RA0;
 	vpxor (1 * 32)(%rsi), RB0, RB0;
@@ -794,6 +805,7 @@ _gcry_twofish_avx2_ocb_enc:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
 
 .align 8
@@ -809,15 +821,21 @@ _gcry_twofish_avx2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 
@@ -865,6 +883,10 @@ _gcry_twofish_avx2_ocb_dec:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_dec_blk16;
 
@@ -880,6 +902,7 @@ _gcry_twofish_avx2_ocb_dec:
 	vpxor (7 * 32)(%rsi), RD1, RD1;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
@@ -907,6 +930,7 @@ _gcry_twofish_avx2_ocb_dec:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
 
 .align 8
@@ -921,15 +945,21 @@ _gcry_twofish_avx2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rdx), RTMP0x;
 
@@ -975,6 +1005,10 @@ _gcry_twofish_avx2_ocb_auth:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_enc_blk16;
 
@@ -987,6 +1021,7 @@ _gcry_twofish_avx2_ocb_auth:
 	vpxor RA1, RC1, RA1;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor RA1, RA0, RTMP1;
 
@@ -998,6 +1033,7 @@ _gcry_twofish_avx2_ocb_auth:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
 
 .align 16
diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S
index e98b831c0..5631dc567 100644
--- a/cipher/whirlpool-sse2-amd64.S
+++ b/cipher/whirlpool-sse2-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_WHIRLPOOL)
 
-#ifdef __PIC__
-#  define RIP %rip
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -173,16 +163,24 @@ _gcry_whirlpool_transform_amd64:
 	 *	%rdx: nblks
 	 *      %rcx: look-up tables
 	 */
+	CFI_STARTPROC();
 	cmp $0, %rdx;
 	je .Lskip;
 
 	subq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(STACK_MAX);
 	movq %rbp, STACK_RBP(%rsp);
 	movq %rbx, STACK_RBX(%rsp);
 	movq %r12, STACK_R12(%rsp);
 	movq %r13, STACK_R13(%rsp);
 	movq %r14, STACK_R14(%rsp);
 	movq %r15, STACK_R15(%rsp);
+	CFI_REL_OFFSET(%rbp, STACK_RBP);
+	CFI_REL_OFFSET(%rbx, STACK_RBX);
+	CFI_REL_OFFSET(%r12, STACK_R12);
+	CFI_REL_OFFSET(%r13, STACK_R13);
+	CFI_REL_OFFSET(%r14, STACK_R14);
+	CFI_REL_OFFSET(%r15, STACK_R15);
 
 	movq %rdx, STACK_NBLKS(%rsp);
 	movq %rdi, STACK_STATEP(%rsp);
@@ -332,10 +330,18 @@ _gcry_whirlpool_transform_amd64:
 	movq STACK_R13(%rsp), %r13;
 	movq STACK_R14(%rsp), %r14;
 	movq STACK_R15(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(-STACK_MAX);
 .Lskip:
 	movl $(STACK_MAX + 8), %eax;
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;)
 
 #endif
diff --git a/configure.ac b/configure.ac
index b54b212b3..75949f942 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1171,6 +1171,32 @@ if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC assembler supports for CFI directives.
+#
+AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
+       [gcry_cv_gcc_asm_cfi_directives],
+       [gcry_cv_gcc_asm_cfi_directives=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[__asm__(
+                ".cfi_startproc\n\t"
+                ".cfi_remember_state\n\t"
+                ".cfi_adjust_cfa_offset 8\n\t"
+                ".cfi_rel_offset 0, 8\n\t"
+                ".cfi_def_cfa_register 1\n\t"
+                ".cfi_register 2, 3\n\t"
+                ".cfi_restore 2\n\t"
+                ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
+                ".cfi_restore_state\n\t"
+                ".cfi_endproc\n\t"
+            );]])],
+          [gcry_cv_gcc_asm_cfi_directives=yes])])
+if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
+             [Defined if underlying assembler supports for CFI directives])
+fi
+
+
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
@@ -1617,7 +1643,6 @@ if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
-
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h
index ce4467441..37d5722af 100644
--- a/mpi/amd64/func_abi.h
+++ b/mpi/amd64/func_abi.h
@@ -1,9 +1,36 @@
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+#endif
+
 #ifdef USE_MS_ABI
  /* Store registers and move four first input arguments from MS ABI to
   * SYSV ABI.  */
  #define FUNC_ENTRY() \
+	CFI_STARTPROC(); \
 	pushq %rsi; \
+	CFI_PUSH(%rsi); \
 	pushq %rdi; \
+	CFI_PUSH(%rdi); \
 	movq %rdx, %rsi; \
 	movq %rcx, %rdi; \
 	movq %r8, %rdx; \
@@ -12,8 +39,16 @@
  /* Restore registers.  */
  #define FUNC_EXIT() \
 	popq %rdi; \
-	popq %rsi;
+	CFI_POP(%rdi); \
+	popq %rsi; \
+	CFI_POP(%rsi); \
+	ret; \
+	CFI_ENDPROC();
 #else
- #define FUNC_ENTRY() /**/
- #define FUNC_EXIT() /**/
+ #define FUNC_ENTRY() \
+	CFI_STARTPROC();
+
+ #define FUNC_EXIT() \
+	ret; \
+	CFI_ENDPROC();
 #endif
diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S
index 6a9026219..157e5f1e0 100644
--- a/mpi/amd64/mpih-add1.S
+++ b/mpi/amd64/mpih-add1.S
@@ -62,4 +62,3 @@ C_SYMBOL_NAME(_gcry_mpih_add_n:)
 	adcq	%rax, %rax
 	FUNC_EXIT()
 	ret
-	
\ No newline at end of file


From devnexen at gmail.com  Wed Apr 17 21:08:33 2019
From: devnexen at gmail.com (devnexen at gmail.com)
Date: Wed, 17 Apr 2019 15:08:33 -0400
Subject: PATCH: NetBSD - explicit_memset support
Message-ID: <CA+XhMqyaJPxftdYRDCFUHFSvN25i1jg0PXxM+j3GXV+wMBXCQw@mail.gmail.com>

 -----BEGIN
This message is encrypted: Open Message <https://flowcrypt.com/qSJwWL0c2h>

Alternatively copy and paste the following link:
https://flowcrypt.com/qSJwWL0c2h
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20190417/6763ea39/attachment.html>

From wk at gnupg.org  Tue Apr 23 17:16:43 2019
From: wk at gnupg.org (Werner Koch)
Date: Tue, 23 Apr 2019 17:16:43 +0200
Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits
In-Reply-To: <6fea47bc-24af-cad3-282b-9947accd464e@iki.fi> (Jussi Kivilinna's
 message of "Thu, 18 Apr 2019 18:38:48 +0300")
References: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain>
 <20190417215036.GA20903@al>
 <6fea47bc-24af-cad3-282b-9947accd464e@iki.fi>
Message-ID: <871s1syco4.fsf@wheatstone.g10code.de>

On Thu, 18 Apr 2019 18:38, jussi.kivilinna at iki.fi said:

> gcry_cipher_get_algo_keylen as existing users might depend it to stay
> fixed to 128bits.

Yes, this is the case.  We can't change that.  The variable keylength is
anyway very specific to Blowfish and dies not justify a new interface.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 227 bytes
Desc: not available
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20190423/e8bb88f4/attachment.sig>

From jussi.kivilinna at iki.fi  Fri Apr 26 18:33:31 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 26 Apr 2019 19:33:31 +0300
Subject: [PATCH 1/4] Add 64-bit ARMv8/CE PMULL implementation of CRC
Message-ID: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>

* cipher/Makefile.am: Add 'crc-armv8-ce.c' and
'crc-armv8-aarch64-ce.S'.
* cipher/asm-common-aarch64.h [HAVE_GCC_ASM_CFI_DIRECTIVES]: Add CFI
helper macros.
* cipher/crc-armv8-aarch64-ce.S: New.
* cipher/crc-armv8-ce.c: New.
* cipher/crc.c (USE_ARM_PMULL): New.
(CRC_CONTEXT) [USE_ARM_PMULL]: Add 'use_pmull'.
[USE_ARM_PMULL] (_gcry_crc32_armv8_ce_pmull)
(_gcry_crc24rfc2440_armv8_ce_pmull): New prototypes.
(crc32_init, crc32rfc1510_init, crc24rfc2440_init): Enable ARM PMULL
implementations if supported by HW features.
(crc32_write, crc24rfc2440_write) [USE_ARM_PMULL]: Use ARM PMULL
implementations if enabled.
* configure.ac: Add 'crc-armv8-ce.lo' and 'crc-armv8-aarch64-ce.lo'.
--

Benchmark on Cortex-A53 (at 1104 Mhz):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 CRC32          |      2.89 ns/B     330.2 MiB/s      3.19 c/B
 CRC32RFC1510   |      2.89 ns/B     330.2 MiB/s      3.19 c/B
 CRC24RFC2440   |      2.72 ns/B     350.8 MiB/s      3.00 c/B

After (crc32 ~8.4x faster, crc24 ~6.8x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 CRC32          |     0.341 ns/B      2796 MiB/s     0.377 c/B
 CRC32RFC1510   |     0.342 ns/B      2792 MiB/s     0.377 c/B
 CRC24RFC2440   |     0.398 ns/B      2396 MiB/s     0.439 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3f00ed4a8..2acd7cb38 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -78,7 +78,8 @@ EXTRA_libcipher_la_SOURCES = \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
-	crc.c crc-intel-pclmul.c \
+	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
+	crc-armv8-aarch64-ce.S \
 	des.c des-amd64.S \
 	dsa.c \
 	elgamal.c \
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
new file mode 100644
index 000000000..497d00551
--- /dev/null
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -0,0 +1,492 @@
+/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.cpu generic+simd+crypto
+
+.text
+
+#define GET_DATA_POINTER(reg, name) \
+		adrp    reg, :got:name ; \
+		ldr     reg, [reg, #:got_lo12:name] ;
+
+/* Structure of crc32_consts_s */
+
+#define consts_k(idx)    ((idx) * 8)
+#define consts_my_p(idx) (consts_k(6) + (idx) * 8)
+
+/* Constants */
+
+.align 6
+.Lcrc32_constants:
+.Lcrc32_partial_fold_input_mask:
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+.Lcrc32_refl_shuf_shift:
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+  .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lcrc32_shuf_shift:
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.Lcrc32_bswap_shuf:
+  .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+  .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+
+/*
+ * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ *                                  const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32r_armv8_ce_bulk
+ELF(.type  _gcry_crc32r_armv8_ce_bulk,%function;)
+_gcry_crc32r_armv8_ce_bulk:
+  /* input:
+   *    x0: pcrc
+   *    x1: inbuf
+   *    x2: inlen
+   *    x3: consts
+   */
+
+  GET_DATA_POINTER(x7, .Lcrc32_constants)
+  add x9, x3, #consts_k(5 - 1)
+  cmp x2, #128
+
+  b.lo .Lcrc32r_fold_by_one_setup
+
+  eor v4.16b, v4.16b, v4.16b
+  add x4, x3, #consts_k(1 - 1)
+  ld1 {v4.s}[0], [x0]             /* load pcrc */
+  ld1 {v0.16b-v3.16b}, [x1], #64  /* load 64 bytes of input */
+  sub x2, x2, #64
+  ld1 {v6.16b}, [x4]
+  eor v0.16b, v0.16b, v4.16b
+
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+
+.Lcrc32r_fold_by_four:
+
+  /* Fold by 4. */
+  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  pmull v20.1q, v0.1d, v6.1d
+  pmull v21.1q, v1.1d, v6.1d
+  pmull v22.1q, v2.1d, v6.1d
+  pmull v23.1q, v3.1d, v6.1d
+  cmp x2, #64
+  pmull2 v24.1q, v0.2d, v6.2d
+  pmull2 v25.1q, v1.2d, v6.2d
+  pmull2 v26.1q, v2.2d, v6.2d
+  pmull2 v27.1q, v3.2d, v6.2d
+  eor v0.16b, v20.16b, v16.16b
+  eor v1.16b, v21.16b, v17.16b
+  eor v2.16b, v22.16b, v18.16b
+  eor v3.16b, v23.16b, v19.16b
+  eor v0.16b, v0.16b, v24.16b
+  eor v1.16b, v1.16b, v25.16b
+  eor v2.16b, v2.16b, v26.16b
+  eor v3.16b, v3.16b, v27.16b
+  b.hs .Lcrc32r_fold_by_four
+
+  ld1 {v6.16b}, [x4]
+  ld1 {v5.16b}, [x5]
+
+  cmp x2, #16
+
+  /* Fold 4 to 1. */
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v1.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v2.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v3.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  b.lo .Lcrc32r_fold_by_one_done
+  b .Lcrc32r_fold_by_one
+
+.Lcrc32r_fold_by_one_setup:
+
+  eor v1.16b, v1.16b, v1.16b
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+  sub x2, x2, #16
+  ld1 {v1.s}[0], [x0]             /* load pcrc */
+  ld1 {v0.16b}, [x1], #16         /* load 16 bytes of input */
+  cmp x2, #16
+  ld1 {v6.16b}, [x4]              /* load k3k4 */
+  ld1 {v5.16b}, [x5]              /* load my_p */
+  eor v0.16b, v0.16b, v1.16b
+  b.lo .Lcrc32r_fold_by_one_done
+
+.Lcrc32r_fold_by_one:
+  sub x2, x2, #16
+  ld1 {v2.16b}, [x1], #16         /* load 16 bytes of input */
+  pmull v3.1q, v0.1d, v6.1d
+  pmull2 v1.1q, v0.2d, v6.2d
+  cmp x2, #16
+  eor v0.16b, v3.16b, v2.16b
+  eor v0.16b, v0.16b, v1.16b
+
+  b.hs .Lcrc32r_fold_by_one
+
+.Lcrc32r_fold_by_one_done:
+
+  cmp x2, #0
+  b.eq .Lcrc32r_final_fold
+
+  /* Partial fold. */
+
+  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants
+  add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16
+  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
+  sub x8, x2, #16
+  add x4, x4, x2
+  add x5, x5, x2
+  add x6, x6, x2
+  add x8, x1, x8
+
+  /* Load last input and add padding zeros. */
+  ld1 {v4.16b}, [x4]
+  eor x2, x2, x2
+  ld1 {v3.16b}, [x5]
+  ld1 {v2.16b}, [x6]
+  tbl v30.16b, {v0.16b}, v4.16b
+  ld1 {v4.16b}, [x8]
+  tbl v1.16b, {v0.16b}, v3.16b
+
+  pmull v0.1q, v30.1d, v6.1d
+  and v2.16b, v2.16b, v4.16b
+  pmull2 v31.1q, v30.2d, v6.2d
+  orr v2.16b, v2.16b, v1.16b
+  eor v0.16b, v0.16b, v31.16b
+  eor v0.16b, v0.16b, v2.16b
+
+.Lcrc32r_final_fold:
+
+  /* Final fold. */
+
+  eor v2.16b, v2.16b, v2.16b      /* zero reg */
+  ld1 {v7.16b}, [x9]
+
+  /* reduce 128-bits to 96-bits */
+  ext v6.16b, v6.16b, v6.16b, #8  /* swap high and low parts */
+  mov v1.16b, v0.16b
+  pmull v0.1q, v0.1d, v6.1d
+  ext v6.16b, v5.16b, v5.16b, #8  /* swap high and low parts */
+  ext v1.16b, v1.16b, v2.16b, #8  /* high to low, high zeroed */
+  eor v3.16b, v0.16b, v1.16b
+
+  /* reduce 96-bits to 64-bits */
+  eor v1.16b, v1.16b, v1.16b
+  ext v0.16b, v3.16b, v2.16b, #4  /* [00][00][x2][x1] */
+  mov v1.s[0], v3.s[0]            /* [00][00][00][x0] */
+  eor v3.16b, v3.16b, v3.16b
+  pmull v1.1q, v1.1d, v7.1d       /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b      /* top 64-bit are zero */
+
+  /* barrett reduction */
+  mov v3.s[1], v0.s[0]            /* [00][00][x1][00] */
+  ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */
+  pmull v1.1q, v3.1d, v5.1d       /* [00][xx][xx][00] */
+  pmull v1.1q, v1.1d, v6.1d       /* [00][xx][xx][00] */
+  eor v0.16b, v0.16b, v1.16b
+
+  /* store CRC */
+  st1 {v0.s}[2], [x0]
+
+  ret
+ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
+
+/*
+ * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
+ *                                         const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32r_armv8_ce_reduction_4
+ELF(.type  _gcry_crc32r_armv8_ce_reduction_4,%function;)
+_gcry_crc32r_armv8_ce_reduction_4:
+  /* input:
+   *    w0: data
+   *    w1: crc
+   *    x2: crc32 constants
+   */
+
+  eor v0.16b, v0.16b, v0.16b
+  add x2, x2, #consts_my_p(0)
+  eor v1.16b, v1.16b, v1.16b
+  ld1 {v5.16b}, [x2]
+
+  mov v0.s[0], w0
+  pmull v0.1q, v0.1d, v5.1d     /* [00][00][xx][xx] */
+  mov v1.s[1], w1
+  mov v0.s[2], v0.s[0]          /* [00][x0][x1][x0] */
+  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b
+
+  mov w0, v0.s[1]
+
+  ret
+ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
+
+/*
+ * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ *                                 const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32_armv8_ce_bulk
+ELF(.type  _gcry_crc32_armv8_ce_bulk,%function;)
+_gcry_crc32_armv8_ce_bulk:
+  /* input:
+   *    x0: pcrc
+   *    x1: inbuf
+   *    x2: inlen
+   *    x3: consts
+   */
+
+  GET_DATA_POINTER(x7, .Lcrc32_constants)
+  add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
+  cmp x2, #128
+  ld1 {v7.16b}, [x4]
+
+  b.lo .Lcrc32_fold_by_one_setup
+
+  eor v4.16b, v4.16b, v4.16b
+  add x4, x3, #consts_k(1 - 1)
+  ld1 {v4.s}[0], [x0]            /* load pcrc */
+  ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  ld1 {v6.16b}, [x4]
+  eor v0.16b, v0.16b, v4.16b
+  ext v4.16b, v6.16b, v6.16b, #8
+  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
+  tbl v1.16b, { v1.16b }, v7.16b /* byte swap */
+  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
+  tbl v3.16b, { v3.16b }, v7.16b /* byte swap */
+
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+
+.Lcrc32_fold_by_four:
+
+  /* Fold by 4. */
+  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  tbl v16.16b, { v16.16b }, v7.16b /* byte swap */
+  tbl v17.16b, { v17.16b }, v7.16b /* byte swap */
+  tbl v18.16b, { v18.16b }, v7.16b /* byte swap */
+  tbl v19.16b, { v19.16b }, v7.16b /* byte swap */
+  cmp x2, #64
+  pmull2 v20.1q, v0.2d, v4.2d
+  pmull2 v21.1q, v1.2d, v4.2d
+  pmull2 v22.1q, v2.2d, v4.2d
+  pmull2 v23.1q, v3.2d, v4.2d
+  pmull v24.1q, v0.1d, v4.1d
+  pmull v25.1q, v1.1d, v4.1d
+  pmull v26.1q, v2.1d, v4.1d
+  pmull v27.1q, v3.1d, v4.1d
+  eor v0.16b, v20.16b, v16.16b
+  eor v1.16b, v21.16b, v17.16b
+  eor v2.16b, v22.16b, v18.16b
+  eor v3.16b, v23.16b, v19.16b
+  eor v0.16b, v0.16b, v24.16b
+  eor v1.16b, v1.16b, v25.16b
+  eor v2.16b, v2.16b, v26.16b
+  eor v3.16b, v3.16b, v27.16b
+  b.hs .Lcrc32_fold_by_four
+
+  ld1 {v6.16b}, [x4]
+  ld1 {v5.16b}, [x5]
+  ext v6.16b, v6.16b, v6.16b, #8
+  ext v5.16b, v5.16b, v5.16b, #8
+
+  cmp x2, #16
+
+  /* Fold 4 to 1. */
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v1.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v2.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v3.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  b.lo .Lcrc32_fold_by_one_done
+  b .Lcrc32_fold_by_one
+
+.Lcrc32_fold_by_one_setup:
+
+  eor v1.16b, v1.16b, v1.16b
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+  ld1 {v1.s}[0], [x0]            /* load pcrc */
+  sub x2, x2, #16
+  ld1 {v0.16b}, [x1], #16        /* load 16 bytes of input */
+  ld1 {v6.16b}, [x4]             /* load k3k4 */
+  ld1 {v5.16b}, [x5]             /* load my_p */
+  eor v0.16b, v0.16b, v1.16b
+  cmp x2, #16
+  ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
+  ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */
+  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
+  b.lo .Lcrc32_fold_by_one_done
+
+.Lcrc32_fold_by_one:
+  sub x2, x2, #16
+  ld1 {v2.16b}, [x1], #16        /* load 16 bytes of input */
+  pmull2 v3.1q, v0.2d, v6.2d
+  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
+  pmull v1.1q, v0.1d, v6.1d
+  cmp x2, #16
+  eor v0.16b, v3.16b, v2.16b
+  eor v0.16b, v0.16b, v1.16b
+
+  b.hs .Lcrc32_fold_by_one
+
+.Lcrc32_fold_by_one_done:
+
+  cmp x2, #0
+  b.eq .Lcrc32_final_fold
+
+  /* Partial fold. */
+
+  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32
+  add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16
+  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
+  sub x8, x2, #16
+  sub x4, x4, x2
+  add x5, x5, x2
+  add x6, x6, x2
+  add x8, x1, x8
+
+  /* Load last input and add padding zeros. */
+  ld1 {v4.16b}, [x4]
+  eor x2, x2, x2
+  ld1 {v3.16b}, [x5]
+  ld1 {v2.16b}, [x6]
+  tbl v30.16b, {v0.16b}, v4.16b
+  ld1 {v4.16b}, [x8]
+  tbl v1.16b, {v0.16b}, v3.16b
+  and v2.16b, v2.16b, v4.16b
+
+  pmull2 v0.1q, v30.2d, v6.2d
+  orr v2.16b, v2.16b, v1.16b
+  pmull v1.1q, v30.1d, v6.1d
+  tbl v2.16b, {v2.16b}, v7.16b   /* byte swap */
+  eor v0.16b, v0.16b, v1.16b
+  eor v0.16b, v0.16b, v2.16b
+
+.Lcrc32_final_fold:
+
+  /* Final fold. */
+
+  eor v2.16b, v2.16b, v2.16b     /* zero reg */
+
+  /* reduce 128-bits to 96-bits */
+  add x4, x3, #consts_k(4)
+  ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
+  eor v6.16b, v6.16b, v6.16b
+  mov v1.16b, v0.16b
+  pmull2 v0.1q, v0.2d, v3.2d
+  ld1 {v6.d}[1], [x4]            /* load k4 */
+  ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */
+  eor v3.16b, v0.16b, v1.16b     /* bottom 32-bit are zero */
+
+  /* reduce 96-bits to 64-bits */
+  eor v0.16b, v0.16b, v0.16b
+  eor v1.16b, v1.16b, v1.16b
+  mov v0.s[1], v3.s[1]           /* [00][00][x1][00] */
+  mov v1.s[2], v3.s[3]           /* [00][x3][00][00] */
+  mov v0.s[2], v3.s[2]           /* [00][x2][x1][00] */
+  eor v3.16b, v3.16b, v3.16b
+  pmull2 v1.1q, v1.2d, v6.2d     /* [00][xx][xx][00] */
+  eor v0.16b, v0.16b, v1.16b     /* top and bottom 32-bit are zero */
+
+  /* barrett reduction */
+  mov v3.s[0], v0.s[1]           /* [00][00][00][x1] */
+  pmull2 v0.1q, v0.2d, v5.2d     /* [00][xx][xx][xx] */
+  ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */
+  pmull v0.1q, v0.1d, v5.1d
+  eor v0.16b, v0.16b, v3.16b
+
+  /* store CRC in input endian */
+  rev32 v0.8b, v0.8b             /* byte swap */
+  st1 {v0.s}[0], [x0]
+
+  ret
+ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
+
+/*
+ * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
+ *                                        const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32_armv8_ce_reduction_4
+ELF(.type  _gcry_crc32_armv8_ce_reduction_4,%function;)
+_gcry_crc32_armv8_ce_reduction_4:
+  /* input:
+   *    w0: data
+   *    w1: crc
+   *    x2: crc32 constants
+   */
+
+  eor v0.16b, v0.16b, v0.16b
+  add x2, x2, #consts_my_p(0)
+  eor v1.16b, v1.16b, v1.16b
+  ld1 {v5.16b}, [x2]
+
+  mov v0.s[1], w0
+  pmull v0.1q, v0.1d, v5.1d     /* [00][xx][xx][00] */
+  mov v1.s[0], w1
+  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b
+
+  rev32 v0.8b, v0.8b            /* Return in input endian */
+  mov w0, v0.s[0]
+
+  ret
+ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
+
+#endif
diff --git a/cipher/crc-armv8-ce.c b/cipher/crc-armv8-ce.c
new file mode 100644
index 000000000..8dd07cce6
--- /dev/null
+++ b/cipher/crc-armv8-ce.c
@@ -0,0 +1,229 @@
+/* crc-armv8-ce.c - ARMv8-CE PMULL accelerated CRC implementation
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
+
+struct u16_unaligned_s
+{
+  u16 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+struct u32_unaligned_s
+{
+  u32 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 PMULL
+ * functions. */
+struct crc32_consts_s
+{
+  /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
+  u64 k[6];
+  /* my_p: { floor(x^64 / P(x)), P(x) } */
+  u64 my_p[2];
+};
+
+/* PMULL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts ALIGNED_16 =
+{
+  { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
+    U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
+    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+    U64_C(0x163cd6124), 0                   /* y = 2 */
+  },
+  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+    U64_C(0x1f7011641), U64_C(0x1db710641)
+  }
+};
+
+/* PMULL constants for CRC24RFC2440 (polynomial multiplied with x?). */
+static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
+{
+  { /* k[6] = x^(32*y) mod P(x) << 32*/
+    U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
+    U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+    U64_C(0xd9fe8c00) << 32, 0                        /* y = 2 */
+  },
+  { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+    U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  }
+};
+
+
+u32 _gcry_crc32r_armv8_ce_reduction_4 (u32 data, u32 crc,
+				       const struct crc32_consts_s *consts);
+void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+                                 const struct crc32_consts_s *consts);
+
+u32 _gcry_crc32_armv8_ce_reduction_4 (u32 data, u32 crc,
+				      const struct crc32_consts_s *consts);
+void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+                                const struct crc32_consts_s *consts);
+
+
+static inline void
+crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		     const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = ((const struct u32_unaligned_s *)inbuf)->a;
+      data ^= crc;
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data <<= 24;
+      crc >>= 8;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data <<= 16;
+      crc >>= 16;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data <<= 8;
+      crc >>= 24;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+static inline void
+crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		    const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = ((const struct u32_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data = _gcry_bswap32(data);
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data = data & 0xffU;
+      crc = _gcry_bswap32(crc >> 8);
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data = _gcry_bswap32(data << 16);
+      crc = _gcry_bswap32(crc >> 16);
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data = _gcry_bswap32(data << 8);
+      crc = crc & 0xff000000U;
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+void
+_gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc32_consts;
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    _gcry_crc32r_armv8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32r_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+void
+_gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+
+  if (!inlen)
+    return;
+
+  /* Note: *pcrc in input endian. */
+
+  if (inlen >= 16)
+    _gcry_crc32_armv8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+#endif /* USE_INTEL_PCLMUL */
diff --git a/cipher/crc.c b/cipher/crc.c
index 4457ff62f..2abbab288 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -42,12 +42,24 @@
 # endif
 #endif /* USE_INTEL_PCLMUL */
 
+/* USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
+#undef USE_ARM_PMULL
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT)
+# if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_PMULL 1
+# endif
+#endif /* USE_ARM_PMULL */
 
 typedef struct
 {
   u32 CRC;
 #ifdef USE_INTEL_PCLMUL
   unsigned int use_pclmul:1;           /* Intel PCLMUL shall be used.  */
+#endif
+#ifdef USE_ARM_PMULL
+  unsigned int use_pmull:1;            /* ARMv8 PMULL shall be used. */
 #endif
   byte buf[4];
 }
@@ -61,6 +73,13 @@ void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf,
 				      size_t inlen);
 #endif
 
+#ifdef USE_ARM_PMULL
+/*-- crc-armv8-ce.c --*/
+void _gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf,
+					size_t inlen);
+#endif
+
 
 /*
  * Code generated by universal_crc by Danjel McGougan
@@ -361,13 +380,17 @@ static void
 crc32_init (void *context, unsigned int flags)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-#ifdef USE_INTEL_PCLMUL
   u32 hwf = _gcry_get_hw_features ();
 
+#ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
 #endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
 
   (void)flags;
+  (void)hwf;
 
   ctx->CRC = 0 ^ 0xffffffffL;
 }
@@ -386,6 +409,13 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen)
       return;
     }
 #endif
+#ifdef USE_ARM_PMULL
+  if (ctx->use_pmull)
+    {
+      _gcry_crc32_armv8_ce_pmull(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
 
   if (!inbuf || !inlen)
     return;
@@ -439,13 +469,17 @@ static void
 crc32rfc1510_init (void *context, unsigned int flags)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-#ifdef USE_INTEL_PCLMUL
   u32 hwf = _gcry_get_hw_features ();
 
+#ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
 #endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
 
   (void)flags;
+  (void)hwf;
 
   ctx->CRC = 0;
 }
@@ -769,12 +803,16 @@ static void
 crc24rfc2440_init (void *context, unsigned int flags)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-#ifdef USE_INTEL_PCLMUL
   u32 hwf = _gcry_get_hw_features ();
 
+#ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
 #endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
 
+  (void)hwf;
   (void)flags;
 
   ctx->CRC = crc24_init();
@@ -794,6 +832,13 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
       return;
     }
 #endif
+#ifdef USE_ARM_PMULL
+  if (ctx->use_pmull)
+    {
+      _gcry_crc24rfc2440_armv8_ce_pmull(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
 
   if (!inbuf || !inlen)
     return;
diff --git a/configure.ac b/configure.ac
index 1aafc320a..aa23a5010 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2409,6 +2409,11 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo"
+      ;;
    esac
 fi
 

From jussi.kivilinna at iki.fi  Fri Apr 26 18:33:41 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 26 Apr 2019 19:33:41 +0300
Subject: [PATCH 3/4] Move data pointer macro for 64-bit ARM assembly to common
 header
In-Reply-To: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>
References: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>
Message-ID: <155629642157.14985.4393579564393301810.stgit@localhost.localdomain>

* cipher/asm-common-aarch64.h (GET_DATA_POINTER): New.
* cipher/chacha20-aarch64.S (GET_DATA_POINTER): Remove.
* cipher/cipher-gcm-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove.
* cipher/crc-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove.
* cipher/rijndael-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove.
* cipher/sha1-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove.
* cipher/sha256-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h
index 502c35aeb..4ffc1b711 100644
--- a/cipher/asm-common-aarch64.h
+++ b/cipher/asm-common-aarch64.h
@@ -29,6 +29,16 @@
 # define ELF(...) /*_*/
 #endif
 
+#ifdef _WIN32
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, name ; \
+	add     reg, reg, #:lo12:name ;
+#else
+#define GET_DATA_POINTER(reg, name) \
+	adrp    reg, :got:name ; \
+	ldr     reg, [reg, #:got_lo12:name] ;
+#endif
+
 #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
 /* CFI directives to emit DWARF stack unwinding information. */
 # define CFI_STARTPROC()            .cfi_startproc
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index adb9b1f29..07b4bb5c0 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -38,15 +38,6 @@
 
 .text
 
-#ifdef _WIN32
-#define GET_DATA_POINTER(reg, name) \
-	adrp    reg, name ; \
-	add     reg, reg, #:lo12:name ;
-#else
-#define GET_DATA_POINTER(reg, name) \
-	adrp    reg, :got:name ; \
-	ldr     reg, [reg, #:got_lo12:name] ;
-#endif
 
 /* register macros */
 #define INPUT     x0
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 7c6be94ed..b0c2cccc6 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -27,10 +27,6 @@
 
 .text
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
-
 
 /* Constants */
 
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
index f269b74a3..060abdfe9 100644
--- a/cipher/crc-armv8-aarch64-ce.S
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -27,9 +27,6 @@
 
 .text
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
 
 /* Structure of crc32_consts_s */
 
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 71b45b856..3af29e0d0 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -28,11 +28,6 @@
 .text
 
 
-#define GET_DATA_POINTER(reg, name) \
-	adrp    reg, :got:name ; \
-	ldr     reg, [reg, #:got_lo12:name] ;
-
-
 /* Register macros */
 
 #define vk0 v17
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index 7dc26c0f1..223268cad 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -28,11 +28,6 @@
 .text
 
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
-
-
 /* Constants */
 
 #define K1  0x5A827999
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index 706e0dfd9..f57cae290 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -28,11 +28,6 @@
 .text
 
 
-#define GET_DATA_POINTER(reg, name) \
-		adrp    reg, :got:name ; \
-		ldr     reg, [reg, #:got_lo12:name] ;
-
-
 /* Constants */
 
 .align 4


From jussi.kivilinna at iki.fi  Fri Apr 26 18:33:36 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 26 Apr 2019 19:33:36 +0300
Subject: [PATCH 2/4] Add CFI unwind assembly directives for 64-bit ARM assembly
In-Reply-To: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>
References: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>
Message-ID: <155629641640.14985.5996707259227955897.stgit@localhost.localdomain>

* cipher/asm-common-aarch64.h (CFI_STARTPROC, CFI_ENDPROC)
(CFI_REMEMBER_STATE, CFI_RESTORE_STATE, CFI_ADJUST_CFA_OFFSET)
(CFI_REL_OFFSET, CFI_DEF_CFA_REGISTER, CFI_REGISTER, CFI_RESTORE)
(DW_REGNO_SP, DW_SLEB128_7BIT, DW_SLEB128_28BIT, CFI_CFA_ON_STACK)
(CFI_REG_ON_STACK): New.
* cipher/camellia-aarch64.S: Add CFI directives.
* cipher/chacha20-aarch64.S: Add CFI directives.
* cipher/cipher-gcm-armv8-aarch64-ce.S: Add CFI directives.
* cipher/crc-armv8-aarch64-ce.S: Add CFI directives.
* cipher/rijndael-aarch64.S: Add CFI directives.
* cipher/rijndael-armv8-aarch64-ce.S: Add CFI directives.
* cipher/sha1-armv8-aarch64-ce.S: Add CFI directives.
* cipher/sha256-armv8-aarch64-ce.S: Add CFI directives.
* cipher/twofish-aarch64.S: Add CFI directives.
* mpi/aarch64/mpih-add1.S: Add CFI directives.
* mpi/aarch64/mpih-mul1.S: Add CFI directives.
* mpi/aarch64/mpih-mul2.S: Add CFI directives.
* mpi/aarch64/mpih-mul3.S: Add CFI directives.
* mpi/aarch64/mpih-sub1.S: Add CFI directives.
* mpi/asm-common-aarch64.h: Include "../cipher/asm-common-aarch64.h".
(ELF): Remove.
--

This commit adds CFI directives that add DWARF unwinding information for
debugger to backtrace when executing code from 64-bit ARM assembly files.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h
index 814b7ad16..502c35aeb 100644
--- a/cipher/asm-common-aarch64.h
+++ b/cipher/asm-common-aarch64.h
@@ -29,4 +29,62 @@
 # define ELF(...) /*_*/
 #endif
 
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+/* CFA expressions are used for pointing CFA and registers to
+ * SP relative offsets. */
+# define DW_REGNO_SP 31
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x8f, /* DW_OP_breg31, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(regno,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(regno), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x8f, /* DW_OP_breg31, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
 #endif /* GCRY_ASM_COMMON_AARCH64_H */
diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S
index 5c6ab020a..f49808621 100644
--- a/cipher/camellia-aarch64.S
+++ b/cipher/camellia-aarch64.S
@@ -201,7 +201,12 @@
 ELF(.type   _gcry_camellia_arm_encrypt_block, at function;)
 
 _gcry_camellia_arm_encrypt_block:
+	CFI_STARTPROC()
 	stp x19, x30, [sp, #-16]!
+	CFI_ADJUST_CFA_OFFSET(16)
+	CFI_REG_ON_STACK(19, 0)
+	CFI_REG_ON_STACK(30, 8)
+
 	/* input:
 	 *	x0: keytable
 	 *	x1: dst
@@ -228,8 +233,13 @@ _gcry_camellia_arm_encrypt_block:
 
 	outunpack(24);
 
+	CFI_REMEMBER_STATE()
 	ldp x19, x30, [sp], #16
+	CFI_ADJUST_CFA_OFFSET(-16)
+	CFI_RESTORE(x19)
+	CFI_RESTORE(x30)
 	ret;
+	CFI_RESTORE_STATE()
 .ltorg
 
 .Lenc_256:
@@ -239,7 +249,11 @@ _gcry_camellia_arm_encrypt_block:
 	outunpack(32);
 
 	ldp x19, x30, [sp], #16
+	CFI_ADJUST_CFA_OFFSET(-16)
+	CFI_RESTORE(x19)
+	CFI_RESTORE(x30)
 	ret;
+	CFI_ENDPROC()
 .ltorg
 ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;)
 
@@ -247,7 +261,12 @@ ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;)
 ELF(.type   _gcry_camellia_arm_decrypt_block, at function;)
 
 _gcry_camellia_arm_decrypt_block:
+	CFI_STARTPROC()
 	stp x19, x30, [sp, #-16]!
+	CFI_ADJUST_CFA_OFFSET(16)
+	CFI_REG_ON_STACK(19, 0)
+	CFI_REG_ON_STACK(30, 8)
+
 	/* input:
 	 *	x0: keytable
 	 *	x1: dst
@@ -275,8 +294,13 @@ _gcry_camellia_arm_decrypt_block:
 
 	outunpack(0);
 
+	CFI_REMEMBER_STATE()
 	ldp x19, x30, [sp], #16
+	CFI_ADJUST_CFA_OFFSET(-16)
+	CFI_RESTORE(x19)
+	CFI_RESTORE(x30)
 	ret;
+	CFI_RESTORE_STATE()
 .ltorg
 
 .Ldec_256:
@@ -285,6 +309,7 @@ _gcry_camellia_arm_decrypt_block:
 	dec_fls(24);
 
 	b .Ldec_128;
+	CFI_ENDPROC()
 .ltorg
 ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;)
 
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index 3844d4e10..adb9b1f29 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -163,6 +163,7 @@ _gcry_chacha20_aarch64_blocks4:
 	 *	x2: src
 	 *	x3: nblks (multiple of 4)
 	 */
+	CFI_STARTPROC()
 
 	GET_DATA_POINTER(CTR, .Linc_counter);
 	add INPUT_CTR, INPUT, #(12*4);
@@ -309,6 +310,7 @@ _gcry_chacha20_aarch64_blocks4:
 
 	eor x0, x0, x0
 	ret
+	CFI_ENDPROC()
 ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;)
 
 #endif
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index b6c4f59d3..7c6be94ed 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -157,15 +157,23 @@ gcry_gcm_reduction_constant:
 
 #define VPUSH_ABI \
         stp d8, d9, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16); \
         stp d10, d11, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16); \
         stp d12, d13, [sp, #-16]!; \
-        stp d14, d15, [sp, #-16]!;
+        CFI_ADJUST_CFA_OFFSET(16); \
+        stp d14, d15, [sp, #-16]!; \
+        CFI_ADJUST_CFA_OFFSET(16);
 
 #define VPOP_ABI \
         ldp d14, d15, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16); \
         ldp d12, d13, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16); \
         ldp d10, d11, [sp], #16; \
-        ldp d8, d9, [sp], #16;
+        CFI_ADJUST_CFA_OFFSET(-16); \
+        ldp d8, d9, [sp], #16; \
+        CFI_ADJUST_CFA_OFFSET(-16);
 
 /*
  * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
@@ -183,6 +191,8 @@ _gcry_ghash_armv8_ce_pmull:
    *    x3: nblocks
    *    x4: gcm_table
    */
+  CFI_STARTPROC();
+
   cbz x3, .Ldo_nothing;
 
   GET_DATA_POINTER(x5, .Lrconst)
@@ -360,6 +370,7 @@ _gcry_ghash_armv8_ce_pmull:
 .Ldo_nothing:
   mov x0, #0
   ret
+  CFI_ENDPROC()
 ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
 
 
@@ -374,6 +385,7 @@ _gcry_ghash_setup_armv8_ce_pmull:
    *	x0: gcm_key
    *	x1: gcm_table
    */
+  CFI_STARTPROC()
 
   GET_DATA_POINTER(x2, .Lrconst)
 
@@ -408,6 +420,7 @@ _gcry_ghash_setup_armv8_ce_pmull:
   st1 {rh5.16b-rh6.16b}, [x1]
 
   ret
+  CFI_ENDPROC()
 ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;)
 
 #endif
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
index 497d00551..f269b74a3 100644
--- a/cipher/crc-armv8-aarch64-ce.S
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -72,6 +72,7 @@ _gcry_crc32r_armv8_ce_bulk:
    *    x2: inlen
    *    x3: consts
    */
+  CFI_STARTPROC()
 
   GET_DATA_POINTER(x7, .Lcrc32_constants)
   add x9, x3, #consts_k(5 - 1)
@@ -230,6 +231,7 @@ _gcry_crc32r_armv8_ce_bulk:
   st1 {v0.s}[2], [x0]
 
   ret
+  CFI_ENDPROC()
 ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
 
 /*
@@ -245,6 +247,7 @@ _gcry_crc32r_armv8_ce_reduction_4:
    *    w1: crc
    *    x2: crc32 constants
    */
+  CFI_STARTPROC()
 
   eor v0.16b, v0.16b, v0.16b
   add x2, x2, #consts_my_p(0)
@@ -261,6 +264,7 @@ _gcry_crc32r_armv8_ce_reduction_4:
   mov w0, v0.s[1]
 
   ret
+  CFI_ENDPROC()
 ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
 
 /*
@@ -277,6 +281,7 @@ _gcry_crc32_armv8_ce_bulk:
    *    x2: inlen
    *    x3: consts
    */
+  CFI_STARTPROC()
 
   GET_DATA_POINTER(x7, .Lcrc32_constants)
   add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
@@ -456,6 +461,7 @@ _gcry_crc32_armv8_ce_bulk:
   st1 {v0.s}[0], [x0]
 
   ret
+  CFI_ENDPROC()
 ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
 
 /*
@@ -471,6 +477,7 @@ _gcry_crc32_armv8_ce_reduction_4:
    *    w1: crc
    *    x2: crc32 constants
    */
+  CFI_STARTPROC()
 
   eor v0.16b, v0.16b, v0.16b
   add x2, x2, #consts_my_p(0)
@@ -487,6 +494,7 @@ _gcry_crc32_armv8_ce_reduction_4:
   mov w0, v0.s[0]
 
   ret
+  CFI_ENDPROC()
 ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
 
 #endif
diff --git a/cipher/rijndael-aarch64.S b/cipher/rijndael-aarch64.S
index aad748753..e77dd4e0b 100644
--- a/cipher/rijndael-aarch64.S
+++ b/cipher/rijndael-aarch64.S
@@ -216,6 +216,7 @@ _gcry_aes_arm_encrypt_block:
 	 *	%w3: number of rounds.. 10, 12 or 14
 	 *      %x4: encryption table
 	 */
+	CFI_STARTPROC();
 
 	/* read input block */
 
@@ -285,6 +286,7 @@ _gcry_aes_arm_encrypt_block:
 	lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
 	b .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;)
 
 #define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
@@ -439,6 +441,7 @@ _gcry_aes_arm_decrypt_block:
 	 *	%w3: number of rounds.. 10, 12 or 14
 	 *      %x4: decryption table
 	 */
+	CFI_STARTPROC();
 
 	/* read input block */
 
@@ -504,6 +507,7 @@ _gcry_aes_arm_decrypt_block:
 	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 
 	b .Ldec_tail;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_arm_decrypt_block,.-_gcry_aes_arm_decrypt_block;)
 
 #endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index f0012c20a..71b45b856 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -247,6 +247,7 @@ _gcry_aes_enc_armv8_ce:
    *    x2: src
    *    w3: nrounds
    */
+  CFI_STARTPROC();
 
   aes_preload_keys(x0, w3);
 
@@ -291,6 +292,7 @@ _gcry_aes_enc_armv8_ce:
   CLEAR_REG(vk13)
   CLEAR_REG(vk14)
   b .Lenc1_tail
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;)
 
 
@@ -309,6 +311,7 @@ _gcry_aes_dec_armv8_ce:
    *    x2: src
    *    w3: nrounds
    */
+  CFI_STARTPROC();
 
   aes_preload_keys(x0, w3);
 
@@ -353,6 +356,7 @@ _gcry_aes_dec_armv8_ce:
   CLEAR_REG(vk13)
   CLEAR_REG(vk14)
   b .Ldec1_tail
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
 
 
@@ -377,6 +381,7 @@ _gcry_aes_cbc_enc_armv8_ce:
    *    w5: cbc_mac
    *    w6: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lcbc_enc_skip
 
@@ -419,6 +424,7 @@ _gcry_aes_cbc_enc_armv8_ce:
 
 .Lcbc_enc_skip:
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
 
 /*
@@ -440,6 +446,7 @@ _gcry_aes_cbc_dec_armv8_ce:
    *    x4: nblocks
    *    w5: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lcbc_dec_skip
 
@@ -515,6 +522,7 @@ _gcry_aes_cbc_dec_armv8_ce:
 
 .Lcbc_dec_skip:
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;)
 
 
@@ -537,6 +545,7 @@ _gcry_aes_ctr_enc_armv8_ce:
    *    x4: nblocks
    *    w5: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lctr_enc_skip
 
@@ -668,7 +677,7 @@ _gcry_aes_ctr_enc_armv8_ce:
 
 .Lctr_enc_skip:
   ret
-
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
 
 
@@ -691,6 +700,7 @@ _gcry_aes_cfb_enc_armv8_ce:
    *    x4: nblocks
    *    w5: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lcfb_enc_skip
 
@@ -732,6 +742,7 @@ _gcry_aes_cfb_enc_armv8_ce:
 
 .Lcfb_enc_skip:
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;)
 
 
@@ -754,6 +765,7 @@ _gcry_aes_cfb_dec_armv8_ce:
    *    x4: nblocks
    *    w5: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lcfb_dec_skip
 
@@ -829,6 +841,7 @@ _gcry_aes_cfb_dec_armv8_ce:
 
 .Lcfb_dec_skip:
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
 
 
@@ -859,6 +872,7 @@ _gcry_aes_ocb_enc_armv8_ce:
    *    w7: nrounds
    *    %st+0: blkn => w12
    */
+  CFI_STARTPROC();
 
   ldr w12, [sp]
   ld1 {v0.16b}, [x3] /* load offset */
@@ -979,6 +993,7 @@ _gcry_aes_ocb_enc_armv8_ce:
   CLEAR_REG(v16)
 
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
 
 
@@ -1009,6 +1024,7 @@ _gcry_aes_ocb_dec_armv8_ce:
    *    w7: nrounds
    *    %st+0: blkn => w12
    */
+  CFI_STARTPROC();
 
   ldr w12, [sp]
   ld1 {v0.16b}, [x3] /* load offset */
@@ -1129,6 +1145,7 @@ _gcry_aes_ocb_dec_armv8_ce:
   CLEAR_REG(v16)
 
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
 
 
@@ -1157,6 +1174,8 @@ _gcry_aes_ocb_auth_armv8_ce:
    *    w6: nrounds => w7
    *    w7: blkn => w12
    */
+  CFI_STARTPROC();
+
   mov w12, w7
   mov w7, w6
   mov x6, x5
@@ -1273,6 +1292,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(v16)
 
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
 
 
@@ -1297,6 +1317,7 @@ _gcry_aes_xts_enc_armv8_ce:
    *    x4: nblocks
    *    w5: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lxts_enc_skip
 
@@ -1411,7 +1432,7 @@ _gcry_aes_xts_enc_armv8_ce:
 
 .Lxts_enc_skip:
   ret
-
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;)
 
 
@@ -1436,6 +1457,7 @@ _gcry_aes_xts_dec_armv8_ce:
    *    x4: nblocks
    *    w5: nrounds
    */
+  CFI_STARTPROC();
 
   cbz x4, .Lxts_dec_skip
 
@@ -1550,7 +1572,7 @@ _gcry_aes_xts_dec_armv8_ce:
 
 .Lxts_dec_skip:
   ret
-
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;)
 
 
@@ -1564,6 +1586,7 @@ _gcry_aes_sbox4_armv8_ce:
   /* See "Gouv?a, C. P. L. & L?pez, J. Implementing GCM on ARMv8. Topics in
    * Cryptology ? CT-RSA 2015" for details.
    */
+  CFI_STARTPROC();
   movi v0.16b, #0x52
   movi v1.16b, #0
   mov v0.S[0], w0
@@ -1572,6 +1595,7 @@ _gcry_aes_sbox4_armv8_ce:
   mov w0, v0.S[0]
   CLEAR_REG(v0)
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;)
 
 
@@ -1582,11 +1606,13 @@ ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;)
 .globl _gcry_aes_invmixcol_armv8_ce
 ELF(.type  _gcry_aes_invmixcol_armv8_ce,%function;)
 _gcry_aes_invmixcol_armv8_ce:
+  CFI_STARTPROC();
   ld1 {v0.16b}, [x1]
   aesimc v0.16b, v0.16b
   st1 {v0.16b}, [x0]
   CLEAR_REG(v0)
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;)
 
 #endif
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index aeb67a128..7dc26c0f1 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -110,6 +110,7 @@ _gcry_sha1_transform_armv8_ce:
    *	x1: data (64*nblks bytes)
    *	x2: nblks
    */
+  CFI_STARTPROC();
 
   cbz x2, .Ldo_nothing;
 
@@ -199,6 +200,7 @@ _gcry_sha1_transform_armv8_ce:
 .Ldo_nothing:
   mov x0, #0
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;)
 
 #endif
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index 6b3ad32d8..706e0dfd9 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -120,6 +120,7 @@ _gcry_sha256_transform_armv8_ce:
    *	r1: data (64*nblks bytes)
    *	r2: nblks
    */
+  CFI_STARTPROC();
 
   cbz x2, .Ldo_nothing;
 
@@ -213,6 +214,7 @@ _gcry_sha256_transform_armv8_ce:
 .Ldo_nothing:
   mov x0, #0
   ret
+  CFI_ENDPROC();
 ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;)
 
 #endif
diff --git a/cipher/twofish-aarch64.S b/cipher/twofish-aarch64.S
index adee412d7..9f35b5cde 100644
--- a/cipher/twofish-aarch64.S
+++ b/cipher/twofish-aarch64.S
@@ -225,6 +225,7 @@ _gcry_twofish_arm_encrypt_block:
 	 *	x1: dst
 	 *	x2: src
 	 */
+	CFI_STARTPROC();
 
 	add CTXw, CTX, #(w);
 
@@ -262,6 +263,7 @@ _gcry_twofish_arm_encrypt_block:
 	str_output_le(RDST, RC, RD, RA, RB, RT0, RT1);
 
 	ret;
+	CFI_ENDPROC();
 .ltorg
 ELF(.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;)
 
@@ -274,6 +276,7 @@ _gcry_twofish_arm_decrypt_block:
 	 *	%r1: dst
 	 *	%r2: src
 	 */
+	CFI_STARTPROC();
 
 	add CTXw, CTX, #(w);
 
@@ -311,6 +314,7 @@ _gcry_twofish_arm_decrypt_block:
 	str_output_le(RDST, RA, RB, RC, RD, RT0, RT1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;)
 
 #endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
diff --git a/mpi/aarch64/mpih-add1.S b/mpi/aarch64/mpih-add1.S
index 3370320e0..bc62cf987 100644
--- a/mpi/aarch64/mpih-add1.S
+++ b/mpi/aarch64/mpih-add1.S
@@ -37,6 +37,7 @@
 .globl _gcry_mpih_add_n
 ELF(.type  _gcry_mpih_add_n,%function)
 _gcry_mpih_add_n:
+	CFI_STARTPROC()
 	and	w5, w3, #3;
 	adds	xzr, xzr, xzr; /* clear carry flag */
 
@@ -69,4 +70,5 @@ _gcry_mpih_add_n:
 .Lend:
 	adc	x0, xzr, xzr;
 	ret;
+	CFI_ENDPROC()
 ELF(.size _gcry_mpih_add_n,.-_gcry_mpih_add_n;)
diff --git a/mpi/aarch64/mpih-mul1.S b/mpi/aarch64/mpih-mul1.S
index 8830845a7..92fcd141b 100644
--- a/mpi/aarch64/mpih-mul1.S
+++ b/mpi/aarch64/mpih-mul1.S
@@ -37,6 +37,7 @@
 .globl _gcry_mpih_mul_1
 ELF(.type  _gcry_mpih_mul_1,%function)
 _gcry_mpih_mul_1:
+	CFI_STARTPROC()
 	and	w5, w2, #3;
 	mov	x4, xzr;
 
@@ -94,4 +95,5 @@ _gcry_mpih_mul_1:
 .Lend:
 	mov	x0, x4;
 	ret;
+	CFI_ENDPROC()
 ELF(.size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1;)
diff --git a/mpi/aarch64/mpih-mul2.S b/mpi/aarch64/mpih-mul2.S
index 5d736990e..aa0e5a2d5 100644
--- a/mpi/aarch64/mpih-mul2.S
+++ b/mpi/aarch64/mpih-mul2.S
@@ -37,6 +37,7 @@
 .globl _gcry_mpih_addmul_1
 ELF(.type  _gcry_mpih_addmul_1,%function)
 _gcry_mpih_addmul_1:
+	CFI_STARTPROC()
 	and	w5, w2, #3;
 	mov	x6, xzr;
 	mov	x7, xzr;
@@ -106,4 +107,5 @@ _gcry_mpih_addmul_1:
 .Lend:
 	mov	x0, x6;
 	ret;
+	CFI_ENDPROC()
 ELF(.size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1;)
diff --git a/mpi/aarch64/mpih-mul3.S b/mpi/aarch64/mpih-mul3.S
index f785e5e42..5a40b354c 100644
--- a/mpi/aarch64/mpih-mul3.S
+++ b/mpi/aarch64/mpih-mul3.S
@@ -37,6 +37,7 @@
 .globl _gcry_mpih_submul_1
 ELF(.type  _gcry_mpih_submul_1,%function)
 _gcry_mpih_submul_1:
+	CFI_STARTPROC()
 	and	w5, w2, #3;
 	mov	x7, xzr;
 	cbz	w5, .Large_loop;
@@ -119,4 +120,5 @@ _gcry_mpih_submul_1:
 .Loop_end:
 	cinc	x0, x7, cc;
 	ret;
+	CFI_ENDPROC()
 ELF(.size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1;)
diff --git a/mpi/aarch64/mpih-sub1.S b/mpi/aarch64/mpih-sub1.S
index 45a7b0417..4f279a123 100644
--- a/mpi/aarch64/mpih-sub1.S
+++ b/mpi/aarch64/mpih-sub1.S
@@ -37,6 +37,7 @@
 .globl _gcry_mpih_sub_n
 ELF(.type  _gcry_mpih_sub_n,%function)
 _gcry_mpih_sub_n:
+	CFI_STARTPROC()
 	and	w5, w3, #3;
 	subs	xzr, xzr, xzr; /* prepare carry flag for sub */
 
@@ -69,4 +70,5 @@ _gcry_mpih_sub_n:
 .Lend:
 	cset	x0, cc;
 	ret;
+	CFI_ENDPROC()
 ELF(.size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n;)
diff --git a/mpi/asm-common-aarch64.h b/mpi/asm-common-aarch64.h
index 126941307..cf4bdb852 100644
--- a/mpi/asm-common-aarch64.h
+++ b/mpi/asm-common-aarch64.h
@@ -21,10 +21,6 @@
 #ifndef MPI_ASM_COMMON_AARCH64_H
 #define MPI_ASM_COMMON_AARCH64_H
 
-#ifdef __ELF__
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "../cipher/asm-common-aarch64.h"
 
 #endif /* MPI_ASM_COMMON_AARCH64_H */


From jussi.kivilinna at iki.fi  Fri Apr 26 18:33:46 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 26 Apr 2019 19:33:46 +0300
Subject: [PATCH 4/4] Optimizations for GCM Intel/PCLMUL implementation
In-Reply-To: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>
References: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain>
Message-ID: <155629642673.14985.4669553340338086876.stgit@localhost.localdomain>

* cipher/cipher-gcm-intel-pclmul.c (reduction): New.
(glmul_pclmul): Include shifting to left into pclmul operations; Use
'reduction' helper function.
(gfmul_pclmul_aggr4): Reorder instructions and adjust register usage to
free up registers; Use 'reduction' helper function; Include shifting to
left into pclmul operations.
(gcm_lsh): New.
(_gcry_ghash_setup_intel_pclmul): Left shift H values to left by one.
(_gcry_ghash_intel_pclmul) [__x86_64__]: Preload H values to unused
registers.
--

Benchmark on Intel Haswell (amd64):

Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 GMAC_AES           |     0.206 ns/B      4624 MiB/s     0.825 c/B      3998

After (+12% faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 GMAC_AES           |     0.184 ns/B      5195 MiB/s     0.734 c/B      3998

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 60ae7aa9a..da309aead 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1,6 +1,6 @@
 /* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
  *                               implementation
- * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -47,6 +47,35 @@
   "Intel? Carry-Less Multiplication Instruction and its Usage for Computing the
    GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
  */
+static inline void reduction(void)
+{
+  /* input: <xmm1:xmm3> */
+
+  asm volatile (/* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "psllq $1, %%xmm6\n\t"  /* packed right shifting << 63 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "psllq $57, %%xmm7\n\t"  /* packed right shifting << 57 */
+                "psllq $62, %%xmm6\n\t"  /* packed right shifting << 62 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pshufd $0x6a, %%xmm6, %%xmm7\n\t"
+                "pshufd $0xae, %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */
+                "psrlq $1, %%xmm3\n\t"    /* packed left shifting >> 1 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "psrlq $1, %%xmm3\n\t"    /* packed left shifting >> 2 */
+                "pxor %%xmm3, %%xmm1\n\t"
+                "psrlq $5, %%xmm3\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */
+                ::: "memory" );
+}
+
 static inline void gfmul_pclmul(void)
 {
   /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
@@ -60,65 +89,22 @@ static inline void gfmul_pclmul(void)
 
                 "movdqa %%xmm0, %%xmm3\n\t"
                 "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
-                "movdqa %%xmm0, %%xmm6\n\t"
-                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+                "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */
                 "movdqa %%xmm3, %%xmm5\n\t"
                 "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
 
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
                 "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
                 "movdqa %%xmm4, %%xmm5\n\t"
                 "psrldq $8, %%xmm4\n\t"
                 "pslldq $8, %%xmm5\n\t"
                 "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
                                              carry-less multiplication of xmm0
                                              by xmm1 */
+                ::: "memory" );
 
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                ::: "cc" );
+  reduction();
 }
 
 
@@ -136,117 +122,92 @@ static inline void gfmul_pclmul_aggr4(void)
      Input must be converted to little-endian.
    */
   asm volatile (/* perform clmul and merge results... */
-                "pshufd $78, %%xmm10, %%xmm11\n\t"
+                "pshufd $78, %%xmm10, %%xmm5\n\t"
                 "pshufd $78, %%xmm1, %%xmm12\n\t"
-                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
                 "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
+                "movdqa %%xmm10, %%xmm4\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm12\n\t"  /* xmm12 holds 4:(a0+a1)*(b0+b1) */
 
                 "pshufd $78, %%xmm9, %%xmm13\n\t"
-                "pshufd $78, %%xmm2, %%xmm14\n\t"
+                "pshufd $78, %%xmm2, %%xmm5\n\t"
                 "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
-
-                "pshufd $78, %%xmm8, %%xmm5\n\t"
-                "pshufd $78, %%xmm3, %%xmm15\n\t"
-                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
-                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
-
-                "movdqa %%xmm10, %%xmm4\n\t"
+                "pxor %%xmm2, %%xmm5\n\t"  /* xmm5 holds 3:b0+b1 */
                 "movdqa %%xmm9, %%xmm7\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
                 "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
                 "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
-                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
-
-                "pshufd $78, %%xmm0, %%xmm10\n\t"
-                "pshufd $78, %%xmm6, %%xmm11\n\t"
-                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
-                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+                "pclmulqdq $0, %%xmm13, %%xmm5\n\t"  /* xmm5 holds 3:(a0+a1)*(b0+b1) */
 
                 "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
                 "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
-                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+                "pxor %%xmm5, %%xmm12\n\t"  /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
 
-                "movdqa %%xmm8, %%xmm13\n\t"
-                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
+                "pshufd $78, %%xmm8, %%xmm5\n\t"
+                "pshufd $78, %%xmm3, %%xmm2\n\t"
+                "pxor %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a0+a1 */
+                "pxor %%xmm3, %%xmm2\n\t" /* xmm2 holds 2:b0+b1 */
+                "movdqa %%xmm8, %%xmm4\n\t"
+                "pclmulqdq $0, %%xmm3, %%xmm4\n\t"   /* xmm4 holds 2:a0*b0 */
                 "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+                "pclmulqdq $0, %%xmm5, %%xmm2\n\t"   /* xmm2 holds 2:(a0+a1)*(b0+b1) */
 
-                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
+                "pxor %%xmm4, %%xmm7\n\t"  /* xmm7 holds 2+3+4:a0*b0 */
                 "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
-                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+                "pxor %%xmm12, %%xmm2\n\t" /* xmm2 holds 2+3+4:(a0+a1)*(b0+b1) */
 
+                "pshufd $78, %%xmm0, %%xmm11\n\t"
+                "pshufd $78, %%xmm6, %%xmm4\n\t"
+                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */
+                "pxor %%xmm6, %%xmm4\n\t"  /* xmm4 holds 1:b0+b1 */
                 "movdqa %%xmm0, %%xmm3\n\t"
                 "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
                 "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
-                "movdqa %%xmm11, %%xmm4\n\t"
-                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+                "pclmulqdq $0, %%xmm11, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
 
                 "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
-                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
-                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+                "pxor %%xmm6, %%xmm1\n\t"  /* xmm1 holds 1+2+3+4:a1*b1 */
+                "pxor %%xmm2, %%xmm4\n\t"  /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
 
                 /* aggregated reduction... */
                 "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
                 "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
                 "movdqa %%xmm4, %%xmm5\n\t"
                 "psrldq $8, %%xmm4\n\t"
                 "pslldq $8, %%xmm5\n\t"
                 "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
                                              carry-less multiplication of xmm0
                                              by xmm1 */
+                :::"memory");
 
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                :::"cc");
+  reduction();
 }
 #endif
 
+static inline void gcm_lsh(void *h, unsigned int hoffs)
+{
+  static const u64 pconst[2] __attribute__ ((aligned (16))) =
+    { U64_C(0x0000000000000001), U64_C(0xc200000000000000) };
+
+  asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+                "pshufd $0xff, %%xmm2, %%xmm3\n\t"
+                "movdqa %%xmm2, %%xmm4\n\t"
+                "psrad $31, %%xmm3\n\t"
+                "pslldq $8, %%xmm4\n\t"
+                "pand %[pconst], %%xmm3\n\t"
+                "paddq %%xmm2, %%xmm2\n\t"
+                "psrlq $63, %%xmm4\n\t"
+                "pxor %%xmm3, %%xmm2\n\t"
+                "pxor %%xmm4, %%xmm2\n\t"
+                "movdqu %%xmm2, (%[h])\n\t"
+                :
+                : [pconst] "m" (pconst),
+                  [h] "r" ((byte *)h + hoffs)
+                : "memory" );
+}
 
 void
 _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
@@ -274,13 +235,16 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                   [be_mask] "m" (*be_mask)
                 : "memory");
 
+  gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+
 #ifdef __x86_64__
   asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
+                "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
                 :
-                :
+                : [key] "r" (c->u_mode.gcm.u_ghash_key.key)
                 : "memory");
 
-  gfmul_pclmul (); /* H?H => H? */
+  gfmul_pclmul (); /* H<<<1?H => H? */
 
   asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
                 "movdqa %%xmm1, %%xmm8\n\t"
@@ -288,22 +252,26 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 : [h_234] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
-  gfmul_pclmul (); /* H?H? => H? */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 0); /* H? <<< 1 */
+  gfmul_pclmul (); /* H<<<1?H? => H? */
 
   asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
                 "movdqu %%xmm1, 1*16(%[h_234])\n\t"
-                "movdqa %%xmm8, %%xmm1\n\t"
+                "movdqu 0*16(%[h_234]), %%xmm1\n\t" /* load H? <<< 1 */
                 :
                 : [h_234] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
-  gfmul_pclmul (); /* H??H? => H? */
+  gfmul_pclmul (); /* H?<<<1?H? => H? */
 
   asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
                 :
                 : [h_234] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
+  gcm_lsh(c->u_mode.gcm.gcm_table, 16); /* H? <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 32); /* H? <<< 1 */
+
 #ifdef __WIN64__
   /* Clear/restore used registers. */
   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
@@ -329,7 +297,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 "pxor %%xmm6, %%xmm6\n\t"
                 "pxor %%xmm7, %%xmm7\n\t"
                 "pxor %%xmm8, %%xmm8\n\t"
-                ::: "cc" );
+                ::: "memory" );
 #endif
 #endif
 }
@@ -372,32 +340,36 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
                 :
                 : [hash] "m" (*result), [be_mask] "m" (*be_mask),
-                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
+                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                : "memory" );
 
 #ifdef __x86_64__
   if (nblocks >= 4)
     {
+      asm volatile (/* Load H2, H3, H4, be_mask. */
+                    "movdqu 2*16(%[h_234]), %%xmm10\n\t"
+                    "movdqu 1*16(%[h_234]), %%xmm9\n\t"
+                    "movdqu 0*16(%[h_234]), %%xmm8\n\t"
+                    "movdqa %[be_mask], %%xmm14\n\t"
+                    :
+                    : [h_234] "r" (c->u_mode.gcm.gcm_table),
+                      [be_mask] "m" (*be_mask)
+                    : "memory" );
+
       do
         {
-          asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
-                        "movdqu 0*16(%[buf]), %%xmm5\n\t"
+          asm volatile ("movdqu 0*16(%[buf]), %%xmm5\n\t"
                         "movdqu 1*16(%[buf]), %%xmm2\n\t"
                         "movdqu 2*16(%[buf]), %%xmm3\n\t"
                         "movdqu 3*16(%[buf]), %%xmm6\n\t"
-                        "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
-
-                        /* Load H2, H3, H4. */
-                        "movdqu 2*16(%[h_234]), %%xmm10\n\t"
-                        "movdqu 1*16(%[h_234]), %%xmm9\n\t"
-                        "movdqu 0*16(%[h_234]), %%xmm8\n\t"
-
+                        "pshufb %%xmm14, %%xmm5\n\t" /* be => le */
+                        "pshufb %%xmm14, %%xmm2\n\t" /* be => le */
+                        "pshufb %%xmm14, %%xmm3\n\t" /* be => le */
                         "pxor %%xmm5, %%xmm1\n\t"
-                        "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
-                        "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
-                        "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
+                        "pshufb %%xmm14, %%xmm6\n\t" /* be => le */
                         :
-                        : [buf] "r" (buf), [be_mask] "m" (*be_mask),
-                          [h_234] "r" (c->u_mode.gcm.gcm_table));
+                        : [buf] "r" (buf)
+                        : "memory" );
 
           gfmul_pclmul_aggr4 ();
 
@@ -416,29 +388,32 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                     "pxor %%xmm13, %%xmm13\n\t"
                     "pxor %%xmm14, %%xmm14\n\t"
                     "pxor %%xmm15, %%xmm15\n\t"
-                    ::: "cc" );
+                    ::: "memory" );
 #endif
     }
 #endif
 
-  while (nblocks--)
+  while (nblocks)
     {
       asm volatile ("movdqu %[buf], %%xmm2\n\t"
                     "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
                     "pxor %%xmm2, %%xmm1\n\t"
                     :
-                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
+                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask)
+                    : "memory" );
 
       gfmul_pclmul ();
 
       buf += blocksize;
+      nblocks--;
     }
 
   /* Store hash. */
   asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
                 "movdqu %%xmm1, %[hash]\n\t"
                 : [hash] "=m" (*result)
-                : [be_mask] "m" (*be_mask));
+                : [be_mask] "m" (*be_mask)
+                : "memory" );
 
 #ifdef __WIN64__
   /* Clear/restore used registers. */
@@ -471,7 +446,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 "pxor %%xmm5, %%xmm5\n\t"
                 "pxor %%xmm6, %%xmm6\n\t"
                 "pxor %%xmm7, %%xmm7\n\t"
-                ::: "cc" );
+                ::: "memory" );
 #endif
 
   return 0;


From jussi.kivilinna at iki.fi  Sat Apr 27 16:37:33 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Apr 2019 17:37:33 +0300
Subject: [PATCH v2] Optimizations for GCM Intel/PCLMUL implementation
Message-ID: <155637585351.6430.2160747288549673210.stgit@localhost.localdomain>

* cipher/cipher-gcm-intel-pclmul.c (reduction): New.
(glmul_pclmul): Include shifting to left into pclmul operations; Use
'reduction' helper function.
[__x86_64__] (gfmul_pclmul_aggr4): Reorder instructions and adjust
register usage to free up registers; Use 'reduction' helper function;
Include shifting to left into pclmul operations; Moving load H values
and input from caller into this function.
[__x86_64__] (gfmul_pclmul_aggr8): New.
(gcm_lsh): New.
(_gcry_ghash_setup_intel_pclmul): Left shift H values to left by
one; Preserve XMM6-XMM15 registers on WIN64.
(_gcry_ghash_intel_pclmul) [__x86_64__]: Use 8 block aggregated
reduction function.
--

Benchmark on Intel Haswell (amd64):

Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 GMAC_AES           |     0.206 ns/B      4624 MiB/s     0.825 c/B      3998

After (+50% faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 GMAC_AES           |     0.137 ns/B      6953 MiB/s     0.548 c/B      3998

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 60ae7aa9a..46af77eac 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1,6 +1,6 @@
 /* cipher-gcm-intel-pclmul.c  -  Intel PCLMUL accelerated Galois Counter Mode
  *                               implementation
- * Copyright (C) 2013-2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2014,2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -47,6 +47,35 @@
   "Intel? Carry-Less Multiplication Instruction and its Usage for Computing the
    GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
  */
+static inline void reduction(void)
+{
+  /* input: <xmm1:xmm3> */
+
+  asm volatile (/* first phase of the reduction */
+                "movdqa %%xmm3, %%xmm6\n\t"
+                "movdqa %%xmm3, %%xmm7\n\t"
+                "psllq $1, %%xmm6\n\t"  /* packed right shifting << 63 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "psllq $57, %%xmm7\n\t"  /* packed right shifting << 57 */
+                "psllq $62, %%xmm6\n\t"  /* packed right shifting << 62 */
+                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
+                "pshufd $0x6a, %%xmm6, %%xmm7\n\t"
+                "pshufd $0xae, %%xmm6, %%xmm6\n\t"
+                "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction
+                                             complete */
+
+                /* second phase of the reduction */
+                "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */
+                "psrlq $1, %%xmm3\n\t"    /* packed left shifting >> 1 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "psrlq $1, %%xmm3\n\t"    /* packed left shifting >> 2 */
+                "pxor %%xmm3, %%xmm1\n\t"
+                "psrlq $5, %%xmm3\n\t"    /* packed left shifting >> 7 */
+                "pxor %%xmm3, %%xmm6\n\t"
+                "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */
+                ::: "memory" );
+}
+
 static inline void gfmul_pclmul(void)
 {
   /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified.
@@ -60,193 +89,304 @@ static inline void gfmul_pclmul(void)
 
                 "movdqa %%xmm0, %%xmm3\n\t"
                 "pclmulqdq $0, %%xmm1, %%xmm3\n\t"  /* xmm3 holds a0*b0 */
-                "movdqa %%xmm0, %%xmm6\n\t"
-                "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */
+                "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */
                 "movdqa %%xmm3, %%xmm5\n\t"
                 "pclmulqdq $0, %%xmm2, %%xmm4\n\t"  /* xmm4 holds (a0+a1)*(b0+b1) */
 
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
                 "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
                 "movdqa %%xmm4, %%xmm5\n\t"
                 "psrldq $8, %%xmm4\n\t"
                 "pslldq $8, %%xmm5\n\t"
                 "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
                                              carry-less multiplication of xmm0
                                              by xmm1 */
+                ::: "memory" );
 
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                ::: "cc" );
+  reduction();
 }
 
-
 #ifdef __x86_64__
-static inline void gfmul_pclmul_aggr4(void)
+static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_table)
 {
   /* Input:
-      H?: XMM0          X_i            : XMM6
-      H?: XMM8          X_(i-1)        : XMM3
-      H?: XMM9          X_(i-2)        : XMM2
-      H?: XMM10         X_(i-3)?Y_(i-4): XMM1
+      H?: XMM0
+      bemask: XMM15
+      Hash: XMM1
      Output:
-      Y_i: XMM1
-     Inputs XMM0 stays unmodified.
-     Input must be converted to little-endian.
+      Hash: XMM1
+     Inputs XMM0 and XMM14 stays unmodified.
    */
-  asm volatile (/* perform clmul and merge results... */
-                "pshufd $78, %%xmm10, %%xmm11\n\t"
-                "pshufd $78, %%xmm1, %%xmm12\n\t"
-                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
-                "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */
-
-                "pshufd $78, %%xmm9, %%xmm13\n\t"
-                "pshufd $78, %%xmm2, %%xmm14\n\t"
-                "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */
-
-                "pshufd $78, %%xmm8, %%xmm5\n\t"
-                "pshufd $78, %%xmm3, %%xmm15\n\t"
-                "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */
-                "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */
-
-                "movdqa %%xmm10, %%xmm4\n\t"
-                "movdqa %%xmm9, %%xmm7\n\t"
-                "pclmulqdq $0, %%xmm1, %%xmm4\n\t"   /* xmm4 holds 4:a0*b0 */
-                "pclmulqdq $0, %%xmm2, %%xmm7\n\t"   /* xmm7 holds 3:a0*b0 */
+  asm volatile (/* Load H2, H3, H4. */
+                "movdqu 2*16(%[h_table]), %%xmm10\n\t"
+                "movdqu 1*16(%[h_table]), %%xmm9\n\t"
+                "movdqu 0*16(%[h_table]), %%xmm8\n\t"
+
+                /* perform clmul and merge results... */
+                "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+                "pxor %%xmm5, %%xmm1\n\t"
+
+                "pshufd $78, %%xmm10, %%xmm5\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 4:b0+b1 */
+                "movdqa %%xmm10, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */
                 "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
-                "pclmulqdq $17, %%xmm9, %%xmm2\n\t"  /* xmm9 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */
-                "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */
+                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm9, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm9, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */
+
+                "movdqu 2*16(%[buf]), %%xmm5\n\t"
+                "movdqu 3*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 2:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm0, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 1:b0+b1 */
+                "movdqa %%xmm0, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */
 
-                "pshufd $78, %%xmm0, %%xmm10\n\t"
-                "pshufd $78, %%xmm6, %%xmm11\n\t"
-                "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */
-                "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
 
-                "pxor %%xmm4, %%xmm7\n\t"   /* xmm7 holds 3+4:a0*b0 */
-                "pxor %%xmm2, %%xmm1\n\t"   /* xmm1 holds 3+4:a1*b1 */
-                "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */
+                /* aggregated reduction... */
+                "movdqa %%xmm3, %%xmm5\n\t"
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
+                "movdqa %%xmm4, %%xmm5\n\t"
+                "psrldq $8, %%xmm4\n\t"
+                "pslldq $8, %%xmm5\n\t"
+                "pxor %%xmm5, %%xmm3\n\t"
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
+                                             carry-less multiplication of xmm0
+                                             by xmm1 */
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
 
-                "movdqa %%xmm8, %%xmm13\n\t"
-                "pclmulqdq $0, %%xmm3, %%xmm13\n\t"  /* xmm13 holds 2:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm3\n\t"  /* xmm3 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */
+  reduction();
+}
 
-                "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */
-                "pxor %%xmm3, %%xmm1\n\t"  /* xmm1 holds 2+3+4:a1*b1 */
-                "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */
+static inline void gfmul_pclmul_aggr8(const void *buf, const void *h_table)
+{
+  /* Input:
+      H?: XMM0
+      bemask: XMM15
+      Hash: XMM1
+     Output:
+      Hash: XMM1
+     Inputs XMM0 and XMM14 stays unmodified.
+   */
+  asm volatile (/* Load H6, H7, H8. */
+                "movdqu 6*16(%[h_table]), %%xmm10\n\t"
+                "movdqu 5*16(%[h_table]), %%xmm9\n\t"
+                "movdqu 4*16(%[h_table]), %%xmm8\n\t"
+
+                /* perform clmul and merge results... */
+                "movdqu 0*16(%[buf]), %%xmm5\n\t"
+                "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+                "pxor %%xmm5, %%xmm1\n\t"
+
+                "pshufd $78, %%xmm10, %%xmm5\n\t"
+                "pshufd $78, %%xmm1, %%xmm4\n\t"
+                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 8:b0+b1 */
+                "movdqa %%xmm10, %%xmm3\n\t"
+                "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 8:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */
+                "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm9, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 7:b0+b1 */
+                "movdqa %%xmm9, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 7:a0*b0 */
+                "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */
+
+                "movdqu 2*16(%[buf]), %%xmm5\n\t"
+                "movdqu 3*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 6:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 6:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */
+
+                /* Load H3, H4, H5. */
+                "movdqu 3*16(%[h_table]), %%xmm10\n\t"
+                "movdqu 2*16(%[h_table]), %%xmm9\n\t"
+                "movdqu 1*16(%[h_table]), %%xmm8\n\t"
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */
 
-                "movdqa %%xmm0, %%xmm3\n\t"
-                "pclmulqdq $0, %%xmm6, %%xmm3\n\t"  /* xmm3 holds 1:a0*b0 */
-                "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */
-                "movdqa %%xmm11, %%xmm4\n\t"
-                "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */
+                "pshufd $78, %%xmm10, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"   /* xmm7 holds 5:b0+b1 */
+                "movdqa %%xmm10, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"   /* xmm6 holds 5:a0*b0 */
+                "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 5:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "movdqu 4*16(%[buf]), %%xmm5\n\t"
+                "movdqu 5*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm9, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 4:b0+b1 */
+                "movdqa %%xmm9, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 4:a0*b0 */
+                "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+
+                "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "movdqu 6*16(%[buf]), %%xmm5\n\t"
+                "movdqu 7*16(%[buf]), %%xmm2\n\t"
+                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
+                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm5, %%xmm7\n\t"
+                "pxor %%xmm8, %%xmm11\n\t"  /* xmm11 holds 4:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t"   /* xmm7 holds 4:b0+b1 */
+                "movdqa %%xmm8, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm5, %%xmm6\n\t"   /* xmm6 holds 4:a0*b0 */
+                "pclmulqdq $17, %%xmm8, %%xmm5\n\t"  /* xmm5 holds 4:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t"  /* xmm7 holds 4:(a0+a1)*(b0+b1) */
+
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */
+                "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
+
+                "pshufd $78, %%xmm0, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm7\n\t"
+                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm0, %%xmm6\n\t"
+                "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
+                "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
 
-                "pxor %%xmm7, %%xmm3\n\t"  /* xmm3 holds 1+2+3+4:a0*b0 */
-                "pxor %%xmm1, %%xmm6\n\t"  /* xmm6 holds 1+2+3+4:a1*b1 */
-                "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */
+                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */
+                "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */
+                "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */
 
                 /* aggregated reduction... */
                 "movdqa %%xmm3, %%xmm5\n\t"
-                "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
+                "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */
                 "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */
                 "movdqa %%xmm4, %%xmm5\n\t"
                 "psrldq $8, %%xmm4\n\t"
                 "pslldq $8, %%xmm5\n\t"
                 "pxor %%xmm5, %%xmm3\n\t"
-                "pxor %%xmm4, %%xmm6\n\t" /* <xmm6:xmm3> holds the result of the
+                "pxor %%xmm4, %%xmm1\n\t" /* <xmm1:xmm3> holds the result of the
                                              carry-less multiplication of xmm0
                                              by xmm1 */
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table)
+                : "memory" );
 
-                /* shift the result by one bit position to the left cope for
-                   the fact that bits are reversed */
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "movdqa %%xmm6, %%xmm5\n\t"
-                "pslld $1, %%xmm3\n\t"
-                "pslld $1, %%xmm6\n\t"
-                "psrld $31, %%xmm4\n\t"
-                "psrld $31, %%xmm5\n\t"
-                "movdqa %%xmm4, %%xmm1\n\t"
-                "pslldq $4, %%xmm5\n\t"
-                "pslldq $4, %%xmm4\n\t"
-                "psrldq $12, %%xmm1\n\t"
-                "por %%xmm4, %%xmm3\n\t"
-                "por %%xmm5, %%xmm6\n\t"
-                "por %%xmm6, %%xmm1\n\t"
-
-                /* first phase of the reduction */
-                "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
-                "pslld $31, %%xmm6\n\t"  /* packed right shifting << 31 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "pslld $30, %%xmm7\n\t"  /* packed right shifting shift << 30 */
-                "pslld $25, %%xmm5\n\t"  /* packed right shifting shift << 25 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm6\n\t"
-                "movdqa %%xmm6, %%xmm7\n\t"
-                "pslldq $12, %%xmm6\n\t"
-                "psrldq $4, %%xmm7\n\t"
-                "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction
-                                             complete */
-
-                /* second phase of the reduction */
-                "movdqa %%xmm3, %%xmm2\n\t"
-                "movdqa %%xmm3, %%xmm4\n\t"
-                "psrld $1, %%xmm2\n\t"    /* packed left shifting >> 1 */
-                "movdqa %%xmm3, %%xmm5\n\t"
-                "psrld $2, %%xmm4\n\t"    /* packed left shifting >> 2 */
-                "psrld $7, %%xmm5\n\t"    /* packed left shifting >> 7 */
-                "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */
-                "pxor %%xmm5, %%xmm2\n\t"
-                "pxor %%xmm7, %%xmm2\n\t"
-                "pxor %%xmm2, %%xmm3\n\t"
-                "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */
-                :::"cc");
+  reduction();
 }
 #endif
 
+static inline void gcm_lsh(void *h, unsigned int hoffs)
+{
+  static const u64 pconst[2] __attribute__ ((aligned (16))) =
+    { U64_C(0x0000000000000001), U64_C(0xc200000000000000) };
+
+  asm volatile ("movdqu (%[h]), %%xmm2\n\t"
+                "pshufd $0xff, %%xmm2, %%xmm3\n\t"
+                "movdqa %%xmm2, %%xmm4\n\t"
+                "psrad $31, %%xmm3\n\t"
+                "pslldq $8, %%xmm4\n\t"
+                "pand %[pconst], %%xmm3\n\t"
+                "paddq %%xmm2, %%xmm2\n\t"
+                "psrlq $63, %%xmm4\n\t"
+                "pxor %%xmm3, %%xmm2\n\t"
+                "pxor %%xmm4, %%xmm2\n\t"
+                "movdqu %%xmm2, (%[h])\n\t"
+                :
+                : [pconst] "m" (pconst),
+                  [h] "r" ((byte *)h + hoffs)
+                : "memory" );
+}
 
 void
 _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
@@ -254,15 +394,22 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
 #if defined(__x86_64__) && defined(__WIN64__)
-  char win64tmp[3 * 16];
+  char win64tmp[10 * 16];
 
-  /* XMM6-XMM8 need to be restored after use. */
-  asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t"
-                "movdqu %%xmm7, 1*16(%0)\n\t"
-                "movdqu %%xmm8, 2*16(%0)\n\t"
+  /* XMM6-XMM15 need to be restored after use. */
+  asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
+                "movdqu %%xmm7,  1*16(%0)\n\t"
+                "movdqu %%xmm8,  2*16(%0)\n\t"
+                "movdqu %%xmm9,  3*16(%0)\n\t"
+                "movdqu %%xmm10, 4*16(%0)\n\t"
+                "movdqu %%xmm11, 5*16(%0)\n\t"
+                "movdqu %%xmm12, 6*16(%0)\n\t"
+                "movdqu %%xmm13, 7*16(%0)\n\t"
+                "movdqu %%xmm14, 8*16(%0)\n\t"
+                "movdqu %%xmm15, 9*16(%0)\n\t"
                 :
                 : "r" (win64tmp)
-                : "memory");
+                : "memory" );
 #endif
 
   /* Swap endianness of hsub. */
@@ -274,36 +421,82 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                   [be_mask] "m" (*be_mask)
                 : "memory");
 
+  gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
+
 #ifdef __x86_64__
   asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
+                "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
                 :
-                :
+                : [key] "r" (c->u_mode.gcm.u_ghash_key.key)
                 : "memory");
 
-  gfmul_pclmul (); /* H?H => H? */
+  gfmul_pclmul (); /* H<<<1?H => H? */
 
-  asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t"
+  asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t"
                 "movdqa %%xmm1, %%xmm8\n\t"
                 :
-                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
-  gfmul_pclmul (); /* H?H? => H? */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H? <<< 1 */
+  gfmul_pclmul (); /* H<<<1?H? => H? */
 
   asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
-                "movdqu %%xmm1, 1*16(%[h_234])\n\t"
-                "movdqa %%xmm8, %%xmm1\n\t"
+                "movdqu %%xmm1, 1*16(%[h_table])\n\t"
+                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
                 :
-                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
 
-  gfmul_pclmul (); /* H??H? => H? */
+  gfmul_pclmul (); /* H?<<<1?H? => H? */
 
-  asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t"
+  asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t"
+                "movdqa %%xmm1, %%xmm0\n\t"
+                "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */
                 :
-                : [h_234] "r" (c->u_mode.gcm.gcm_table)
+                : [h_table] "r" (c->u_mode.gcm.gcm_table),
+                  [key] "r" (c->u_mode.gcm.u_ghash_key.key)
                 : "memory");
 
+  gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H? <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H? <<< 1 */
+
+  gfmul_pclmul (); /* H<<<1?H? => H? */
+
+  asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
+                "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H?<<<1?H? => H? */
+
+  asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t"
+                "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H?<<<1?H? => H? */
+
+  asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t"
+                "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gfmul_pclmul (); /* H?<<<1?H? => H? */
+
+  asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t"
+                :
+                : [h_table] "r" (c->u_mode.gcm.gcm_table)
+                : "memory");
+
+  gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H? <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H? <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H? <<< 1 */
+  gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H? <<< 1 */
+
 #ifdef __WIN64__
   /* Clear/restore used registers. */
   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
@@ -315,9 +508,16 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 "movdqu 0*16(%0), %%xmm6\n\t"
                 "movdqu 1*16(%0), %%xmm7\n\t"
                 "movdqu 2*16(%0), %%xmm8\n\t"
+                "movdqu 3*16(%0), %%xmm9\n\t"
+                "movdqu 4*16(%0), %%xmm10\n\t"
+                "movdqu 5*16(%0), %%xmm11\n\t"
+                "movdqu 6*16(%0), %%xmm12\n\t"
+                "movdqu 7*16(%0), %%xmm13\n\t"
+                "movdqu 8*16(%0), %%xmm14\n\t"
+                "movdqu 9*16(%0), %%xmm15\n\t"
                 :
                 : "r" (win64tmp)
-                : "memory");
+                : "memory" );
 #else
   /* Clear used registers. */
   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
@@ -329,7 +529,14 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
                 "pxor %%xmm6, %%xmm6\n\t"
                 "pxor %%xmm7, %%xmm7\n\t"
                 "pxor %%xmm8, %%xmm8\n\t"
-                ::: "cc" );
+                "pxor %%xmm9, %%xmm9\n\t"
+                "pxor %%xmm10, %%xmm10\n\t"
+                "pxor %%xmm11, %%xmm11\n\t"
+                "pxor %%xmm12, %%xmm12\n\t"
+                "pxor %%xmm13, %%xmm13\n\t"
+                "pxor %%xmm14, %%xmm14\n\t"
+                "pxor %%xmm15, %%xmm15\n\t"
+                ::: "memory" );
 #endif
 #endif
 }
@@ -342,15 +549,15 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
-#ifdef __WIN64__
+#if defined(__x86_64__) && defined(__WIN64__)
   char win64tmp[10 * 16];
 #endif
 
   if (nblocks == 0)
     return 0;
 
-#ifdef __WIN64__
-  /* XMM8-XMM15 need to be restored after use. */
+#if defined(__x86_64__) && defined(__WIN64__)
+  /* XMM6-XMM15 need to be restored after use. */
   asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t"
                 "movdqu %%xmm7,  1*16(%0)\n\t"
                 "movdqu %%xmm8,  2*16(%0)\n\t"
@@ -367,44 +574,39 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
 #endif
 
   /* Preload hash and H1. */
-  asm volatile ("movdqu %[hash], %%xmm1\n\t"
+  asm volatile ("movdqa %[be_mask], %%xmm7\n\t"
+                "movdqu %[hash], %%xmm1\n\t"
                 "movdqa %[hsub], %%xmm0\n\t"
-                "pshufb %[be_mask], %%xmm1\n\t" /* be => le */
+                "pshufb %%xmm7, %%xmm1\n\t" /* be => le */
                 :
-                : [hash] "m" (*result), [be_mask] "m" (*be_mask),
-                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key));
+                : [hash] "m" (*result),
+                  [be_mask] "m" (*be_mask),
+                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                : "memory" );
 
 #ifdef __x86_64__
   if (nblocks >= 4)
     {
-      do
+      asm volatile ("movdqa %%xmm7, %%xmm15\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      while (nblocks >= 8)
         {
-          asm volatile ("movdqa %[be_mask], %%xmm4\n\t"
-                        "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                        "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                        "movdqu 2*16(%[buf]), %%xmm3\n\t"
-                        "movdqu 3*16(%[buf]), %%xmm6\n\t"
-                        "pshufb %%xmm4, %%xmm5\n\t" /* be => le */
-
-                        /* Load H2, H3, H4. */
-                        "movdqu 2*16(%[h_234]), %%xmm10\n\t"
-                        "movdqu 1*16(%[h_234]), %%xmm9\n\t"
-                        "movdqu 0*16(%[h_234]), %%xmm8\n\t"
-
-                        "pxor %%xmm5, %%xmm1\n\t"
-                        "pshufb %%xmm4, %%xmm2\n\t" /* be => le */
-                        "pshufb %%xmm4, %%xmm3\n\t" /* be => le */
-                        "pshufb %%xmm4, %%xmm6\n\t" /* be => le */
-                        :
-                        : [buf] "r" (buf), [be_mask] "m" (*be_mask),
-                          [h_234] "r" (c->u_mode.gcm.gcm_table));
-
-          gfmul_pclmul_aggr4 ();
+          gfmul_pclmul_aggr8 (buf, c->u_mode.gcm.gcm_table);
+
+          buf += 8 * blocksize;
+          nblocks -= 8;
+        }
+
+      if (nblocks >= 4)
+        {
+          gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.gcm_table);
 
           buf += 4 * blocksize;
           nblocks -= 4;
         }
-      while (nblocks >= 4);
 
 #ifndef __WIN64__
       /* Clear used x86-64/XMM registers. */
@@ -416,31 +618,34 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                     "pxor %%xmm13, %%xmm13\n\t"
                     "pxor %%xmm14, %%xmm14\n\t"
                     "pxor %%xmm15, %%xmm15\n\t"
-                    ::: "cc" );
+                    ::: "memory" );
 #endif
     }
 #endif
 
-  while (nblocks--)
+  while (nblocks)
     {
       asm volatile ("movdqu %[buf], %%xmm2\n\t"
                     "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
                     "pxor %%xmm2, %%xmm1\n\t"
                     :
-                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask));
+                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask)
+                    : "memory" );
 
       gfmul_pclmul ();
 
       buf += blocksize;
+      nblocks--;
     }
 
   /* Store hash. */
   asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */
                 "movdqu %%xmm1, %[hash]\n\t"
                 : [hash] "=m" (*result)
-                : [be_mask] "m" (*be_mask));
+                : [be_mask] "m" (*be_mask)
+                : "memory" );
 
-#ifdef __WIN64__
+#if defined(__x86_64__) && defined(__WIN64__)
   /* Clear/restore used registers. */
   asm volatile( "pxor %%xmm0, %%xmm0\n\t"
                 "pxor %%xmm1, %%xmm1\n\t"
@@ -471,7 +676,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 "pxor %%xmm5, %%xmm5\n\t"
                 "pxor %%xmm6, %%xmm6\n\t"
                 "pxor %%xmm7, %%xmm7\n\t"
-                ::: "cc" );
+                ::: "memory" );
 #endif
 
   return 0;
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 970aa9860..fdec0a1bd 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -313,7 +313,10 @@ struct gcry_cipher_handle
 
       /* Pre-calculated table for GCM. */
 #ifdef GCM_USE_TABLES
- #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
+ #if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL)
+      #define GCM_TABLES_USE_U64 1
+      u64 gcm_table[7 * 16]; /* Extra table space for PCLMUL aggr8 */
+ #elif (SIZEOF_UNSIGNED_LONG == 8) || defined(__x86_64__)
       #define GCM_TABLES_USE_U64 1
       u64 gcm_table[2 * 16];
  #else


From jussi.kivilinna at iki.fi  Sat Apr 27 22:03:04 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Apr 2019 23:03:04 +0300
Subject: [PATCH 2/4] Prefetch GCM look-up tables
In-Reply-To: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>
References: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>
Message-ID: <155639538390.3345.9222163520460838914.stgit@localhost.localdomain>

* cipher/cipher-gcm.c (prefetch_table, do_prefetch_tables)
(prefetch_tables): New.
(ghash_internal): Call prefetch_tables.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index c19f09f27..11f119aa7 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -118,6 +118,34 @@ static const u16 gcmR[256] = {
   0xbbf0, 0xba32, 0xb874, 0xb9b6, 0xbcf8, 0xbd3a, 0xbf7c, 0xbebe,
 };
 
+static inline
+void prefetch_table(const void *tab, size_t len)
+{
+  const volatile byte *vtab = tab;
+  size_t i;
+
+  for (i = 0; i < len; i += 8 * 32)
+    {
+      (void)vtab[i + 0 * 32];
+      (void)vtab[i + 1 * 32];
+      (void)vtab[i + 2 * 32];
+      (void)vtab[i + 3 * 32];
+      (void)vtab[i + 4 * 32];
+      (void)vtab[i + 5 * 32];
+      (void)vtab[i + 6 * 32];
+      (void)vtab[i + 7 * 32];
+    }
+
+  (void)vtab[len - 1];
+}
+
+static inline void
+do_prefetch_tables (const void *gcmM, size_t gcmM_size)
+{
+  prefetch_table(gcmM, gcmM_size);
+  prefetch_table(gcmR, sizeof(gcmR));
+}
+
 #ifdef GCM_TABLES_USE_U64
 static void
 bshift (u64 * b0, u64 * b1)
@@ -365,6 +393,8 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM)
 #define fillM(c) \
   do_fillM (c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table)
 #define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table)
+#define prefetch_tables(c) \
+  do_prefetch_tables(c->u_mode.gcm.gcm_table, sizeof(c->u_mode.gcm.gcm_table))
 
 #else
 
@@ -430,6 +460,7 @@ do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf)
 
 #define fillM(c) do { } while (0)
 #define GHASH(c, result, buf) do_ghash (c->u_mode.gcm.u_ghash_key.key, result, buf)
+#define prefetch_tables(c) do {} while (0)
 
 #endif /* !GCM_USE_TABLES */
 
@@ -441,6 +472,8 @@ ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
   const unsigned int blocksize = GCRY_GCM_BLOCK_LEN;
   unsigned int burn = 0;
 
+  prefetch_tables (c);
+
   while (nblocks)
     {
       burn = GHASH (c, result, buf);


From jussi.kivilinna at iki.fi  Sat Apr 27 22:02:58 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Apr 2019 23:02:58 +0300
Subject: [PATCH 1/4] Optimizations for generic table-based GCM implementations
Message-ID: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>

* cipher/cipher-gcm.c [GCM_TABLES_USE_U64] (do_fillM): Precalculate
M[32..63] values.
[GCM_TABLES_USE_U64] (do_ghash): Split processing of two 64-bit halfs
of the input to two separate loops; Use precalculated M[] values.
[GCM_USE_TABLES && !GCM_TABLES_USE_U64] (do_fillM): Precalculate
M[64..127] values.
[GCM_USE_TABLES && !GCM_TABLES_USE_U64] (do_ghash): Use precalculated
M[] values.
[GCM_USE_TABLES] (bshift): Avoid conditional execution for mask
calculation.
* cipher/cipher-internal.h (gcry_cipher_handle): Double gcm_table size.
--

Benchmark on Intel Haswell (amd64, --disable-hwf all):

 Before:
                     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
  GMAC_AES           |      2.79 ns/B     341.3 MiB/s     11.17 c/B      3998

 After (~36% faster):
                     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
  GMAC_AES           |      2.05 ns/B     464.7 MiB/s      8.20 c/B      3998

Benchmark on Intel Haswell (win32, --disable-hwf all):

 Before:
                     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
  GMAC_AES           |      4.90 ns/B     194.8 MiB/s     19.57 c/B      3997

 After (~36% faster):
                     |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
  GMAC_AES           |      3.58 ns/B     266.4 MiB/s     14.31 c/B      3999

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index cbda87be2..c19f09f27 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -1,6 +1,6 @@
 /* cipher-gcm.c  - Generic Galois Counter Mode implementation
  * Copyright (C) 2013 Dmitry Eremin-Solenikov
- * Copyright (C) 2013, 2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013, 2018-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -126,7 +126,7 @@ bshift (u64 * b0, u64 * b1)
 
   t[0] = *b0;
   t[1] = *b1;
-  mask = t[1] & 1 ? 0xe1 : 0;
+  mask = -(t[1] & 1) & 0xe1;
   mask <<= 56;
 
   *b1 = (t[1] >> 1) ^ (t[0] << 63);
@@ -158,6 +158,12 @@ do_fillM (unsigned char *h, u64 *M)
         M[(i + j) + 0] = M[i + 0] ^ M[j + 0];
         M[(i + j) + 16] = M[i + 16] ^ M[j + 16];
       }
+
+  for (i = 0; i < 16; i++)
+    {
+      M[i + 32] = (M[i + 0] >> 4) ^ ((u64) gcmR[(M[i + 16] & 0xf) << 4] << 48);
+      M[i + 48] = (M[i + 16] >> 4) ^ (M[i + 0] << 60);
+    }
 }
 
 static inline unsigned int
@@ -175,20 +181,18 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
   V[1] = be_bswap64 (V[1]);
 
   /* First round can be manually tweaked based on fact that 'tmp' is zero. */
-  i = 15;
-
-  M = &gcmM[(V[1] & 0xf)];
+  M = &gcmM[(V[1] & 0xf) + 32];
   V[1] >>= 4;
-  tmp[0] = (M[0] >> 4) ^ ((u64) gcmR[(M[16] & 0xf) << 4] << 48);
-  tmp[1] = (M[16] >> 4) ^ (M[0] << 60);
+  tmp[0] = M[0];
+  tmp[1] = M[16];
   tmp[0] ^= gcmM[(V[1] & 0xf) + 0];
   tmp[1] ^= gcmM[(V[1] & 0xf) + 16];
   V[1] >>= 4;
 
-  --i;
+  i = 6;
   while (1)
     {
-      M = &gcmM[(V[1] & 0xf)];
+      M = &gcmM[(V[1] & 0xf) + 32];
       V[1] >>= 4;
 
       A = tmp[1] & 0xff;
@@ -196,15 +200,34 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
       tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[1] & 0xf) + 0];
       tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[1] & 0xf) + 16];
 
-      tmp[0] ^= (M[0] >> 4) ^ ((u64) gcmR[(M[16] & 0xf) << 4] << 48);
-      tmp[1] ^= (M[16] >> 4) ^ (M[0] << 60);
+      tmp[0] ^= M[0];
+      tmp[1] ^= M[16];
+
+      if (i == 0)
+        break;
+
+      V[1] >>= 4;
+      --i;
+    }
+
+  i = 7;
+  while (1)
+    {
+      M = &gcmM[(V[0] & 0xf) + 32];
+      V[0] >>= 4;
+
+      A = tmp[1] & 0xff;
+      T = tmp[0];
+      tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[0] & 0xf) + 0];
+      tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[0] & 0xf) + 16];
+
+      tmp[0] ^= M[0];
+      tmp[1] ^= M[16];
 
       if (i == 0)
         break;
-      else if (i == 8)
-        V[1] = V[0];
-      else
-        V[1] >>= 4;
+
+      V[0] >>= 4;
       --i;
     }
 
@@ -226,7 +249,7 @@ bshift (u32 * M, int i)
   t[1] = M[i * 4 + 1];
   t[2] = M[i * 4 + 2];
   t[3] = M[i * 4 + 3];
-  mask = t[3] & 1 ? 0xe1 : 0;
+  mask = -(t[3] & 1) & 0xe1;
 
   M[i * 4 + 3] = (t[3] >> 1) ^ (t[2] << 31);
   M[i * 4 + 2] = (t[2] >> 1) ^ (t[1] << 31);
@@ -267,6 +290,15 @@ do_fillM (unsigned char *h, u32 *M)
         M[(i + j) * 4 + 2] = M[i * 4 + 2] ^ M[j * 4 + 2];
         M[(i + j) * 4 + 3] = M[i * 4 + 3] ^ M[j * 4 + 3];
       }
+
+  for (i = 0; i < 4 * 16; i += 4)
+    {
+      M[i + 0 + 64] = (M[i + 0] >> 4)
+                      ^ ((u64) gcmR[(M[i + 3] << 4) & 0xf0] << 16);
+      M[i + 1 + 64] = (M[i + 1] >> 4) ^ (M[i + 0] << 28);
+      M[i + 2 + 64] = (M[i + 2] >> 4) ^ (M[i + 1] << 28);
+      M[i + 3 + 64] = (M[i + 3] >> 4) ^ (M[i + 2] << 28);
+    }
 }
 
 static inline unsigned int
@@ -285,19 +317,19 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM)
   i = 15;
 
   v = V[i];
-  M = &gcmM[(v & 0xf) * 4];
+  M = &gcmM[(v & 0xf) * 4 + 64];
   v = (v & 0xf0) >> 4;
   m = &gcmM[v * 4];
   v = V[--i];
 
-  tmp[0] = (M[0] >> 4) ^ ((u64) gcmR[(M[3] << 4) & 0xf0] << 16) ^ m[0];
-  tmp[1] = (M[1] >> 4) ^ (M[0] << 28) ^ m[1];
-  tmp[2] = (M[2] >> 4) ^ (M[1] << 28) ^ m[2];
-  tmp[3] = (M[3] >> 4) ^ (M[2] << 28) ^ m[3];
+  tmp[0] = M[0] ^ m[0];
+  tmp[1] = M[1] ^ m[1];
+  tmp[2] = M[2] ^ m[2];
+  tmp[3] = M[3] ^ m[3];
 
   while (1)
     {
-      M = &gcmM[(v & 0xf) * 4];
+      M = &gcmM[(v & 0xf) * 4 + 64];
       v = (v & 0xf0) >> 4;
       m = &gcmM[v * 4];
 
@@ -309,10 +341,10 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM)
       tmp[2] = (T[1] << 24) ^ (tmp[2] >> 8) ^ m[2];
       tmp[3] = (T[2] << 24) ^ (tmp[3] >> 8) ^ m[3];
 
-      tmp[0] ^= (M[0] >> 4) ^ ((u64) gcmR[(M[3] << 4) & 0xf0] << 16);
-      tmp[1] ^= (M[1] >> 4) ^ (M[0] << 28);
-      tmp[2] ^= (M[2] >> 4) ^ (M[1] << 28);
-      tmp[3] ^= (M[3] >> 4) ^ (M[2] << 28);
+      tmp[0] ^= M[0];
+      tmp[1] ^= M[1];
+      tmp[2] ^= M[2];
+      tmp[3] ^= M[3];
 
       if (i == 0)
         break;
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 970aa9860..47b7b6f9e 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -315,10 +315,10 @@ struct gcry_cipher_handle
 #ifdef GCM_USE_TABLES
  #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
       #define GCM_TABLES_USE_U64 1
-      u64 gcm_table[2 * 16];
+      u64 gcm_table[4 * 16];
  #else
       #undef GCM_TABLES_USE_U64
-      u32 gcm_table[4 * 16];
+      u32 gcm_table[8 * 16];
  #endif
 #endif
     } gcm;


From jussi.kivilinna at iki.fi  Sat Apr 27 22:03:09 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Apr 2019 23:03:09 +0300
Subject: [PATCH 3/4] Enable four block aggregated GCM Intel PCLMUL
 implementation on i386
In-Reply-To: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>
References: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>
Message-ID: <155639538930.3345.2103897558871858741.stgit@localhost.localdomain>

* cipher/cipher-gcm-intel-pclmul.c (reduction): Change "%%xmm7" to
"%%xmm5".
(gfmul_pclmul_aggr4): Move outside [__x86_64__] block; Remove usage of
XMM8-XMM15 registers; Do not preload H-values and be_mask to reduce
register usage for i386.
(_gcry_ghash_setup_intel_pclmul): Enable calculation of H2, H3 and H4
on i386.
(_gcry_ghash_intel_pclmul): Adjust to above gfmul_pclmul_aggr4
changes; Move 'aggr4' code path outside [__x86_64__] block.
--

Benchmark on Intel Haswell (win32):

Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 GMAC_AES           |     0.446 ns/B      2140 MiB/s      1.78 c/B      3998

After (~2.38x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 GMAC_AES           |     0.187 ns/B      5107 MiB/s     0.747 c/B      3998

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 46af77eac..8e109ba3c 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -53,15 +53,15 @@ static inline void reduction(void)
 
   asm volatile (/* first phase of the reduction */
                 "movdqa %%xmm3, %%xmm6\n\t"
-                "movdqa %%xmm3, %%xmm7\n\t"
+                "movdqa %%xmm3, %%xmm5\n\t"
                 "psllq $1, %%xmm6\n\t"  /* packed right shifting << 63 */
                 "pxor %%xmm3, %%xmm6\n\t"
-                "psllq $57, %%xmm7\n\t"  /* packed right shifting << 57 */
+                "psllq $57, %%xmm5\n\t"  /* packed right shifting << 57 */
                 "psllq $62, %%xmm6\n\t"  /* packed right shifting << 62 */
-                "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */
-                "pshufd $0x6a, %%xmm6, %%xmm7\n\t"
+                "pxor %%xmm5, %%xmm6\n\t" /* xor the shifted versions */
+                "pshufd $0x6a, %%xmm6, %%xmm5\n\t"
                 "pshufd $0xae, %%xmm6, %%xmm6\n\t"
-                "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction
+                "pxor %%xmm5, %%xmm3\n\t" /* first phase of the reduction
                                              complete */
 
                 /* second phase of the reduction */
@@ -107,77 +107,83 @@ static inline void gfmul_pclmul(void)
   reduction();
 }
 
-#ifdef __x86_64__
-static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_table)
+static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_1,
+                                      const void *h_table,
+                                      const unsigned char *be_mask)
 {
   /* Input:
-      H?: XMM0
-      bemask: XMM15
       Hash: XMM1
      Output:
       Hash: XMM1
-     Inputs XMM0 and XMM14 stays unmodified.
    */
-  asm volatile (/* Load H2, H3, H4. */
-                "movdqu 2*16(%[h_table]), %%xmm10\n\t"
-                "movdqu 1*16(%[h_table]), %%xmm9\n\t"
-                "movdqu 0*16(%[h_table]), %%xmm8\n\t"
-
-                /* perform clmul and merge results... */
+  asm volatile (/* perform clmul and merge results... */
+                "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */
                 "movdqu 0*16(%[buf]), %%xmm5\n\t"
-                "movdqu 1*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
                 "pxor %%xmm5, %%xmm1\n\t"
 
-                "pshufd $78, %%xmm10, %%xmm5\n\t"
+                "pshufd $78, %%xmm2, %%xmm5\n\t"
                 "pshufd $78, %%xmm1, %%xmm4\n\t"
-                "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
-                "pxor %%xmm1, %%xmm4\n\t"  /* xmm4 holds 4:b0+b1 */
-                "movdqa %%xmm10, %%xmm3\n\t"
+                "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */
+                "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */
+                "movdqa %%xmm2, %%xmm3\n\t"
                 "pclmulqdq $0, %%xmm1, %%xmm3\n\t"   /* xmm3 holds 4:a0*b0 */
-                "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */
+                "pclmulqdq $17, %%xmm2, %%xmm1\n\t"  /* xmm1 holds 4:a1*b1 */
                 "pclmulqdq $0, %%xmm5, %%xmm4\n\t"   /* xmm4 holds 4:(a0+a1)*(b0+b1) */
 
-                "pshufd $78, %%xmm9, %%xmm11\n\t"
+                "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */
+                "movdqu 1*16(%[buf]), %%xmm2\n\t"
+                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+
+                "pshufd $78, %%xmm5, %%xmm0\n\t"
                 "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 3:b0+b1 */
-                "movdqa %%xmm9, %%xmm6\n\t"
+                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */
+                "movdqa %%xmm5, %%xmm6\n\t"
                 "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 3:a0*b0 */
-                "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */
+                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */
+
+                "movdqu 2*16(%[buf]), %%xmm5\n\t"
+                "pshufb %[be_mask], %%xmm5\n\t" /* be => le */
 
                 "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */
                 "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */
                 "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */
 
-                "movdqu 2*16(%[buf]), %%xmm5\n\t"
-                "movdqu 3*16(%[buf]), %%xmm2\n\t"
-                "pshufb %%xmm15, %%xmm5\n\t" /* be => le */
-                "pshufb %%xmm15, %%xmm2\n\t" /* be => le */
+                "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */
 
-                "pshufd $78, %%xmm8, %%xmm11\n\t"
+                "pshufd $78, %%xmm2, %%xmm0\n\t"
                 "pshufd $78, %%xmm5, %%xmm7\n\t"
-                "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */
-                "pxor %%xmm5, %%xmm7\n\t"  /* xmm7 holds 2:b0+b1 */
-                "movdqa %%xmm8, %%xmm6\n\t"
+                "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */
+                "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */
+                "movdqa %%xmm2, %%xmm6\n\t"
                 "pclmulqdq $0, %%xmm5, %%xmm6\n\t"  /* xmm6 holds 2:a0*b0 */
-                "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */
+                "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */
+                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */
 
-                "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */
+                "movdqu 3*16(%[buf]), %%xmm2\n\t"
+                "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+                :
+                : [buf] "r" (buf),
+                  [h_table] "r" (h_table),
+                  [be_mask] "m" (*be_mask)
+                : "memory" );
+
+  asm volatile ("pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */
                 "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */
                 "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */
 
-                "pshufd $78, %%xmm0, %%xmm11\n\t"
+                "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */
+
+                "pshufd $78, %%xmm5, %%xmm0\n\t"
                 "pshufd $78, %%xmm2, %%xmm7\n\t"
-                "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */
-                "pxor %%xmm2, %%xmm7\n\t"  /* xmm7 holds 1:b0+b1 */
-                "movdqa %%xmm0, %%xmm6\n\t"
+                "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */
+                "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */
+                "movdqa %%xmm5, %%xmm6\n\t"
                 "pclmulqdq $0, %%xmm2, %%xmm6\n\t"  /* xmm6 holds 1:a0*b0 */
-                "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */
-                "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */
+                "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */
+                "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */
 
                 "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */
                 "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */
@@ -195,13 +201,13 @@ static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_table)
                                              carry-less multiplication of xmm0
                                              by xmm1 */
                 :
-                : [buf] "r" (buf),
-                  [h_table] "r" (h_table)
+                : [h_1] "m" (*(const unsigned char *)h_1)
                 : "memory" );
 
   reduction();
 }
 
+#ifdef __x86_64__
 static inline void gfmul_pclmul_aggr8(const void *buf, const void *h_table)
 {
   /* Input:
@@ -210,7 +216,7 @@ static inline void gfmul_pclmul_aggr8(const void *buf, const void *h_table)
       Hash: XMM1
      Output:
       Hash: XMM1
-     Inputs XMM0 and XMM14 stays unmodified.
+     Inputs XMM0 and XMM15 stays unmodified.
    */
   asm volatile (/* Load H6, H7, H8. */
                 "movdqu 6*16(%[h_table]), %%xmm10\n\t"
@@ -423,7 +429,6 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
 
   gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */
 
-#ifdef __x86_64__
   asm volatile ("movdqa %%xmm0, %%xmm1\n\t"
                 "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */
                 :
@@ -433,7 +438,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
   gfmul_pclmul (); /* H<<<1?H => H? */
 
   asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t"
-                "movdqa %%xmm1, %%xmm8\n\t"
+                "movdqa %%xmm1, %%xmm7\n\t"
                 :
                 : [h_table] "r" (c->u_mode.gcm.gcm_table)
                 : "memory");
@@ -441,7 +446,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
   gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H? <<< 1 */
   gfmul_pclmul (); /* H<<<1?H? => H? */
 
-  asm volatile ("movdqa %%xmm8, %%xmm0\n\t"
+  asm volatile ("movdqa %%xmm7, %%xmm0\n\t"
                 "movdqu %%xmm1, 1*16(%[h_table])\n\t"
                 "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */
                 :
@@ -461,6 +466,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c)
   gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H? <<< 1 */
   gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H? <<< 1 */
 
+#ifdef __x86_64__
   gfmul_pclmul (); /* H<<<1?H? => H? */
 
   asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t"
@@ -573,23 +579,23 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
                 : "memory" );
 #endif
 
-  /* Preload hash and H1. */
+  /* Preload hash. */
   asm volatile ("movdqa %[be_mask], %%xmm7\n\t"
                 "movdqu %[hash], %%xmm1\n\t"
-                "movdqa %[hsub], %%xmm0\n\t"
                 "pshufb %%xmm7, %%xmm1\n\t" /* be => le */
                 :
                 : [hash] "m" (*result),
-                  [be_mask] "m" (*be_mask),
-                  [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)
+                  [be_mask] "m" (*be_mask)
                 : "memory" );
 
 #ifdef __x86_64__
-  if (nblocks >= 4)
+  if (nblocks >= 8)
     {
+      /* Preload H1. */
       asm volatile ("movdqa %%xmm7, %%xmm15\n\t"
+                    "movdqa %[h_1], %%xmm0\n\t"
                     :
-                    :
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
                     : "memory" );
 
       while (nblocks >= 8)
@@ -599,15 +605,6 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
           buf += 8 * blocksize;
           nblocks -= 8;
         }
-
-      if (nblocks >= 4)
-        {
-          gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.gcm_table);
-
-          buf += 4 * blocksize;
-          nblocks -= 4;
-        }
-
 #ifndef __WIN64__
       /* Clear used x86-64/XMM registers. */
       asm volatile( "pxor %%xmm8, %%xmm8\n\t"
@@ -623,19 +620,37 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
     }
 #endif
 
-  while (nblocks)
+  while (nblocks >= 4)
     {
-      asm volatile ("movdqu %[buf], %%xmm2\n\t"
-                    "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
-                    "pxor %%xmm2, %%xmm1\n\t"
+      gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.u_ghash_key.key,
+                          c->u_mode.gcm.gcm_table, be_mask);
+
+      buf += 4 * blocksize;
+      nblocks -= 4;
+    }
+
+  if (nblocks)
+    {
+      /* Preload H1. */
+      asm volatile ("movdqa %[h_1], %%xmm0\n\t"
                     :
-                    : [buf] "m" (*buf), [be_mask] "m" (*be_mask)
+                    : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key)
                     : "memory" );
 
-      gfmul_pclmul ();
+      while (nblocks)
+        {
+          asm volatile ("movdqu %[buf], %%xmm2\n\t"
+                        "pshufb %[be_mask], %%xmm2\n\t" /* be => le */
+                        "pxor %%xmm2, %%xmm1\n\t"
+                        :
+                        : [buf] "m" (*buf), [be_mask] "m" (*be_mask)
+                        : "memory" );
+
+          gfmul_pclmul ();
 
-      buf += blocksize;
-      nblocks--;
+          buf += blocksize;
+          nblocks--;
+        }
     }
 
   /* Store hash. */


From jussi.kivilinna at iki.fi  Sat Apr 27 22:03:15 2019
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 27 Apr 2019 23:03:15 +0300
Subject: [PATCH 4/4] Fix CFI_PUSH/CFI_POP redefine build warning with AMD64 MPI
In-Reply-To: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>
References: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain>
Message-ID: <155639539470.3345.14491782800257755337.stgit@localhost.localdomain>

* mpi/amd64/func_abi.h: Move CFI macros into [__x86_64__] block.
* mpi/i386/syntax.h: Move CFI macros into [__i386__] block.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h
index 37d5722af..a60363e4e 100644
--- a/mpi/amd64/func_abi.h
+++ b/mpi/amd64/func_abi.h
@@ -1,5 +1,6 @@
 #include <config.h>
 
+#ifdef __x86_64__
 #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
 # define CFI_STARTPROC()            .cfi_startproc
 # define CFI_ENDPROC()              .cfi_endproc
@@ -21,6 +22,7 @@
 # define CFI_PUSH(reg)
 # define CFI_POP(reg)
 #endif
+#endif
 
 #ifdef USE_MS_ABI
  /* Store registers and move four first input arguments from MS ABI to
diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h
index 9101585a8..dd3003199 100644
--- a/mpi/i386/syntax.h
+++ b/mpi/i386/syntax.h
@@ -28,6 +28,7 @@
 
 #include <config.h>
 
+#ifdef __i386__
 #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
 # define CFI_STARTPROC()            .cfi_startproc
 # define CFI_ENDPROC()              .cfi_endproc
@@ -49,6 +50,7 @@
 # define CFI_PUSH(reg)
 # define CFI_POP(reg)
 #endif
+#endif
 
 #undef ALIGN