From jussi.kivilinna at iki.fi  Sat Jan  8 12:06:11 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 13:06:11 +0200
Subject: [PATCH 2/3] Add armv8/pmull accelerated POLYVAL for GCM-SIV
In-Reply-To: <20220108110612.141943-1-jussi.kivilinna@iki.fi>
References: <20220108110612.141943-1-jussi.kivilinna@iki.fi>
Message-ID: <20220108110612.141943-2-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-armv8-aarch32-ce.S
(_gcry_polyval_armv8_ce_pmull): New.
* cipher/cipher-gcm-armv8-aarch64-ce.S
(_gcry_polyval_armv8_ce_pmull): New.
* cipher/cipher-gcm.c (_gcry_polyval_armv8_ce_pmull)
(polyval_armv8_ce_pmull): New.
(setupM) [GCM_USE_ARM_PMULL]: Setup 'polyval_armv8_ce_pmull' as POLYVAL
function.
--

Benchmark on Cortex-A53 (aarch64):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
   GCM-SIV auth |      1.74 ns/B     547.6 MiB/s      2.01 c/B      1152

After (76% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
   GCM-SIV auth |     0.990 ns/B     963.2 MiB/s      1.14 c/B      1152

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-gcm-armv8-aarch32-ce.S | 155 ++++++++++++++++++
 cipher/cipher-gcm-armv8-aarch64-ce.S | 228 +++++++++++++++++++++++++++
 cipher/cipher-gcm.c                  |  14 ++
 3 files changed, 397 insertions(+)

diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S
index fb51b339..00c547de 100644
--- a/cipher/cipher-gcm-armv8-aarch32-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch32-ce.S
@@ -358,6 +358,161 @@ _gcry_ghash_armv8_ce_pmull:
 .size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;
 
 
+/*
+ * unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result,
+ *                                            const byte *buf, size_t nblocks,
+ *                                            void *gcm_table);
+ */
+.align 3
+.globl _gcry_polyval_armv8_ce_pmull
+.type  _gcry_polyval_armv8_ce_pmull,%function;
+_gcry_polyval_armv8_ce_pmull:
+  /* input:
+   *    r0: gcm_key
+   *    r1: result/hash
+   *    r2: buf
+   *    r3: nblocks
+   *    %st+0: gcm_table
+   */
+  push {r4-r6, lr}
+
+  cmp r3, #0
+  beq .Lpolyval_do_nothing
+
+  GET_DATA_POINTER(r4, .Lrconst64, lr)
+
+  vld1.64 {rhash}, [r1]
+  vld1.64 {rh1}, [r0]
+
+  vrev64.8 rhash, rhash /* byte-swap */
+  vld1.64 {rrconst_h}, [r4]
+  vext.8 rhash, rhash, rhash, #8
+
+  cmp r3, #4
+  blo .Lpolyval_less_than_4
+
+  /* Bulk processing of 4 blocks per loop iteration. */
+
+  ldr r5, [sp, #(4*4)];
+  add r6, r5, #32
+
+  vpush {q4-q7}
+
+  vld1.64 {rh2-rh3}, [r5]
+  vld1.64 {rh4}, [r6]
+
+  vld1.64 {rbuf-rbuf1}, [r2]!
+  sub r3, r3, #4
+  vld1.64 {rbuf2-rbuf3}, [r2]!
+
+  cmp r3, #4
+  veor rhash, rhash, rbuf /* in0 ^ hash */
+
+  blo .Lpolyval_end_4
+
+.Lpolyval_loop_4:
+  /* (in0 ^ hash) * H? => rr2:rr3 */
+  /* (in1) * H? => rr0:rr1 */
+  PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
+
+  vld1.64 {rbuf-rbuf1}, [r2]!
+  sub r3, r3, #4
+  veor rr0, rr0, rr2
+  veor rr1, rr1, rr3
+
+  /* (in2) * H? => rr2:rr3 */
+  /* (in3) * H? => rhash:rbuf3 */
+  PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, __)
+
+  vld1.64 {rbuf2}, [r2]!
+
+  veor rr0, rr0, rr2
+  veor rr1, rr1, rr3
+
+  cmp r3, #4
+
+  veor rr0, rr0, rhash
+  veor rr1, rr1, rbuf3
+
+  vld1.64 {rbuf3}, [r2]!
+
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, __)
+
+  veor rhash, rhash, rbuf /* in0 ^ hash */
+
+  bhs .Lpolyval_loop_4
+
+.Lpolyval_end_4:
+  /* (in0 ^ hash) * H? => rr2:rr3 */
+  /* (in1) * H? => rr0:rr1 */
+  PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
+
+  /* (in2) * H? => rhash:rbuf */
+  /* (in3) * H? => rbuf1:rbuf2 */
+  PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1,
+                 _(veor rr0, rr0, rr2;
+                   veor rr1, rr1, rr3))
+
+  veor rr0, rr0, rhash
+  veor rr1, rr1, rbuf
+
+  veor rr0, rr0, rbuf1
+  veor rr1, rr1, rbuf2
+
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
+            _(CLEAR_REG(rr2);
+              CLEAR_REG(rr3);
+              CLEAR_REG(rbuf1);
+              CLEAR_REG(rbuf2);
+              CLEAR_REG(rbuf3);
+              CLEAR_REG(rh2);
+              CLEAR_REG(rh3);
+              CLEAR_REG(rh4)))
+
+  vpop {q4-q7}
+
+  cmp r3, #0
+  beq .Lpolyval_done
+
+.Lpolyval_less_than_4:
+  /* Handle remaining blocks. */
+
+  vld1.64 {rbuf}, [r2]!
+  subs r3, r3, #1
+
+  veor rhash, rhash, rbuf
+
+  beq .Lpolyval_end
+
+.Lpolyval_loop:
+  vld1.64 {rbuf}, [r2]!
+  subs r3, r3, #1
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, __)
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, __)
+  veor rhash, rhash, rbuf
+
+  bne .Lpolyval_loop
+
+.Lpolyval_end:
+  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf)))
+  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1)))
+
+.Lpolyval_done:
+  CLEAR_REG(rr1)
+  vrev64.8 rhash, rhash /* byte-swap */
+  CLEAR_REG(rt0)
+  CLEAR_REG(rr0)
+  vext.8 rhash, rhash, rhash, #8
+  CLEAR_REG(rt1)
+  vst1.64 {rhash}, [r1]
+  CLEAR_REG(rhash)
+
+.Lpolyval_do_nothing:
+  mov r0, #0
+  pop {r4-r6, pc}
+.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;
+
+
 /*
  * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
  */
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 13ee83ed..2c619f9b 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -370,6 +370,234 @@ _gcry_ghash_armv8_ce_pmull:
 ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
 
 
+/*
+ * unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result,
+ *                                            const byte *buf, size_t nblocks,
+ *                                            void *gcm_table);
+ */
+.align 3
+.globl _gcry_polyval_armv8_ce_pmull
+ELF(.type  _gcry_polyval_armv8_ce_pmull,%function;)
+_gcry_polyval_armv8_ce_pmull:
+  /* input:
+   *    x0: gcm_key
+   *    x1: result/hash
+   *    x2: buf
+   *    x3: nblocks
+   *    x4: gcm_table
+   */
+  CFI_STARTPROC();
+
+  cbz x3, .Lpolyval_do_nothing;
+
+  GET_DATA_POINTER(x5, .Lrconst)
+
+  eor vZZ.16b, vZZ.16b, vZZ.16b
+  ld1 {rhash.16b}, [x1]
+  ld1 {rh1.16b}, [x0]
+
+  rbit rhash.16b, rhash.16b /* bit-swap */
+  ld1r {rrconst.2d}, [x5]
+
+  cmp x3, #6
+  b.lo .Lpolyval_less_than_6
+
+  add x6, x4, #64
+  VPUSH_ABI
+
+  ld1 {rh2.16b-rh5.16b}, [x4]
+  ld1 {rh6.16b}, [x6]
+
+  sub x3, x3, #6
+
+  ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
+  ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
+  rev64 rbuf.16b, rbuf.16b /* byte-swap */
+  rev64 rbuf1.16b, rbuf1.16b /* byte-swap */
+  rev64 rbuf2.16b, rbuf2.16b /* byte-swap */
+  rev64 rbuf3.16b, rbuf3.16b /* byte-swap */
+  rev64 rbuf4.16b, rbuf4.16b /* byte-swap */
+  rev64 rbuf5.16b, rbuf5.16b /* byte-swap */
+  ext rbuf.16b, rbuf.16b, rbuf.16b, #8 /* byte-swap */
+  ext rbuf1.16b, rbuf1.16b, rbuf1.16b, #8 /* byte-swap */
+  ext rbuf2.16b, rbuf2.16b, rbuf2.16b, #8 /* byte-swap */
+  ext rbuf3.16b, rbuf3.16b, rbuf3.16b, #8 /* byte-swap */
+  ext rbuf4.16b, rbuf4.16b, rbuf4.16b, #8 /* byte-swap */
+  ext rbuf5.16b, rbuf5.16b, rbuf5.16b, #8 /* byte-swap */
+  rbit rbuf.16b, rbuf.16b /* bit-swap */
+  rbit rbuf1.16b, rbuf1.16b /* bit-swap */
+  rbit rbuf2.16b, rbuf2.16b /* bit-swap */
+  rbit rbuf3.16b, rbuf3.16b /* bit-swap */
+  rbit rbuf4.16b, rbuf4.16b /* bit-swap */
+  rbit rbuf5.16b, rbuf5.16b /* bit-swap */
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  cmp x3, #6
+  b.lo .Lpolyval_end_6
+
+.Lpolyval_loop_6:
+
+  /* (in1) * H? => rr0:rr1 */
+  /* (in2) * H? => rr2:rr3 */
+  /* (in0 ^ hash) * H? => rr4:rr5 */
+  PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
+                 rr2, rr3, rbuf2, rh4, t2, t3,
+                 rr4, rr5, rhash, rh6, t4, t5,
+                 _(sub x3, x3, #6))
+
+  ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
+  cmp x3, #6
+
+  eor rr0.16b, rr0.16b, rr2.16b
+  eor rr1.16b, rr1.16b, rr3.16b
+
+  /* (in3) * H? => rr2:rr3 */
+  /* (in4) * H? => rr6:rr7 */
+  /* (in5) * H? => rr8:rr9 */
+  PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1,
+                 rr6, rr7, rbuf4, rh2, t2, t3,
+                 rr8, rr9, rbuf5, rh1, t4, t5,
+                 _(eor rr0.16b, rr0.16b, rr4.16b;
+                   eor rr1.16b, rr1.16b, rr5.16b))
+
+  rev64 rbuf.16b, rbuf.16b /* byte-swap */
+  rev64 rbuf1.16b, rbuf1.16b /* byte-swap */
+  rev64 rbuf2.16b, rbuf2.16b /* byte-swap */
+  ext rbuf.16b, rbuf.16b, rbuf.16b, #8 /* byte-swap */
+  ext rbuf1.16b, rbuf1.16b, rbuf1.16b, #8 /* byte-swap */
+  ext rbuf2.16b, rbuf2.16b, rbuf2.16b, #8 /* byte-swap */
+
+  eor rr0.16b, rr0.16b, rr2.16b
+  eor rr1.16b, rr1.16b, rr3.16b
+  rbit rbuf.16b, rbuf.16b /* bit-swap */
+  eor rr0.16b, rr0.16b, rr6.16b
+  eor rr1.16b, rr1.16b, rr7.16b
+  rbit rbuf1.16b, rbuf1.16b /* bit-swap */
+  eor rr0.16b, rr0.16b, rr8.16b
+  eor rr1.16b, rr1.16b, rr9.16b
+  ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
+
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
+            _(rbit rbuf2.16b, rbuf2.16b), /* bit-swap */
+            _(rev64 rbuf3.16b, rbuf3.16b), /* byte-swap */
+            _(rev64 rbuf4.16b, rbuf4.16b)) /* byte-swap */
+
+  rev64 rbuf5.16b, rbuf5.16b /* byte-swap */
+  ext rbuf3.16b, rbuf3.16b, rbuf3.16b, #8 /* byte-swap */
+
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  ext rbuf4.16b, rbuf4.16b, rbuf4.16b, #8 /* byte-swap */
+  ext rbuf5.16b, rbuf5.16b, rbuf5.16b, #8 /* byte-swap */
+  rbit rbuf3.16b, rbuf3.16b /* bit-swap */
+  rbit rbuf4.16b, rbuf4.16b /* bit-swap */
+  rbit rbuf5.16b, rbuf5.16b /* bit-swap */
+
+  b.hs .Lpolyval_loop_6
+
+.Lpolyval_end_6:
+
+  /* (in1) * H? => rr0:rr1 */
+  /* (in0 ^ hash) * H? => rr2:rr3 */
+  /* (in2) * H? => rr4:rr5 */
+  PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
+                 rr2, rr3, rhash, rh6, t2, t3,
+                 rr4, rr5, rbuf2, rh4, t4, t5,
+                 __)
+  eor rr0.16b, rr0.16b, rr2.16b
+  eor rr1.16b, rr1.16b, rr3.16b
+  eor rr0.16b, rr0.16b, rr4.16b
+  eor rr1.16b, rr1.16b, rr5.16b
+
+  /* (in3) * H? => rhash:rbuf */
+  /* (in4) * H? => rr6:rr7 */
+  /* (in5) * H? => rr8:rr9 */
+  PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1,
+                 rr6, rr7, rbuf4, rh2, t2, t3,
+                 rr8, rr9, rbuf5, rh1, t4, t5,
+                 _(CLEAR_REG(rh4);
+                   CLEAR_REG(rh5);
+                   CLEAR_REG(rh6)))
+  eor rr0.16b, rr0.16b, rhash.16b
+  eor rr1.16b, rr1.16b, rbuf.16b
+  eor rr0.16b, rr0.16b, rr6.16b
+  eor rr1.16b, rr1.16b, rr7.16b
+  eor rr0.16b, rr0.16b, rr8.16b
+  eor rr1.16b, rr1.16b, rr9.16b
+
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
+            _(CLEAR_REG(rh2);
+              CLEAR_REG(rh3);
+              CLEAR_REG(rr2);
+              CLEAR_REG(rbuf2);
+              CLEAR_REG(rbuf3)),
+            _(CLEAR_REG(rr3);
+              CLEAR_REG(rr4);
+              CLEAR_REG(rr5);
+              CLEAR_REG(rr6);
+              CLEAR_REG(rr7)),
+            _(CLEAR_REG(rr8);
+              CLEAR_REG(rr9);
+              CLEAR_REG(rbuf1);
+              CLEAR_REG(rbuf2)))
+
+  CLEAR_REG(rbuf4)
+  CLEAR_REG(rbuf5)
+  CLEAR_REG(t2)
+  CLEAR_REG(t3)
+  CLEAR_REG(t4)
+  CLEAR_REG(t5)
+
+  VPOP_ABI
+
+  cbz x3, .Lpolyval_done
+
+.Lpolyval_less_than_6:
+  /* Handle remaining blocks. */
+
+  ld1 {rbuf.16b}, [x2], #16
+  sub x3, x3, #1
+
+  rev64 rbuf.16b, rbuf.16b /* byte-swap */
+  ext rbuf.16b, rbuf.16b, rbuf.16b, #8 /* byte-swap */
+  rbit rbuf.16b, rbuf.16b /* bit-swap */
+
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  cbz x3, .Lpolyval_end
+
+.Lpolyval_loop:
+  PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16))
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
+            _(sub x3, x3, #1;
+              rev64 rbuf.16b, rbuf.16b), /* byte-swap */
+            _(ext rbuf.16b, rbuf.16b, rbuf.16b, #8), /* byte-swap */
+            _(rbit rbuf.16b, rbuf.16b)) /* bit-swap */
+  eor rhash.16b, rhash.16b, rbuf.16b
+
+  cbnz x3, .Lpolyval_loop
+
+.Lpolyval_end:
+  PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf)))
+  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __)
+
+.Lpolyval_done:
+  CLEAR_REG(rr1)
+  CLEAR_REG(rr0)
+  rbit rhash.16b, rhash.16b /* bit-swap */
+  CLEAR_REG(t0)
+  CLEAR_REG(t1)
+
+  st1 {rhash.2d}, [x1]
+  CLEAR_REG(rhash)
+
+.Lpolyval_do_nothing:
+  mov x0, #0
+  ret
+  CFI_ENDPROC()
+ELF(.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;)
+
+
 /*
  * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
  */
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index d3ed9cf6..a039c5e9 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -57,6 +57,11 @@ extern unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
                                                 const byte *buf, size_t nblocks,
                                                 void *gcm_table);
 
+extern unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result,
+                                                  const byte *buf,
+                                                  size_t nblocks,
+                                                  void *gcm_table);
+
 static void
 ghash_setup_armv8_ce_pmull (gcry_cipher_hd_t c)
 {
@@ -71,6 +76,14 @@ ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
   return _gcry_ghash_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, buf,
                                     nblocks, c->u_mode.gcm.gcm_table);
 }
+
+static unsigned int
+polyval_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
+                        size_t nblocks)
+{
+  return _gcry_polyval_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result,
+                                      buf, nblocks, c->u_mode.gcm.gcm_table);
+}
 #endif /* GCM_USE_ARM_PMULL */
 
 #ifdef GCM_USE_ARM_NEON
@@ -591,6 +604,7 @@ setupM (gcry_cipher_hd_t c)
   else if (features & HWF_ARM_PMULL)
     {
       c->u_mode.gcm.ghash_fn = ghash_armv8_ce_pmull;
+      c->u_mode.gcm.polyval_fn = polyval_armv8_ce_pmull;
       ghash_setup_armv8_ce_pmull (c);
     }
 #endif
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 12:06:10 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 13:06:10 +0200
Subject: [PATCH 1/3] Use 'vmov' and 'movi' for vector register clearing in ARM
 assembly
Message-ID: <20220108110612.141943-1-jussi.kivilinna@iki.fi>

* cipher/chacha20-aarch64.S (clear): Use 'movi'.
* cipher/chacha20-armv7-neon.S (clear): Use 'vmov'.
* cipher/cipher-gcm-armv7-neon.S (clear): Use 'vmov'.
* cipher/cipher-gcm-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/cipher-gcm-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'.
* cipher/rijndael-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/sha1-armv7-neon.S (clear): Use 'vmov'.
* cipher/sha1-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/sha1-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'.
* cipher/sha256-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/sha256-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'.
* cipher/sha512-armv7-neon.S (CLEAR_REG): New using 'vmov'.
(_gcry_sha512_transform_armv7_neon): Use CLEAR_REG for clearing
registers.
--

Use 'vmov reg, #0' on 32-bit and 'movi reg.16b, #0' instead of
self-xoring register to break false register dependency.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-aarch64.S            |  2 +-
 cipher/chacha20-armv7-neon.S         |  2 +-
 cipher/cipher-gcm-armv7-neon.S       |  2 +-
 cipher/cipher-gcm-armv8-aarch32-ce.S |  2 +-
 cipher/cipher-gcm-armv8-aarch64-ce.S |  2 +-
 cipher/rijndael-armv8-aarch32-ce.S   |  2 +-
 cipher/sha1-armv7-neon.S             |  2 +-
 cipher/sha1-armv8-aarch32-ce.S       |  2 +-
 cipher/sha1-armv8-aarch64-ce.S       |  2 +-
 cipher/sha256-armv8-aarch32-ce.S     |  2 +-
 cipher/sha256-armv8-aarch64-ce.S     |  2 +-
 cipher/sha512-armv7-neon.S           | 26 ++++++++++++++------------
 12 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index b8f9724a..4f76834b 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -110,7 +110,7 @@
 	vpunpcklqdq(x2, t2, x2);
 
 #define clear(x) \
-	eor x.16b, x.16b, x.16b;
+	movi x.16b, #0;
 
 /**********************************************************************
   4-way chacha20
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
index 33a43df1..a862be4e 100644
--- a/cipher/chacha20-armv7-neon.S
+++ b/cipher/chacha20-armv7-neon.S
@@ -132,7 +132,7 @@
 	vswp _q0##h, _q2##l;			\
 	vswp _q1##h, _q3##l;
 
-#define clear(x) veor x,x,x;
+#define clear(x) vmov.i8 x, #0;
 
 /**********************************************************************
   4-way chacha20
diff --git a/cipher/cipher-gcm-armv7-neon.S b/cipher/cipher-gcm-armv7-neon.S
index a801a5e5..16502b4a 100644
--- a/cipher/cipher-gcm-armv7-neon.S
+++ b/cipher/cipher-gcm-armv7-neon.S
@@ -210,7 +210,7 @@ gcry_gcm_reduction_constant:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S
index 1de66a16..fb51b339 100644
--- a/cipher/cipher-gcm-armv8-aarch32-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch32-ce.S
@@ -180,7 +180,7 @@ gcry_gcm_reduction_constant:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 877207d3..13ee83ed 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -149,7 +149,7 @@ gcry_gcm_reduction_constant:
 #define _(...) __VA_ARGS__
 #define __ _()
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 #define VPUSH_ABI \
         stp d8, d9, [sp, #-16]!; \
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 6d78af0a..1eafa93e 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -249,7 +249,7 @@
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha1-armv7-neon.S b/cipher/sha1-armv7-neon.S
index 61cc541c..2de678b8 100644
--- a/cipher/sha1-armv7-neon.S
+++ b/cipher/sha1-armv7-neon.S
@@ -303,7 +303,7 @@ gcry_sha1_armv7_neon_K_VEC:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha1-armv8-aarch32-ce.S b/cipher/sha1-armv8-aarch32-ce.S
index bf2b233b..059b9a85 100644
--- a/cipher/sha1-armv8-aarch32-ce.S
+++ b/cipher/sha1-armv8-aarch32-ce.S
@@ -100,7 +100,7 @@ gcry_sha1_aarch32_ce_K_VEC:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index 223268ca..8ea1486b 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -88,7 +88,7 @@ gcry_sha1_aarch64_ce_K_VEC:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 
 /*
diff --git a/cipher/sha256-armv8-aarch32-ce.S b/cipher/sha256-armv8-aarch32-ce.S
index 2b17ab1b..95778b40 100644
--- a/cipher/sha256-armv8-aarch32-ce.S
+++ b/cipher/sha256-armv8-aarch32-ce.S
@@ -111,7 +111,7 @@ gcry_sha256_aarch32_ce_K:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index f57cae29..5c39e83e 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -98,7 +98,7 @@ gcry_sha256_aarch64_ce_K:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 
 /*
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S
index 6596f2cd..2b186b47 100644
--- a/cipher/sha512-armv7-neon.S
+++ b/cipher/sha512-armv7-neon.S
@@ -91,6 +91,8 @@
 #define RW1213q q14
 #define RW1415q q15
 
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
+
 /***********************************************************************
  * ARM assembly implementation of sha512 transform
  ***********************************************************************/
@@ -426,22 +428,22 @@ _gcry_sha512_transform_armv7_neon:
 
 	/* Clear used registers */
 	/* d16-d31 */
-	veor.u64 RW01q, RW01q;
-	veor.u64 RW23q, RW23q;
-	veor.u64 RW45q, RW45q;
-	veor.u64 RW67q, RW67q;
+	CLEAR_REG(RW01q);
+	CLEAR_REG(RW23q);
+	CLEAR_REG(RW45q);
+	CLEAR_REG(RW67q);
 	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	veor.u64 RW89q, RW89q;
-	veor.u64 RW1011q, RW1011q;
-	veor.u64 RW1213q, RW1213q;
-	veor.u64 RW1415q, RW1415q;
+	CLEAR_REG(RW89q);
+	CLEAR_REG(RW1011q);
+	CLEAR_REG(RW1213q);
+	CLEAR_REG(RW1415q);
 	/* d8-d15 */
 	vpop {RT0-RT7};
 	/* d0-d7 (q0-q3) */
-	veor.u64 %q0, %q0;
-	veor.u64 %q1, %q1;
-	veor.u64 %q2, %q2;
-	veor.u64 %q3, %q3;
+	CLEAR_REG(%q0);
+	CLEAR_REG(%q1);
+	CLEAR_REG(%q2);
+	CLEAR_REG(%q3);
 
 	eor %r0, %r0;
 	pop {%pc};
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 12:06:12 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 13:06:12 +0200
Subject: [PATCH 3/3] Optimizations for AES aarch64-ce assembly implementation
In-Reply-To: <20220108110612.141943-1-jussi.kivilinna@iki.fi>
References: <20220108110612.141943-1-jussi.kivilinna@iki.fi>
Message-ID: <20220108110612.141943-3-jussi.kivilinna@iki.fi>

* cipher/rijndael-armv8-aarch64-ce.S (vk14): Remove.
(vklast, __, _): New.
(aes_preload_keys): Setup vklast.
(do_aes_one128/192/256): Split to ...
(do_aes_one_part1, do_aes_part2_128/192/256): ... these and add
interleave ops.
(do_aes_one128/192/256): New using above part1 and part2 macros.
(aes_round_4): Rename to ...
(aes_round_4_multikey): ... this and allow different key used for
parallel blocks.
(aes_round_4): New using above multikey macro.
(aes_lastround_4): Reorder AES round and xor instructions, allow
different last key for parallel blocks.
(do_aes_4_128/192/256): Split to ...
(do_aes_4_part1_multikey, do_aes_4_part1)
(do_aes_4_part2_128/192/256): ... these.
(do_aes_4_128/192/256): New using above part1 and part2 macros.
(CLEAR_REG): Use movi for clearing registers.
(aes_clear_keys): Remove branching and clear all key registers.
(_gcry_aes_enc_armv8_ce, _gcry_aes_dec_armv8_ce): Adjust to macro
changes.
(_gcry_aes_cbc_enc_armv8_ce, _gcry_aes_cbc_dec_armv8_ce)
(_gcry_aes_cfb_enc_armv8_ce, _gcry_aes_cfb_enc_armv8_ce)
(_gcry_aes_ctr32le_enc_armv8_ce): Apply entry/loop-body/exit
optimization for better interleaving of input/output processing;
First/last round key and input/output xoring optimization to reduce
critical path length.
(_gcry_aes_ctr_enc_armv8_ce): Add fast path for counter incrementing
without byte-swaps when counter does not overflow 8-bit; Apply
entry/loop-body/exit optimization for better interleaving of
input/output processing; First/last round key and input/output
xoring optimization to reduce critical path length.
(_gcry_aes_ocb_enc_armv8_ce, _gcry_aes_ocb_dec_armv8_ce): Add aligned
processing for nblk and OCB offsets; Apply entry/loop-body/exit
optimization for better interleaving of input/output processing;
First/last round key and input/output xoring optimization to reduce
critical path length; Change to use same function body macro for
both encryption and decryption.
(_gcry_aes_xts_enc_armv8_ce, _gcry_aes_xts_dec_armv8_ce): Apply
entry/loop-body/exit optimization for better interleaving of
input/output processing; First/last round key and input/output
xoring optimization to reduce critical path length; Change to use
same function body macro for both encryption and decryption.
--

Benchmark on AWS Graviton2 (2500Mhz):

 Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC enc |     0.663 ns/B      1439 MiB/s      1.66 c/B
        CBC dec |     0.288 ns/B      3310 MiB/s     0.720 c/B
        CFB enc |     0.657 ns/B      1453 MiB/s      1.64 c/B
        CFB dec |     0.288 ns/B      3313 MiB/s     0.720 c/B
        CTR dec |     0.314 ns/B      3039 MiB/s     0.785 c/B
        XTS enc |     0.357 ns/B      2674 MiB/s     0.891 c/B
        XTS dec |     0.358 ns/B      2666 MiB/s     0.894 c/B
        OCB enc |     0.343 ns/B      2784 MiB/s     0.856 c/B
        OCB dec |     0.341 ns/B      2795 MiB/s     0.853 c/B
    GCM-SIV enc |     0.526 ns/B      1813 MiB/s      1.31 c/B

 After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  perf increase
        CBC enc |     0.500 ns/B      1906 MiB/s      1.25 c/B      +33%
        CBC dec |     0.263 ns/B      3622 MiB/s     0.658 c/B       +9%
        CFB enc |     0.500 ns/B      1906 MiB/s      1.25 c/B      +31%
        CFB dec |     0.263 ns/B      3620 MiB/s     0.658 c/B       +9%
        CTR enc |     0.264 ns/B      3618 MiB/s     0.659 c/B      +19%
        XTS enc |     0.350 ns/B      2722 MiB/s     0.876 c/B       +2%
        OCB enc |     0.275 ns/B      3468 MiB/s     0.687 c/B      +25%
        OCB dec |     0.276 ns/B      3459 MiB/s     0.689 c/B      +24%
    GCM-SIV enc |     0.494 ns/B      1929 MiB/s      1.24 c/B       +6%

Benchmark on Cortex-A53 (1152Mhz):

 Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC enc |      1.41 ns/B     675.9 MiB/s      1.63 c/B
        CBC dec |     0.910 ns/B      1048 MiB/s      1.05 c/B
        CFB enc |      1.30 ns/B     732.2 MiB/s      1.50 c/B
        CFB dec |     0.910 ns/B      1048 MiB/s      1.05 c/B
        CTR enc |      1.03 ns/B     924.4 MiB/s      1.19 c/B
        XTS enc |      1.25 ns/B     763.0 MiB/s      1.44 c/B
        OCB enc |      1.21 ns/B     789.5 MiB/s      1.39 c/B
        OCB dec |      1.21 ns/B     788.9 MiB/s      1.39 c/B
    GCM-SIV enc |      1.92 ns/B     496.6 MiB/s      2.21 c/B

 After:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  perf increase
        CBC enc |      1.14 ns/B     836.6 MiB/s      1.31 c/B      +24%
        CBC dec |     0.843 ns/B      1132 MiB/s     0.971 c/B       +8%
        CFB enc |      1.19 ns/B     798.8 MiB/s      1.38 c/B       +9%
        CFB dec |     0.842 ns/B      1132 MiB/s     0.970 c/B       +8%
        CTR enc |     0.898 ns/B      1062 MiB/s      1.03 c/B      +16%
        XTS enc |      1.22 ns/B     779.9 MiB/s      1.41 c/B       +2%
        OCB enc |     0.992 ns/B     961.0 MiB/s      1.14 c/B      +22%
        OCB dec |     0.993 ns/B     960.5 MiB/s      1.14 c/B      +22%
    GCM-SIV enc |      1.88 ns/B     507.3 MiB/s      2.17 c/B       +2%

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-armv8-aarch64-ce.S | 1227 ++++++++++++++++------------
 1 file changed, 713 insertions(+), 514 deletions(-)

diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index a87d2ca5..9f8d9d49 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -44,7 +44,13 @@
 #define vk11 v28
 #define vk12 v29
 #define vk13 v30
-#define vk14 v31
+#define vklast v31
+
+
+/* Helper macros */
+
+#define __ /*_*/
+#define _(...) __VA_ARGS__
 
 
 /* AES macros */
@@ -54,39 +60,40 @@
 	ld1 {vk0.16b-vk3.16b}, [keysched], #64; \
 	ld1 {vk4.16b-vk7.16b}, [keysched], #64; \
 	ld1 {vk8.16b-vk10.16b}, [keysched], #48; \
+	mov vklast.16b, vk10.16b; \
 	b.lo 1f; \
 	ld1 {vk11.16b-vk12.16b}, [keysched], #32; \
+	mov vklast.16b, vk12.16b; \
 	b.eq 1f; \
-	ld1 {vk13.16b-vk14.16b}, [keysched]; \
+	ld1 {vk13.16b-vklast.16b}, [keysched]; \
 1:	;
 
-#define do_aes_one128(ed, mcimc, vo, vb) \
-	aes##ed    vb.16b, vk0.16b; \
+#define do_aes_one_part1(ed, mcimc, vb, vkfirst) \
+	aes##ed    vb.16b, vkfirst.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk1.16b; \
-	aes##mcimc vb.16b, vb.16b; \
+	aes##mcimc vb.16b, vb.16b;
+
+#define do_aes_one_part2_128(ed, mcimc, vb, iop1, iop2) \
 	aes##ed    vb.16b, vk2.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk3.16b; \
 	aes##mcimc vb.16b, vb.16b; \
+	iop1; \
 	aes##ed    vb.16b, vk4.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk5.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk6.16b; \
 	aes##mcimc vb.16b, vb.16b; \
+	iop2; \
 	aes##ed    vb.16b, vk7.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk8.16b; \
 	aes##mcimc vb.16b, vb.16b; \
-	aes##ed    vb.16b, vk9.16b; \
-	eor        vo.16b, vb.16b, vk10.16b;
+	aes##ed    vb.16b, vk9.16b;
 
-#define do_aes_one192(ed, mcimc, vo, vb) \
-	aes##ed    vb.16b, vk0.16b; \
-	aes##mcimc vb.16b, vb.16b; \
-	aes##ed    vb.16b, vk1.16b; \
-	aes##mcimc vb.16b, vb.16b; \
+#define do_aes_one_part2_192(ed, mcimc, vb, iop1, iop2) \
 	aes##ed    vb.16b, vk2.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk3.16b; \
@@ -95,24 +102,21 @@
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk5.16b; \
 	aes##mcimc vb.16b, vb.16b; \
+	iop1; \
 	aes##ed    vb.16b, vk6.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk7.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk8.16b; \
 	aes##mcimc vb.16b, vb.16b; \
+	iop2; \
 	aes##ed    vb.16b, vk9.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk10.16b; \
 	aes##mcimc vb.16b, vb.16b; \
-	aes##ed    vb.16b, vk11.16b; \
-	eor        vo.16b, vb.16b, vk12.16b;
+	aes##ed    vb.16b, vk11.16b;
 
-#define do_aes_one256(ed, mcimc, vo, vb) \
-	aes##ed    vb.16b, vk0.16b; \
-	aes##mcimc vb.16b, vb.16b; \
-	aes##ed    vb.16b, vk1.16b; \
-	aes##mcimc vb.16b, vb.16b; \
+#define do_aes_one_part2_256(ed, mcimc, vb, iop1, iop2) \
 	aes##ed    vb.16b, vk2.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk3.16b; \
@@ -125,56 +129,78 @@
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk7.16b; \
 	aes##mcimc vb.16b, vb.16b; \
+	iop1; \
 	aes##ed    vb.16b, vk8.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk9.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk10.16b; \
 	aes##mcimc vb.16b, vb.16b; \
+	iop2; \
 	aes##ed    vb.16b, vk11.16b; \
 	aes##mcimc vb.16b, vb.16b; \
 	aes##ed    vb.16b, vk12.16b; \
 	aes##mcimc vb.16b, vb.16b; \
-	aes##ed    vb.16b, vk13.16b; \
-	eor        vo.16b, vb.16b, vk14.16b;
+	aes##ed    vb.16b, vk13.16b;
 
-#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
-	aes##ed    b0.16b, key.16b; \
+#define do_aes_one128(ed, mcimc, vo, vb, vkfirst) \
+	do_aes_one_part1(ed, mcimc, vb, vkfirst); \
+	do_aes_one_part2_128(ed, mcimc, vb, __, __); \
+	eor vo.16b, vb.16b, vklast.16b;
+
+#define do_aes_one192(ed, mcimc, vo, vb, vkfirst) \
+	do_aes_one_part1(ed, mcimc, vb, vkfirst); \
+	do_aes_one_part2_192(ed, mcimc, vb, __, __); \
+	eor vo.16b, vb.16b, vklast.16b;
+
+#define do_aes_one256(ed, mcimc, vo, vb, vkfirst) \
+	do_aes_one_part1(ed, mcimc, vb, vkfirst); \
+	do_aes_one_part2_256(ed, mcimc, vb, __, __); \
+	eor vo.16b, vb.16b, vklast.16b;
+
+#define aes_round_4_multikey(ed, mcimc, b0, b1, b2, b3, key0, key1, key2, key3) \
+	aes##ed    b0.16b, key0.16b; \
 	aes##mcimc b0.16b, b0.16b; \
-	  aes##ed    b1.16b, key.16b; \
+	  aes##ed    b1.16b, key1.16b; \
 	  aes##mcimc b1.16b, b1.16b; \
-	    aes##ed    b2.16b, key.16b; \
+	    aes##ed    b2.16b, key2.16b; \
 	    aes##mcimc b2.16b, b2.16b; \
-	      aes##ed    b3.16b, key.16b; \
+	      aes##ed    b3.16b, key3.16b; \
 	      aes##mcimc b3.16b, b3.16b;
 
-#define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \
+#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \
+	aes_round_4_multikey(ed, mcimc, b0, b1, b2, b3, key, key, key, key);
+
+#define aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, key1, b0_key2, b1_key2, b2_key2, b3_key2) \
 	aes##ed    b0.16b, key1.16b; \
-	eor        b0.16b, b0.16b, key2.16b; \
 	  aes##ed    b1.16b, key1.16b; \
-	  eor        b1.16b, b1.16b, key2.16b; \
 	    aes##ed    b2.16b, key1.16b; \
-	    eor        b2.16b, b2.16b, key2.16b; \
 	      aes##ed    b3.16b, key1.16b; \
-	      eor        b3.16b, b3.16b, key2.16b;
+	eor        o0.16b, b0.16b, b0_key2.16b; \
+	  eor        o1.16b, b1.16b, b1_key2.16b; \
+	    eor        o2.16b, b2.16b, b2_key2.16b; \
+	      eor        o3.16b, b3.16b, b3_key2.16b;
 
-#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
+#define do_aes_4_part1_multikey(ed, mcimc, b0, b1, b2, b3, key0, key1, key2, key3) \
+	aes_round_4_multikey(ed, mcimc, b0, b1, b2, b3, key0, key1, key2, key3); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3);
+
+#define do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vkfirst) \
+	do_aes_4_part1_multikey(ed, mcimc, b0, b1, b2, b3, vkfirst, vkfirst, vkfirst, vkfirst);
+
+#define do_aes_4_part2_128(ed, mcimc, o0, o1, o2, o3, b0, b1, b2, b3, \
+			   b0_key, b1_key, b2_key, b3_key) \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
-	aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10);
+	aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, vk9, b0_key, b1_key, b2_key, b3_key);
 
-#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+#define do_aes_4_part2_192(ed, mcimc, o0, o1, o2, o3, b0, b1, b2, b3, \
+			   b0_key, b1_key, b2_key, b3_key) \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
@@ -182,13 +208,10 @@
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
-	aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12);
+	aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, vk11, b0_key, b1_key, b2_key, b3_key);
 
-#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \
-	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \
+#define do_aes_4_part2_256(ed, mcimc, o0, o1, o2, o3, b0, b1, b2, b3, \
+			   b0_key, b1_key, b2_key, b3_key) \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \
@@ -198,15 +221,25 @@
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \
 	aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \
-	aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14);
+	aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, vk13, b0_key, b1_key, b2_key, b3_key);
 
+#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \
+	do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vk0); \
+	do_aes_4_part2_128(ed, mcimc, b0, b1, b2, b3, b0, b1, b2, b3, vklast, vklast, vklast, vklast);
+
+#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \
+	do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vk0); \
+	do_aes_4_part2_192(ed, mcimc, b0, b1, b2, b3, b0, b1, b2, b3, vklast, vklast, vklast, vklast);
+
+#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \
+	do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vk0); \
+	do_aes_4_part2_256(ed, mcimc, b0, b1, b2, b3, b0, b1, b2, b3, vklast, vklast, vklast, vklast);
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 #define aes_clear_keys(nrounds) \
-	cmp nrounds, #12; \
 	CLEAR_REG(vk0); \
 	CLEAR_REG(vk1); \
 	CLEAR_REG(vk2); \
@@ -218,13 +251,10 @@
 	CLEAR_REG(vk9); \
 	CLEAR_REG(vk8); \
 	CLEAR_REG(vk10); \
-	b.lo 1f; \
 	CLEAR_REG(vk11); \
 	CLEAR_REG(vk12); \
-	b.eq 1f; \
 	CLEAR_REG(vk13); \
-	CLEAR_REG(vk14); \
-1:	;
+	CLEAR_REG(vklast);
 
 
 /*
@@ -252,7 +282,7 @@ _gcry_aes_enc_armv8_ce:
   b.eq .Lenc1_192
 
 .Lenc1_128:
-  do_aes_one128(e, mc, v0, v0);
+  do_aes_one128(e, mc, v0, v0, vk0);
 
 .Lenc1_tail:
   CLEAR_REG(vk0)
@@ -266,6 +296,7 @@ _gcry_aes_enc_armv8_ce:
   CLEAR_REG(vk8)
   CLEAR_REG(vk9)
   CLEAR_REG(vk10)
+  CLEAR_REG(vklast)
   st1 {v0.16b}, [x1]
   CLEAR_REG(v0)
 
@@ -273,19 +304,18 @@ _gcry_aes_enc_armv8_ce:
   ret
 
 .Lenc1_192:
-  do_aes_one192(e, mc, v0, v0);
+  do_aes_one192(e, mc, v0, v0, vk0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   b .Lenc1_tail
 
 .Lenc1_256:
-  do_aes_one256(e, mc, v0, v0);
+  do_aes_one256(e, mc, v0, v0, vk0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   CLEAR_REG(vk13)
-  CLEAR_REG(vk14)
   b .Lenc1_tail
   CFI_ENDPROC();
 ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;)
@@ -316,7 +346,7 @@ _gcry_aes_dec_armv8_ce:
   b.eq .Ldec1_192
 
 .Ldec1_128:
-  do_aes_one128(d, imc, v0, v0);
+  do_aes_one128(d, imc, v0, v0, vk0);
 
 .Ldec1_tail:
   CLEAR_REG(vk0)
@@ -330,6 +360,7 @@ _gcry_aes_dec_armv8_ce:
   CLEAR_REG(vk8)
   CLEAR_REG(vk9)
   CLEAR_REG(vk10)
+  CLEAR_REG(vklast)
   st1 {v0.16b}, [x1]
   CLEAR_REG(v0)
 
@@ -337,19 +368,18 @@ _gcry_aes_dec_armv8_ce:
   ret
 
 .Ldec1_192:
-  do_aes_one192(d, imc, v0, v0);
+  do_aes_one192(d, imc, v0, v0, vk0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   b .Ldec1_tail
 
 .Ldec1_256:
-  do_aes_one256(d, imc, v0, v0);
+  do_aes_one256(d, imc, v0, v0, vk0);
 
   CLEAR_REG(vk11)
   CLEAR_REG(vk12)
   CLEAR_REG(vk13)
-  CLEAR_REG(vk14)
   b .Ldec1_tail
   CFI_ENDPROC();
 ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;)
@@ -381,26 +411,38 @@ _gcry_aes_cbc_enc_armv8_ce:
   cbz x4, .Lcbc_enc_skip
 
   cmp w5, #0
-  ld1 {v1.16b}, [x3] /* load IV */
-  cset x5, eq
+  ld1 {v4.16b}, [x3] /* load IV */
+  csetm x5, eq
 
   aes_preload_keys(x0, w6);
-  lsl x5, x5, #4
+  and x5, x5, #16
+
+  ld1 {v3.16b}, [x2], #16; /* load plaintext */
+  mov v0.16b, vk0.16b;
+  sub x4, x4, #1;
+  eor v16.16b, vk0.16b, vklast.16b;
+  eor v4.16b, v4.16b, v3.16b;
+  do_aes_one_part1(e, mc, v4, v0);
 
-  b.eq .Lcbc_enc_loop192
-  b.hi .Lcbc_enc_loop256
+  b.eq .Lcbc_enc_entry_192
+  b.hi .Lcbc_enc_entry_256
 
 #define CBC_ENC(bits) \
-  .Lcbc_enc_loop##bits: \
-    ld1 {v0.16b}, [x2], #16; /* load plaintext */ \
-    eor v1.16b, v0.16b, v1.16b; \
-    sub x4, x4, #1; \
-    \
-    do_aes_one##bits(e, mc, v1, v1); \
+  .Lcbc_enc_entry_##bits: \
+    cbz x4, .Lcbc_enc_done_##bits; \
     \
-    st1 {v1.16b}, [x1], x5; /* store ciphertext */ \
+  .Lcbc_enc_loop_##bits: \
+    do_aes_one_part2_##bits(e, mc, v4, \
+                            _(ld1 {v0.16b}, [x2], #16 /* load plaintext */), \
+                            _(eor v0.16b, v0.16b, v16.16b)); \
+    sub x4, x4, #1; \
+    eor v3.16b, v4.16b, vklast.16b; \
+    do_aes_one_part1(e, mc, v4, v0); \
+    st1 {v3.16b}, [x1], x5; /* store ciphertext */ \
+    cbnz x4, .Lcbc_enc_loop_##bits; \
     \
-    cbnz x4, .Lcbc_enc_loop##bits; \
+  .Lcbc_enc_done_##bits: \
+    do_aes_one_part2_##bits(e, mc, v4, __, __); \
     b .Lcbc_enc_done;
 
   CBC_ENC(128)
@@ -410,11 +452,14 @@ _gcry_aes_cbc_enc_armv8_ce:
 #undef CBC_ENC
 
 .Lcbc_enc_done:
+  eor v3.16b, v4.16b, vklast.16b;
+  st1 {v3.16b}, [x1]; /* store ciphertext */
   aes_clear_keys(w6)
+  st1 {v3.16b}, [x3] /* store IV */
 
-  st1 {v1.16b}, [x3] /* store IV */
-
-  CLEAR_REG(v1)
+  CLEAR_REG(v16)
+  CLEAR_REG(v4)
+  CLEAR_REG(v3)
   CLEAR_REG(v0)
 
 .Lcbc_enc_skip:
@@ -445,7 +490,10 @@ _gcry_aes_cbc_dec_armv8_ce:
 
   cbz x4, .Lcbc_dec_skip
 
-  ld1 {v0.16b}, [x3] /* load IV */
+  add sp, sp, #-64;
+  CFI_ADJUST_CFA_OFFSET(64);
+
+  ld1 {v16.16b}, [x3] /* load IV */
 
   aes_preload_keys(x0, w5);
 
@@ -457,44 +505,61 @@ _gcry_aes_cbc_dec_armv8_ce:
     cmp x4, #4; \
     b.lo .Lcbc_dec_loop_##bits; \
     \
-  .Lcbc_dec_loop4_##bits: \
-    \
-    ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \
+    ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x4, #8; \
     sub x4, x4, #4; \
-    mov v5.16b, v1.16b; \
-    mov v6.16b, v2.16b; \
-    mov v7.16b, v3.16b; \
-    mov v16.16b, v4.16b; \
-    cmp x4, #4; \
+    eor v4.16b, v16.16b, vklast.16b; \
+    eor v5.16b, v0.16b, vklast.16b; \
+    eor v6.16b, v1.16b, vklast.16b; \
+    eor v7.16b, v2.16b, vklast.16b; \
+    mov v16.16b, v3.16b; /* next IV */ \
     \
-    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
+    do_aes_4_part1(d, imc, v0, v1, v2, v3, vk0); \
+    b.lo .Lcbc_dec_done4_##bits; \
     \
-    eor v1.16b, v1.16b, v0.16b; \
-    eor v2.16b, v2.16b, v5.16b; \
-    st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \
-    eor v3.16b, v3.16b, v6.16b; \
-    eor v4.16b, v4.16b, v7.16b; \
-    mov v0.16b, v16.16b; /* next IV */ \
-    st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \
+    st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \
+    \
+  .Lcbc_dec_loop4_##bits: \
+    do_aes_4_part2_##bits(d, imc, v8, v9, v10, v11, v0, v1, v2, v3, v4, v5, v6, v7); \
+    ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x4, #8; \
+    sub x4, x4, #4; \
+    eor v4.16b, v16.16b, vklast.16b; \
+    eor v5.16b, v0.16b, vklast.16b; \
+    eor v6.16b, v1.16b, vklast.16b; \
+    eor v7.16b, v2.16b, vklast.16b; \
+    mov v16.16b, v3.16b; /* next IV */ \
+    \
+    do_aes_4_part1(d, imc, v0, v1, v2, v3, vk0); \
+    st1 {v8.16b-v11.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lcbc_dec_loop4_##bits; \
-    CLEAR_REG(v3); \
+    \
+    ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \
+    \
+  .Lcbc_dec_done4_##bits: \
+    do_aes_4_part2_##bits(d, imc, v0, v1, v2, v3, v0, v1, v2, v3, v4, v5, v6, v7); \
+    \
     CLEAR_REG(v4); \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
-    CLEAR_REG(v16); \
+    st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \
+    CLEAR_REG(v0); \
+    CLEAR_REG(v3); \
     cbz x4, .Lcbc_dec_done; \
     \
   .Lcbc_dec_loop_##bits: \
     ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
     sub x4, x4, #1; \
+    eor v16.16b, v16.16b, vklast.16b; \
     mov v2.16b, v1.16b; \
     \
-    do_aes_one##bits(d, imc, v1, v1); \
+    do_aes_one_part1(d, imc, v1, vk0); \
+    do_aes_one_part2_##bits(d, imc, v1, __, __); \
+    eor v1.16b, v1.16b, v16.16b; \
     \
-    eor v1.16b, v1.16b, v0.16b; \
-    mov v0.16b, v2.16b; \
+    mov v16.16b, v2.16b; \
     st1 {v1.16b}, [x1], #16; /* store plaintext */ \
     \
     cbnz x4, .Lcbc_dec_loop_##bits; \
@@ -509,12 +574,15 @@ _gcry_aes_cbc_dec_armv8_ce:
 .Lcbc_dec_done:
   aes_clear_keys(w5)
 
-  st1 {v0.16b}, [x3] /* store IV */
+  st1 {v16.16b}, [x3] /* store IV */
 
-  CLEAR_REG(v0)
+  CLEAR_REG(v16)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
+  add sp, sp, #64;
+  CFI_ADJUST_CFA_OFFSET(-64);
+
 .Lcbc_dec_skip:
   ret
   CFI_ENDPROC();
@@ -544,9 +612,13 @@ _gcry_aes_ctr_enc_armv8_ce:
 
   cbz x4, .Lctr_enc_skip
 
-  mov x6, #1
+  add x8, sp, #-64
+  add sp, sp, #-128;
+  CFI_ADJUST_CFA_OFFSET(128);
+
+  mov w6, #(1 << 24)
   movi v16.16b, #0
-  mov v16.D[1], x6
+  mov v16.S[3], w6 /* 1 */
 
   /* load IV */
   ldp x9, x10, [x3]
@@ -554,6 +626,9 @@ _gcry_aes_ctr_enc_armv8_ce:
   rev x9, x9
   rev x10, x10
 
+  mov x12, #(4 << 56)
+  lsl x11, x10, #56
+
   aes_preload_keys(x0, w5);
 
   b.eq .Lctr_enc_entry_192
@@ -564,73 +639,71 @@ _gcry_aes_ctr_enc_armv8_ce:
     cmp x4, #4; \
     b.lo .Lctr_enc_loop_##bits; \
     \
-  .Lctr_enc_loop4_##bits: \
-    cmp x10, #0xfffffffffffffffc; \
+    st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \
+    \
+    adds x11, x11, x12; \
+    add v9.4s, v16.4s, v16.4s; /* 2 */ \
+    add v10.4s, v16.4s, v9.4s; /* 3 */ \
+    add v11.4s, v9.4s, v9.4s; /* 4 */ \
+    mov x7, #1; \
     sub x4, x4, #4; \
-    b.lo .Lctr_enc_loop4_##bits##_nocarry; \
+    ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \
+    b.cs .Lctr_enc_carry4_##bits; \
     \
-    adds x10, x10, #1; \
     mov v1.16b, v0.16b; \
-    adc x9, x9, xzr; \
-    mov v2.D[1], x10; \
-    mov v2.D[0], x9; \
-    \
-    adds x10, x10, #1; \
-    rev64 v2.16b, v2.16b; \
-    adc x9, x9, xzr; \
-    mov v3.D[1], x10; \
-    mov v3.D[0], x9; \
-    \
-    adds x10, x10, #1; \
-    rev64 v3.16b, v3.16b; \
-    adc x9, x9, xzr; \
-    mov v4.D[1], x10; \
-    mov v4.D[0], x9; \
+    add x10, x10, #4; \
+    add v2.16b, v0.16b, v16.16b; \
+    add v3.8h, v0.8h, v9.8h; \
+    add v4.4s, v0.4s, v10.4s; \
+    add v0.2d, v0.2d, v11.2d; \
     \
-    adds x10, x10, #1; \
-    rev64 v4.16b, v4.16b; \
-    adc x9, x9, xzr; \
-    mov v0.D[1], x10; \
-    mov v0.D[0], x9; \
-    rev64 v0.16b, v0.16b; \
+  .Lctr_enc_entry4_##bits##_carry_done: \
+    mov x7, #0; \
+    cmp x4, #4; \
+    do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \
+    b.lo .Lctr_enc_done4_##bits; \
     \
-    b .Lctr_enc_loop4_##bits##_store_ctr; \
+    st1 {v12.16b-v15.16b}, [x8]; /* store callee saved registers */ \
     \
-  .Lctr_enc_loop4_##bits##_nocarry: \
+  .Lctr_enc_loop4_##bits: \
+    eor v5.16b, v5.16b, vklast.16b; \
+    eor v6.16b, v6.16b, vklast.16b; \
+    eor v7.16b, v7.16b, vklast.16b; \
+    eor v8.16b, v8.16b, vklast.16b; \
+    do_aes_4_part2_##bits(e, mc, v12, v13, v14, v15, v1, v2, v3, v4, v5, v6, v7, v8); \
+    ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \
+    adds x11, x11, x12; \
+    sub x4, x4, #4; \
+    b.cs .Lctr_enc_carry4_##bits; \
     \
-    add v3.2d, v16.2d, v16.2d; /* 2 */ \
-    rev64 v6.16b, v0.16b; \
+    mov v1.16b, v0.16b; \
     add x10, x10, #4; \
-    add v4.2d, v3.2d, v16.2d;  /* 3 */ \
-    add v0.2d, v3.2d, v3.2d;   /* 4 */ \
-    rev64 v1.16b, v6.16b; \
-    add v2.2d, v6.2d, v16.2d; \
-    add v3.2d, v6.2d, v3.2d; \
-    add v4.2d, v6.2d, v4.2d; \
-    add v0.2d, v6.2d, v0.2d; \
-    rev64 v2.16b, v2.16b; \
-    rev64 v3.16b, v3.16b; \
-    rev64 v0.16b, v0.16b; \
-    rev64 v4.16b, v4.16b; \
+    add v2.16b, v0.16b, v16.16b; \
+    add v3.8h, v0.8h, v9.8h; \
+    add v4.4s, v0.4s, v10.4s; \
+    add v0.2d, v0.2d, v11.2d; \
     \
-  .Lctr_enc_loop4_##bits##_store_ctr: \
-    \
-    st1 {v0.16b}, [x3]; \
+  .Lctr_enc_loop4_##bits##_carry_done: \
     cmp x4, #4; \
-    ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+    do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \
+    st1 {v12.16b-v15.16b}, [x1], #64; /* store plaintext */ \
     \
-    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    b.hs .Lctr_enc_loop4_##bits; \
     \
-    eor v1.16b, v1.16b, v5.16b; \
-    ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
-    eor v2.16b, v2.16b, v6.16b; \
-    eor v3.16b, v3.16b, v7.16b; \
-    eor v4.16b, v4.16b, v5.16b; \
-    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    ld1 {v12.16b-v15.16b}, [x8]; /* restore callee saved registers */ \
+    \
+  .Lctr_enc_done4_##bits: \
+    eor v5.16b, v5.16b, vklast.16b; \
+    eor v6.16b, v6.16b, vklast.16b; \
+    eor v7.16b, v7.16b, vklast.16b; \
+    eor v8.16b, v8.16b, vklast.16b; \
+    do_aes_4_part2_##bits(e, mc, v5, v6, v7, v8, v1, v2, v3, v4, v5, v6, v7, v8); \
+    \
+    st1 {v5.16b-v8.16b}, [x1], #64; /* store plaintext */ \
     \
-    b.hs .Lctr_enc_loop4_##bits; \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
+    ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
@@ -641,19 +714,48 @@ _gcry_aes_ctr_enc_armv8_ce:
     adds x10, x10, #1; \
     mov v1.16b, v0.16b; \
     adc x9, x9, xzr; \
-    mov v0.D[1], x10; \
-    mov v0.D[0], x9; \
+    dup v0.2d, x10; \
     sub x4, x4, #1; \
+    ins v0.D[0], x9; \
     ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
     rev64 v0.16b, v0.16b; \
     \
-    do_aes_one##bits(e, mc, v1, v1); \
+    do_aes_one_part1(e, mc, v1, vk0); \
+    eor v2.16b, v2.16b, vklast.16b; \
+    do_aes_one_part2_##bits(e, mc, v1, __, __); \
     \
-    eor v1.16b, v2.16b, v1.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
     st1 {v1.16b}, [x1], #16; /* store plaintext */ \
     \
     cbnz x4, .Lctr_enc_loop_##bits; \
-    b .Lctr_enc_done;
+    b .Lctr_enc_done; \
+    \
+  .Lctr_enc_carry4_##bits: \
+    \
+    adds x13, x10, #1; \
+    mov v1.16b, v0.16b; \
+    adc x14, x9, xzr; \
+    dup v2.2d, x13; \
+      adds x13, x10, #2; \
+    ins v2.D[0], x14; \
+      adc x14, x9, xzr; \
+    rev64 v2.16b, v2.16b; \
+      dup v3.2d, x13; \
+	adds x13, x10, #3; \
+      ins v3.D[0], x14; \
+	adc x14, x9, xzr; \
+      rev64 v3.16b, v3.16b; \
+	dup v4.2d, x13; \
+	  adds x10, x10, #4; \
+	ins v4.D[0], x14; \
+	  adc x9, x9, xzr; \
+	rev64 v4.16b, v4.16b; \
+	  dup v0.2d, x10; \
+	  ins v0.D[0], x9; \
+	  rev64 v0.16b, v0.16b; \
+    \
+    cbz x7, .Lctr_enc_loop4_##bits##_carry_done; \
+    b .Lctr_enc_entry4_##bits##_carry_done;
 
   CTR_ENC(128)
   CTR_ENC(192)
@@ -669,6 +771,10 @@ _gcry_aes_ctr_enc_armv8_ce:
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  add sp, sp, #128;
+  CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lctr_enc_skip:
   ret
@@ -700,6 +806,10 @@ _gcry_aes_ctr32le_enc_armv8_ce:
 
   cbz x4, .Lctr32le_enc_skip
 
+  add x8, sp, #-64
+  add sp, sp, #-128;
+  CFI_ADJUST_CFA_OFFSET(128);
+
   mov w6, #1
   movi v16.16b, #0
   mov v16.S[0], w6
@@ -712,38 +822,66 @@ _gcry_aes_ctr32le_enc_armv8_ce:
   b.eq .Lctr32le_enc_entry_192
   b.hi .Lctr32le_enc_entry_256
 
-#define CTR_ENC(bits) \
+#define CTR32LE_ENC(bits) \
   .Lctr32le_enc_entry_##bits: \
     cmp x4, #4; \
     b.lo .Lctr32le_enc_loop_##bits; \
     \
-  .Lctr32le_enc_loop4_##bits: \
+    st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \
+    add v9.4s, v16.4s, v16.4s; /* 2 */ \
+    cmp x4, #8; \
+    add v10.4s, v9.4s, v16.4s; /* 3 */ \
     sub x4, x4, #4; \
+    add v11.4s, v9.4s, v9.4s;  /* 4 */ \
+    \
+    ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \
     \
-    add v3.4s, v16.4s, v16.4s; /* 2 */ \
     mov v1.16b, v0.16b; \
     add v2.4s, v0.4s, v16.4s; \
-    add v4.4s, v3.4s, v16.4s;  /* 3 */ \
-    add v6.4s, v3.4s, v3.4s;   /* 4 */ \
-    add v3.4s, v0.4s, v3.4s; \
-    add v4.4s, v0.4s, v4.4s; \
-    add v0.4s, v0.4s, v6.4s; \
+    add v3.4s, v0.4s, v9.4s; \
+    add v4.4s, v0.4s, v10.4s; \
+    add v0.4s, v0.4s, v11.4s; \
     \
-    cmp x4, #4; \
-    ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+    do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \
+    b.lo .Lctr32le_enc_done4_##bits; \
     \
-    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    st1 {v12.16b-v15.16b}, [x8]; /* store callee saved registers */ \
     \
-    eor v1.16b, v1.16b, v5.16b; \
-    ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
-    eor v2.16b, v2.16b, v6.16b; \
-    eor v3.16b, v3.16b, v7.16b; \
-    eor v4.16b, v4.16b, v5.16b; \
-    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+  .Lctr32le_enc_loop4_##bits: \
+    eor v5.16b, v5.16b, vklast.16b; \
+    eor v6.16b, v6.16b, vklast.16b; \
+    eor v7.16b, v7.16b, vklast.16b; \
+    eor v8.16b, v8.16b, vklast.16b; \
+    do_aes_4_part2_##bits(e, mc, v12, v13, v14, v15, v1, v2, v3, v4, v5, v6, v7, v8); \
+    ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \
+    \
+    cmp x4, #8; \
+    sub x4, x4, #4; \
+    \
+    mov v1.16b, v0.16b; \
+    add v2.4s, v0.4s, v16.4s; \
+    add v3.4s, v0.4s, v9.4s; \
+    add v4.4s, v0.4s, v10.4s; \
+    add v0.4s, v0.4s, v11.4s; \
+    \
+    do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \
+    st1 {v12.16b-v15.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lctr32le_enc_loop4_##bits; \
+    \
+    ld1 {v12.16b-v15.16b}, [x8]; /* restore callee saved registers */ \
+    \
+  .Lctr32le_enc_done4_##bits: \
+    eor v5.16b, v5.16b, vklast.16b; \
+    eor v6.16b, v6.16b, vklast.16b; \
+    eor v7.16b, v7.16b, vklast.16b; \
+    eor v8.16b, v8.16b, vklast.16b; \
+    do_aes_4_part2_##bits(e, mc, v5, v6, v7, v8, v1, v2, v3, v4, v5, v6, v7, v8); \
+    \
+    st1 {v5.16b-v8.16b}, [x1], #64; /* store plaintext */ \
     CLEAR_REG(v3); \
     CLEAR_REG(v4); \
+    ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
@@ -756,19 +894,21 @@ _gcry_aes_ctr32le_enc_armv8_ce:
     sub x4, x4, #1; \
     add v0.4s, v0.4s, v16.4s; \
     \
-    do_aes_one##bits(e, mc, v1, v1); \
+    do_aes_one_part1(e, mc, v1, vk0); \
+    eor v2.16b, v2.16b, vklast.16b; \
+    do_aes_one_part2_##bits(e, mc, v1, __, __); \
     \
-    eor v1.16b, v2.16b, v1.16b; \
+    eor v1.16b, v1.16b, v2.16b; \
     st1 {v1.16b}, [x1], #16; /* store plaintext */ \
     \
     cbnz x4, .Lctr32le_enc_loop_##bits; \
     b .Lctr32le_enc_done;
 
-  CTR_ENC(128)
-  CTR_ENC(192)
-  CTR_ENC(256)
+  CTR32LE_ENC(128)
+  CTR32LE_ENC(192)
+  CTR32LE_ENC(256)
 
-#undef CTR_ENC
+#undef CTR32LE_ENC
 
 .Lctr32le_enc_done:
   aes_clear_keys(w5)
@@ -778,6 +918,10 @@ _gcry_aes_ctr32le_enc_armv8_ce:
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  add sp, sp, #128;
+  CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lctr32le_enc_skip:
   ret
@@ -813,21 +957,34 @@ _gcry_aes_cfb_enc_armv8_ce:
 
   aes_preload_keys(x0, w5);
 
+  ld1 {v1.16b}, [x2], #16; /* load plaintext */
+  eor v3.16b, vk0.16b, vklast.16b;
+  eor v0.16b, v0.16b, vklast.16b;
+  sub x4, x4, #1;
+  mov v4.16b, v3.16b;
+  do_aes_one_part1(e, mc, v0, v4);
+
   b.eq .Lcfb_enc_entry_192
   b.hi .Lcfb_enc_entry_256
 
 #define CFB_ENC(bits) \
   .Lcfb_enc_entry_##bits: \
+    cbz x4, .Lcfb_enc_done_##bits; \
+    \
   .Lcfb_enc_loop_##bits: \
-    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    eor v2.16b, v1.16b, vklast.16b; \
+    do_aes_one_part2_##bits(e, mc, v0, \
+			    _(eor v4.16b, v3.16b, v1.16b), \
+			    _(ld1 {v1.16b}, [x2], #16 /* load plaintext */)); \
     sub x4, x4, #1; \
-    \
-    do_aes_one##bits(e, mc, v0, v0); \
-    \
-    eor v0.16b, v1.16b, v0.16b; \
-    st1 {v0.16b}, [x1], #16; /* store ciphertext */ \
-    \
+    eor v2.16b, v2.16b, v0.16b; \
+    do_aes_one_part1(e, mc, v0, v4); \
+    st1 {v2.16b}, [x1], #16; /* store ciphertext */ \
     cbnz x4, .Lcfb_enc_loop_##bits; \
+    \
+  .Lcfb_enc_done_##bits: \
+    eor v2.16b, v1.16b, vklast.16b; \
+    do_aes_one_part2_##bits(e, mc, v0, __, __); \
     b .Lcfb_enc_done;
 
   CFB_ENC(128)
@@ -837,12 +994,16 @@ _gcry_aes_cfb_enc_armv8_ce:
 #undef CFB_ENC
 
 .Lcfb_enc_done:
+  eor v2.16b, v2.16b, v0.16b;
+  st1 {v2.16b}, [x1]; /* store ciphertext */
   aes_clear_keys(w5)
-
-  st1 {v0.16b}, [x3] /* store IV */
+  st1 {v2.16b}, [x3] /* store IV */
 
   CLEAR_REG(v0)
   CLEAR_REG(v1)
+  CLEAR_REG(v2)
+  CLEAR_REG(v3)
+  CLEAR_REG(v4)
 
 .Lcfb_enc_skip:
   ret
@@ -873,6 +1034,9 @@ _gcry_aes_cfb_dec_armv8_ce:
 
   cbz x4, .Lcfb_dec_skip
 
+  add sp, sp, #-64;
+  CFI_ADJUST_CFA_OFFSET(64);
+
   /* load IV */
   ld1 {v0.16b}, [x3]
 
@@ -886,42 +1050,60 @@ _gcry_aes_cfb_dec_armv8_ce:
     cmp x4, #4; \
     b.lo .Lcfb_dec_loop_##bits; \
     \
-  .Lcfb_dec_loop4_##bits: \
-    \
-    ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \
+    ld1 {v2.16b-v5.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x4, #8; \
     mov v1.16b, v0.16b; \
     sub x4, x4, #4; \
-    cmp x4, #4; \
-    mov v5.16b, v2.16b; \
-    mov v6.16b, v3.16b; \
-    mov v7.16b, v4.16b; \
-    ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \
+    eor v6.16b, v2.16b, vklast.16b; \
+    eor v7.16b, v3.16b, vklast.16b; \
+    eor v16.16b, v4.16b, vklast.16b; \
+    mov v0.16b, v5.16b; /* next IV */ \
+    eor v5.16b, v5.16b, vklast.16b; \
     \
-    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \
+    b.lo .Lcfb_dec_done4_##bits; \
     \
-    eor v1.16b, v1.16b, v5.16b; \
-    eor v2.16b, v2.16b, v6.16b; \
-    eor v3.16b, v3.16b, v7.16b; \
-    eor v4.16b, v4.16b, v0.16b; \
-    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \
+    \
+  .Lcfb_dec_loop4_##bits: \
+    do_aes_4_part2_##bits(e, mc, v8, v9, v10, v11, v1, v2, v3, v4, v6, v7, v16, v5); \
+    ld1 {v2.16b-v5.16b}, [x2], #64; /* load ciphertext */ \
+    cmp x4, #8; \
+    mov v1.16b, v0.16b; \
+    sub x4, x4, #4; \
+    eor v6.16b, v2.16b, vklast.16b; \
+    eor v7.16b, v3.16b, vklast.16b; \
+    eor v16.16b, v4.16b, vklast.16b; \
+    mov v0.16b, v5.16b; /* next IV */ \
+    eor v5.16b, v5.16b, vklast.16b; \
+    \
+    do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \
+    st1 {v8.16b-v11.16b}, [x1], #64; /* store plaintext */ \
     \
     b.hs .Lcfb_dec_loop4_##bits; \
-    CLEAR_REG(v3); \
-    CLEAR_REG(v4); \
+    \
+    ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \
+    \
+  .Lcfb_dec_done4_##bits: \
+    do_aes_4_part2_##bits(e, mc, v1, v2, v3, v4, v1, v2, v3, v4, v6, v7, v16, v5); \
+    \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
     cbz x4, .Lcfb_dec_done; \
     \
   .Lcfb_dec_loop_##bits: \
-    \
     ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
-    \
     sub x4, x4, #1; \
     \
-    do_aes_one##bits(e, mc, v0, v0); \
+    do_aes_one_part1(e, mc, v0, vk0); \
+    eor v2.16b, v1.16b, vklast.16b; \
+    do_aes_one_part2_##bits(e, mc, v0, __, __); \
+    eor v2.16b, v2.16b, v0.16b; \
     \
-    eor v2.16b, v1.16b, v0.16b; \
     mov v0.16b, v1.16b; \
     st1 {v2.16b}, [x1], #16; /* store plaintext */ \
     \
@@ -942,6 +1124,10 @@ _gcry_aes_cfb_dec_armv8_ce:
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
+  CLEAR_REG(v16)
+
+  add sp, sp, #64;
+  CFI_ADJUST_CFA_OFFSET(-64);
 
 .Lcfb_dec_skip:
   ret
@@ -972,7 +1158,7 @@ _gcry_aes_ocb_enc_armv8_ce:
    *    x3: offset
    *    x4: checksum
    *    x5: Ltable
-   *    x6: nblocks (0 < nblocks <= 32)
+   *    x6: nblocks (0 < nblocks)
    *    w7: nrounds
    *    %st+0: blkn => w12
    */
@@ -982,110 +1168,203 @@ _gcry_aes_ocb_enc_armv8_ce:
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
+  add x16, sp, #-64;
+  add sp, sp, #-128;
+  CFI_ADJUST_CFA_OFFSET(128);
+
   aes_preload_keys(x0, w7);
 
-  b.eq .Locb_enc_entry_192
-  b.hi .Locb_enc_entry_256
+  st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */
 
-#define OCB_ENC(bits, ...) \
-  .Locb_enc_entry_##bits: \
-    cmp x6, #4; \
-    add x12, x12, #1; \
-    b.lo .Locb_enc_loop_##bits; \
+  eor v0.16b, v0.16b, vk0.16b; /* offset ^ first key */
+  eor v9.16b, vk0.16b, vklast.16b; /* first key ^ last key */
+
+  b.eq .Locb_ecry_entry_192
+  b.hi .Locb_ecry_entry_256
+
+#define OCB_CRYPT(bits, ed, mcimc) \
+  .Locb_##ed##cry_entry_##bits: \
+    /* Get number of blocks to align nblk to 4. */ \
+    neg x13, x12; \
+    add x12, x12, #1; /* Pre-increment nblk for ntz calculation */ \
+    and x13, x13, #(4-1); \
+    cmp x13, x6; \
+    csel x13, x6, x13, hi; \
+    cbz x13, .Locb_##ed##cry_alignment_ok_##bits; \
+    \
+    /* Number of blocks after alignment. */ \
+    sub x14, x6, x13; \
     \
-  .Locb_enc_loop4_##bits: \
+    /* If number after alignment is less than 4, skip aligned handling \
+     * completely. */ \
+    cmp x14, #4; \
+    csel x13, x6, x13, lo; \
+    \
+  .Locb_##ed##cry_unaligned_entry_##bits: \
+    cmp x13, #4; \
+    \
+  .Locb_##ed##cry_loop1_##bits: \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    add w9, w12, #1; \
-    add w10, w12, #2; \
-    add w11, w12, #3; \
-    rbit w8, w12; \
-    add w12, w12, #4; \
-    rbit w9, w9; \
-    rbit w10, w10; \
-    rbit w11, w11; \
-    clz w8, w8; /* ntz(i+0) */ \
-    clz w9, w9; /* ntz(i+1) */ \
-    clz w10, w10; /* ntz(i+2) */ \
-    clz w11, w11; /* ntz(i+3) */ \
+    rbit x8, x12; \
+    add x12, x12, #1; \
+    clz x8, x8; /* ntz(i) */ \
     add x8, x5, x8, lsl #4; \
-    ld1 {v1.16b-v4.16b}, [x2], #64;   /* load P_i+<0-3> */ \
-    add x9, x5, x9, lsl #4; \
-    add x10, x5, x10, lsl #4; \
-    add x11, x5, x11, lsl #4; \
     \
-    sub x6, x6, #4; \
+    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
+    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
+    eor v0.16b, v0.16b, v2.16b; \
+    sub x13, x13, #1; \
+    ENC(eor v16.16b, v16.16b, v1.16b); \
+    sub x6, x6, #1; \
     \
-    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
-    eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
-    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
-    eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
-    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
-    eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
-    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
-    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
-    eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
-    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
-    eor v1.16b, v1.16b, v5.16b;       /* P_i+0 xor Offset_i+0 */ \
-    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
-    eor v2.16b, v2.16b, v6.16b;       /* P_i+1 xor Offset_i+1 */ \
-    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
-    cmp x6, #4; \
-    eor v3.16b, v3.16b, v7.16b;       /* P_i+2 xor Offset_i+2 */ \
-    eor v4.16b, v4.16b, v0.16b;       /* P_i+3 xor Offset_i+3 */ \
+    do_aes_one_part1(ed, mcimc, v1, v0); \
+    eor v2.16b, v0.16b, v9.16b; \
+    do_aes_one_part2_##bits(ed, mcimc, v1, __, __); \
+    eor v1.16b, v1.16b, v2.16b; \
+    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    DEC(eor v16.16b, v16.16b, v1.16b); \
     \
-    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    cbnz x13, .Locb_##ed##cry_loop1_##bits; \
     \
-    eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
-    eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
-    eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
-    eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
-    st1 {v1.16b-v4.16b}, [x1], #64; \
+    cbz x6, .Locb_##ed##cry_done; \
     \
-    b.hs .Locb_enc_loop4_##bits; \
-    CLEAR_REG(v3); \
-    CLEAR_REG(v4); \
-    CLEAR_REG(v5); \
-    CLEAR_REG(v6); \
-    CLEAR_REG(v7); \
-    cbz x6, .Locb_enc_done; \
+    /* nblk is now aligned and we have 4 or more blocks. So jump directly to \
+     * aligned processing. */ \
+    b .Locb_##ed##cry_aligned_entry_##bits; \
     \
-  .Locb_enc_loop_##bits: \
+  .Locb_##ed##cry_alignment_ok_##bits: \
+    cbz x6, .Locb_##ed##cry_done; \
+    \
+    /* Short buffers do not benefit from L-array optimization. */ \
+    cmp x6, #4; \
+    mov x13, x6; \
+    b.lo .Locb_##ed##cry_unaligned_entry_##bits; \
+    \
+  .Locb_##ed##cry_aligned_entry_##bits: \
+    /* Prepare L-array optimization. \
+     * Since nblk is aligned to 4, offsets will have following construction: \
+     *  - block1 = ntz{0} = offset ^ L[0] \
+     *  - block2 = ntz{1} = offset ^ L[0] ^ L[1] \
+     *  - block3 = ntz{0} = offset ^ L[1] \
+     *  - block4 = ntz{x} = offset ^ L[1] ^ L[ntz{x}] \
+     */ \
+    ld1 {v10.16b-v11.16b}, [x5];        /* preload L[0] && L[1] */ \
+    mov x15, #4; \
+    \
+    st1 {v12.16b-v15.16b}, [x16]; /* store callee saved registers */ \
     \
     /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
     /* Checksum_i = Checksum_{i-1} xor P_i  */ \
     /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    rbit x8, x12; \
-    add x12, x12, #1; \
-    clz x8, x8; /* ntz(i) */ \
-    add x8, x5, x8, lsl #4; \
+    add x11, x12, #3; \
+    ld1 {v1.16b-v4.16b}, [x2], #64;     /* load P_i+<0-3> */ \
+    rbit x11, x11; \
+    eor v6.16b, v10.16b, v11.16b;       /* L[0] ^ L[1] */ \
+    ENC(eor v16.16b, v16.16b, v1.16b);  /* Checksum_i+0 */ \
+    add x12, x12, #4; \
+    clz x11, x11; /* ntz(i+3) */ \
+    add x15, x15, #4; \
+    add x11, x5, x11, lsl #4; \
     \
-    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
-    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
-    sub x6, x6, #1; \
-    eor v0.16b, v0.16b, v2.16b; \
-    eor v16.16b, v16.16b, v1.16b; \
-    eor v1.16b, v1.16b, v0.16b; \
+    eor v5.16b, v0.16b, v10.16b;        /* Offset_i+0 */ \
+    ENC(eor v16.16b, v16.16b, v2.16b);  /* Checksum_i+1 */ \
+    ld1 {v8.16b}, [x11];                /* load L_{ntz(i+3)} */ \
+    ENC(eor v16.16b, v16.16b, v3.16b);  /* Checksum_i+2 */ \
+    eor v6.16b, v0.16b, v6.16b;         /* Offset_i+1 */ \
+    ENC(eor v16.16b, v16.16b, v4.16b);  /* Checksum_i+3 */ \
+    eor v7.16b, v0.16b, v11.16b;        /* Offset_i+2 */ \
+    eor v8.16b, v8.16b, v11.16b;        /* L[1] ^ L[ntz{x}] */ \
+    cmp x15, x13; \
+    eor v0.16b, v0.16b, v8.16b;         /* Offset_i+3 */ \
+    \
+    do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v5, v6, v7, v0); /* P_i+j xor Offset_i+j */ \
+    b.hi .Locb_##ed##cry_aligned_done4_##bits; \
+    \
+  .Locb_##ed##cry_aligned_loop4_##bits: \
+    add x11, x12, #3; \
+    eor v5.16b, v5.16b, v9.16b; \
+    eor v6.16b, v6.16b, v9.16b; \
+    rbit x11, x11; \
+    eor v7.16b, v7.16b, v9.16b; \
+    eor v8.16b, v0.16b, v9.16b; \
+    clz x11, x11; /* ntz(i+3) */ \
+    do_aes_4_part2_##bits(ed, mcimc, v12, v13, v14, v15, v1, v2, v3, v4, v5, v6, v7, v8); /* xor Offset_i+j */ \
     \
-    do_aes_one##bits(e, mc, v1, v1); \
+    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
+    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
+    /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */ \
     \
-    eor v1.16b, v1.16b, v0.16b; \
-    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
+    add x12, x12, #4; \
+    ld1 {v1.16b-v4.16b}, [x2], #64;     /* load P_i+<0-3> */ \
+    eor v6.16b, v10.16b, v11.16b;       /* L[0] ^ L[1] */ \
+    add x15, x15, #4; \
+    DEC(eor v16.16b, v16.16b, v12.16b); /* Checksum_i+0 */ \
+    ENC(eor v16.16b, v16.16b, v1.16b);  /* Checksum_i+0 */ \
+    add x11, x5, x11, lsl #4; \
+    \
+    eor v5.16b, v0.16b, v10.16b;        /* Offset_i+0 */ \
+    ENC(eor v16.16b, v16.16b, v2.16b);  /* Checksum_i+1 */ \
+    DEC(eor v16.16b, v16.16b, v13.16b); /* Checksum_1+2 */ \
+    ld1 {v8.16b}, [x11];                /* load L_{ntz(i+3)} */ \
+    ENC(eor v16.16b, v16.16b, v3.16b);  /* Checksum_i+2 */ \
+    DEC(eor v16.16b, v16.16b, v14.16b); /* Checksum_i+0+3 */ \
+    eor v6.16b, v0.16b, v6.16b;         /* Offset_i+1 */ \
+    ENC(eor v16.16b, v16.16b, v4.16b);  /* Checksum_i+3 */ \
+    DEC(eor v16.16b, v16.16b, v15.16b); /* Checksum_i+0+1+2 */ \
+    eor v7.16b, v0.16b, v11.16b;        /* Offset_i+2 */ \
+    eor v8.16b, v8.16b, v11.16b;        /* L[1] ^ L[ntz{x}] */ \
+    cmp x15, x13; \
+    eor v0.16b, v0.16b, v8.16b;         /* Offset_i+3 */ \
+    \
+    do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v5, v6, v7, v0); /* P_i+j xor Offset_i+j */ \
+    st1 {v12.16b-v15.16b}, [x1], #64; \
+    \
+    b.ls .Locb_##ed##cry_aligned_loop4_##bits; \
+    \
+  .Locb_##ed##cry_aligned_done4_##bits: \
+    eor v5.16b, v5.16b, v9.16b; \
+    eor v6.16b, v6.16b, v9.16b; \
+    eor v7.16b, v7.16b, v9.16b; \
+    eor v8.16b, v0.16b, v9.16b; \
+    do_aes_4_part2_##bits(ed, mcimc, v1, v2, v3, v4, v1, v2, v3, v4, v5, v6, v7, v8); /* xor Offset_i+j */ \
+    DEC(eor v16.16b, v16.16b, v1.16b);  /* Checksum_i+0 */ \
+    DEC(eor v5.16b, v2.16b, v3.16b);    /* Checksum_1+2 */ \
+    DEC(eor v16.16b, v16.16b, v4.16b);  /* Checksum_i+0+3 */ \
+    st1 {v1.16b-v4.16b}, [x1], #64; \
+    DEC(eor v16.16b, v16.16b, v5.16b);  /* Checksum_i+0+1+2 */ \
     \
-    cbnz x6, .Locb_enc_loop_##bits; \
-    b .Locb_enc_done;
+    sub x15, x15, #4; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    ld1 {v12.16b-v15.16b}, [x16]; /* restore callee saved registers */ \
+    sub x13, x13, x15; \
+    sub x6, x6, x15; \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    \
+    /* Handle tailing 1?3 blocks in unaligned loop. */ \
+    mov x13, x6; \
+    cbnz x6, .Locb_##ed##cry_unaligned_entry_##bits; \
+    \
+    b .Locb_##ed##cry_done;
 
-  OCB_ENC(128)
-  OCB_ENC(192)
-  OCB_ENC(256)
+#define ENC(...) __VA_ARGS__
+#define DEC(...) /*_*/
+  OCB_CRYPT(128, e, mc)
+  OCB_CRYPT(192, e, mc)
+  OCB_CRYPT(256, e, mc)
+#undef ENC
+#undef DEC
 
-#undef OCB_ENC
+.Locb_ecry_done:
+  eor v0.16b, v0.16b, vk0.16b; /* restore offset */
 
-.Locb_enc_done:
+  ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */
   aes_clear_keys(w7)
 
   st1 {v16.16b}, [x4] /* store checksum */
@@ -1094,8 +1373,12 @@ _gcry_aes_ocb_enc_armv8_ce:
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
+  CLEAR_REG(v7)
   CLEAR_REG(v16)
 
+  add sp, sp, #128;
+  CFI_ADJUST_CFA_OFFSET(-128);
+
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
@@ -1124,7 +1407,7 @@ _gcry_aes_ocb_dec_armv8_ce:
    *    x3: offset
    *    x4: checksum
    *    x5: Ltable
-   *    x6: nblocks (0 < nblocks <= 32)
+   *    x6: nblocks (0 < nblocks)
    *    w7: nrounds
    *    %st+0: blkn => w12
    */
@@ -1134,110 +1417,34 @@ _gcry_aes_ocb_dec_armv8_ce:
   ld1 {v0.16b}, [x3] /* load offset */
   ld1 {v16.16b}, [x4] /* load checksum */
 
+  add x16, sp, #-64;
+  add sp, sp, #-128;
+  CFI_ADJUST_CFA_OFFSET(128);
+
   aes_preload_keys(x0, w7);
 
-  b.eq .Locb_dec_entry_192
-  b.hi .Locb_dec_entry_256
+  st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */
 
-#define OCB_DEC(bits) \
-  .Locb_dec_entry_##bits: \
-    cmp x6, #4; \
-    add w12, w12, #1; \
-    b.lo .Locb_dec_loop_##bits; \
-    \
-  .Locb_dec_loop4_##bits: \
-    \
-    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
-    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
-    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
-    \
-    add w9, w12, #1; \
-    add w10, w12, #2; \
-    add w11, w12, #3; \
-    rbit w8, w12; \
-    add w12, w12, #4; \
-    rbit w9, w9; \
-    rbit w10, w10; \
-    rbit w11, w11; \
-    clz w8, w8; /* ntz(i+0) */ \
-    clz w9, w9; /* ntz(i+1) */ \
-    clz w10, w10; /* ntz(i+2) */ \
-    clz w11, w11; /* ntz(i+3) */ \
-    add x8, x5, x8, lsl #4; \
-    ld1 {v1.16b-v4.16b}, [x2], #64;   /* load C_i+<0-3> */ \
-    add x9, x5, x9, lsl #4; \
-    add x10, x5, x10, lsl #4; \
-    add x11, x5, x11, lsl #4; \
-    \
-    sub x6, x6, #4; \
-    \
-    ld1 {v5.16b}, [x8];               /* load L_{ntz(i+0)} */ \
-    ld1 {v6.16b}, [x9];               /* load L_{ntz(i+1)} */ \
-    ld1 {v7.16b}, [x10];              /* load L_{ntz(i+2)} */ \
-    eor v5.16b, v5.16b, v0.16b;       /* Offset_i+0 */ \
-    ld1 {v0.16b}, [x11];              /* load L_{ntz(i+3)} */ \
-    eor v6.16b, v6.16b, v5.16b;       /* Offset_i+1 */ \
-    eor v1.16b, v1.16b, v5.16b;       /* C_i+0 xor Offset_i+0 */ \
-    eor v7.16b, v7.16b, v6.16b;       /* Offset_i+2 */ \
-    eor v2.16b, v2.16b, v6.16b;       /* C_i+1 xor Offset_i+1 */ \
-    eor v0.16b, v0.16b, v7.16b;       /* Offset_i+3 */ \
-    cmp x6, #4; \
-    eor v3.16b, v3.16b, v7.16b;       /* C_i+2 xor Offset_i+2 */ \
-    eor v4.16b, v4.16b, v0.16b;       /* C_i+3 xor Offset_i+3 */ \
-    \
-    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
-    \
-    eor v1.16b, v1.16b, v5.16b;       /* xor Offset_i+0 */ \
-    eor v2.16b, v2.16b, v6.16b;       /* xor Offset_i+1 */ \
-    eor v16.16b, v16.16b, v1.16b;     /* Checksum_i+0 */ \
-    eor v3.16b, v3.16b, v7.16b;       /* xor Offset_i+2 */ \
-    eor v16.16b, v16.16b, v2.16b;     /* Checksum_i+1 */ \
-    eor v4.16b, v4.16b, v0.16b;       /* xor Offset_i+3 */ \
-    eor v16.16b, v16.16b, v3.16b;     /* Checksum_i+2 */ \
-    eor v16.16b, v16.16b, v4.16b;     /* Checksum_i+3 */ \
-    st1 {v1.16b-v4.16b}, [x1], #64; \
-    \
-    b.hs .Locb_dec_loop4_##bits; \
-    CLEAR_REG(v3); \
-    CLEAR_REG(v4); \
-    CLEAR_REG(v5); \
-    CLEAR_REG(v6); \
-    CLEAR_REG(v7); \
-    cbz x6, .Locb_dec_done; \
-    \
-  .Locb_dec_loop_##bits: \
-    \
-    /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \
-    /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */ \
-    /* Checksum_i = Checksum_{i-1} xor P_i  */ \
-    \
-    rbit w8, w12; \
-    add w12, w12, #1; \
-    clz w8, w8; /* ntz(i) */ \
-    add x8, x5, x8, lsl #4; \
-    \
-    ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \
-    ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \
-    sub x6, x6, #1; \
-    eor v0.16b, v0.16b, v2.16b; \
-    eor v1.16b, v1.16b, v0.16b; \
-    \
-    do_aes_one##bits(d, imc, v1, v1) \
-    \
-    eor v1.16b, v1.16b, v0.16b; \
-    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
-    eor v16.16b, v16.16b, v1.16b; \
-    \
-    cbnz x6, .Locb_dec_loop_##bits; \
-    b .Locb_dec_done;
+  eor v0.16b, v0.16b, vk0.16b; /* offset ^ first key */
+  eor v9.16b, vk0.16b, vklast.16b; /* first key ^ last key */
+
+  b.eq .Locb_dcry_entry_192
+  b.hi .Locb_dcry_entry_256
+
+#define ENC(...) /*_*/
+#define DEC(...) __VA_ARGS__
+  OCB_CRYPT(128, d, imc)
+  OCB_CRYPT(192, d, imc)
+  OCB_CRYPT(256, d, imc)
+#undef ENC
+#undef DEC
 
-  OCB_DEC(128)
-  OCB_DEC(192)
-  OCB_DEC(256)
+#undef OCB_CRYPT
 
-#undef OCB_DEC
+.Locb_dcry_done:
+  eor v0.16b, v0.16b, vk0.16b; /* restore offset */
 
-.Locb_dec_done:
+  ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */
   aes_clear_keys(w7)
 
   st1 {v16.16b}, [x4] /* store checksum */
@@ -1248,6 +1455,9 @@ _gcry_aes_ocb_dec_armv8_ce:
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
+  add sp, sp, #128;
+  CFI_ADJUST_CFA_OFFSET(-128);
+
   ret
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
@@ -1371,7 +1581,7 @@ _gcry_aes_ocb_auth_armv8_ce:
     eor v0.16b, v0.16b, v2.16b; \
     eor v1.16b, v1.16b, v0.16b; \
     \
-    do_aes_one##bits(e, mc, v1, v1) \
+    do_aes_one##bits(e, mc, v1, v1, vk0) \
     \
     eor v16.16b, v16.16b, v1.16b; \
     \
@@ -1425,6 +1635,10 @@ _gcry_aes_xts_enc_armv8_ce:
 
   cbz x4, .Lxts_enc_skip
 
+  add x16, sp, #-64;
+  add sp, sp, #-128;
+  CFI_ADJUST_CFA_OFFSET(128);
+
   /* load tweak */
   ld1 {v0.16b}, [x3]
 
@@ -1435,18 +1649,66 @@ _gcry_aes_xts_enc_armv8_ce:
   mov v16.D[1], x7
 
   aes_preload_keys(x0, w5);
+  eor vklast.16b, vklast.16b, vk0.16b;
 
-  b.eq .Lxts_enc_entry_192
-  b.hi .Lxts_enc_entry_256
+  b.eq .Lxts_ecry_entry_192
+  b.hi .Lxts_ecry_entry_256
 
-#define XTS_ENC(bits) \
-  .Lxts_enc_entry_##bits: \
+#define XTS_CRYPT(bits, ed, mcimc) \
+  .Lxts_##ed##cry_entry_##bits: \
     cmp x4, #4; \
-    b.lo .Lxts_enc_loop_##bits; \
+    b.lo .Lxts_##ed##cry_loop_##bits; \
     \
-  .Lxts_enc_loop4_##bits: \
+    st1 {v8.16b}, [sp]; /* store callee saved registers */ \
+    ext v4.16b, v0.16b, v0.16b, #8; \
+    mov v8.16b, v0.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v5.2d, v0.2d, v0.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v5.16b, v5.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v6.2d, v5.2d, v5.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v6.16b, v6.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v7.2d, v6.2d, v6.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v7.16b, v7.16b, v2.16b; \
+    \
+    sshr v2.2d, v4.2d, #63; \
+    add v3.2d, v7.2d, v7.2d; \
+    and v2.16b, v2.16b, v16.16b; \
+    add v4.2d, v4.2d, v4.2d; \
+    eor v0.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v4.16b}, [x2], #64; /* load plaintext */ \
+    cmp x4, #8; \
+    sub x4, x4, #4; \
+    \
+    eor v8.16b, v8.16b, vk0.16b; \
+    eor v5.16b, v5.16b, vk0.16b; \
+    eor v6.16b, v6.16b, vk0.16b; \
+    eor v7.16b, v7.16b, vk0.16b; \
+    \
+    do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v8, v5, v6, v7); \
+    b.lo .Lxts_##ed##cry_done4_##bits; \
+    \
+    st1 {v9.16b-v12.16b}, [x16]; /* store callee saved registers */ \
+    \
+  .Lxts_##ed##cry_loop4_##bits: \
+    eor v8.16b, v8.16b, vklast.16b; \
+    eor v5.16b, v5.16b, vklast.16b; \
+    eor v6.16b, v6.16b, vklast.16b; \
+    eor v7.16b, v7.16b, vklast.16b; \
+    do_aes_4_part2_##bits(ed, mcimc, v9, v10, v11, v12, v1, v2, v3, v4, v8, v5, v6, v7); \
     \
     ext v4.16b, v0.16b, v0.16b, #8; \
+    mov v8.16b, v0.16b; \
     \
     sshr v2.2d, v4.2d, #63; \
     add v5.2d, v0.2d, v0.2d; \
@@ -1470,62 +1732,66 @@ _gcry_aes_xts_enc_armv8_ce:
     add v3.2d, v7.2d, v7.2d; \
     and v2.16b, v2.16b, v16.16b; \
     add v4.2d, v4.2d, v4.2d; \
-    eor v3.16b, v3.16b, v2.16b; \
-    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
-    st1 {v3.16b}, [x3]; \
+    eor v0.16b, v3.16b, v2.16b; \
+    ld1 {v1.16b-v4.16b}, [x2], #64; /* load plaintext */ \
+    cmp x4, #8; \
     sub x4, x4, #4; \
-    eor v1.16b, v1.16b, v0.16b; \
     \
-    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
-    cmp x4, #4; \
-    eor v2.16b, v2.16b, v5.16b; \
-    eor v3.16b, v3.16b, v6.16b; \
-    eor v4.16b, v4.16b, v7.16b; \
+    eor v8.16b, v8.16b, vk0.16b; \
+    eor v5.16b, v5.16b, vk0.16b; \
+    eor v6.16b, v6.16b, vk0.16b; \
+    eor v7.16b, v7.16b, vk0.16b; \
     \
-    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v8, v5, v6, v7); \
+    \
+    st1 {v9.16b-v12.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lxts_##ed##cry_loop4_##bits; \
+    \
+    ld1 {v9.16b-v12.16b}, [x16]; /* restore callee saved registers */ \
+    \
+  .Lxts_##ed##cry_done4_##bits: \
+    eor v8.16b, v8.16b, vklast.16b; \
+    eor v5.16b, v5.16b, vklast.16b; \
+    eor v6.16b, v6.16b, vklast.16b; \
+    eor v7.16b, v7.16b, vklast.16b; \
+    do_aes_4_part2_##bits(ed, mcimc, v1, v2, v3, v4, v1, v2, v3, v4, v8, v5, v6, v7); \
     \
-    eor v1.16b, v1.16b, v0.16b; \
-    ld1 {v0.16b}, [x3]; \
-    eor v2.16b, v2.16b, v5.16b; \
-    eor v3.16b, v3.16b, v6.16b; \
-    eor v4.16b, v4.16b, v7.16b; \
     st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
     \
-    b.hs .Lxts_enc_loop4_##bits; \
-    CLEAR_REG(v3); \
     CLEAR_REG(v4); \
+    ld1 {v8.16b}, [sp]; /* restore callee saved registers */ \
     CLEAR_REG(v5); \
     CLEAR_REG(v6); \
     CLEAR_REG(v7); \
-    cbz x4, .Lxts_enc_done; \
+    cbz x4, .Lxts_##ed##cry_done; \
     \
-  .Lxts_enc_loop_##bits: \
+  .Lxts_##ed##cry_loop_##bits: \
     \
     ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
     ext v3.16b, v0.16b, v0.16b, #8; \
-    mov v2.16b, v0.16b; \
+    eor v2.16b, v0.16b, vk0.16b; \
     sshr v3.2d, v3.2d, #63; \
     add v0.2d, v0.2d, v0.2d; \
     and v3.16b, v3.16b, v16.16b; \
-    eor v1.16b, v1.16b, v2.16b; \
-    eor v0.16b, v0.16b, v3.16b; \
     sub x4, x4, #1; \
+    eor v0.16b, v0.16b, v3.16b; \
     \
-    do_aes_one##bits(e, mc, v1, v1); \
-    \
+    do_aes_one_part1(ed, mcimc, v1, v2); \
+    eor v2.16b, v2.16b, vklast.16b; \
+    do_aes_one_part2_##bits(ed, mcimc, v1, __, __); \
     eor v1.16b, v1.16b, v2.16b; \
+    \
     st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
     \
-    cbnz x4, .Lxts_enc_loop_##bits; \
-    b .Lxts_enc_done;
+    cbnz x4, .Lxts_##ed##cry_loop_##bits; \
+    b .Lxts_##ed##cry_done;
 
-  XTS_ENC(128)
-  XTS_ENC(192)
-  XTS_ENC(256)
+  XTS_CRYPT(128, e, mc)
+  XTS_CRYPT(192, e, mc)
+  XTS_CRYPT(256, e, mc)
 
-#undef XTS_ENC
-
-.Lxts_enc_done:
+.Lxts_ecry_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store tweak */
@@ -1533,6 +1799,11 @@ _gcry_aes_xts_enc_armv8_ce:
   CLEAR_REG(v0)
   CLEAR_REG(v1)
   CLEAR_REG(v2)
+  CLEAR_REG(v3)
+  CLEAR_REG(v16)
+
+  add sp, sp, 128;
+  CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lxts_enc_skip:
   ret
@@ -1565,6 +1836,10 @@ _gcry_aes_xts_dec_armv8_ce:
 
   cbz x4, .Lxts_dec_skip
 
+  add x16, sp, #-64;
+  add sp, sp, #-128;
+  CFI_ADJUST_CFA_OFFSET(128);
+
   /* load tweak */
   ld1 {v0.16b}, [x3]
 
@@ -1575,97 +1850,18 @@ _gcry_aes_xts_dec_armv8_ce:
   mov v16.D[1], x7
 
   aes_preload_keys(x0, w5);
+  eor vklast.16b, vklast.16b, vk0.16b;
 
-  b.eq .Lxts_dec_entry_192
-  b.hi .Lxts_dec_entry_256
+  b.eq .Lxts_dcry_entry_192
+  b.hi .Lxts_dcry_entry_256
 
-#define XTS_DEC(bits) \
-  .Lxts_dec_entry_##bits: \
-    cmp x4, #4; \
-    b.lo .Lxts_dec_loop_##bits; \
-    \
-  .Lxts_dec_loop4_##bits: \
-    \
-    ext v4.16b, v0.16b, v0.16b, #8; \
-    \
-    sshr v2.2d, v4.2d, #63; \
-    add v5.2d, v0.2d, v0.2d; \
-    and v2.16b, v2.16b, v16.16b; \
-    add v4.2d, v4.2d, v4.2d; \
-    eor v5.16b, v5.16b, v2.16b; \
-    \
-    sshr v2.2d, v4.2d, #63; \
-    add v6.2d, v5.2d, v5.2d; \
-    and v2.16b, v2.16b, v16.16b; \
-    add v4.2d, v4.2d, v4.2d; \
-    eor v6.16b, v6.16b, v2.16b; \
-    \
-    sshr v2.2d, v4.2d, #63; \
-    add v7.2d, v6.2d, v6.2d; \
-    and v2.16b, v2.16b, v16.16b; \
-    add v4.2d, v4.2d, v4.2d; \
-    eor v7.16b, v7.16b, v2.16b; \
-    \
-    sshr v2.2d, v4.2d, #63; \
-    add v3.2d, v7.2d, v7.2d; \
-    and v2.16b, v2.16b, v16.16b; \
-    add v4.2d, v4.2d, v4.2d; \
-    eor v3.16b, v3.16b, v2.16b; \
-    ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \
-    st1 {v3.16b}, [x3]; \
-    sub x4, x4, #4; \
-    eor v1.16b, v1.16b, v0.16b; \
-    \
-    ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \
-    cmp x4, #4; \
-    eor v2.16b, v2.16b, v5.16b; \
-    eor v3.16b, v3.16b, v6.16b; \
-    eor v4.16b, v4.16b, v7.16b; \
-    \
-    do_aes_4_##bits(d, imc, v1, v2, v3, v4); \
-    \
-    eor v1.16b, v1.16b, v0.16b; \
-    ld1 {v0.16b}, [x3]; \
-    eor v2.16b, v2.16b, v5.16b; \
-    eor v3.16b, v3.16b, v6.16b; \
-    eor v4.16b, v4.16b, v7.16b; \
-    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
-    \
-    b.hs .Lxts_dec_loop4_##bits; \
-    CLEAR_REG(v3); \
-    CLEAR_REG(v4); \
-    CLEAR_REG(v5); \
-    CLEAR_REG(v6); \
-    CLEAR_REG(v7); \
-    cbz x4, .Lxts_dec_done; \
-    \
-  .Lxts_dec_loop_##bits: \
-    \
-    ld1 {v1.16b}, [x2], #16; /* load plaintext */ \
-    ext v3.16b, v0.16b, v0.16b, #8; \
-    mov v2.16b, v0.16b; \
-    sshr v3.2d, v3.2d, #63; \
-    add v0.2d, v0.2d, v0.2d; \
-    and v3.16b, v3.16b, v16.16b; \
-    eor v1.16b, v1.16b, v2.16b; \
-    eor v0.16b, v0.16b, v3.16b; \
-    sub x4, x4, #1; \
-    \
-    do_aes_one##bits(d, imc, v1, v1); \
-    \
-    eor v1.16b, v1.16b, v2.16b; \
-    st1 {v1.16b}, [x1], #16; /* store ciphertext */ \
-    \
-    cbnz x4, .Lxts_dec_loop_##bits; \
-    b .Lxts_dec_done;
-
-  XTS_DEC(128)
-  XTS_DEC(192)
-  XTS_DEC(256)
+  XTS_CRYPT(128, d, imc)
+  XTS_CRYPT(192, d, imc)
+  XTS_CRYPT(256, d, imc)
 
-#undef XTS_DEC
+#undef XTS_CRYPT
 
-.Lxts_dec_done:
+.Lxts_dcry_done:
   aes_clear_keys(w5)
 
   st1 {v0.16b}, [x3] /* store tweak */
@@ -1674,6 +1870,9 @@ _gcry_aes_xts_dec_armv8_ce:
   CLEAR_REG(v1)
   CLEAR_REG(v2)
 
+  add sp, sp, 128;
+  CFI_ADJUST_CFA_OFFSET(-128);
+
 .Lxts_dec_skip:
   ret
   CFI_ENDPROC();
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 21:13:37 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 22:13:37 +0200
Subject: [PATCH 2/4] Add straight-line speculation hardening for aarch64
 assembly
In-Reply-To: <20220108201339.360118-1-jussi.kivilinna@iki.fi>
References: <20220108201339.360118-1-jussi.kivilinna@iki.fi>
Message-ID: <20220108201339.360118-2-jussi.kivilinna@iki.fi>

* cipher/asm-common-aarch64.h (ret_spec_stop): New.
* cipher/asm-poly1305-aarch64.h: Use 'ret_spec_stop' for 'ret'
instruction.
* cipher/camellia-aarch64.S: Likewise.
* cipher/chacha20-aarch64.S: Likewise.
* cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise.
* cipher/crc-armv8-aarch64-ce.S: Likewise.
* cipher/rijndael-aarch64.S: Likewise.
* cipher/rijndael-armv8-aarch64-ce.S: Likewise.
* cipher/sha1-armv8-aarch64-ce.S: Likewise.
* cipher/sha256-armv8-aarch64-ce.S: Likewise.
* cipher/sm3-aarch64.S: Likewise.
* cipher/twofish-aarch64.S: Likewise.
* mpi/aarch64/mpih-add1.S: Likewise.
* mpi/aarch64/mpih-mul1.S: Likewise.
* mpi/aarch64/mpih-mul2.S: Likewise.
* mpi/aarch64/mpih-mul3.S: Likewise.
* mpi/aarch64/mpih-sub1.S: Likewise.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/asm-common-aarch64.h          |  4 ++++
 cipher/asm-poly1305-aarch64.h        |  2 +-
 cipher/camellia-aarch64.S            |  6 +++---
 cipher/chacha20-aarch64.S            |  4 ++--
 cipher/cipher-gcm-armv8-aarch64-ce.S |  6 +++---
 cipher/crc-armv8-aarch64-ce.S        |  8 ++++----
 cipher/rijndael-aarch64.S            |  4 ++--
 cipher/rijndael-armv8-aarch64-ce.S   | 30 ++++++++++++++--------------
 cipher/sha1-armv8-aarch64-ce.S       |  2 +-
 cipher/sha256-armv8-aarch64-ce.S     |  2 +-
 cipher/sm3-aarch64.S                 |  2 +-
 cipher/twofish-aarch64.S             |  4 ++--
 mpi/aarch64/mpih-add1.S              |  2 +-
 mpi/aarch64/mpih-mul1.S              |  2 +-
 mpi/aarch64/mpih-mul2.S              |  2 +-
 mpi/aarch64/mpih-mul3.S              |  4 ++--
 mpi/aarch64/mpih-sub1.S              |  2 +-
 17 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h
index cf0afe1f..6ce773f2 100644
--- a/cipher/asm-common-aarch64.h
+++ b/cipher/asm-common-aarch64.h
@@ -101,4 +101,8 @@
 # define CFI_REG_ON_STACK(reg,rsp_offs)
 #endif
 
+/* 'ret' instruction replacement for straight-line speculation mitigation */
+#define ret_spec_stop \
+	ret; b .; dsb sy; isb;
+
 #endif /* GCRY_ASM_COMMON_AARCH64_H */
diff --git a/cipher/asm-poly1305-aarch64.h b/cipher/asm-poly1305-aarch64.h
index 90092709..2f05aae2 100644
--- a/cipher/asm-poly1305-aarch64.h
+++ b/cipher/asm-poly1305-aarch64.h
@@ -237,7 +237,7 @@ _gcry_poly1305_aarch64_blocks1:
 	mov x0, #0;
 
 	POLY1305_POP_REGS();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;)
 #endif
diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S
index f4980862..30b568d3 100644
--- a/cipher/camellia-aarch64.S
+++ b/cipher/camellia-aarch64.S
@@ -238,7 +238,7 @@ _gcry_camellia_arm_encrypt_block:
 	CFI_ADJUST_CFA_OFFSET(-16)
 	CFI_RESTORE(x19)
 	CFI_RESTORE(x30)
-	ret;
+	ret_spec_stop;
 	CFI_RESTORE_STATE()
 .ltorg
 
@@ -252,7 +252,7 @@ _gcry_camellia_arm_encrypt_block:
 	CFI_ADJUST_CFA_OFFSET(-16)
 	CFI_RESTORE(x19)
 	CFI_RESTORE(x30)
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 .ltorg
 ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;)
@@ -299,7 +299,7 @@ _gcry_camellia_arm_decrypt_block:
 	CFI_ADJUST_CFA_OFFSET(-16)
 	CFI_RESTORE(x19)
 	CFI_RESTORE(x30)
-	ret;
+	ret_spec_stop;
 	CFI_RESTORE_STATE()
 .ltorg
 
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index 4f76834b..2a980b95 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -356,7 +356,7 @@ _gcry_chacha20_aarch64_blocks4:
 	clear(X15);
 
 	eor x0, x0, x0
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;)
 
@@ -641,7 +641,7 @@ _gcry_chacha20_poly1305_aarch64_blocks4:
 
 	eor x0, x0, x0
 	POLY1305_POP_REGS()
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 ELF(.size _gcry_chacha20_poly1305_aarch64_blocks4, .-_gcry_chacha20_poly1305_aarch64_blocks4;)
 
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 2c619f9b..e6714249 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -365,7 +365,7 @@ _gcry_ghash_armv8_ce_pmull:
 
 .Ldo_nothing:
   mov x0, #0
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
 
@@ -593,7 +593,7 @@ _gcry_polyval_armv8_ce_pmull:
 
 .Lpolyval_do_nothing:
   mov x0, #0
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;)
 
@@ -645,7 +645,7 @@ _gcry_ghash_setup_armv8_ce_pmull:
   st1 {rh2.16b-rh4.16b}, [x1], #(3*16)
   st1 {rh5.16b-rh6.16b}, [x1]
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;)
 
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
index 060abdfe..7ac884af 100644
--- a/cipher/crc-armv8-aarch64-ce.S
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -227,7 +227,7 @@ _gcry_crc32r_armv8_ce_bulk:
   /* store CRC */
   st1 {v0.s}[2], [x0]
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
 
@@ -260,7 +260,7 @@ _gcry_crc32r_armv8_ce_reduction_4:
 
   mov w0, v0.s[1]
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
 
@@ -457,7 +457,7 @@ _gcry_crc32_armv8_ce_bulk:
   rev32 v0.8b, v0.8b             /* byte swap */
   st1 {v0.s}[0], [x0]
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
 
@@ -490,7 +490,7 @@ _gcry_crc32_armv8_ce_reduction_4:
   rev32 v0.8b, v0.8b            /* Return in input endian */
   mov w0, v0.s[0]
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC()
 ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
 
diff --git a/cipher/rijndael-aarch64.S b/cipher/rijndael-aarch64.S
index e77dd4e0..184fcd20 100644
--- a/cipher/rijndael-aarch64.S
+++ b/cipher/rijndael-aarch64.S
@@ -263,7 +263,7 @@ _gcry_aes_arm_encrypt_block:
 	stp	RC, RD, [RDST, #8];
 
 	mov     x0, #(0);
-	ret;
+	ret_spec_stop;
 
 .ltorg
 .Lenc_not_128:
@@ -486,7 +486,7 @@ _gcry_aes_arm_decrypt_block:
 	stp	RC, RD, [RDST, #8];
 
 	mov     x0, #(0);
-	ret;
+	ret_spec_stop;
 
 .ltorg
 .Ldec_256:
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 9f8d9d49..4fef0345 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -301,7 +301,7 @@ _gcry_aes_enc_armv8_ce:
   CLEAR_REG(v0)
 
   mov x0, #0
-  ret
+  ret_spec_stop
 
 .Lenc1_192:
   do_aes_one192(e, mc, v0, v0, vk0);
@@ -365,7 +365,7 @@ _gcry_aes_dec_armv8_ce:
   CLEAR_REG(v0)
 
   mov x0, #0
-  ret
+  ret_spec_stop
 
 .Ldec1_192:
   do_aes_one192(d, imc, v0, v0, vk0);
@@ -463,7 +463,7 @@ _gcry_aes_cbc_enc_armv8_ce:
   CLEAR_REG(v0)
 
 .Lcbc_enc_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;)
 
@@ -584,7 +584,7 @@ _gcry_aes_cbc_dec_armv8_ce:
   CFI_ADJUST_CFA_OFFSET(-64);
 
 .Lcbc_dec_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;)
 
@@ -777,7 +777,7 @@ _gcry_aes_ctr_enc_armv8_ce:
   CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lctr_enc_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
 
@@ -924,7 +924,7 @@ _gcry_aes_ctr32le_enc_armv8_ce:
   CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lctr32le_enc_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;)
 
@@ -1006,7 +1006,7 @@ _gcry_aes_cfb_enc_armv8_ce:
   CLEAR_REG(v4)
 
 .Lcfb_enc_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;)
 
@@ -1130,7 +1130,7 @@ _gcry_aes_cfb_dec_armv8_ce:
   CFI_ADJUST_CFA_OFFSET(-64);
 
 .Lcfb_dec_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;)
 
@@ -1379,7 +1379,7 @@ _gcry_aes_ocb_enc_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;)
 
@@ -1458,7 +1458,7 @@ _gcry_aes_ocb_dec_armv8_ce:
   add sp, sp, #128;
   CFI_ADJUST_CFA_OFFSET(-128);
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;)
 
@@ -1605,7 +1605,7 @@ _gcry_aes_ocb_auth_armv8_ce:
   CLEAR_REG(v2)
   CLEAR_REG(v16)
 
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;)
 
@@ -1806,7 +1806,7 @@ _gcry_aes_xts_enc_armv8_ce:
   CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lxts_enc_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;)
 
@@ -1874,7 +1874,7 @@ _gcry_aes_xts_dec_armv8_ce:
   CFI_ADJUST_CFA_OFFSET(-128);
 
 .Lxts_dec_skip:
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;)
 
@@ -1897,7 +1897,7 @@ _gcry_aes_sbox4_armv8_ce:
   addv s0, v0.4s
   mov w0, v0.S[0]
   CLEAR_REG(v0)
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;)
 
@@ -1914,7 +1914,7 @@ _gcry_aes_invmixcol_armv8_ce:
   aesimc v0.16b, v0.16b
   st1 {v0.16b}, [x0]
   CLEAR_REG(v0)
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;)
 
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index 8ea1486b..ea26564b 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -194,7 +194,7 @@ _gcry_sha1_transform_armv8_ce:
 
 .Ldo_nothing:
   mov x0, #0
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;)
 
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index 5c39e83e..d0fa6285 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -208,7 +208,7 @@ _gcry_sha256_transform_armv8_ce:
 
 .Ldo_nothing:
   mov x0, #0
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;)
 
diff --git a/cipher/sm3-aarch64.S b/cipher/sm3-aarch64.S
index 77dba2ba..3fb89006 100644
--- a/cipher/sm3-aarch64.S
+++ b/cipher/sm3-aarch64.S
@@ -650,7 +650,7 @@ _gcry_sm3_transform_aarch64:
   CFI_ADJUST_CFA_OFFSET(-16);
   CFI_RESTORE(x28);
   CFI_RESTORE(x29);
-  ret
+  ret_spec_stop
   CFI_ENDPROC();
 ELF(.size _gcry_sm3_transform_aarch64, .-_gcry_sm3_transform_aarch64;)
 
diff --git a/cipher/twofish-aarch64.S b/cipher/twofish-aarch64.S
index 9f35b5cd..7941fe3a 100644
--- a/cipher/twofish-aarch64.S
+++ b/cipher/twofish-aarch64.S
@@ -262,7 +262,7 @@ _gcry_twofish_arm_encrypt_block:
 
 	str_output_le(RDST, RC, RD, RA, RB, RT0, RT1);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 .ltorg
 ELF(.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;)
@@ -313,7 +313,7 @@ _gcry_twofish_arm_decrypt_block:
 
 	str_output_le(RDST, RA, RB, RC, RD, RT0, RT1);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;)
 
diff --git a/mpi/aarch64/mpih-add1.S b/mpi/aarch64/mpih-add1.S
index cc356bce..24859b17 100644
--- a/mpi/aarch64/mpih-add1.S
+++ b/mpi/aarch64/mpih-add1.S
@@ -69,6 +69,6 @@ C_SYMBOL_NAME(_gcry_mpih_add_n):
 
 .Lend:
 	adc	x0, xzr, xzr;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 ELF(.size C_SYMBOL_NAME(_gcry_mpih_add_n),.-C_SYMBOL_NAME(_gcry_mpih_add_n);)
diff --git a/mpi/aarch64/mpih-mul1.S b/mpi/aarch64/mpih-mul1.S
index 0db54444..f34c13c5 100644
--- a/mpi/aarch64/mpih-mul1.S
+++ b/mpi/aarch64/mpih-mul1.S
@@ -94,6 +94,6 @@ C_SYMBOL_NAME(_gcry_mpih_mul_1):
 
 .Lend:
 	mov	x0, x4;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 ELF(.size C_SYMBOL_NAME(_gcry_mpih_mul_1),.-C_SYMBOL_NAME(_gcry_mpih_mul_1);)
diff --git a/mpi/aarch64/mpih-mul2.S b/mpi/aarch64/mpih-mul2.S
index b4cc6eeb..1880999d 100644
--- a/mpi/aarch64/mpih-mul2.S
+++ b/mpi/aarch64/mpih-mul2.S
@@ -106,6 +106,6 @@ C_SYMBOL_NAME(_gcry_mpih_addmul_1):
 
 .Lend:
 	mov	x0, x6;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 ELF(.size C_SYMBOL_NAME(_gcry_mpih_addmul_1),.-C_SYMBOL_NAME(_gcry_mpih_addmul_1);)
diff --git a/mpi/aarch64/mpih-mul3.S b/mpi/aarch64/mpih-mul3.S
index 47a189b6..e5faeddc 100644
--- a/mpi/aarch64/mpih-mul3.S
+++ b/mpi/aarch64/mpih-mul3.S
@@ -115,10 +115,10 @@ C_SYMBOL_NAME(_gcry_mpih_submul_1):
 	cbnz	w2, .Large_loop;
 
 	mov	x0, x7;
-	ret;
+	ret_spec_stop;
 
 .Loop_end:
 	cinc	x0, x7, cc;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 ELF(.size C_SYMBOL_NAME(_gcry_mpih_submul_1),.-C_SYMBOL_NAME(_gcry_mpih_submul_1);)
diff --git a/mpi/aarch64/mpih-sub1.S b/mpi/aarch64/mpih-sub1.S
index 16b6c004..46908286 100644
--- a/mpi/aarch64/mpih-sub1.S
+++ b/mpi/aarch64/mpih-sub1.S
@@ -69,6 +69,6 @@ C_SYMBOL_NAME(_gcry_mpih_sub_n):
 
 .Lend:
 	cset	x0, cc;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC()
 ELF(.size C_SYMBOL_NAME(_gcry_mpih_sub_n),.-C_SYMBOL_NAME(_gcry_mpih_sub_n);)
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 21:13:39 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 22:13:39 +0200
Subject: [PATCH 4/4] mpi/config.links: merge i586 targets with rest i*86
 targets
In-Reply-To: <20220108201339.360118-1-jussi.kivilinna@iki.fi>
References: <20220108201339.360118-1-jussi.kivilinna@iki.fi>
Message-ID: <20220108201339.360118-4-jussi.kivilinna@iki.fi>

* mpi/config.links: Merge i586 targets with rest i[3467]86 targets.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/config.links | 64 ++++++++++++------------------------------------
 1 file changed, 15 insertions(+), 49 deletions(-)

diff --git a/mpi/config.links b/mpi/config.links
index deb98bf0..8cd6657e 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -50,26 +50,14 @@ case "${host}" in
        path=""
        mpi_cpu_arch="x86"
        ;;
-    i[3467]86*-*-openbsd*         | \
-    i[3467]86*-*-freebsd*-elf     | \
-    i[3467]86*-*-freebsd[3-9]*    | \
-    i[3467]86*-*-freebsd[12][0-9]*| \
-    i[3467]86*-*-freebsdelf*      | \
-    i[3467]86*-*-netbsd*          | \
-    i[3467]86*-*-k*bsd*)
-       echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
-       cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
-       path="i386"
-       mpi_cpu_arch="x86"
-       ;;
-    i586*-*-openbsd*         | \
-    i586*-*-freebsd*-elf     | \
-    i586*-*-freebsd[3-9]*    | \
-    i586*-*-freebsd[12][0-9]*| \
-    i586*-*-freebsdelf*      | \
-    i586*-*-netbsd*	     | \
-    i586*-*-k*bsd*	     | \
-    pentium-*-netbsd*	     | \
+    i[34567]86*-*-openbsd*         | \
+    i[34567]86*-*-freebsd*-elf     | \
+    i[34567]86*-*-freebsd[3-9]*    | \
+    i[34567]86*-*-freebsd[12][0-9]*| \
+    i[34567]86*-*-freebsdelf*      | \
+    i[34567]86*-*-netbsd*          | \
+    i[34567]86*-*-k*bsd*           | \
+    pentium-*-netbsd*	           | \
     pentiumpro-*-netbsd*)
        echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
        cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
@@ -82,46 +70,24 @@ case "${host}" in
        path="i386"
        mpi_cpu_arch="x86"
        ;;
-    i[3467]86*-*-linuxaout*  | \
-    i[3467]86*-*-linuxoldld* | \
-    i[3467]86*-*-*bsd*)
-	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
-	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
-	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
-	path="i386"
-        mpi_cpu_arch="x86"
-	;;
-    i586*-*-linuxaout*  | \
-    i586*-*-linuxoldld* | \
-    i586*-*-*bsd*)
+    i[34567]86*-*-linuxaout*  | \
+    i[34567]86*-*-linuxoldld* | \
+    i[34567]86*-*-*bsd*)
 	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
 	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
 	path="i386"
         mpi_cpu_arch="x86"
 	;;
-    i[3467]86*-msdosdjgpp* | \
-    i[34]86*-apple-darwin*)
+    i[34567]86*-msdosdjgpp* | \
+    i[34567]86*-apple-darwin*)
 	echo '#define BSD_SYNTAX'        >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	 >>./mpi/asm-syntax.h
 	path="i386"
         mpi_cpu_arch="x86"
 	;;
-    i586*-msdosdjgpp* | \
-    i[567]86*-apple-darwin*)
-	echo '#define BSD_SYNTAX'        >>./mpi/asm-syntax.h
-	cat  $srcdir/mpi/i386/syntax.h	 >>./mpi/asm-syntax.h
-	path="i386"
-        mpi_cpu_arch="x86"
-	;;
-    i[3467]86*-*-*)
-	echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
-	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
-	path="i386"
-        mpi_cpu_arch="x86"
-	;;
-    i586*-*-*  | \
-    pentium-*-*   | \
+    i[34567]86*-*-* | \
+    pentium-*-*     | \
     pentiumpro-*-*)
 	echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 21:13:36 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 22:13:36 +0200
Subject: [PATCH 1/4] Add straight-line speculation hardening for amd64 and
 i386 assembly
Message-ID: <20220108201339.360118-1-jussi.kivilinna@iki.fi>

* cipher/asm-common-amd64.h (ret_spec_stop): New.
* cipher/arcfour-amd64.S: Use 'ret_spec_stop' for 'ret' instruction.
* cipher/blake2b-amd64-avx2.S: Likewise.
* cipher/blake2s-amd64-avx.S: Likewise.
* cipher/blowfish-amd64.S: Likewise.
* cipher/camellia-aesni-avx-amd64.S: Likewise.
* cipher/camellia-aesni-avx2-amd64.h: Likewise.
* cipher/cast5-amd64.S: Likewise.
* cipher/chacha20-amd64-avx2.S: Likewise.
* cipher/chacha20-amd64-ssse3.S: Likewise.
* cipher/des-amd64.S: Likewise.
* cipher/rijndael-aarch64.S: Likewise.
* cipher/rijndael-amd64.S: Likewise.
* cipher/rijndael-ssse3-amd64-asm.S: Likewise.
* cipher/rijndael-vaes-avx2-amd64.S: Likewise.
* cipher/salsa20-amd64.S: Likewise.
* cipher/serpent-avx2-amd64.S: Likewise.
* cipher/serpent-sse2-amd64.S: Likewise.
* cipher/sha1-avx-amd64.S: Likewise.
* cipher/sha1-avx-bmi2-amd64.S: Likewise.
* cipher/sha1-avx2-bmi2-amd64.S: Likewise.
* cipher/sha1-ssse3-amd64.S: Likewise.
* cipher/sha256-avx-amd64.S: Likewise.
* cipher/sha256-avx2-bmi2-amd64.S: Likewise.
* cipher/sha256-ssse3-amd64.S: Likewise.
* cipher/sha512-avx-amd64.S: Likewise.
* cipher/sha512-avx2-bmi2-amd64.S: Likewise.
* cipher/sha512-ssse3-amd64.S: Likewise.
* cipher/sm3-avx-bmi2-amd64.S: Likewise.
* cipher/sm4-aesni-avx-amd64.S: Likewise.
* cipher/sm4-aesni-avx2-amd64.S: Likewise.
* cipher/twofish-amd64.S: Likewise.
* cipher/twofish-avx2-amd64.S: Likewise.
* cipher/whirlpool-sse2-amd64.S: Likewise.
* mpi/amd64/func_abi.h (CFI_*): Remove, include from "asm-common-amd64.h"
instead.
(FUNC_EXIT): Use 'ret_spec_stop' for 'ret' instruction.
* mpi/asm-common-amd64.h: New.
* mpi/i386/mpih-add1.S: Use 'ret_spec_stop' for 'ret' instruction.
* mpi/i386/mpih-lshift.S: Likewise.
* mpi/i386/mpih-mul1.S: Likewise.
* mpi/i386/mpih-mul2.S: Likewise.
* mpi/i386/mpih-mul3.S: Likewise.
* mpi/i386/mpih-rshift.S: Likewise.
* mpi/i386/mpih-sub1.S: Likewise.
* mpi/i386/syntax.h (ret_spec_stop): New.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/arcfour-amd64.S             |  2 +-
 cipher/asm-common-amd64.h          |  4 ++++
 cipher/blake2b-amd64-avx2.S        |  2 +-
 cipher/blake2s-amd64-avx.S         |  2 +-
 cipher/blowfish-amd64.S            | 18 +++++++++---------
 cipher/camellia-aesni-avx-amd64.S  | 20 ++++++++++----------
 cipher/camellia-aesni-avx2-amd64.h | 16 ++++++++--------
 cipher/cast5-amd64.S               | 14 +++++++-------
 cipher/chacha20-amd64-avx2.S       |  4 ++--
 cipher/chacha20-amd64-ssse3.S      |  8 ++++----
 cipher/des-amd64.S                 | 10 +++++-----
 cipher/rijndael-amd64.S            |  4 ++--
 cipher/rijndael-ssse3-amd64-asm.S  | 18 +++++++++---------
 cipher/rijndael-vaes-avx2-amd64.S  | 14 +++++++-------
 cipher/salsa20-amd64.S             |  6 +++---
 cipher/serpent-avx2-amd64.S        | 16 ++++++++--------
 cipher/serpent-sse2-amd64.S        | 16 ++++++++--------
 cipher/sha1-avx-amd64.S            |  2 +-
 cipher/sha1-avx-bmi2-amd64.S       |  2 +-
 cipher/sha1-avx2-bmi2-amd64.S      |  2 +-
 cipher/sha1-ssse3-amd64.S          |  2 +-
 cipher/sha256-avx-amd64.S          |  2 +-
 cipher/sha256-avx2-bmi2-amd64.S    |  2 +-
 cipher/sha256-ssse3-amd64.S        |  2 +-
 cipher/sha512-avx-amd64.S          |  2 +-
 cipher/sha512-avx2-bmi2-amd64.S    |  2 +-
 cipher/sha512-ssse3-amd64.S        |  2 +-
 cipher/sm3-avx-bmi2-amd64.S        |  2 +-
 cipher/sm4-aesni-avx-amd64.S       | 20 ++++++++++----------
 cipher/sm4-aesni-avx2-amd64.S      | 14 +++++++-------
 cipher/twofish-amd64.S             | 20 ++++++++++----------
 cipher/twofish-avx2-amd64.S        | 16 ++++++++--------
 cipher/whirlpool-sse2-amd64.S      |  2 +-
 mpi/amd64/func_abi.h               | 28 +++-------------------------
 mpi/asm-common-amd64.h             | 26 ++++++++++++++++++++++++++
 mpi/i386/mpih-add1.S               |  2 +-
 mpi/i386/mpih-lshift.S             |  4 ++--
 mpi/i386/mpih-mul1.S               |  2 +-
 mpi/i386/mpih-mul2.S               |  2 +-
 mpi/i386/mpih-mul3.S               |  2 +-
 mpi/i386/mpih-rshift.S             |  4 ++--
 mpi/i386/mpih-sub1.S               |  2 +-
 mpi/i386/syntax.h                  |  6 ++++++
 43 files changed, 180 insertions(+), 166 deletions(-)
 create mode 100644 mpi/asm-common-amd64.h

diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index 221dfeff..2abd90a7 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -99,7 +99,7 @@ _gcry_arcfour_amd64:
 	pop	%rbp
 	CFI_POP(%rbp)
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 .L__gcry_arcfour_amd64_end:
 ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 9d4a028a..8ee9d9e7 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -186,4 +186,8 @@
 # define EXIT_SYSV_FUNC
 #endif
 
+/* 'ret' instruction replacement for straight-line speculation mitigation */
+#define ret_spec_stop \
+	ret; jmp .; int3;
+
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S
index 357e8a51..3601b65f 100644
--- a/cipher/blake2b-amd64-avx2.S
+++ b/cipher/blake2b-amd64-avx2.S
@@ -291,7 +291,7 @@ _gcry_blake2b_transform_amd64_avx2:
 
         xor %eax, %eax;
         vzeroall;
-        ret;
+        ret_spec_stop;
         CFI_ENDPROC();
 ELF(.size _gcry_blake2b_transform_amd64_avx2,
     .-_gcry_blake2b_transform_amd64_avx2;)
diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S
index 5b936758..5094b4c1 100644
--- a/cipher/blake2s-amd64-avx.S
+++ b/cipher/blake2s-amd64-avx.S
@@ -269,7 +269,7 @@ _gcry_blake2s_transform_amd64_avx:
 
         xor %eax, %eax;
         vzeroall;
-        ret;
+        ret_spec_stop;
         CFI_ENDPROC();
 ELF(.size _gcry_blake2s_transform_amd64_avx,
     .-_gcry_blake2s_transform_amd64_avx;)
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index bdb361d7..2b4ffa1a 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -151,7 +151,7 @@ __blowfish_enc_blk1:
 	movq %r11, %rbp;
 	CFI_RESTORE(%rbp)
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
 
@@ -182,7 +182,7 @@ _gcry_blowfish_amd64_do_encrypt:
 	movl RX0d, (RX2);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
@@ -210,7 +210,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	write_block();
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
@@ -253,7 +253,7 @@ _gcry_blowfish_amd64_decrypt_block:
 	CFI_RESTORE(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
@@ -367,7 +367,7 @@ __blowfish_enc_blk4:
 
 	outbswap_block4();
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
 
@@ -398,7 +398,7 @@ __blowfish_dec_blk4:
 
 	outbswap_block4();
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
 
@@ -468,7 +468,7 @@ _gcry_blowfish_amd64_ctr_enc:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
@@ -529,7 +529,7 @@ _gcry_blowfish_amd64_cbc_dec:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
@@ -593,7 +593,7 @@ _gcry_blowfish_amd64_cfb_dec:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 64cabaa5..5c304e57 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -822,7 +822,7 @@ __camellia_enc_blk16:
 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		    %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax));
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
@@ -887,7 +887,7 @@ __camellia_dec_blk16:
 		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
 		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
@@ -1021,7 +1021,7 @@ _gcry_camellia_aesni_avx_ctr_enc:
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
@@ -1094,7 +1094,7 @@ _gcry_camellia_aesni_avx_cbc_dec:
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
@@ -1176,7 +1176,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
@@ -1328,7 +1328,7 @@ _gcry_camellia_aesni_avx_ocb_enc:
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
 
@@ -1499,7 +1499,7 @@ _gcry_camellia_aesni_avx_ocb_dec:
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
 
@@ -1647,7 +1647,7 @@ _gcry_camellia_aesni_avx_ocb_auth:
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
 
@@ -2096,7 +2096,7 @@ __camellia_avx_setup128:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
@@ -2576,7 +2576,7 @@ __camellia_avx_setup256:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index be7bb0aa..e93c40b8 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -815,7 +815,7 @@ __camellia_enc_blk32:
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax));
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
@@ -880,7 +880,7 @@ __camellia_dec_blk32:
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
 		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
@@ -1084,7 +1084,7 @@ FUNC_NAME(ctr_enc):
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);)
 
@@ -1161,7 +1161,7 @@ FUNC_NAME(cbc_dec):
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);)
 
@@ -1245,7 +1245,7 @@ FUNC_NAME(cfb_dec):
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);)
 
@@ -1419,7 +1419,7 @@ FUNC_NAME(ocb_enc):
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);)
 
@@ -1616,7 +1616,7 @@ FUNC_NAME(ocb_dec):
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);)
 
@@ -1787,7 +1787,7 @@ FUNC_NAME(ocb_auth):
 
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)
 
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index 82f67890..a804654c 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -219,7 +219,7 @@ _gcry_cast5_amd64_encrypt_block:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
@@ -269,7 +269,7 @@ _gcry_cast5_amd64_decrypt_block:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
@@ -399,7 +399,7 @@ __cast5_enc_blk4:
 	round_enc_last4(14, F4_3, F4_1);
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
@@ -432,7 +432,7 @@ __cast5_dec_blk4:
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	CFI_ENDPROC();
-	ret;
+	ret_spec_stop;
 ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
 .align 8
@@ -508,7 +508,7 @@ _gcry_cast5_amd64_ctr_enc:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
@@ -582,7 +582,7 @@ _gcry_cast5_amd64_cbc_dec:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
@@ -655,7 +655,7 @@ _gcry_cast5_amd64_cfb_dec:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
 
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index 51e107be..9f2a036a 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -322,7 +322,7 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	/* eax zeroed by round loop. */
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_amd64_avx2_blocks8;)
@@ -592,7 +592,7 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	xorl %eax, %eax;
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 9cdb69ae..6c737978 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -333,7 +333,7 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	/* eax zeroed by round loop. */
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
@@ -502,7 +502,7 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 	clear(X13);
 
 	/* eax zeroed by round loop. */
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
@@ -772,7 +772,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	xorl %eax, %eax;
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
@@ -1003,7 +1003,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	xorl %eax, %eax;
 	leave;
 	CFI_LEAVE();
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index a211dac3..c1bf9f29 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -285,7 +285,7 @@ _gcry_3des_amd64_crypt_block:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
@@ -544,7 +544,7 @@ _gcry_3des_amd64_crypt_blk3:
 
 	final_permutation3(RR, RL);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
 
@@ -642,7 +642,7 @@ _gcry_3des_amd64_cbc_dec:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
@@ -740,7 +740,7 @@ _gcry_3des_amd64_ctr_enc:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
@@ -837,7 +837,7 @@ _gcry_3des_amd64_cfb_dec:
 	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index 3dcaa856..6e3cc819 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -270,7 +270,7 @@ _gcry_aes_amd64_encrypt_block:
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 
 	CFI_RESTORE_STATE();
 .align 4
@@ -448,7 +448,7 @@ _gcry_aes_amd64_decrypt_block:
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 
 	CFI_RESTORE_STATE();
 .align 4
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index 8124eb21..b98dca26 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -61,7 +61,7 @@ _gcry_aes_ssse3_enc_preload:
 	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
@@ -83,7 +83,7 @@ _gcry_aes_ssse3_dec_preload:
 	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
@@ -194,7 +194,7 @@ _aes_encrypt_core:
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
@@ -303,7 +303,7 @@ _aes_decrypt_core:
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
@@ -439,7 +439,7 @@ _aes_schedule_core:
 	pxor	%xmm6,	%xmm0		# -> b+c+d b+c b a
 	pshufd	$0x0E,	%xmm0,	%xmm6
 	pslldq	$8,	%xmm6		# clobber low side with zeros
-	ret
+	ret_spec_stop
 
 ##
 ##  .Laes_schedule_256
@@ -546,7 +546,7 @@ _aes_schedule_core:
 	# add in smeared stuff
 	pxor	%xmm7,	%xmm0
 	movdqa	%xmm0,	%xmm7
-	ret
+	ret_spec_stop
 
 ##
 ##  .Laes_schedule_transform
@@ -567,7 +567,7 @@ _aes_schedule_core:
 	movdqa	16(%r11), %xmm0 # hi
 	pshufb	%xmm1,	%xmm0
 	pxor	%xmm2,	%xmm0
-	ret
+	ret_spec_stop
 
 ##
 ##  .Laes_schedule_mangle
@@ -639,7 +639,7 @@ _aes_schedule_core:
 	add	$-16,	%r8
 	and	$48,	%r8
 	movdqa	%xmm3,	(%rdx)
-	ret
+	ret_spec_stop
 
 ##
 ##  .Laes_schedule_mangle_last
@@ -679,7 +679,7 @@ _aes_schedule_core:
 	pxor	%xmm7,  %xmm7
 	pxor	%xmm8,  %xmm8
 	EXIT_SYSV_FUNC
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index d4ecf59f..f94b58db 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -383,7 +383,7 @@ _gcry_vaes_avx2_cbc_dec_amd64:
 	vmovdqu %xmm15, (%rsi);
 
 	vzeroall;
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64)
 
@@ -691,7 +691,7 @@ _gcry_vaes_avx2_cfb_dec_amd64:
 	vmovdqu %xmm15, (%rsi);
 
 	vzeroall;
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64)
 
@@ -1103,7 +1103,7 @@ _gcry_vaes_avx2_ctr_enc_amd64:
 	vzeroall;
 	xorl %r10d, %r10d;
 	xorl %r11d, %r11d;
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64)
 
@@ -1387,7 +1387,7 @@ _gcry_vaes_avx2_ctr32le_enc_amd64:
 .Ldone_ctr32le_enc:
 	vmovdqu %xmm15, (%rsi);
 	vzeroall;
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
 
@@ -1535,7 +1535,7 @@ _gcry_vaes_avx2_ocb_checksum:
 .Locb_checksum_done:
 	vpxor (%rax), %xmm0, %xmm0;
 	vmovdqu %xmm0, (%rax);
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum)
 
@@ -2398,7 +2398,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
 
 	leave;
 	CFI_LEAVE();
-	ret
+	ret_spec_stop
 
 #undef STACK_REGS_POS
 #undef STACK_ALLOC
@@ -2919,7 +2919,7 @@ _gcry_vaes_avx2_xts_crypt_amd64:
 	vzeroall;
 
 	xorl %eax, %eax
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64)
 
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
index ae8f2715..64626063 100644
--- a/cipher/salsa20-amd64.S
+++ b/cipher/salsa20-amd64.S
@@ -83,7 +83,7 @@ _gcry_salsa20_amd64_keysetup:
 	movl   %ecx,8(%rdi)
 	movl   %r8d,12(%rdi)
 .L_keysetupdone:
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 
 .align 8
@@ -99,7 +99,7 @@ _gcry_salsa20_amd64_ivsetup:
 	movl   %esi,44(%rdi)
 	movl   %r9d,32(%rdi)
 	movl   %eax,52(%rdi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC();
 
 .align 8
@@ -926,7 +926,7 @@ _gcry_salsa20_amd64_encrypt_blocks:
 	CFI_DEF_CFA_REGISTER(%rsp)
 	pop %rbx
 	CFI_POP(%rbx)
-	ret
+	ret_spec_stop
 	CFI_RESTORE_STATE();
 .L_bytes_are_128_or_192:
 	sub  $64,%rdx
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index dcee9b62..d3515a21 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -487,7 +487,7 @@ __serpent_enc_blk16:
 	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
 
@@ -579,7 +579,7 @@ __serpent_dec_blk16:
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
@@ -697,7 +697,7 @@ _gcry_serpent_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
 
@@ -750,7 +750,7 @@ _gcry_serpent_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
 
@@ -805,7 +805,7 @@ _gcry_serpent_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
@@ -919,7 +919,7 @@ _gcry_serpent_avx2_ocb_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
 
@@ -1043,7 +1043,7 @@ _gcry_serpent_avx2_ocb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
 
@@ -1146,7 +1146,7 @@ _gcry_serpent_avx2_ocb_auth:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
 
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 39cba002..b5935095 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -509,7 +509,7 @@ __serpent_enc_blk8:
 	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
 
@@ -601,7 +601,7 @@ __serpent_dec_blk8:
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
@@ -733,7 +733,7 @@ _gcry_serpent_sse2_ctr_enc:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
 
@@ -796,7 +796,7 @@ _gcry_serpent_sse2_cbc_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
 
@@ -862,7 +862,7 @@ _gcry_serpent_sse2_cfb_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
@@ -976,7 +976,7 @@ _gcry_serpent_sse2_ocb_enc:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
 
@@ -1100,7 +1100,7 @@ _gcry_serpent_sse2_ocb_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
 
@@ -1203,7 +1203,7 @@ _gcry_serpent_sse2_ocb_auth:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;)
 
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 85876ad4..acada960 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -420,7 +420,7 @@ _gcry_sha1_transform_amd64_avx:
   xorl %eax, %eax;
 
 .Lret:
-  ret;
+  ret_spec_stop;
   CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx,
     .-_gcry_sha1_transform_amd64_avx;)
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index 5dfcdca9..5f4b9e69 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -432,7 +432,7 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   xorl %eax, %eax;
 
 .Lret:
-  ret;
+  ret_spec_stop;
   CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
     .-_gcry_sha1_transform_amd64_avx_bmi2;)
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
index 93863230..ed52761b 100644
--- a/cipher/sha1-avx2-bmi2-amd64.S
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -564,7 +564,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   /* stack already burned */
   xorl %eax, %eax;
 
-  ret;
+  ret_spec_stop;
   CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
     .-_gcry_sha1_transform_amd64_avx2_bmi2;)
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index db62928a..f09b1de1 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -428,7 +428,7 @@ _gcry_sha1_transform_amd64_ssse3:
   xorl %eax, %eax;
 
 .Lret:
-  ret;
+  ret_spec_stop;
   CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_ssse3,
     .-_gcry_sha1_transform_amd64_ssse3;)
diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index ec945f84..be8a799d 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -471,7 +471,7 @@ _gcry_sha256_transform_amd64_avx:
 	pop	rbx
 	CFI_POP(rbx)
 
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
 
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index d130dd4a..60ad442c 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -474,7 +474,7 @@ _gcry_sha256_transform_amd64_avx2:
 	CFI_POP(rbx)
 
 .Lnowork:
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
 .align 64
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 098b0eb6..401ff6f4 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -493,7 +493,7 @@ _gcry_sha256_transform_amd64_ssse3:
 	pop	rbx
 	CFI_POP(rbx)
 
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
 
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 75f7b070..bfc4435d 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -400,7 +400,7 @@ _gcry_sha512_transform_amd64_avx:
 	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
 /*
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 7f119e6c..a431e196 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -439,7 +439,7 @@ _gcry_sha512_transform_amd64_avx2:
 	CFI_DEF_CFA_REGISTER(rsp)
 
 .Lnowork:
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 6a1328a6..9cc30892 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -406,7 +406,7 @@ _gcry_sha512_transform_amd64_ssse3:
 	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
 /*
diff --git a/cipher/sm3-avx-bmi2-amd64.S b/cipher/sm3-avx-bmi2-amd64.S
index 46226ae6..d9b6206a 100644
--- a/cipher/sm3-avx-bmi2-amd64.S
+++ b/cipher/sm3-avx-bmi2-amd64.S
@@ -544,7 +544,7 @@ _gcry_sm3_transform_amd64_avx_bmi2:
 
   leave;
   CFI_LEAVE();
-  ret;
+  ret_spec_stop;
   CFI_ENDPROC();
 ELF(.size _gcry_sm3_transform_amd64_avx_bmi2,
           .-_gcry_sm3_transform_amd64_avx_bmi2;)
diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S
index 3610b98c..7a99e070 100644
--- a/cipher/sm4-aesni-avx-amd64.S
+++ b/cipher/sm4-aesni-avx-amd64.S
@@ -240,7 +240,7 @@ _gcry_sm4_aesni_avx_expand_key:
 #undef ROUND
 
 	vzeroall;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;)
 
@@ -345,7 +345,7 @@ sm4_aesni_avx_crypt_blk1_4:
 .Lblk4_store_output_done:
 	vzeroall;
 	xorl %eax, %eax;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;)
 
@@ -454,7 +454,7 @@ __sm4_crypt_blk8:
 	vpshufb RTMP2, RB2, RB2;
 	vpshufb RTMP2, RB3, RB3;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;)
 
@@ -508,7 +508,7 @@ _gcry_sm4_aesni_avx_crypt_blk1_8:
 .Lblk8_store_output_done:
 	vzeroall;
 	xorl %eax, %eax;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;)
 
@@ -582,7 +582,7 @@ _gcry_sm4_aesni_avx_ctr_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;)
 
@@ -631,7 +631,7 @@ _gcry_sm4_aesni_avx_cbc_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;)
 
@@ -683,7 +683,7 @@ _gcry_sm4_aesni_avx_cfb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;)
 
@@ -782,7 +782,7 @@ _gcry_sm4_aesni_avx_ocb_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;)
 
@@ -891,7 +891,7 @@ _gcry_sm4_aesni_avx_ocb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;)
 
@@ -979,7 +979,7 @@ _gcry_sm4_aesni_avx_ocb_auth:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx_ocb_auth,.-_gcry_sm4_aesni_avx_ocb_auth;)
 
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index 6e46c0dc..7a8b9558 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -276,7 +276,7 @@ __sm4_crypt_blk16:
 	vpshufb RTMP2, RB2, RB2;
 	vpshufb RTMP2, RB3, RB3;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)
 
@@ -394,7 +394,7 @@ _gcry_sm4_aesni_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;)
 
@@ -447,7 +447,7 @@ _gcry_sm4_aesni_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;)
 
@@ -502,7 +502,7 @@ _gcry_sm4_aesni_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;)
 
@@ -616,7 +616,7 @@ _gcry_sm4_aesni_avx2_ocb_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;)
 
@@ -740,7 +740,7 @@ _gcry_sm4_aesni_avx2_ocb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;)
 
@@ -843,7 +843,7 @@ _gcry_sm4_aesni_avx2_ocb_auth:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_sm4_aesni_avx2_ocb_auth,.-_gcry_sm4_aesni_avx2_ocb_auth;)
 
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 3cb73431..a7a60553 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -211,7 +211,7 @@ _gcry_twofish_amd64_encrypt_block:
 	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
@@ -265,7 +265,7 @@ _gcry_twofish_amd64_decrypt_block:
 	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
@@ -511,7 +511,7 @@ __twofish_enc_blk3:
 
 	outunpack_enc3();
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
 
@@ -540,7 +540,7 @@ __twofish_dec_blk3:
 
 	outunpack_dec3();
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
@@ -641,7 +641,7 @@ _gcry_twofish_amd64_ctr_enc:
 	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
@@ -726,7 +726,7 @@ _gcry_twofish_amd64_cbc_dec:
 	CFI_ADJUST_CFA_OFFSET(-9 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
@@ -811,7 +811,7 @@ _gcry_twofish_amd64_cfb_dec:
 	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
@@ -937,7 +937,7 @@ _gcry_twofish_amd64_ocb_enc:
 	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
@@ -1071,7 +1071,7 @@ _gcry_twofish_amd64_ocb_dec:
 	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
@@ -1176,7 +1176,7 @@ _gcry_twofish_amd64_ocb_auth:
 	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
 
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index 74cad355..930ac792 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -431,7 +431,7 @@ __twofish_enc_blk16:
 	outunpack_enc16(RA, RB, RC, RD);
 	transpose4x4_16(RA, RB, RC, RD);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
 
@@ -464,7 +464,7 @@ __twofish_dec_blk16:
 	outunpack_dec16(RA, RB, RC, RD);
 	transpose4x4_16(RA, RB, RC, RD);
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
@@ -582,7 +582,7 @@ _gcry_twofish_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
 
@@ -635,7 +635,7 @@ _gcry_twofish_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
 
@@ -690,7 +690,7 @@ _gcry_twofish_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
 
@@ -804,7 +804,7 @@ _gcry_twofish_avx2_ocb_enc:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
 
@@ -929,7 +929,7 @@ _gcry_twofish_avx2_ocb_dec:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
 
@@ -1032,7 +1032,7 @@ _gcry_twofish_avx2_ocb_auth:
 
 	vzeroall;
 
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
 
diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S
index 5631dc56..37648faa 100644
--- a/cipher/whirlpool-sse2-amd64.S
+++ b/cipher/whirlpool-sse2-amd64.S
@@ -340,7 +340,7 @@ _gcry_whirlpool_transform_amd64:
 	CFI_ADJUST_CFA_OFFSET(-STACK_MAX);
 .Lskip:
 	movl $(STACK_MAX + 8), %eax;
-	ret;
+	ret_spec_stop;
 	CFI_ENDPROC();
 ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;)
 
diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h
index a60363e4..c3f2d026 100644
--- a/mpi/amd64/func_abi.h
+++ b/mpi/amd64/func_abi.h
@@ -1,28 +1,6 @@
 #include <config.h>
 
-#ifdef __x86_64__
-#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
-# define CFI_STARTPROC()            .cfi_startproc
-# define CFI_ENDPROC()              .cfi_endproc
-# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
-# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
-# define CFI_RESTORE(reg)           .cfi_restore reg
-
-# define CFI_PUSH(reg) \
-	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
-# define CFI_POP(reg) \
-	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
-#else
-# define CFI_STARTPROC()
-# define CFI_ENDPROC()
-# define CFI_ADJUST_CFA_OFFSET(off)
-# define CFI_REL_OFFSET(reg,off)
-# define CFI_RESTORE(reg)
-
-# define CFI_PUSH(reg)
-# define CFI_POP(reg)
-#endif
-#endif
+#include "asm-common-amd64.h"
 
 #ifdef USE_MS_ABI
  /* Store registers and move four first input arguments from MS ABI to
@@ -44,13 +22,13 @@
 	CFI_POP(%rdi); \
 	popq %rsi; \
 	CFI_POP(%rsi); \
-	ret; \
+	ret_spec_stop; \
 	CFI_ENDPROC();
 #else
  #define FUNC_ENTRY() \
 	CFI_STARTPROC();
 
  #define FUNC_EXIT() \
-	ret; \
+	ret_spec_stop; \
 	CFI_ENDPROC();
 #endif
diff --git a/mpi/asm-common-amd64.h b/mpi/asm-common-amd64.h
new file mode 100644
index 00000000..ad0e8e62
--- /dev/null
+++ b/mpi/asm-common-amd64.h
@@ -0,0 +1,26 @@
+/* asm-common-amd64.h  -  Common macros for AMD64 assembly
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MPI_ASM_COMMON_AMD64_H
+#define MPI_ASM_COMMON_AMD64_H
+
+#include "../cipher/asm-common-amd64.h"
+
+#endif /* MPI_ASM_COMMON_AMD64_H */
diff --git a/mpi/i386/mpih-add1.S b/mpi/i386/mpih-add1.S
index de78a0cb..95a75890 100644
--- a/mpi/i386/mpih-add1.S
+++ b/mpi/i386/mpih-add1.S
@@ -156,6 +156,6 @@ Loop:	movl	(%esi),%eax
 	CFI_POP(%esi)
 	popl %edi
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-lshift.S b/mpi/i386/mpih-lshift.S
index 55da0678..3404cf55 100644
--- a/mpi/i386/mpih-lshift.S
+++ b/mpi/i386/mpih-lshift.S
@@ -86,7 +86,7 @@ L1:	movl	(%esi,%edx,4),%eax
 	popl	%ebx
 	popl	%esi
 	popl	%edi
-	ret
+	ret_spec_stop
 
 Lend:	shll	%cl,%ebx		/* compute least significant limb */
 	movl	%ebx,(%edi)		/* store it */
@@ -97,6 +97,6 @@ Lend:	shll	%cl,%ebx		/* compute least significant limb */
 	CFI_POP(%esi)
 	popl	%edi
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-mul1.S b/mpi/i386/mpih-mul1.S
index 9679ea62..a672d052 100644
--- a/mpi/i386/mpih-mul1.S
+++ b/mpi/i386/mpih-mul1.S
@@ -89,6 +89,6 @@ Loop:
 	CFI_POP(%esi)
 	INSN1(pop,l	,R(edi))
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-mul2.S b/mpi/i386/mpih-mul2.S
index fe4129c4..e09c3f7c 100644
--- a/mpi/i386/mpih-mul2.S
+++ b/mpi/i386/mpih-mul2.S
@@ -91,6 +91,6 @@ Loop:
 	CFI_POP(%esi)
 	INSN1(pop,l	,R(edi))
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-mul3.S b/mpi/i386/mpih-mul3.S
index 87577d54..4112c699 100644
--- a/mpi/i386/mpih-mul3.S
+++ b/mpi/i386/mpih-mul3.S
@@ -91,6 +91,6 @@ Loop:
 	CFI_POP(%esi)
 	INSN1(pop,l	,R(edi))
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-rshift.S b/mpi/i386/mpih-rshift.S
index 35a8201f..5d34696c 100644
--- a/mpi/i386/mpih-rshift.S
+++ b/mpi/i386/mpih-rshift.S
@@ -89,7 +89,7 @@ L2:	movl	(%esi,%edx,4),%eax
 	popl	%ebx
 	popl	%esi
 	popl	%edi
-	ret
+	ret_spec_stop
 
 Lend2:	shrl	%cl,%ebx		/* compute most significant limb */
 	movl	%ebx,(%edi)		/* store it */
@@ -100,6 +100,6 @@ Lend2:	shrl	%cl,%ebx		/* compute most significant limb */
 	CFI_POP(%esi)
 	popl	%edi
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/mpih-sub1.S b/mpi/i386/mpih-sub1.S
index 2bdc1438..49477ae3 100644
--- a/mpi/i386/mpih-sub1.S
+++ b/mpi/i386/mpih-sub1.S
@@ -157,6 +157,6 @@ Loop:	movl	(%esi),%eax
 	CFI_POP(%esi)
 	popl %edi
 	CFI_POP(%edi)
-	ret
+	ret_spec_stop
 	CFI_ENDPROC()
 
diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h
index dd300319..bab2d4a6 100644
--- a/mpi/i386/syntax.h
+++ b/mpi/i386/syntax.h
@@ -92,3 +92,9 @@
 #undef ALIGN
 #define ALIGN(log) .align log,0x90
 #endif
+
+/* 'ret' instruction replacement for straight-line speculation mitigation */
+#define ret_spec_stop \
+	ret; \
+	jmp .; \
+	int3;
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 21:13:38 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 22:13:38 +0200
Subject: [PATCH 3/4] mpi: remove unused i586 and pentium4 assembly
In-Reply-To: <20220108201339.360118-1-jussi.kivilinna@iki.fi>
References: <20220108201339.360118-1-jussi.kivilinna@iki.fi>
Message-ID: <20220108201339.360118-3-jussi.kivilinna@iki.fi>

* mpi/config.links: Remove 'i586' from paths.
* mpi/i586*: Remove.
* mpi/pentium4/*: Remove.
--

Current x86 targets (i686) have been defaulting on mpi/i386 assembly
for quite some time now. Remove mpi/i586 as it is no longer used. While
at it, remove mpi/pentium4 assembly also as obsolete.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/config.links               |   8 +-
 mpi/i586/README                |  26 --
 mpi/i586/distfiles             |   9 -
 mpi/i586/mpih-add1.S           | 135 ----------
 mpi/i586/mpih-lshift.S         | 229 -----------------
 mpi/i586/mpih-mul1.S           |  89 -------
 mpi/i586/mpih-mul2.S           |  93 -------
 mpi/i586/mpih-mul3.S           |  93 -------
 mpi/i586/mpih-rshift.S         | 228 ----------------
 mpi/i586/mpih-sub1.S           | 142 ----------
 mpi/pentium4/README            | 115 ---------
 mpi/pentium4/distfiles         |   3 -
 mpi/pentium4/mmx/distfiles     |   2 -
 mpi/pentium4/mmx/mpih-lshift.S | 457 ---------------------------------
 mpi/pentium4/mmx/mpih-rshift.S | 453 --------------------------------
 mpi/pentium4/sse2/distfiles    |   5 -
 mpi/pentium4/sse2/mpih-add1.S  |  91 -------
 mpi/pentium4/sse2/mpih-mul1.S  |  96 -------
 mpi/pentium4/sse2/mpih-mul2.S  | 136 ----------
 mpi/pentium4/sse2/mpih-mul3.S  | 127 ---------
 mpi/pentium4/sse2/mpih-sub1.S  | 112 --------
 21 files changed, 4 insertions(+), 2645 deletions(-)
 delete mode 100644 mpi/i586/README
 delete mode 100644 mpi/i586/distfiles
 delete mode 100644 mpi/i586/mpih-add1.S
 delete mode 100644 mpi/i586/mpih-lshift.S
 delete mode 100644 mpi/i586/mpih-mul1.S
 delete mode 100644 mpi/i586/mpih-mul2.S
 delete mode 100644 mpi/i586/mpih-mul3.S
 delete mode 100644 mpi/i586/mpih-rshift.S
 delete mode 100644 mpi/i586/mpih-sub1.S
 delete mode 100644 mpi/pentium4/README
 delete mode 100644 mpi/pentium4/distfiles
 delete mode 100644 mpi/pentium4/mmx/distfiles
 delete mode 100644 mpi/pentium4/mmx/mpih-lshift.S
 delete mode 100644 mpi/pentium4/mmx/mpih-rshift.S
 delete mode 100644 mpi/pentium4/sse2/distfiles
 delete mode 100644 mpi/pentium4/sse2/mpih-add1.S
 delete mode 100644 mpi/pentium4/sse2/mpih-mul1.S
 delete mode 100644 mpi/pentium4/sse2/mpih-mul2.S
 delete mode 100644 mpi/pentium4/sse2/mpih-mul3.S
 delete mode 100644 mpi/pentium4/sse2/mpih-sub1.S

diff --git a/mpi/config.links b/mpi/config.links
index e4fc4fc4..deb98bf0 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -73,7 +73,7 @@ case "${host}" in
     pentiumpro-*-netbsd*)
        echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
        cat  $srcdir/mpi/i386/syntax.h	   >>./mpi/asm-syntax.h
-       path="i586 i386"
+       path="i386"
        mpi_cpu_arch="x86"
        ;;
     i[34]86*-*-bsdi4*)
@@ -97,7 +97,7 @@ case "${host}" in
 	echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h
 	echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
-	path="i586 i386"
+	path="i386"
         mpi_cpu_arch="x86"
 	;;
     i[3467]86*-msdosdjgpp* | \
@@ -111,7 +111,7 @@ case "${host}" in
     i[567]86*-apple-darwin*)
 	echo '#define BSD_SYNTAX'        >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	 >>./mpi/asm-syntax.h
-	path="i586 i386"
+	path="i386"
         mpi_cpu_arch="x86"
 	;;
     i[3467]86*-*-*)
@@ -125,7 +125,7 @@ case "${host}" in
     pentiumpro-*-*)
 	echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h
 	cat  $srcdir/mpi/i386/syntax.h	    >>./mpi/asm-syntax.h
-	path="i586 i386"
+	path="i386"
         mpi_cpu_arch="x86"
 	;;
     x86_64-apple-darwin*)
diff --git a/mpi/i586/README b/mpi/i586/README
deleted file mode 100644
index d73b0826..00000000
--- a/mpi/i586/README
+++ /dev/null
@@ -1,26 +0,0 @@
-This directory contains mpn functions optimized for Intel Pentium
-processors.
-
-RELEVANT OPTIMIZATION ISSUES
-
-1. Pentium doesn't allocate cache lines on writes, unlike most other modern
-processors.  Since the functions in the mpn class do array writes, we have to
-handle allocating the destination cache lines by reading a word from it in the
-loops, to achieve the best performance.
-
-2. Pairing of memory operations requires that the two issued operations refer
-to different cache banks.  The simplest way to insure this is to read/write
-two words from the same object.  If we make operations on different objects,
-they might or might not be to the same cache bank.
-
-STATUS
-
-1. mpn_lshift and mpn_rshift run at about 6 cycles/limb, but the Pentium
-documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
-or 5 cycles/limb asymptotically.
-
-2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
-overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb.
-
-3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
-should...
diff --git a/mpi/i586/distfiles b/mpi/i586/distfiles
deleted file mode 100644
index 8f821fbf..00000000
--- a/mpi/i586/distfiles
+++ /dev/null
@@ -1,9 +0,0 @@
-mpih-add1.S
-mpih-mul1.S
-mpih-mul2.S
-mpih-mul3.S
-mpih-lshift.S
-mpih-rshift.S
-mpih-sub1.S
-README
-
diff --git a/mpi/i586/mpih-add1.S b/mpi/i586/mpih-add1.S
deleted file mode 100644
index 7436d592..00000000
--- a/mpi/i586/mpih-add1.S
+++ /dev/null
@@ -1,135 +0,0 @@
-/* i80586 add_n -- Add two limb vectors of the same length > 0 and store
- *		   sum in a third limb vector.
- *
- *      Copyright (C) 1992, 1994, 1995, 1996, 1998,
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- *  mpi_limb_t
- *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	(sp + 4)
- *		   mpi_ptr_t s1_ptr,	(sp + 8)
- *		   mpi_ptr_t s2_ptr,	(sp + 12)
- *		   mpi_size_t size)	(sp + 16)
- */
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_add_n)
-C_SYMBOL_NAME(_gcry_mpih_add_n:)
-	pushl	%edi
-	pushl	%esi
-	pushl	%ebx
-	pushl	%ebp
-
-	movl	20(%esp),%edi		/* res_ptr */
-	movl	24(%esp),%esi		/* s1_ptr */
-	movl	28(%esp),%ebp		/* s2_ptr */
-	movl	32(%esp),%ecx		/* size */
-
-	movl	(%ebp),%ebx
-
-	decl	%ecx
-	movl	%ecx,%edx
-	shrl	$3,%ecx
-	andl	$7,%edx
-	testl	%ecx,%ecx		/* zero carry flag */
-	jz	Lend
-	pushl	%edx
-
-	ALIGN (3)
-Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
-	leal	32(%edi),%edi
-
-L1:	movl	(%esi),%eax
-	movl	4(%esi),%edx
-	adcl	%ebx,%eax
-	movl	4(%ebp),%ebx
-	adcl	%ebx,%edx
-	movl	8(%ebp),%ebx
-	movl	%eax,-32(%edi)
-	movl	%edx,-28(%edi)
-
-L2:	movl	8(%esi),%eax
-	movl	12(%esi),%edx
-	adcl	%ebx,%eax
-	movl	12(%ebp),%ebx
-	adcl	%ebx,%edx
-	movl	16(%ebp),%ebx
-	movl	%eax,-24(%edi)
-	movl	%edx,-20(%edi)
-
-L3:	movl	16(%esi),%eax
-	movl	20(%esi),%edx
-	adcl	%ebx,%eax
-	movl	20(%ebp),%ebx
-	adcl	%ebx,%edx
-	movl	24(%ebp),%ebx
-	movl	%eax,-16(%edi)
-	movl	%edx,-12(%edi)
-
-L4:	movl	24(%esi),%eax
-	movl	28(%esi),%edx
-	adcl	%ebx,%eax
-	movl	28(%ebp),%ebx
-	adcl	%ebx,%edx
-	movl	32(%ebp),%ebx
-	movl	%eax,-8(%edi)
-	movl	%edx,-4(%edi)
-
-	leal	32(%esi),%esi
-	leal	32(%ebp),%ebp
-	decl	%ecx
-	jnz	Loop
-
-	popl	%edx
-Lend:
-	decl	%edx			/* test %edx w/o clobbering carry */
-	js	Lend2
-	incl	%edx
-Loop2:
-	leal	4(%edi),%edi
-	movl	(%esi),%eax
-	adcl	%ebx,%eax
-	movl	4(%ebp),%ebx
-	movl	%eax,-4(%edi)
-	leal	4(%esi),%esi
-	leal	4(%ebp),%ebp
-	decl	%edx
-	jnz	Loop2
-Lend2:
-	movl	(%esi),%eax
-	adcl	%ebx,%eax
-	movl	%eax,(%edi)
-
-	sbbl	%eax,%eax
-	negl	%eax
-
-	popl	%ebp
-	popl	%ebx
-	popl	%esi
-	popl	%edi
-	ret
-
-
diff --git a/mpi/i586/mpih-lshift.S b/mpi/i586/mpih-lshift.S
deleted file mode 100644
index 9d25fe9d..00000000
--- a/mpi/i586/mpih-lshift.S
+++ /dev/null
@@ -1,229 +0,0 @@
-/* i80586   lshift
- *
- *      Copyright (C) 1992, 1994, 1998, 
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_lshift( mpi_ptr_t wp,	(sp + 4)
- *		   mpi_ptr_t up,	(sp + 8)
- *		   mpi_size_t usize,	(sp + 12)
- *		   unsigned cnt)	(sp + 16)
- */
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_lshift)
-C_SYMBOL_NAME(_gcry_mpih_lshift:)
-
-	pushl	%edi
-	pushl	%esi
-	pushl	%ebx
-	pushl	%ebp
-
-	movl	20(%esp),%edi		/* res_ptr */
-	movl	24(%esp),%esi		/* s_ptr */
-	movl	28(%esp),%ebp		/* size */
-	movl	32(%esp),%ecx		/* cnt */
-
-/* We can use faster code for shift-by-1 under certain conditions.  */
-	cmp	$1,%ecx
-	jne	Lnormal
-	leal	4(%esi),%eax
-	cmpl	%edi,%eax
-	jnc	Lspecial		/* jump if s_ptr + 1 >= res_ptr */
-	leal	(%esi,%ebp,4),%eax
-	cmpl	%eax,%edi
-	jnc	Lspecial		/* jump if res_ptr >= s_ptr + size */
-
-Lnormal:
-	leal	-4(%edi,%ebp,4),%edi
-	leal	-4(%esi,%ebp,4),%esi
-
-	movl	(%esi),%edx
-	subl	$4,%esi
-	xorl	%eax,%eax
-	shldl	%cl,%edx,%eax		/* compute carry limb */
-	pushl	%eax			/* push carry limb onto stack */
-
-	decl	%ebp
-	pushl	%ebp
-	shrl	$3,%ebp
-	jz	Lend
-
-	movl	(%edi),%eax		/* fetch destination cache line */
-
-	ALIGN	(2)
-Loop:	movl	-28(%edi),%eax		/* fetch destination cache line */
-	movl	%edx,%ebx
-
-	movl	(%esi),%eax
-	movl	-4(%esi),%edx
-	shldl	%cl,%eax,%ebx
-	shldl	%cl,%edx,%eax
-	movl	%ebx,(%edi)
-	movl	%eax,-4(%edi)
-
-	movl	-8(%esi),%ebx
-	movl	-12(%esi),%eax
-	shldl	%cl,%ebx,%edx
-	shldl	%cl,%eax,%ebx
-	movl	%edx,-8(%edi)
-	movl	%ebx,-12(%edi)
-
-	movl	-16(%esi),%edx
-	movl	-20(%esi),%ebx
-	shldl	%cl,%edx,%eax
-	shldl	%cl,%ebx,%edx
-	movl	%eax,-16(%edi)
-	movl	%edx,-20(%edi)
-
-	movl	-24(%esi),%eax
-	movl	-28(%esi),%edx
-	shldl	%cl,%eax,%ebx
-	shldl	%cl,%edx,%eax
-	movl	%ebx,-24(%edi)
-	movl	%eax,-28(%edi)
-
-	subl	$32,%esi
-	subl	$32,%edi
-	decl	%ebp
-	jnz	Loop
-
-Lend:	popl	%ebp
-	andl	$7,%ebp
-	jz	Lend2
-Loop2:	movl	(%esi),%eax
-	shldl	%cl,%eax,%edx
-	movl	%edx,(%edi)
-	movl	%eax,%edx
-	subl	$4,%esi
-	subl	$4,%edi
-	decl	%ebp
-	jnz	Loop2
-
-Lend2:	shll	%cl,%edx		/* compute least significant limb */
-	movl	%edx,(%edi)		/* store it */
-
-	popl	%eax			/* pop carry limb */
-
-	popl	%ebp
-	popl	%ebx
-	popl	%esi
-	popl	%edi
-	ret
-
-/* We loop from least significant end of the arrays, which is only
-   permissable if the source and destination don't overlap, since the
-   function is documented to work for overlapping source and destination.
-*/
-
-Lspecial:
-	movl	(%esi),%edx
-	addl	$4,%esi
-
-	decl	%ebp
-	pushl	%ebp
-	shrl	$3,%ebp
-
-	addl	%edx,%edx
-	incl	%ebp
-	decl	%ebp
-	jz	LLend
-
-	movl	(%edi),%eax		/* fetch destination cache line */
-
-	ALIGN	(2)
-LLoop:	movl	28(%edi),%eax		/* fetch destination cache line */
-	movl	%edx,%ebx
-
-	movl	(%esi),%eax
-	movl	4(%esi),%edx
-	adcl	%eax,%eax
-	movl	%ebx,(%edi)
-	adcl	%edx,%edx
-	movl	%eax,4(%edi)
-
-	movl	8(%esi),%ebx
-	movl	12(%esi),%eax
-	adcl	%ebx,%ebx
-	movl	%edx,8(%edi)
-	adcl	%eax,%eax
-	movl	%ebx,12(%edi)
-
-	movl	16(%esi),%edx
-	movl	20(%esi),%ebx
-	adcl	%edx,%edx
-	movl	%eax,16(%edi)
-	adcl	%ebx,%ebx
-	movl	%edx,20(%edi)
-
-	movl	24(%esi),%eax
-	movl	28(%esi),%edx
-	adcl	%eax,%eax
-	movl	%ebx,24(%edi)
-	adcl	%edx,%edx
-	movl	%eax,28(%edi)
-
-	leal	32(%esi),%esi		/* use leal not to clobber carry */
-	leal	32(%edi),%edi
-	decl	%ebp
-	jnz	LLoop
-
-LLend:	popl	%ebp
-	sbbl	%eax,%eax		/* save carry in %eax */
-	andl	$7,%ebp
-	jz	LLend2
-	addl	%eax,%eax		/* restore carry from eax */
-LLoop2: movl	%edx,%ebx
-	movl	(%esi),%edx
-	adcl	%edx,%edx
-	movl	%ebx,(%edi)
-
-	leal	4(%esi),%esi		/* use leal not to clobber carry */
-	leal	4(%edi),%edi
-	decl	%ebp
-	jnz	LLoop2
-
-	jmp	LL1
-LLend2: addl	%eax,%eax		/* restore carry from eax */
-LL1:	movl	%edx,(%edi)		/* store last limb */
-
-	sbbl	%eax,%eax
-	negl	%eax
-
-	popl	%ebp
-	popl	%ebx
-	popl	%esi
-	popl	%edi
-	ret
-
-
diff --git a/mpi/i586/mpih-mul1.S b/mpi/i586/mpih-mul1.S
deleted file mode 100644
index 3601d968..00000000
--- a/mpi/i586/mpih-mul1.S
+++ /dev/null
@@ -1,89 +0,0 @@
-/* i80586 mul_1 -- Multiply a limb vector with a limb and store
- *			 the result in a second limb vector.
- *
- *      Copyright (C) 1992, 1994, 1996, 1998,
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,	(sp + 4)
- *		  mpi_ptr_t s1_ptr,	(sp + 8)
- *		  mpi_size_t s1_size,	(sp + 12)
- *		  mpi_limb_t s2_limb)	(sp + 16)
- */
-
-#define res_ptr edi
-#define s1_ptr	esi
-#define size	ecx
-#define s2_limb ebp
-
-	TEXT
-	ALIGN (3)
-	GLOBL	C_SYMBOL_NAME(_gcry_mpih_mul_1)
-C_SYMBOL_NAME(_gcry_mpih_mul_1:)
-
-	INSN1(push,l	,R(edi))
-	INSN1(push,l	,R(esi))
-	INSN1(push,l	,R(ebx))
-	INSN1(push,l	,R(ebp))
-
-	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
-	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
-	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
-	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
-
-	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
-	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
-	INSN1(neg,l	,R(size))
-	INSN2(xor,l	,R(ebx),R(ebx))
-	ALIGN (3)
-
-Loop:	INSN2(adc,l	,R(ebx),$0)
-	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
-
-	INSN1(mul,l	,R(s2_limb))
-
-	INSN2(add,l	,R(ebx),R(eax))
-
-	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
-	INSN1(inc,l	,R(size))
-
-	INSN2(mov,l	,R(ebx),R(edx))
-	INSN1(jnz,	,Loop)
-
-	INSN2(adc,l	,R(ebx),$0)
-	INSN2(mov,l	,R(eax),R(ebx))
-	INSN1(pop,l	,R(ebp))
-	INSN1(pop,l	,R(ebx))
-	INSN1(pop,l	,R(esi))
-	INSN1(pop,l	,R(edi))
-	ret
-
diff --git a/mpi/i586/mpih-mul2.S b/mpi/i586/mpih-mul2.S
deleted file mode 100644
index f32d363a..00000000
--- a/mpi/i586/mpih-mul2.S
+++ /dev/null
@@ -1,93 +0,0 @@
-/* i80586 addmul_1 -- Multiply a limb vector with a limb and add
- *		      the result to a second limb vector.
- *
- *      Copyright (C) 1992, 1994, 1998, 
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,      (sp + 4)
- *		     mpi_ptr_t s1_ptr,	     (sp + 8)
- *		     mpi_size_t s1_size,     (sp + 12)
- *		     mpi_limb_t s2_limb)     (sp + 16)
- */
-
-#define res_ptr edi
-#define s1_ptr	esi
-#define size	ecx
-#define s2_limb ebp
-
-	TEXT
-	ALIGN (3)
-	GLOBL	C_SYMBOL_NAME(_gcry_mpih_addmul_1)
-C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
-
-	INSN1(push,l	,R(edi))
-	INSN1(push,l	,R(esi))
-	INSN1(push,l	,R(ebx))
-	INSN1(push,l	,R(ebp))
-
-	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
-	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
-	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
-	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
-
-	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
-	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
-	INSN1(neg,l	,R(size))
-	INSN2(xor,l	,R(ebx),R(ebx))
-	ALIGN (3)
-
-Loop:	INSN2(adc,l	,R(ebx),$0)
-	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
-
-	INSN1(mul,l	,R(s2_limb))
-
-	INSN2(add,l	,R(eax),R(ebx))
-	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,size,4))
-
-	INSN2(adc,l	,R(edx),$0)
-	INSN2(add,l	,R(ebx),R(eax))
-
-	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
-	INSN1(inc,l	,R(size))
-
-	INSN2(mov,l	,R(ebx),R(edx))
-	INSN1(jnz,	,Loop)
-
-	INSN2(adc,l	,R(ebx),$0)
-	INSN2(mov,l	,R(eax),R(ebx))
-	INSN1(pop,l	,R(ebp))
-	INSN1(pop,l	,R(ebx))
-	INSN1(pop,l	,R(esi))
-	INSN1(pop,l	,R(edi))
-	ret
-
diff --git a/mpi/i586/mpih-mul3.S b/mpi/i586/mpih-mul3.S
deleted file mode 100644
index fa27d4e1..00000000
--- a/mpi/i586/mpih-mul3.S
+++ /dev/null
@@ -1,93 +0,0 @@
-/* i80586 submul_1 -- Multiply a limb vector with a limb and add
- *		      the result to a second limb vector.
- *
- *      Copyright (C) 1992, 1994, 1998,
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,      (sp + 4)
- *		     mpi_ptr_t s1_ptr,	     (sp + 8)
- *		     mpi_size_t s1_size,     (sp + 12)
- *		     mpi_limb_t s2_limb)     (sp + 16)
- */
-
-#define res_ptr edi
-#define s1_ptr	esi
-#define size	ecx
-#define s2_limb ebp
-
-	TEXT
-	ALIGN (3)
-	GLOBL	C_SYMBOL_NAME(_gcry_mpih_submul_1)
-C_SYMBOL_NAME(_gcry_mpih_submul_1:)
-
-	INSN1(push,l	,R(edi))
-	INSN1(push,l	,R(esi))
-	INSN1(push,l	,R(ebx))
-	INSN1(push,l	,R(ebp))
-
-	INSN2(mov,l	,R(res_ptr),MEM_DISP(esp,20))
-	INSN2(mov,l	,R(s1_ptr),MEM_DISP(esp,24))
-	INSN2(mov,l	,R(size),MEM_DISP(esp,28))
-	INSN2(mov,l	,R(s2_limb),MEM_DISP(esp,32))
-
-	INSN2(lea,l	,R(res_ptr),MEM_INDEX(res_ptr,size,4))
-	INSN2(lea,l	,R(s1_ptr),MEM_INDEX(s1_ptr,size,4))
-	INSN1(neg,l	,R(size))
-	INSN2(xor,l	,R(ebx),R(ebx))
-	ALIGN (3)
-
-Loop:	INSN2(adc,l	,R(ebx),$0)
-	INSN2(mov,l	,R(eax),MEM_INDEX(s1_ptr,size,4))
-
-	INSN1(mul,l	,R(s2_limb))
-
-	INSN2(add,l	,R(eax),R(ebx))
-	INSN2(mov,l	,R(ebx),MEM_INDEX(res_ptr,size,4))
-
-	INSN2(adc,l	,R(edx),$0)
-	INSN2(sub,l	,R(ebx),R(eax))
-
-	INSN2(mov,l	,MEM_INDEX(res_ptr,size,4),R(ebx))
-	INSN1(inc,l	,R(size))
-
-	INSN2(mov,l	,R(ebx),R(edx))
-	INSN1(jnz,	,Loop)
-
-	INSN2(adc,l	,R(ebx),$0)
-	INSN2(mov,l	,R(eax),R(ebx))
-	INSN1(pop,l	,R(ebp))
-	INSN1(pop,l	,R(ebx))
-	INSN1(pop,l	,R(esi))
-	INSN1(pop,l	,R(edi))
-	ret
-
diff --git a/mpi/i586/mpih-rshift.S b/mpi/i586/mpih-rshift.S
deleted file mode 100644
index c661e3d3..00000000
--- a/mpi/i586/mpih-rshift.S
+++ /dev/null
@@ -1,228 +0,0 @@
-/* i80586   rshift
- *
- *      Copyright (C) 1992, 1994, 1998,
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_rshift( mpi_ptr_t wp,	(sp + 4)
- *		   mpi_ptr_t up,	(sp + 8)
- *		   mpi_size_t usize,	(sp + 12)
- *		   unsigned cnt)	(sp + 16)
- */
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
-C_SYMBOL_NAME(_gcry_mpih_rshift:)
-	pushl	%edi
-	pushl	%esi
-	pushl	%ebx
-	pushl	%ebp
-
-	movl	20(%esp),%edi		/* res_ptr */
-	movl	24(%esp),%esi		/* s_ptr */
-	movl	28(%esp),%ebp		/* size */
-	movl	32(%esp),%ecx		/* cnt */
-
-/* We can use faster code for shift-by-1 under certain conditions.  */
-	cmp	$1,%ecx
-	jne	Rnormal
-	leal	4(%edi),%eax
-	cmpl	%esi,%eax
-	jnc	Rspecial		/* jump if res_ptr + 1 >= s_ptr */
-	leal	(%edi,%ebp,4),%eax
-	cmpl	%eax,%esi
-	jnc	Rspecial		/* jump if s_ptr >= res_ptr + size */
-
-Rnormal:
-	movl	(%esi),%edx
-	addl	$4,%esi
-	xorl	%eax,%eax
-	shrdl	%cl,%edx,%eax		/* compute carry limb */
-	pushl	%eax			/* push carry limb onto stack */
-
-	decl	%ebp
-	pushl	%ebp
-	shrl	$3,%ebp
-	jz	Rend
-
-	movl	(%edi),%eax		/* fetch destination cache line */
-
-	ALIGN	(2)
-Roop:	movl	28(%edi),%eax		/* fetch destination cache line */
-	movl	%edx,%ebx
-
-	movl	(%esi),%eax
-	movl	4(%esi),%edx
-	shrdl	%cl,%eax,%ebx
-	shrdl	%cl,%edx,%eax
-	movl	%ebx,(%edi)
-	movl	%eax,4(%edi)
-
-	movl	8(%esi),%ebx
-	movl	12(%esi),%eax
-	shrdl	%cl,%ebx,%edx
-	shrdl	%cl,%eax,%ebx
-	movl	%edx,8(%edi)
-	movl	%ebx,12(%edi)
-
-	movl	16(%esi),%edx
-	movl	20(%esi),%ebx
-	shrdl	%cl,%edx,%eax
-	shrdl	%cl,%ebx,%edx
-	movl	%eax,16(%edi)
-	movl	%edx,20(%edi)
-
-	movl	24(%esi),%eax
-	movl	28(%esi),%edx
-	shrdl	%cl,%eax,%ebx
-	shrdl	%cl,%edx,%eax
-	movl	%ebx,24(%edi)
-	movl	%eax,28(%edi)
-
-	addl	$32,%esi
-	addl	$32,%edi
-	decl	%ebp
-	jnz	Roop
-
-Rend:	popl	%ebp
-	andl	$7,%ebp
-	jz	Rend2
-Roop2:	movl	(%esi),%eax
-	shrdl	%cl,%eax,%edx		/* compute result limb */
-	movl	%edx,(%edi)
-	movl	%eax,%edx
-	addl	$4,%esi
-	addl	$4,%edi
-	decl	%ebp
-	jnz	Roop2
-
-Rend2:	shrl	%cl,%edx		/* compute most significant limb */
-	movl	%edx,(%edi)		/* store it */
-
-	popl	%eax			/* pop carry limb */
-
-	popl	%ebp
-	popl	%ebx
-	popl	%esi
-	popl	%edi
-	ret
-
-/* We loop from least significant end of the arrays, which is only
-   permissable if the source and destination don't overlap, since the
-   function is documented to work for overlapping source and destination.
-*/
-
-Rspecial:
-	leal	-4(%edi,%ebp,4),%edi
-	leal	-4(%esi,%ebp,4),%esi
-
-	movl	(%esi),%edx
-	subl	$4,%esi
-
-	decl	%ebp
-	pushl	%ebp
-	shrl	$3,%ebp
-
-	shrl	$1,%edx
-	incl	%ebp
-	decl	%ebp
-	jz	RLend
-
-	movl	(%edi),%eax		/* fetch destination cache line */
-
-	ALIGN	(2)
-RLoop:	movl	-28(%edi),%eax		/* fetch destination cache line */
-	movl	%edx,%ebx
-
-	movl	(%esi),%eax
-	movl	-4(%esi),%edx
-	rcrl	$1,%eax
-	movl	%ebx,(%edi)
-	rcrl	$1,%edx
-	movl	%eax,-4(%edi)
-
-	movl	-8(%esi),%ebx
-	movl	-12(%esi),%eax
-	rcrl	$1,%ebx
-	movl	%edx,-8(%edi)
-	rcrl	$1,%eax
-	movl	%ebx,-12(%edi)
-
-	movl	-16(%esi),%edx
-	movl	-20(%esi),%ebx
-	rcrl	$1,%edx
-	movl	%eax,-16(%edi)
-	rcrl	$1,%ebx
-	movl	%edx,-20(%edi)
-
-	movl	-24(%esi),%eax
-	movl	-28(%esi),%edx
-	rcrl	$1,%eax
-	movl	%ebx,-24(%edi)
-	rcrl	$1,%edx
-	movl	%eax,-28(%edi)
-
-	leal	-32(%esi),%esi		/* use leal not to clobber carry */
-	leal	-32(%edi),%edi
-	decl	%ebp
-	jnz	RLoop
-
-RLend:	popl	%ebp
-	sbbl	%eax,%eax		/* save carry in %eax */
-	andl	$7,%ebp
-	jz	RLend2
-	addl	%eax,%eax		/* restore carry from eax */
-RLoop2: movl	%edx,%ebx
-	movl	(%esi),%edx
-	rcrl	$1,%edx
-	movl	%ebx,(%edi)
-
-	leal	-4(%esi),%esi		/* use leal not to clobber carry */
-	leal	-4(%edi),%edi
-	decl	%ebp
-	jnz	RLoop2
-
-	jmp	RL1
-RLend2: addl	%eax,%eax		/* restore carry from eax */
-RL1:	movl	%edx,(%edi)		/* store last limb */
-
-	movl	$0,%eax
-	rcrl	$1,%eax
-
-	popl	%ebp
-	popl	%ebx
-	popl	%esi
-	popl	%edi
-	ret
-
diff --git a/mpi/i586/mpih-sub1.S b/mpi/i586/mpih-sub1.S
deleted file mode 100644
index ef2d5807..00000000
--- a/mpi/i586/mpih-sub1.S
+++ /dev/null
@@ -1,142 +0,0 @@
-/* i80586 sub_n -- Sub two limb vectors of the same length > 0 and store
- *		   sum in a third limb vector.
- *
- *      Copyright (C) 1992, 1994, 1995, 1998, 
- *                    2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- *  mpi_limb_t
- *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	(sp + 4)
- *		   mpi_ptr_t s1_ptr,	(sp + 8)
- *		   mpi_ptr_t s2_ptr,	(sp + 12)
- *		   mpi_size_t size)	(sp + 16)
- */
-
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
-C_SYMBOL_NAME(_gcry_mpih_sub_n:)
-
-	pushl	%edi
-	pushl	%esi
-	pushl	%ebx
-	pushl	%ebp
-
-	movl	20(%esp),%edi		/* res_ptr */
-	movl	24(%esp),%esi		/* s1_ptr */
-	movl	28(%esp),%ebp		/* s2_ptr */
-	movl	32(%esp),%ecx		/* size */
-
-	movl	(%ebp),%ebx
-
-	decl	%ecx
-	movl	%ecx,%edx
-	shrl	$3,%ecx
-	andl	$7,%edx
-	testl	%ecx,%ecx		/* zero carry flag */
-	jz	Lend
-	pushl	%edx
-
-	ALIGN (3)
-Loop:	movl	28(%edi),%eax		/* fetch destination cache line */
-	leal	32(%edi),%edi
-
-L1:	movl	(%esi),%eax
-	movl	4(%esi),%edx
-	sbbl	%ebx,%eax
-	movl	4(%ebp),%ebx
-	sbbl	%ebx,%edx
-	movl	8(%ebp),%ebx
-	movl	%eax,-32(%edi)
-	movl	%edx,-28(%edi)
-
-L2:	movl	8(%esi),%eax
-	movl	12(%esi),%edx
-	sbbl	%ebx,%eax
-	movl	12(%ebp),%ebx
-	sbbl	%ebx,%edx
-	movl	16(%ebp),%ebx
-	movl	%eax,-24(%edi)
-	movl	%edx,-20(%edi)
-
-L3:	movl	16(%esi),%eax
-	movl	20(%esi),%edx
-	sbbl	%ebx,%eax
-	movl	20(%ebp),%ebx
-	sbbl	%ebx,%edx
-	movl	24(%ebp),%ebx
-	movl	%eax,-16(%edi)
-	movl	%edx,-12(%edi)
-
-L4:	movl	24(%esi),%eax
-	movl	28(%esi),%edx
-	sbbl	%ebx,%eax
-	movl	28(%ebp),%ebx
-	sbbl	%ebx,%edx
-	movl	32(%ebp),%ebx
-	movl	%eax,-8(%edi)
-	movl	%edx,-4(%edi)
-
-	leal	32(%esi),%esi
-	leal	32(%ebp),%ebp
-	decl	%ecx
-	jnz	Loop
-
-	popl	%edx
-Lend:
-	decl	%edx			/* test %edx w/o clobbering carry */
-	js	Lend2
-	incl	%edx
-Loop2:
-	leal	4(%edi),%edi
-	movl	(%esi),%eax
-	sbbl	%ebx,%eax
-	movl	4(%ebp),%ebx
-	movl	%eax,-4(%edi)
-	leal	4(%esi),%esi
-	leal	4(%ebp),%ebp
-	decl	%edx
-	jnz	Loop2
-Lend2:
-	movl	(%esi),%eax
-	sbbl	%ebx,%eax
-	movl	%eax,(%edi)
-
-	sbbl	%eax,%eax
-	negl	%eax
-
-	popl	%ebp
-	popl	%ebx
-	popl	%esi
-	popl	%edi
-	ret
-
diff --git a/mpi/pentium4/README b/mpi/pentium4/README
deleted file mode 100644
index 215fc7f8..00000000
--- a/mpi/pentium4/README
+++ /dev/null
@@ -1,115 +0,0 @@
-Copyright 2001 Free Software Foundation, Inc.
-
-This file is part of the GNU MP Library.
-
-The GNU MP Library is free software; you can redistribute it and/or modify
-it under the terms of the GNU Lesser General Public License as published by
-the Free Software Foundation; either version 2.1 of the License, or (at your
-option) any later version.
-
-The GNU MP Library is distributed in the hope that it will be useful, but
-WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-License for more details.
-
-You should have received a copy of the GNU Lesser General Public License
-along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
-the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-02110-1301, USA.
-
-
-
-
-                   INTEL PENTIUM-4 MPN SUBROUTINES
-
-
-This directory contains mpn functions optimized for Intel Pentium-4.
-
-The mmx subdirectory has routines using MMX instructions, the sse2
-subdirectory has routines using SSE2 instructions.  All P4s have these, the
-separate directories are just so configure can omit that code if the
-assembler doesn't support it.
-
-
-STATUS
-
-                                cycles/limb
-
-	mpn_add_n/sub_n            4 normal, 6 in-place
-
-	mpn_mul_1                  4 normal, 6 in-place
-	mpn_addmul_1               6
-	mpn_submul_1               7
-
-	mpn_mul_basecase           6 cycles/crossproduct (approx)
-
-	mpn_sqr_basecase           3.5 cycles/crossproduct (approx)
-                                   or 7.0 cycles/triangleproduct (approx)
-
-	mpn_l/rshift               1.75
-
-
-
-The shifts ought to be able to go at 1.5 c/l, but not much effort has been
-applied to them yet.
-
-In-place operations, and all addmul, submul, mul_basecase and sqr_basecase
-calls, suffer from pipeline anomalies associated with write combining and
-movd reads and writes to the same or nearby locations.  The movq
-instructions do not trigger the same hardware problems.  Unfortunately,
-using movq and splitting/combining seems to require too many extra
-instructions to help.  Perhaps future chip steppings will be better.
-
-
-
-NOTES
-
-The Pentium-4 pipeline "Netburst", provides for quite a number of surprises.
-Many traditional x86 instructions run very slowly, requiring use of
-alterative instructions for acceptable performance.
-
-adcl and sbbl are quite slow at 8 cycles for reg->reg.  paddq of 32-bits
-within a 64-bit mmx register seems better, though the combination
-paddq/psrlq when propagating a carry is still a 4 cycle latency.
-
-incl and decl should be avoided, instead use add $1 and sub $1.  Apparently
-the carry flag is not separately renamed, so incl and decl depend on all
-previous flags-setting instructions.
-
-shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest
-integer instructions (addl, subl, orl, andl, and some more).  shldl and
-shrdl seem to have 13 and 15 cycles latency, respectively.  Bizarre.
-
-movq mmx -> mmx does have 6 cycle latency, as noted in the documentation.
-pxor/por or similar combination at 2 cycles latency can be used instead.
-The movq however executes in the float unit, thereby saving MMX execution
-resources.  With the right juggling, data moves shouldn't be on a dependent
-chain.
-
-L1 is write-through, but the write-combining sounds like it does enough to
-not require explicit destination prefetching.
-
-xmm registers so far haven't found a use, but not much effort has been
-expended.  A configure test for whether the operating system knows
-fxsave/fxrestor will be needed if they're used.
-
-
-
-REFERENCES
-
-Intel Pentium-4 processor manuals,
-
-	http://developer.intel.com/design/pentium4/manuals
-
-"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001,
-order number 248966.  Available on-line:
-
-	http://developer.intel.com/design/pentium4/manuals/248966.htm
-
-
-
-----------------
-Local variables:
-mode: text
-fill-column: 76
-End:
diff --git a/mpi/pentium4/distfiles b/mpi/pentium4/distfiles
deleted file mode 100644
index b419f85a..00000000
--- a/mpi/pentium4/distfiles
+++ /dev/null
@@ -1,3 +0,0 @@
-README
-
-
diff --git a/mpi/pentium4/mmx/distfiles b/mpi/pentium4/mmx/distfiles
deleted file mode 100644
index 8f0ea426..00000000
--- a/mpi/pentium4/mmx/distfiles
+++ /dev/null
@@ -1,2 +0,0 @@
-mpih-lshift.S
-mpih-rshift.S
diff --git a/mpi/pentium4/mmx/mpih-lshift.S b/mpi/pentium4/mmx/mpih-lshift.S
deleted file mode 100644
index e2dd184b..00000000
--- a/mpi/pentium4/mmx/mpih-lshift.S
+++ /dev/null
@@ -1,457 +0,0 @@
-/* Intel Pentium-4 mpn_lshift -- left shift.
- *
- * Copyright 2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_lshift( mpi_ptr_t wp,	(sp + 4)
- *		   mpi_ptr_t up,	(sp + 8)
- *		   mpi_size_t usize,	(sp + 12)
- *		   unsigned cnt)	(sp + 16)
- *
- * P4 Willamette, Northwood: 1.75 cycles/limb
- * P4 Prescott:		     2.0 cycles/limb
- */
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_lshift)
-C_SYMBOL_NAME(_gcry_mpih_lshift:)
-
-	
-	pushl	%ebx
-	pushl	%edi
-
-
-	movl	20(%esp), %eax
-	movl	12(%esp), %edx
-
-	movl	16(%esp), %ebx
-	movl	24(%esp), %ecx
-
-	cmp	$5, %eax
-	jae	.Lunroll
-
-	movl	-4(%ebx,%eax,4), %edi	
-	decl	%eax
-
-	jnz	.Lsimple
-
-	shldl	%cl, %edi, %eax	
-
-	shll	%cl, %edi
-
-	movl	%edi, (%edx)		
-	popl	%edi			
-
-	popl	%ebx
-
-	ret
-
-
-
-
-
-.Lsimple:
-	
-	
-	
-	
-	
-	
-	
-
-
-	movd	(%ebx,%eax,4), %mm5	
-
-	movd	%ecx, %mm6		
-	negl	%ecx
-
-	psllq	%mm6, %mm5
-	addl	$32, %ecx
-
-	movd	%ecx, %mm7
-	psrlq	$32, %mm5		
-
-
-.Lsimple_top:
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	-4(%ebx,%eax,4), %mm0
-	decl	%eax
-
-	psrlq	%mm7, %mm0
-
-	
-
-	movd	%mm0, 4(%edx,%eax,4)
-	jnz	.Lsimple_top
-
-
-	movd	(%ebx), %mm0
-
-	movd	%mm5, %eax
-	psllq	%mm6, %mm0
-
-	popl	%edi
-	popl	%ebx
-
-	movd	%mm0, (%edx)
-
-	emms
-
-	ret
-
-
-
-
-
-	.align	8, 0x90
-.Lunroll:
-	
-	
-	
-	
-	
-	
-	
-
-
-	movd	-4(%ebx,%eax,4), %mm5	
-	leal	(%ebx,%eax,4), %edi
-
-	movd	%ecx, %mm6		
-	andl	$4, %edi
-
-	psllq	%mm6, %mm5
-	jz	.Lstart_src_aligned
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	-8(%ebx,%eax,4), %mm0	
-
-	psllq	%mm6, %mm0
-	decl	%eax
-
-	psrlq	$32, %mm0
-
-	
-
-	movd	%mm0, (%edx,%eax,4)
-.Lstart_src_aligned:
-
-	movq	-8(%ebx,%eax,4), %mm1	
-	leal	(%edx,%eax,4), %edi
-
-	andl	$4, %edi
-	psrlq	$32, %mm5		
-
-	movq	-16(%ebx,%eax,4), %mm3	
-	jz	.Lstart_dst_aligned
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	%mm1, %mm0
-	addl	$32, %ecx		
-
-	psllq	%mm6, %mm0
-
-	movd	%ecx, %mm6
-	psrlq	$32, %mm0
-
-	
-
-	movd	%mm0, -4(%edx,%eax,4)
-	subl	$4, %edx
-.Lstart_dst_aligned:
-
-
-	psllq	%mm6, %mm1
-	negl	%ecx			
-
-	addl	$64, %ecx		
-	movq	%mm3, %mm2
-
-	movd	%ecx, %mm7
-	subl	$8, %eax		
-
-	psrlq	%mm7, %mm3
-
-	por	%mm1, %mm3		
-	jc	.Lfinish
-
-
-	
-
-	.align	8, 0x90
-.Lunroll_loop:
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	8(%ebx,%eax,4), %mm0
-	psllq	%mm6, %mm2
-
-	movq	%mm0, %mm1
-	psrlq	%mm7, %mm0
-
-	movq	%mm3, 24(%edx,%eax,4)	
-	por	%mm2, %mm0
-
-	movq	(%ebx,%eax,4), %mm3	
-	psllq	%mm6, %mm1		
-
-	movq	%mm0, 16(%edx,%eax,4)
-	movq	%mm3, %mm2		
-
-	psrlq	%mm7, %mm3		
-	subl	$4, %eax
-
-	por	%mm1, %mm3		
-	jnc	.Lunroll_loop
-
-
-
-.Lfinish:
-	
-
-	testb	$2, %al
-
-	jz	.Lfinish_no_two
-
-	movq	8(%ebx,%eax,4), %mm0
-	psllq	%mm6, %mm2
-
-	movq	%mm0, %mm1
-	psrlq	%mm7, %mm0
-
-	movq	%mm3, 24(%edx,%eax,4)	
-	por	%mm2, %mm0
-
-	movq	%mm1, %mm2
-	movq	%mm0, %mm3
-
-	subl	$2, %eax
-.Lfinish_no_two:
-
-
-	
-	
-	
-	
-
-	testb	$1, %al
-	movd	%mm5, %eax	
-
-	popl	%edi
-	jz	.Lfinish_zero
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-	
-
-
-	movd	(%ebx), %mm0
-	psllq	%mm6, %mm2
-
-	movq	%mm3, 12(%edx)
-	psllq	$32, %mm0
-
-	movq	%mm0, %mm1
-	psrlq	%mm7, %mm0
-
-	por	%mm2, %mm0
-	psllq	%mm6, %mm1
-
-	movq	%mm0, 4(%edx)
-	psrlq	$32, %mm1
-
-	andl	$32, %ecx
-	popl	%ebx
-
-	jz	.Lfinish_one_unaligned
-
-	movd	%mm1, (%edx)
-.Lfinish_one_unaligned:
-
-	emms
-
-	ret
-
-
-
-
-.Lfinish_zero:
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-
-
-	movq	%mm3, 8(%edx)
-	andl	$32, %ecx
-
-	psllq	%mm6, %mm2
-	jz	.Lfinish_zero_unaligned
-
-	movq	%mm2, (%edx)
-.Lfinish_zero_unaligned:
-
-	psrlq	$32, %mm2
-	popl	%ebx
-
-	movd	%mm5, %eax	
-
-	movd	%mm2, 4(%edx)
-
-	emms
-
-	ret
diff --git a/mpi/pentium4/mmx/mpih-rshift.S b/mpi/pentium4/mmx/mpih-rshift.S
deleted file mode 100644
index e3374e3b..00000000
--- a/mpi/pentium4/mmx/mpih-rshift.S
+++ /dev/null
@@ -1,453 +0,0 @@
-/* Intel Pentium-4 mpn_rshift -- right shift.
- *
- * Copyright 2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_rshift( mpi_ptr_t wp,	(sp + 4)
- *		   mpi_ptr_t up,	(sp + 8)
- *		   mpi_size_t usize,	(sp + 12)
- *		   unsigned cnt)	(sp + 16)
- *
- * P4 Willamette, Northwood: 1.75 cycles/limb
- * P4 Prescott:		     2.0 cycles/limb
- */
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_rshift)
-C_SYMBOL_NAME(_gcry_mpih_rshift:)
-	pushl	%ebx
-	pushl	%edi
-
-
-	movl	20(%esp), %eax
-	movl	12(%esp), %edx
-
-	movl	16(%esp), %ebx
-	movl	24(%esp), %ecx
-
-	cmp	$5, %eax
-	jae	.Lunroll
-
-	decl	%eax
-	movl	(%ebx), %edi		
-
-	jnz	.Lsimple
-
-	shrdl	%cl, %edi, %eax	
-
-	shrl	%cl, %edi
-
-	movl	%edi, (%edx)		
-	popl	%edi			
-
-	popl	%ebx
-
-	ret
-
-
-
-
-
-	.align	8, 0x90
-.Lsimple:
-	
-	
-	
-	
-	
-	
-	
-
-
-	movd	(%ebx), %mm5		
-	leal	(%ebx,%eax,4), %ebx	
-
-	movd	%ecx, %mm6		
-	leal	-4(%edx,%eax,4), %edx	
-
-	psllq	$32, %mm5
-	negl	%eax
-
-
-
-
-
-
-
-.Lsimple_top:
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	(%ebx,%eax,4), %mm0
-	incl	%eax
-
-	psrlq	%mm6, %mm0
-
-	movd	%mm0, (%edx,%eax,4)
-	jnz	.Lsimple_top
-
-
-	movd	(%ebx), %mm0
-	psrlq	%mm6, %mm5		
-
-	psrlq	%mm6, %mm0
-	popl	%edi
-
-	movd	%mm5, %eax
-	popl	%ebx
-
-	movd	%mm0, 4(%edx)
-
-	emms
-
-	ret
-
-
-
-
-
-	.align	8, 0x90
-.Lunroll:
-	
-	
-	
-	
-	
-	
-	
-
-
-	movd	(%ebx), %mm5		
-	movl	$4, %edi
-
-	movd	%ecx, %mm6		
-	testl	%edi, %ebx
-
-	psllq	$32, %mm5
-	jz	.Lstart_src_aligned
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	(%ebx), %mm0		
-
-	psrlq	%mm6, %mm0
-	addl	$4, %ebx
-
-	decl	%eax
-
-	movd	%mm0, (%edx)
-	addl	$4, %edx
-.Lstart_src_aligned:
-
-
-	movq	(%ebx), %mm1
-	testl	%edi, %edx
-
-	psrlq	%mm6, %mm5		
-	jz	.Lstart_dst_aligned
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	%mm1, %mm0
-	addl	$32, %ecx		
-
-	psrlq	%mm6, %mm0
-
-	movd	%ecx, %mm6
-
-	movd	%mm0, (%edx)
-	addl	$4, %edx
-.Lstart_dst_aligned:
-
-
-	movq	8(%ebx), %mm3
-	negl	%ecx
-
-	movq	%mm3, %mm2		
-	addl	$64, %ecx
-
-	movd	%ecx, %mm7
-	psrlq	%mm6, %mm1
-
-	leal	-12(%ebx,%eax,4), %ebx
-	leal	-20(%edx,%eax,4), %edx
-
-	psllq	%mm7, %mm3
-	subl	$7, %eax		
-
-	por	%mm1, %mm3		
-	negl	%eax			
-
-	jns	.Lfinish
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	.align	8, 0x90
-.Lunroll_loop:
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-	movq	(%ebx,%eax,4), %mm0
-	psrlq	%mm6, %mm2
-
-	movq	%mm0, %mm1
-	psllq	%mm7, %mm0
-
-	movq	%mm3, -8(%edx,%eax,4)	
-	por	%mm2, %mm0
-
-	movq	8(%ebx,%eax,4), %mm3	
-	psrlq	%mm6, %mm1		
-
-	movq	%mm0, (%edx,%eax,4)
-	movq	%mm3, %mm2		
-
-	psllq	%mm7, %mm3		
-	addl	$4, %eax
-
-	por	%mm1, %mm3		
-	js	.Lunroll_loop
-
-
-.Lfinish:
-	
-
-	testb	$2, %al
-
-	jnz	.Lfinish_no_two
-
-	movq	(%ebx,%eax,4), %mm0
-	psrlq	%mm6, %mm2
-
-	movq	%mm0, %mm1
-	psllq	%mm7, %mm0
-
-	movq	%mm3, -8(%edx,%eax,4)	
-	por	%mm2, %mm0
-
-	movq	%mm1, %mm2
-	movq	%mm0, %mm3
-
-	addl	$2, %eax
-.Lfinish_no_two:
-
-
-	
-	
-	
-	
-
-	testb	$1, %al
-	popl	%edi
-
-	movd	%mm5, %eax	
-	jnz	.Lfinish_zero
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-
-
-	movd	8(%ebx), %mm0
-	psrlq	%mm6, %mm2
-
-	movq	%mm0, %mm1
-	psllq	%mm7, %mm0
-
-	movq	%mm3, (%edx)
-	por	%mm2, %mm0
-
-	psrlq	%mm6, %mm1
-	andl	$32, %ecx
-
-	popl	%ebx
-	jz	.Lfinish_one_unaligned
-
-	
-	movd	%mm1, 16(%edx)
-.Lfinish_one_unaligned:
-
-	movq	%mm0, 8(%edx)
-
-	emms
-
-	ret
-
-
-
-
-.Lfinish_zero:
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-
-
-	
-	
-	
-
-
-	movq	%mm3, 4(%edx)
-	psrlq	%mm6, %mm2
-
-	movd	%mm2, 12(%edx)
-	andl	$32, %ecx
-
-	popl	%ebx
-	jz	.Lfinish_zero_unaligned
-
-	movq	%mm2, 12(%edx)
-.Lfinish_zero_unaligned:
-
-	emms
-
-	ret
diff --git a/mpi/pentium4/sse2/distfiles b/mpi/pentium4/sse2/distfiles
deleted file mode 100644
index 7252cd7e..00000000
--- a/mpi/pentium4/sse2/distfiles
+++ /dev/null
@@ -1,5 +0,0 @@
-mpih-add1.S
-mpih-mul1.S
-mpih-mul2.S
-mpih-mul3.S
-mpih-sub1.S
diff --git a/mpi/pentium4/sse2/mpih-add1.S b/mpi/pentium4/sse2/mpih-add1.S
deleted file mode 100644
index 55ed6630..00000000
--- a/mpi/pentium4/sse2/mpih-add1.S
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Intel Pentium-4 mpn_add_n -- mpn addition.
- *
- * Copyright 2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-	/*******************
- *  mpi_limb_t
- *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	(sp + 4)
- *		   mpi_ptr_t s1_ptr,	(sp + 8)
- *		   mpi_ptr_t s2_ptr,	(sp + 12)
- *		   mpi_size_t size)	(sp + 16)
- *
- * P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
- *			    6.0 cycles/limb if dst==src1 or dst==src2
- * P4 Prescott:		    >= 5 cycles/limb
- *
- * The 4 c/l achieved here isn't particularly good, but is better than 9 c/l
- * for a basic adc loop.
- */
-
-	TEXT
-	ALIGN (3)
-	GLOBL C_SYMBOL_NAME(_gcry_mpih_add_n)
-C_SYMBOL_NAME(_gcry_mpih_add_n:)
-
-	pxor	%mm0, %mm0
-	
-	movl	8(%esp), %eax		/* s1_ptr */
-	movl	%ebx, 8(%esp)		/* re-use parameter space */
-	movl	12(%esp), %ebx		/* res_ptr */
-	movl	4(%esp), %edx		/* s2_ptr */
-	movl	16(%esp), %ecx		/* size */
-
-	leal	(%eax,%ecx,4), %eax	/* src1 end */
-	leal	(%ebx,%ecx,4), %ebx	/* src2 end */
-	leal	(%edx,%ecx,4), %edx	/* dst end */
-	negl	%ecx			/* -size */
-
-Ltop:
-/*
-	C eax	src1 end
-	C ebx	src2 end
-	C ecx	counter, limbs, negative
-	C edx	dst end
-	C mm0	carry bit
-*/
-
-	movd	(%eax,%ecx,4), %mm1
-	movd	(%ebx,%ecx,4), %mm2
-	paddq	%mm2, %mm1
-
-	paddq	%mm1, %mm0
-	movd	%mm0, (%edx,%ecx,4)
-
-	psrlq	$32, %mm0
-
-	addl	$1, %ecx
-	jnz	Ltop
-
-
-	movd	%mm0, %eax
-	movl	8(%esp), %ebx	/* restore saved EBX */
-	emms
-	ret
diff --git a/mpi/pentium4/sse2/mpih-mul1.S b/mpi/pentium4/sse2/mpih-mul1.S
deleted file mode 100644
index a0c98fb4..00000000
--- a/mpi/pentium4/sse2/mpih-mul1.S
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Intel Pentium-4 mpn_mul_1 -- Multiply a limb vector with a limb and store
- * the result in a second limb vector.
- *
- * Copyright 2001, 2002, 2003, 2005 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,	(sp + 4)
- *		  mpi_ptr_t s1_ptr,	(sp + 8)
- *		  mpi_size_t s1_size,	(sp + 12)
- *		  mpi_limb_t s2_limb)	(sp + 16)
- *
- *                           src != dst      src == dst
- * P6 model 9  (Banias)          ?.?
- * P6 model 13 (Dothan)          4.75            4.75
- * P4 model 0  (Willamette)      4.0             6.0
- * P4 model 1  (?)               4.0             6.0
- * P4 model 2  (Northwood)       4.0             6.0
- * P4 model 3  (Prescott)        ?.?             ?.?
- * P4 model 4  (Nocona)          ?.?             ?.?
- * Unfortunately when src==dst the write-combining described in
- * pentium4/README takes us up to 6 c/l.
- *
- */
-
-	TEXT
-	ALIGN (3)
-	GLOBL	C_SYMBOL_NAME(_gcry_mpih_mul_1)
-C_SYMBOL_NAME(_gcry_mpih_mul_1:); 
-
-	pxor	%mm0, %mm0
-
-.Lstart_1c:
-	movl	8(%esp), %eax
-	movd	16(%esp), %mm7
-	movl	4(%esp), %edx
-	movl	12(%esp), %ecx
-
-.Ltop:
-
-/*
-	C eax	src, incrementing
-	C ebx
-	C ecx	counter, size iterations
-	C edx	dst, incrementing
-	C
-	C mm0	carry limb
-	C mm7	multiplier
-*/
-	
-	movd	(%eax), %mm1
-	addl	$4, %eax
-	pmuludq	%mm7, %mm1
-
-	paddq	%mm1, %mm0
-	movd	%mm0, (%edx)
-	addl	$4, %edx
-
-	psrlq	$32, %mm0
-
-	subl	$1, %ecx
-	jnz	.Ltop
-
-
-	movd	%mm0, %eax
-	emms
-	ret
-
diff --git a/mpi/pentium4/sse2/mpih-mul2.S b/mpi/pentium4/sse2/mpih-mul2.S
deleted file mode 100644
index f975adfc..00000000
--- a/mpi/pentium4/sse2/mpih-mul2.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Intel Pentium-4 mpn_addmul_1 -- Multiply a limb vector with a limb and add
- * the result to a second limb vector.
- *
- * Copyright 2001, 2002, 2004, 2005 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,      (sp + 4)
- *		     mpi_ptr_t s1_ptr,	     (sp + 8)
- *		     mpi_size_t s1_size,     (sp + 12)
- *		     mpi_limb_t s2_limb)     (sp + 16)
- *
- * P3 model 9  (Banias)          ?.?
- * P3 model 13 (Dothan)          5.8
- * P4 model 0  (Willamette)      5.5
- * P4 model 1  (?)               5.5
- * P4 model 2  (Northwood)       5.5
- * P4 model 3  (Prescott)        6.0
- * P4 model 4  (Nocona)
- *
- * Only the carry limb propagation is on the dependent chain, but some other
- * Pentium4 pipeline magic brings down performance to 6 cycles/l from the
- * ideal 4 cycles/l.
- */
-
-
-	TEXT
-	ALIGN (4)
-	GLOBL	C_SYMBOL_NAME(_gcry_mpih_addmul_1)
-C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
-
-	pxor	%mm4, %mm4
-.Lstart_1c:
-	movl	8(%esp), %eax
-	movl	12(%esp), %ecx
-	movl	4(%esp), %edx
-	movd	16(%esp), %mm7
-
-/*
-	C eax	src, incrementing ; 5B
-	C ecx	loop counter, decrementing
-	C edx	dst, incrementing
-	C
-	C mm4	carry, low 32-bits
-	C mm7	multiplier
-*/
-
-	movd		(%eax), %mm2	
-	pmuludq		%mm7, %mm2
-
-	shrl	$1, %ecx
-	jnc	.Leven
-
-	leal		4(%eax), %eax
-	movd		(%edx), %mm1
-	paddq		%mm2, %mm1
-	paddq		%mm1, %mm4
-	movd		%mm4, (%edx)
-	psrlq		$32, %mm4
-
-	testl	%ecx, %ecx
-	jz	.Lrtn
-	leal	4(%edx), %edx
-
-	movd		(%eax), %mm2	
-	pmuludq		%mm7, %mm2
-.Leven:
-	movd		4(%eax), %mm0	
-	movd		(%edx), %mm1	
-	pmuludq		%mm7, %mm0
-
-	subl	$1, %ecx
-	jz	.Lend
-.Lloop:
-	paddq		%mm2, %mm1	
-	movd		8(%eax), %mm2	
-	paddq		%mm1, %mm4	
-	movd		4(%edx), %mm3	
-	pmuludq		%mm7, %mm2
-	movd		%mm4, (%edx)
-	psrlq		$32, %mm4
-
-	paddq		%mm0, %mm3	
-	movd		12(%eax), %mm0	
-	paddq		%mm3, %mm4	
-	movd		8(%edx), %mm1	
-	pmuludq		%mm7, %mm0
-	movd		%mm4, 4(%edx)
-	psrlq		$32, %mm4
-
-	leal	8(%eax), %eax
-	leal	8(%edx), %edx
-	subl	$1, %ecx
-	jnz	.Lloop
-.Lend:
-	paddq		%mm2, %mm1	
-	paddq		%mm1, %mm4	
-	movd		4(%edx), %mm3	
-	movd		%mm4, (%edx)
-	psrlq		$32, %mm4
-	paddq		%mm0, %mm3	
-	paddq		%mm3, %mm4	
-	movd		%mm4, 4(%edx)
-	psrlq		$32, %mm4
-.Lrtn:
-	movd	%mm4, %eax
-	emms
-	ret
diff --git a/mpi/pentium4/sse2/mpih-mul3.S b/mpi/pentium4/sse2/mpih-mul3.S
deleted file mode 100644
index ebcd2a68..00000000
--- a/mpi/pentium4/sse2/mpih-mul3.S
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
- * subtract the result from a second limb vector.
- *
- * Copyright 2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- * mpi_limb_t
- * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,      (sp + 4)
- *		     mpi_ptr_t s1_ptr,	     (sp + 8)
- *		     mpi_size_t s1_size,     (sp + 12)
- *		     mpi_limb_t s2_limb)     (sp + 16)
- *
- * P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon
- *    (stepping 10).
- *
- * This code is not particularly good at 7 c/l.  The dependent chain is only
- * 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that
- * speed isn't achieved.
- *
- * The arrangements made here to get a two instruction dependent chain are
- * slightly subtle.  In the loop the carry (or borrow rather) is a negative
- * so that a paddq can be used to give a low limb ready to store, and a high
- * limb ready to become the new carry after a psrlq.
- *
- * If the carry was a simple twos complement negative then the psrlq shift
- * would need to bring in 0 bits or 1 bits according to whether the high was
- * zero or non-zero, since a non-zero value would represent a negative
- * needing sign extension.  That wouldn't be particularly easy to arrange and
- * certainly would add an instruction to the dependent chain, so instead an
- * offset is applied so that the high limb will be 0xFFFFFFFF+c.  With c in
- * the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to
- * 0xFFFFFFFF and is therefore always positive and can always have 0 bits
- * shifted in, which is what psrlq does.
- *
- * The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
- * done off the dependent chain.  The total adjustment then is to add
- * 0xFFFFFFFF00000000 to offset the new carry, and subtract
- * 0x00000000FFFFFFFF to remove the offset from the current carry, for a net
- * add of 0xFFFFFFFE00000001.  In the code this is applied to the destination
- * limb when fetched.
- *
- * It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
- * negative, which is how it's undone for the return value, but that doesn't
- * seem as clear.
-*/
-
-	TEXT
-	ALIGN (4)
-	GLOBL	C_SYMBOL_NAME(_gcry_mpih_submul_1)
-C_SYMBOL_NAME(_gcry_mpih_submul_1:)
-
-	pxor	%mm1, %mm1		
-
-.Lstart_1c:
-	movl	8(%esp), %eax
-	pcmpeqd	%mm0, %mm0
-
-	movd	16(%esp), %mm7
-	pcmpeqd	%mm6, %mm6
-
-	movl	4(%esp), %edx
-	psrlq	$32, %mm0		
-
-	movl	12(%esp), %ecx
-	psllq	$32, %mm6		
-
-	psubq	%mm0, %mm6		
-
-	psubq	%mm1, %mm0		
-
-/*
-	C eax	src, incrementing
-	C ebx
-	C ecx	loop counter, decrementing
-	C edx	dst, incrementing
-	C
-	C mm0	0xFFFFFFFF - borrow
-	C mm6	0xFFFFFFFE00000001
-	C mm7	multiplier
-*/
-	
-.Lloop:
-	movd	(%eax), %mm1		
-	leal	4(%eax), %eax
-	movd	(%edx), %mm2		
-	paddq	%mm6, %mm2		
-	pmuludq	%mm7, %mm1
-	psubq	%mm1, %mm2		
-	paddq	%mm2, %mm0		
-	subl	$1, %ecx
-	movd	%mm0, (%edx)		
-	psrlq	$32, %mm0
-	leal	4(%edx), %edx
-	jnz	.Lloop
-
-	movd	%mm0, %eax
-	notl	%eax
-	emms
-	ret
diff --git a/mpi/pentium4/sse2/mpih-sub1.S b/mpi/pentium4/sse2/mpih-sub1.S
deleted file mode 100644
index 33900c74..00000000
--- a/mpi/pentium4/sse2/mpih-sub1.S
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Intel Pentium-4 mpn_sub_n -- mpn subtraction.
- *
- * Copyright 2001, 2002 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
- *
- * Note: This code is heavily based on the GNU MP Library.
- *	 Actually it's the same code with only minor changes in the
- *	 way the data is stored; this is to support the abstraction
- *	 of an optional secure memory allocation which may be used
- *	 to avoid revealing of sensitive data due to paging etc.
- */
-
-
-#include "sysdep.h"
-#include "asm-syntax.h"
-
-
-/*******************
- *  mpi_limb_t
- *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	(sp + 4)
- *		   mpi_ptr_t s1_ptr,	(sp + 8)
- *		   mpi_ptr_t s2_ptr,	(sp + 12)
- *		   mpi_size_t size)	(sp + 16)
- *
- * P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2
- *			     6.0 cycles/limb if dst==src1 or dst==src2
- * P4 Prescott:		     >= 5 cycles/limb
- *
- * The main loop code is 2x unrolled so that the carry bit can alternate
- * between mm0 and mm1.
- */
-
-
-.text
-	ALIGN (3)
-	.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
-C_SYMBOL_NAME(_gcry_mpih_sub_n:)
-
-	pxor	%mm0, %mm0
-.Lstart_nc:
-	movl	8(%esp), %eax
-	movl	%ebx, 8(%esp)
-	movl	12(%esp), %ebx
-	movl	4(%esp), %edx
-	movl	16(%esp), %ecx
-
-	leal	(%eax,%ecx,4), %eax	
-	leal	(%ebx,%ecx,4), %ebx	
-	leal	(%edx,%ecx,4), %edx	
-	negl	%ecx			
-
-.Ltop:
-/*
-	C eax	src1 end
-	C ebx	src2 end
-	C ecx	counter, limbs, negative
-	C edx	dst end
-	C mm0	carry bit
-*/
-
-	movd	(%eax,%ecx,4), %mm1
-	movd	(%ebx,%ecx,4), %mm2
-	psubq	%mm2, %mm1
-
-	psubq	%mm0, %mm1
-	movd	%mm1, (%edx,%ecx,4)
-
-	psrlq	$63, %mm1
-
-	addl	$1, %ecx
-	jz	.Ldone_mm1
-
-	movd	(%eax,%ecx,4), %mm0
-	movd	(%ebx,%ecx,4), %mm2
-	psubq	%mm2, %mm0
-
-	psubq	%mm1, %mm0
-	movd	%mm0, (%edx,%ecx,4)
-
-	psrlq	$63, %mm0
-
-	addl	$1, %ecx
-	jnz	.Ltop
-
-
-	movd	%mm0, %eax
-	movl	8(%esp), %ebx
-	emms
-	ret
-
-
-
-.Ldone_mm1:
-	movd	%mm1, %eax
-	movl	8(%esp), %ebx
-	emms
-	ret
-- 
2.32.0


From jussi.kivilinna at iki.fi  Sat Jan  8 22:43:43 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat,  8 Jan 2022 23:43:43 +0200
Subject: [PATCH] mpi/amd64: remove extra 'ret' from assembly functions
Message-ID: <20220108214343.486437-1-jussi.kivilinna@iki.fi>

* mpi/amd64/mpih-add1.S: Remove 'ret' as it is already included by
FUNC_EXIT macro.
* mpi/amd64/mpih-lshift.S: Likewise.
* mpi/amd64/mpih-mul1.S: Likewise.
* mpi/amd64/mpih-mul2.S: Likewise.
* mpi/amd64/mpih-mul3.S: Likewise.
* mpi/amd64/mpih-rshift.S: Likewise.
* mpi/amd64/mpih-sub1.S: Likewise.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/amd64/mpih-add1.S   | 1 -
 mpi/amd64/mpih-lshift.S | 1 -
 mpi/amd64/mpih-mul1.S   | 1 -
 mpi/amd64/mpih-mul2.S   | 1 -
 mpi/amd64/mpih-mul3.S   | 1 -
 mpi/amd64/mpih-rshift.S | 1 -
 mpi/amd64/mpih-sub1.S   | 1 -
 7 files changed, 7 deletions(-)

diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S
index 157e5f1e..39c00c52 100644
--- a/mpi/amd64/mpih-add1.S
+++ b/mpi/amd64/mpih-add1.S
@@ -61,4 +61,3 @@ C_SYMBOL_NAME(_gcry_mpih_add_n:)
 	movq	%rcx, %rax		/* zero %rax */
 	adcq	%rax, %rax
 	FUNC_EXIT()
-	ret
diff --git a/mpi/amd64/mpih-lshift.S b/mpi/amd64/mpih-lshift.S
index 76e9408f..a9c7d7e1 100644
--- a/mpi/amd64/mpih-lshift.S
+++ b/mpi/amd64/mpih-lshift.S
@@ -76,4 +76,3 @@ C_SYMBOL_NAME(_gcry_mpih_lshift:)
 .Lende:	psllq	%xmm1, %xmm2
 	movq	%xmm2, (%rdi)
 	FUNC_EXIT()
-	ret
diff --git a/mpi/amd64/mpih-mul1.S b/mpi/amd64/mpih-mul1.S
index 67ab47ea..dacb9d87 100644
--- a/mpi/amd64/mpih-mul1.S
+++ b/mpi/amd64/mpih-mul1.S
@@ -64,4 +64,3 @@ C_SYMBOL_NAME(_gcry_mpih_mul_1:)
 
 	movq	%r8, %rax
 	FUNC_EXIT()
-	ret
diff --git a/mpi/amd64/mpih-mul2.S b/mpi/amd64/mpih-mul2.S
index 1aa4fa0a..07913586 100644
--- a/mpi/amd64/mpih-mul2.S
+++ b/mpi/amd64/mpih-mul2.S
@@ -63,4 +63,3 @@ C_SYMBOL_NAME(_gcry_mpih_addmul_1:)
 
 	movq	%r8, %rax
 	FUNC_EXIT()
-	ret
diff --git a/mpi/amd64/mpih-mul3.S b/mpi/amd64/mpih-mul3.S
index bc41c4eb..f8889eb2 100644
--- a/mpi/amd64/mpih-mul3.S
+++ b/mpi/amd64/mpih-mul3.S
@@ -64,4 +64,3 @@ C_SYMBOL_NAME(_gcry_mpih_submul_1:)
 
 	movq	%r8, %rax
 	FUNC_EXIT()
-	ret
diff --git a/mpi/amd64/mpih-rshift.S b/mpi/amd64/mpih-rshift.S
index d5e27974..8ecf155f 100644
--- a/mpi/amd64/mpih-rshift.S
+++ b/mpi/amd64/mpih-rshift.S
@@ -79,4 +79,3 @@ C_SYMBOL_NAME(_gcry_mpih_rshift:)
 .Lende:	psrlq	%xmm1, %xmm2
 	movq	%xmm2, -8(%rdi)
 	FUNC_EXIT()
-	ret
diff --git a/mpi/amd64/mpih-sub1.S b/mpi/amd64/mpih-sub1.S
index ccf64963..d60b58a5 100644
--- a/mpi/amd64/mpih-sub1.S
+++ b/mpi/amd64/mpih-sub1.S
@@ -60,4 +60,3 @@ C_SYMBOL_NAME(_gcry_mpih_sub_n:)
 	movq	%rcx, %rax		/* zero %rax */
 	adcq	%rax, %rax
 	FUNC_EXIT()
-	ret
-- 
2.32.0


From jussi.kivilinna at iki.fi  Tue Jan 11 20:00:08 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 11 Jan 2022 21:00:08 +0200
Subject: [PATCH] rijndael-aesni: small optimization for cbc-enc and cfb-enc
Message-ID: <20220111190008.1220151-1-jussi.kivilinna@iki.fi>

* cipher/rijndael-aesni.c (_gcry_aes_aesni_cfb_enc)
(_gcry_aes_aesni_cbc_enc): Copy contents of 'do_aesni_enc' here and
merge input/output and first/last round key xoring to shorten critical
path.
--

Benchmark on AMD Ryzen 7 5800X:

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        CBC enc |     0.541 ns/B      1762 MiB/s      2.62 c/B      4850
        CFB enc |     0.541 ns/B      1762 MiB/s      2.63 c/B      4850

After (5% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        CBC enc |     0.515 ns/B      1850 MiB/s      2.50 c/B      4850
        CFB enc |     0.515 ns/B      1851 MiB/s      2.50 c/B      4850

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-aesni.c | 201 +++++++++++++++++++++++++++++++++-------
 1 file changed, 165 insertions(+), 36 deletions(-)

diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 34a4a447..ff6b0b26 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -1723,34 +1723,97 @@ _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks)
 {
+  unsigned int rounds = ctx->rounds;
+  aesni_prepare_2_7_variable;
+
   aesni_prepare ();
+  aesni_prepare_2_7();
 
   asm volatile ("movdqu %[iv], %%xmm0\n\t"
-                : /* No output */
-                : [iv] "m" (*iv)
-                : "memory" );
+		: /* No output */
+		: [iv] "m" (*iv)
+		: "memory" );
+
+  asm volatile ("movdqa %[key0], %%xmm2\n\t"     /* xmm2 = key[0] */
+		"movdqa %[keylast], %%xmm4\n\t"  /* xmm4 = key[last] */
+		"movdqa %%xmm0, %%xmm3\n"
+		"pxor %%xmm2, %%xmm4\n\t"        /* xmm4 = key[0] ^ key[last] */
+		"pxor %%xmm2, %%xmm0\n\t"        /* xmm0 = IV ^ key[0] */
+		: /* No output */
+		: [key0] "m" (ctx->keyschenc[0][0][0]),
+		  [keylast] "m" (ctx->keyschenc[rounds][0][0])
+		: "memory" );
 
   for ( ;nblocks; nblocks-- )
     {
-      do_aesni_enc (ctx);
+      asm volatile ("movdqu %[inbuf], %%xmm5\n\t"
+		    "movdqa %%xmm2, %%xmm3\n\t"
+		    "pxor %%xmm4, %%xmm5\n\t"  /* xmm5 = input ^ key[last] ^ key[0] */
+		    :
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
 
-      asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
-                    "pxor %%xmm1, %%xmm0\n\t"
-                    "movdqu %%xmm0, %[outbuf]\n\t"
-                    : [outbuf] "=m" (*outbuf)
-                    : [inbuf] "m" (*inbuf)
-                    : "memory" );
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm1_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t"
+#define aesenclast_xmm5_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc5\n\t"
+      asm volatile ("movdqa 0x10(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x20(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x30(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x40(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x50(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x60(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x70(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x80(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x90(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "cmpl $10, %[rounds]\n\t"
+		    "jz .Lenclast%=\n\t"
+		    "movdqa 0xa0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0xb0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "cmpl $12, %[rounds]\n\t"
+		    "jz .Lenclast%=\n\t"
+		    "movdqa 0xc0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0xd0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+
+		    ".Lenclast%=:\n\t"
+		    aesenclast_xmm5_xmm0
+		    :
+		    : [key] "r" (ctx->keyschenc),
+		      [rounds] "r" (rounds)
+		    : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm1_xmm0
+#undef aesenclast_xmm5_xmm0
+
+      asm volatile ("pxor %%xmm0, %%xmm3\n\t"
+		    "movdqu %%xmm3, %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    : [inbuf] "m" (*inbuf)
+		    : "memory" );
 
       outbuf += BLOCKSIZE;
       inbuf  += BLOCKSIZE;
     }
 
-  asm volatile ("movdqu %%xmm0, %[iv]\n\t"
-                : [iv] "=m" (*iv)
-                :
-                : "memory" );
+  asm volatile ("movdqu %%xmm3, %[iv]\n\t"
+		: [iv] "=m" (*iv)
+		:
+		: "memory" );
 
   aesni_cleanup ();
+  aesni_cleanup_2_7 ();
 }
 
 
@@ -1759,41 +1822,107 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv,
                          unsigned char *outbuf, const unsigned char *inbuf,
                          size_t nblocks, int cbc_mac)
 {
+  unsigned int rounds = ctx->rounds;
   aesni_prepare_2_7_variable;
 
+  if (nblocks == 0) /* CMAC may call with nblocks 0. */
+    return;
+
   aesni_prepare ();
   aesni_prepare_2_7();
 
-  asm volatile ("movdqu %[iv], %%xmm5\n\t"
-                : /* No output */
-                : [iv] "m" (*iv)
-                : "memory" );
+  asm volatile ("movdqu %[iv], %%xmm0\n\t"
+		: /* No output */
+		: [iv] "m" (*iv)
+		: "memory" );
 
-  for ( ;nblocks; nblocks-- )
+  asm volatile ("movdqa %[key0], %%xmm2\n\t"     /* xmm2 = key[0] */
+		"movdqa %[keylast], %%xmm3\n\t"  /* xmm3 = key[last] */
+		"pxor %%xmm2, %%xmm0\n\t"        /* xmm0 = IV ^ key[0] */
+		"pxor %%xmm3, %%xmm2\n\t"        /* xmm2 = key[0] ^ key[last] */
+		: /* No output */
+		: [key0] "m" (ctx->keyschenc[0][0][0]),
+		  [keylast] "m" (ctx->keyschenc[rounds][0][0])
+		: "memory" );
+
+  asm volatile ("movdqu %[inbuf], %%xmm4\n\t"
+		"pxor %%xmm4, %%xmm0\n\t"  /* xmm0 = IV ^ key[0] ^ input */
+		:
+		: [inbuf] "m" (*inbuf)
+		: "memory" );
+  inbuf += BLOCKSIZE;
+
+  for ( ;nblocks; )
     {
-      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
-                    "pxor %%xmm5, %%xmm0\n\t"
-                    : /* No output */
-                    : [inbuf] "m" (*inbuf)
-                    : "memory" );
+      if (--nblocks)
+	{
+	  asm volatile ("movdqu %[inbuf], %%xmm4\n\t"
+			/* xmm4 = IV ^ key[0] ^ key[last] ^ input: */
+			"pxor %%xmm2, %%xmm4\n\t"
+			:
+			: [inbuf] "m" (*inbuf)
+			: "memory" );
+	  inbuf += BLOCKSIZE;
+	}
 
-      do_aesni_enc (ctx);
+#define aesenc_xmm1_xmm0      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t"
+#define aesenclast_xmm4_xmm0  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc4\n\t"
+#define aesenclast_xmm3_xmm5  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xeb\n\t"
+      asm volatile ("movdqa 0x10(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x20(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x30(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x40(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x50(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x60(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x70(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x80(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0x90(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "cmpl $10, %[rounds]\n\t"
+		    "jz .Lenclast%=\n\t"
+		    "movdqa 0xa0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0xb0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "cmpl $12, %[rounds]\n\t"
+		    "jz .Lenclast%=\n\t"
+		    "movdqa 0xc0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+		    "movdqa 0xd0(%[key]), %%xmm1\n\t"
+		    aesenc_xmm1_xmm0
+
+		    ".Lenclast%=:\n\t"
+		    "movdqa %%xmm0, %%xmm5\n"
+		    aesenclast_xmm4_xmm0  /* xmm0 = IV ^ key[0] */
+		    aesenclast_xmm3_xmm5  /* xmm5 = IV */
+		    :
+		    : [key] "r" (ctx->keyschenc),
+		      [rounds] "r" (rounds)
+		    : "cc", "memory");
+#undef aesenc_xmm1_xmm0
+#undef aesenclast_xmm4_xmm0
+#undef aesenclast_xmm3_xmm5
 
-      asm volatile ("movdqa %%xmm0, %%xmm5\n\t"
-                    "movdqu %%xmm0, %[outbuf]\n\t"
-                    : [outbuf] "=m" (*outbuf)
-                    :
-                    : "memory" );
+      asm volatile ("movdqu %%xmm5, %[outbuf]\n\t"
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory" );
 
-      inbuf += BLOCKSIZE;
-      if (!cbc_mac)
-        outbuf += BLOCKSIZE;
+      outbuf += -(!cbc_mac) & BLOCKSIZE;
     }
 
   asm volatile ("movdqu %%xmm5, %[iv]\n\t"
-                : [iv] "=m" (*iv)
-                :
-                : "memory" );
+		: [iv] "=m" (*iv)
+		:
+		: "memory" );
 
   aesni_cleanup ();
   aesni_cleanup_2_7 ();
-- 
2.32.0


From sam at gentoo.org  Tue Jan 18 16:44:15 2022
From: sam at gentoo.org (Sam James)
Date: Tue, 18 Jan 2022 15:44:15 +0000
Subject: [PATCH] cipher/cipher-gcm: fix build failure on ARM NEON
Message-ID: <20220118154415.953374-1-sam@gentoo.org>

'features' is only defined when relevant CPU features are found, but
one of the uses below its definition checked for GCM_USE_ARM_NEON which
wasn't in the guard above it.

i.e. We used to only define 'features' when:
- GCM_USE_INTEL_PCLMUL
- GCM_USE_ARM_PMULL
- GCM_USE_S390X_CRYPTO
- GCM_USE_PPC_VPMSUM
- GCM_USE_S390X_CRYPTO
- GCM_USE_PPC_VPMSUM
is set.

We were missing GCM_USE_ARM_NEON so when we check for GCM_USE_ARM_NEON
below, it'd fail as features wasn't defined.

Bug: https://bugs.gentoo.org/831397
---
 cipher/cipher-gcm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index a039c5e9..22834f35 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -583,7 +583,8 @@ static void
 setupM (gcry_cipher_hd_t c)
 {
 #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
-    defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM)
+    defined(GCM_USE_ARM_NEON) || defined(GCM_USE_S390X_CRYPTO) || \
+    defined(GCM_USE_PPC_VPMSUM)
   unsigned int features = _gcry_get_hw_features ();
 #endif
 
-- 
2.34.1


From sam at gentoo.org  Tue Jan 18 13:52:52 2022
From: sam at gentoo.org (Sam James)
Date: Tue, 18 Jan 2022 12:52:52 +0000
Subject: [PATCH] cipher/cipher-gcm: fix build failure on ARM NEON
Message-ID: <20220118125252.809049-1-sam@gentoo.org>

'features' is only defined when relevant CPU features are found, but
one of the uses below its definition checked for GCM_USE_ARM_NEON which
wasn't in the guard above it.

i.e. We used to only define 'features' when:
- GCM_USE_INTEL_PCLMUL
- GCM_USE_ARM_PMULL
- GCM_USE_S390X_CRYPTO
- GCM_USE_PPC_VPMSUM
- GCM_USE_S390X_CRYPTO
- GCM_USE_PPC_VPMSUM
is set.

We were missing GCM_USE_ARM_NEON so when we check for GCM_USE_ARM_NEON
below, it'd fail as features wasn't defined.

Bug: https://bugs.gentoo.org/831397
---
 cipher/cipher-gcm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index a039c5e9..22834f35 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -583,7 +583,8 @@ static void
 setupM (gcry_cipher_hd_t c)
 {
 #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
-    defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM)
+    defined(GCM_USE_ARM_NEON) || defined(GCM_USE_S390X_CRYPTO) || \
+    defined(GCM_USE_PPC_VPMSUM)
   unsigned int features = _gcry_get_hw_features ();
 #endif
 
-- 
2.34.1


From sam at gentoo.org  Tue Jan 18 16:38:51 2022
From: sam at gentoo.org (Sam James)
Date: Tue, 18 Jan 2022 15:38:51 +0000
Subject: [PATCH] cipher/cipher-gcm: fix build failure on ARM NEON
Message-ID: <20220118153851.950749-1-sam@gentoo.org>

'features' is only defined when relevant CPU features are found, but
one of the uses below its definition checked for GCM_USE_ARM_NEON which
wasn't in the guard above it.

i.e. We used to only define 'features' when:
- GCM_USE_INTEL_PCLMUL
- GCM_USE_ARM_PMULL
- GCM_USE_S390X_CRYPTO
- GCM_USE_PPC_VPMSUM
- GCM_USE_S390X_CRYPTO
- GCM_USE_PPC_VPMSUM
is set.

We were missing GCM_USE_ARM_NEON so when we check for GCM_USE_ARM_NEON
below, it'd fail as features wasn't defined.

Bug: https://bugs.gentoo.org/831397
---
 cipher/cipher-gcm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index a039c5e9..22834f35 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -583,7 +583,8 @@ static void
 setupM (gcry_cipher_hd_t c)
 {
 #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
-    defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM)
+    defined(GCM_USE_ARM_NEON) || defined(GCM_USE_S390X_CRYPTO) || \
+    defined(GCM_USE_PPC_VPMSUM)
   unsigned int features = _gcry_get_hw_features ();
 #endif
 
-- 
2.34.1


From gniibe at fsij.org  Thu Jan 20 07:40:15 2022
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 20 Jan 2022 15:40:15 +0900
Subject: Balloon hashing (was: Argon2)
In-Reply-To: <e251666b-a83f-9562-7f60-d57ae737a53d@iki.fi>
References: <87v91yiu99.fsf@akagi.fsij.org>
 <584e3784-609d-b56a-1a00-ed8f244e0b42@iki.fi>
 <87v91pngwb.fsf@wheatstone.g10code.de>
 <e251666b-a83f-9562-7f60-d57ae737a53d@iki.fi>
Message-ID: <87k0euoqpc.fsf@akagi.fsij.org>

Hello,

Last October, I wrote about possible addition of Argon2 to libgcrypt.

Today, I am considering adding Balloon instead (or as well as Argon2).

Background: These days, we try to prepare FIPS mode for coming libgcrypt
1.10.  In this context of FIPS compliant things, I'm afraid Argon2 won't
be approved algo by FIPS (in future).

This week, I read this document of NIST:

    https://pages.nist.gov/800-63-3/sp800-63b.html#sec5

and it addresses Balloon [0] as one of examples.  And I found that
Balloon is more FIPS friendly, as it can use FIPS approved hash
function.

Just like Argon2, it has three parameters (parallelism, space cost, and
time cost).  Thus, it has same problem with the gcry_kdf_derive API
(which only has "iterations").

For parallelism, Balloon approach is straight forward: tweaking salt for
each worker thread, parallel computation by threads, and merging results
by XOR.  (It's a bit simpler than Argon2 where its H0 includes
parallelism parameter.)

So, I think that we could only offer single-thread version of Balloon by
libgcrypt and assume use of parallelism by an application.  This way, we
may avoid introducing thread dependency in libgcrypt.

[0] https://crypto.stanford.edu/balloon/
-- 


From bad at bsd.de  Mon Jan 24 18:38:55 2022
From: bad at bsd.de (Christoph Badura)
Date: Mon, 24 Jan 2022 18:38:55 +0100
Subject: PATCH random/rndgetentropy.c: fix build failure on macOS
Message-ID: <20220124173855.GD23126@irregular-apocalypse.k.bsd.de>

Before the weekend I did a speedo.mk build of gnupg off the master
branches on an Intel MacBook running Big Sur with Xcode 13.2.1 using the
MacOS SDK 12.1.

libgcrypt fails in rndgetentropy.c because the prototype for getentropy()
is missing.  The prototype is provided by sys/random.h per the man pages.

The following patch fixes this for me.
--chris

1 file changed, 3 insertions(+)
random/rndgetentropy.c | 3 +++

modified   random/rndgetentropy.c
@@ -23,6 +23,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <errno.h>
+#ifdef __APPLE__
+#include <sys/random.h>	/* getentropy(2) lives here */
+#endif
 #include <sys/types.h>
 #include <string.h>
 #include <unistd.h>


From jussi.kivilinna at iki.fi  Fri Jan 28 20:06:13 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 28 Jan 2022 21:06:13 +0200
Subject: [PATCH 1/4] tests/t-kdf: few changes to pthread example and fix
 win32/win64 builds
Message-ID: <20220128190616.884237-1-jussi.kivilinna@iki.fi>

* src/gcrypt.h.in (gcry_kdf_thread_ops_t): New based on
'struct gcry_kdf_thread_ops'.
(gcry_kdf_compute): Use 'gcry_kdf_thread_ops_t' instead of
'struct gcry_kdf_thread_ops'.
* tests/Makefile.am: Define 't_kdf_LDADD' and 't_kdf_CFLAGS' on
win32/win64 target too.
* tests/t-kdf.c (pthread_jobs_launch_job): Set 'oldest_thread_idx' on
first thread creation.
(wait_all_jobs_completion): Reset 'oldest_thread_idx' to -1.
(my_kdf_derive): Merge HAVE_PTHREAD ifdefs; Initialize 'oldest_thread_idx'
to -1.
--

Windows build was not working because of missing HAVE_PTHREAD in
't-kdf.c' and LDADD/CFLAGS issue in 'Makefile.am'.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 src/gcrypt.h.in   |  7 ++++---
 tests/Makefile.am |  2 ++
 tests/t-kdf.c     | 46 +++++++++++++++++++++++-----------------------
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 5e016932..680f634f 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -1620,11 +1620,12 @@ typedef int (*gcry_kdf_lauch_job_t) (void *jobs_context,
 typedef int (*gcry_kdf_wait_all_jobs_completion_t) (void *jobs_context);
 
 /* Exposed structure for KDF computation to decouple thread functionality.  */
-struct gcry_kdf_thread_ops {
+typedef struct gcry_kdf_thread_ops
+{
   void *jobs_context;
   gcry_kdf_lauch_job_t launch_job;
   gcry_kdf_wait_all_jobs_completion_t wait_all_jobs_completion;
-};
+} gcry_kdf_thread_ops_t;
 
 gcry_error_t gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
                             const unsigned long *param, unsigned int paramlen,
@@ -1633,7 +1634,7 @@ gcry_error_t gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
                             const void *key, size_t keylen,
                             const void *ad, size_t adlen);
 gcry_error_t gcry_kdf_compute (gcry_kdf_hd_t h,
-                               const struct gcry_kdf_thread_ops *ops);
+                               const gcry_kdf_thread_ops_t *ops);
 gcry_error_t gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result);
 void gcry_kdf_close (gcry_kdf_hd_t h);
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b42156f0..e6953fd3 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -89,6 +89,8 @@ if HAVE_W32_SYSTEM
 xtestsuite_libs = ../src/.libs/libgcrypt-20.dll \
                   $(prefix)/bin/libgpg-error*-0.dll
 xtestsuite_driver = .libs/testdrv.exe
+t_kdf_LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) @LDADD_FOR_TESTS_KLUDGE@
+t_kdf_CFLAGS = $(GPG_ERROR_CFLAGS)
 else
 xtestsuite_libs = ../src/.libs/libgcrypt.so*
 xtestsuite_driver = testdrv
diff --git a/tests/t-kdf.c b/tests/t-kdf.c
index 59559a4c..d61159e3 100644
--- a/tests/t-kdf.c
+++ b/tests/t-kdf.c
@@ -1255,7 +1255,8 @@ struct user_defined_threads_ctx
   int num_threads_running;
   pthread_attr_t attr;
   pthread_t thread[MAX_THREADS];
-  struct job_thread_param {
+  struct job_thread_param
+  {
     void (*job) (void *work_priv);
     void *priv;
   } work[MAX_THREADS];
@@ -1275,8 +1276,7 @@ pthread_jobs_launch_job (void *jobs_context,
 {
   struct user_defined_threads_ctx *ctx = jobs_context;
 
-  if (ctx->num_threads_running
-      && ctx->next_thread_idx == ctx->oldest_thread_idx)
+  if (ctx->next_thread_idx == ctx->oldest_thread_idx)
     {
       assert (ctx->num_threads_running == MAX_THREADS);
       /* thread limit reached, join a thread */
@@ -1289,6 +1289,8 @@ pthread_jobs_launch_job (void *jobs_context,
   ctx->work[ctx->next_thread_idx].priv = work_priv;
   pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr,
                   job_thread, &ctx->work[ctx->next_thread_idx]);
+  if (ctx->oldest_thread_idx < 0)
+    ctx->oldest_thread_idx = ctx->next_thread_idx;
   ctx->next_thread_idx = (ctx->next_thread_idx + 1) % MAX_THREADS;
   ctx->num_threads_running++;
   return 0;
@@ -1308,7 +1310,7 @@ wait_all_jobs_completion (void *jobs_context)
 
   /* reset context for next round of parallel work */
   ctx->num_threads_running = 0;
-  ctx->oldest_thread_idx = 0;
+  ctx->oldest_thread_idx = -1;
   ctx->next_thread_idx = 0;
 
   return 0;
@@ -1327,9 +1329,8 @@ my_kdf_derive (int parallel,
 {
   gcry_error_t err;
   gcry_kdf_hd_t hd;
-#ifdef HAVE_PTHREAD
-  struct user_defined_threads_ctx jobs_context;
-#endif
+
+  (void)parallel;
 
   err = gcry_kdf_open (&hd, algo, subalgo, params, paramslen,
                        pass, passlen, salt, saltlen, key, keylen,
@@ -1340,7 +1341,16 @@ my_kdf_derive (int parallel,
 #ifdef HAVE_PTHREAD
   if (parallel)
     {
+      struct user_defined_threads_ctx jobs_context;
+      const gcry_kdf_thread_ops_t ops =
+      {
+        &jobs_context,
+        pthread_jobs_launch_job,
+        wait_all_jobs_completion
+      };
+
       memset (&jobs_context, 0, sizeof (struct user_defined_threads_ctx));
+      jobs_context.oldest_thread_idx = -1;
 
       if (pthread_attr_init (&jobs_context.attr))
 	{
@@ -1357,26 +1367,16 @@ my_kdf_derive (int parallel,
 	  gcry_kdf_close (hd);
 	  return err;
 	}
-    }
-#endif
-
-  if (!parallel)
-    err = gcry_kdf_compute (hd, NULL);
-  else
-    {
-      struct gcry_kdf_thread_ops ops = {
-        &jobs_context,
-        pthread_jobs_launch_job,
-        wait_all_jobs_completion
-      };
 
       err = gcry_kdf_compute (hd, &ops);
-    }
 
-#ifdef HAVE_PTHREAD
-  if (parallel)
-    pthread_attr_destroy (&jobs_context. attr);
+      pthread_attr_destroy (&jobs_context. attr);
+    }
+  else
 #endif
+    {
+      err = gcry_kdf_compute (hd, NULL);
+    }
 
   if (!err)
     err = gcry_kdf_final (hd, outlen, out);
-- 
2.32.0


From jussi.kivilinna at iki.fi  Fri Jan 28 20:06:14 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 28 Jan 2022 21:06:14 +0200
Subject: [PATCH 2/4] Rename KDF job functions and function types
In-Reply-To: <20220128190616.884237-1-jussi.kivilinna@iki.fi>
References: <20220128190616.884237-1-jussi.kivilinna@iki.fi>
Message-ID: <20220128190616.884237-2-jussi.kivilinna@iki.fi>

* src/gcrypt.h.in (gcry_kdf_job_fn_t): New.
(gcry_kdf_dispatch_job_fn_t): Renamed from 'gcry_kdf_lauch_job_t'; Use
'gcry_kdf_job_fn_t' for function pointer parameter.
(gcry_kdf_wait_all_jobs_fn_t): Renamed from
'gcry_kdf_wait_all_jobs_completion_t'.
(gcry_kdf_thread_ops_t): Rename functions to 'dispatch_job' and
'wait_all_jobs'.
* cipher/kdf.c (argon2_compute): Change to use 'dispatch_job' and
'wait_all_jobs'.
* tests/t-kdf.c (job_thread_param, pthread_jobs_launch_job): Use
'gcry_kdf_job_fn_t' type for 'job'.
--

Rename 'launch_job' to 'dispatch_job', dispatch feels better
word to describe the action here. Also remove '_completion'
from wait_all function name as it makes name unnecessary long.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/kdf.c    |  6 +++---
 src/gcrypt.h.in | 14 +++++++-------
 tests/t-kdf.c   |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cipher/kdf.c b/cipher/kdf.c
index 94cd064f..d426b608 100644
--- a/cipher/kdf.c
+++ b/cipher/kdf.c
@@ -719,14 +719,14 @@ argon2_compute (argon2_ctx_t a, const struct gcry_kdf_thread_ops *ops)
             thread_data->lane = l;
 
             if (ops)
-              ops->launch_job (ops->jobs_context,
-                               argon2_compute_segment, thread_data);
+              ops->dispatch_job (ops->jobs_context,
+                                 argon2_compute_segment, thread_data);
             else
               argon2_compute_segment (thread_data);
           }
 
         if (ops)
-          ops->wait_all_jobs_completion (ops->jobs_context);
+          ops->wait_all_jobs (ops->jobs_context);
       }
 
   return 0;
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 680f634f..2fd47292 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -1613,18 +1613,18 @@ gpg_error_t gcry_kdf_derive (const void *passphrase, size_t passphraselen,
 /* Another API to derive a key from a passphrase.  */
 typedef struct gcry_kdf_handle *gcry_kdf_hd_t;
 
-
-typedef int (*gcry_kdf_lauch_job_t) (void *jobs_context,
-                                     void (*job) (void *work_priv),
-                                     void *work_priv);
-typedef int (*gcry_kdf_wait_all_jobs_completion_t) (void *jobs_context);
+typedef void (*gcry_kdf_job_fn_t) (void *priv);
+typedef int (*gcry_kdf_dispatch_job_fn_t) (void *jobs_context,
+                                           gcry_kdf_job_fn_t job_fn,
+                                           void *job_priv);
+typedef int (*gcry_kdf_wait_all_jobs_fn_t) (void *jobs_context);
 
 /* Exposed structure for KDF computation to decouple thread functionality.  */
 typedef struct gcry_kdf_thread_ops
 {
   void *jobs_context;
-  gcry_kdf_lauch_job_t launch_job;
-  gcry_kdf_wait_all_jobs_completion_t wait_all_jobs_completion;
+  gcry_kdf_dispatch_job_fn_t dispatch_job;
+  gcry_kdf_wait_all_jobs_fn_t wait_all_jobs;
 } gcry_kdf_thread_ops_t;
 
 gcry_error_t gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
diff --git a/tests/t-kdf.c b/tests/t-kdf.c
index d61159e3..8844e111 100644
--- a/tests/t-kdf.c
+++ b/tests/t-kdf.c
@@ -1257,7 +1257,7 @@ struct user_defined_threads_ctx
   pthread_t thread[MAX_THREADS];
   struct job_thread_param
   {
-    void (*job) (void *work_priv);
+    gcry_kdf_job_fn_t job;
     void *priv;
   } work[MAX_THREADS];
 };
@@ -1271,8 +1271,8 @@ job_thread (void *p)
 }
 
 static int
-pthread_jobs_launch_job (void *jobs_context,
-                         void (*job) (void *work_priv), void *work_priv)
+pthread_jobs_launch_job (void *jobs_context, gcry_kdf_job_fn_t job,
+			 void *job_priv)
 {
   struct user_defined_threads_ctx *ctx = jobs_context;
 
@@ -1286,7 +1286,7 @@ pthread_jobs_launch_job (void *jobs_context,
     }
 
   ctx->work[ctx->next_thread_idx].job = job;
-  ctx->work[ctx->next_thread_idx].priv = work_priv;
+  ctx->work[ctx->next_thread_idx].priv = job_priv;
   pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr,
                   job_thread, &ctx->work[ctx->next_thread_idx]);
   if (ctx->oldest_thread_idx < 0)
-- 
2.32.0


From jussi.kivilinna at iki.fi  Fri Jan 28 20:06:15 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 28 Jan 2022 21:06:15 +0200
Subject: [PATCH 3/4] kdf/argon2: use BLAKE2b hash_buffers function instead of
 _gcry_md_*
In-Reply-To: <20220128190616.884237-1-jussi.kivilinna@iki.fi>
References: <20220128190616.884237-1-jussi.kivilinna@iki.fi>
Message-ID: <20220128190616.884237-3-jussi.kivilinna@iki.fi>

* cipher/kdf.c (argon2_fill_first_blocks): Convert to use iov
hash_buffers API instead of _gcry_md_*.
--

More direct use of BLAKE2b avoids overhead from md object creation
and cleanup.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/kdf.c | 97 ++++++++++++++++++++++++++++------------------------
 1 file changed, 52 insertions(+), 45 deletions(-)

diff --git a/cipher/kdf.c b/cipher/kdf.c
index d426b608..74c5b753 100644
--- a/cipher/kdf.c
+++ b/cipher/kdf.c
@@ -367,59 +367,66 @@ xor_block (u64 *dst, const u64 *src)
 static gpg_err_code_t
 argon2_fill_first_blocks (argon2_ctx_t a)
 {
-  gpg_err_code_t ec;
   unsigned char h0_01_i[72];
-  const unsigned char *digest;
-  unsigned char buf[4];
+  unsigned char buf[10][4];
+  gcry_buffer_t iov[8];
+  unsigned int iov_count = 0;
   int i;
-  gcry_md_hd_t hd;
-
-  ec = _gcry_md_open (&hd, GCRY_MD_BLAKE2B_512, 0);
-  if (ec)
-    return ec;
 
   /* Generate H0.  */
-  buf_put_le32 (buf, a->lanes);
-  _gcry_md_write (hd, buf, 4);
-
-  buf_put_le32 (buf, a->outlen);
-  _gcry_md_write (hd, buf, 4);
-
-  buf_put_le32 (buf, a->m_cost);
-  _gcry_md_write (hd, buf, 4);
-
-  buf_put_le32 (buf, a->passes);
-  _gcry_md_write (hd, buf, 4);
-
-  buf_put_le32 (buf, ARGON2_VERSION);
-  _gcry_md_write (hd, buf, 4);
-
-  buf_put_le32 (buf, a->hash_type);
-  _gcry_md_write (hd, buf, 4);
-
-  buf_put_le32 (buf, a->passwordlen);
-  _gcry_md_write (hd, buf, 4);
-  _gcry_md_write (hd, a->password, a->passwordlen);
-
-  buf_put_le32 (buf, a->saltlen);
-  _gcry_md_write (hd, buf, 4);
-  _gcry_md_write (hd, a->salt, a->saltlen);
-
-  buf_put_le32 (buf, a->keylen);
-  _gcry_md_write (hd, buf, 4);
+  buf_put_le32 (buf[0], a->lanes);
+  buf_put_le32 (buf[1], a->outlen);
+  buf_put_le32 (buf[2], a->m_cost);
+  buf_put_le32 (buf[3], a->passes);
+  buf_put_le32 (buf[4], ARGON2_VERSION);
+  buf_put_le32 (buf[5], a->hash_type);
+  buf_put_le32 (buf[6], a->passwordlen);
+  iov[iov_count].data = buf[0];
+  iov[iov_count].len = 4 * 7;
+  iov[iov_count].off = 0;
+  iov_count++;
+  iov[iov_count].data = (void *)a->password;
+  iov[iov_count].len = a->passwordlen;
+  iov[iov_count].off = 0;
+  iov_count++;
+
+  buf_put_le32 (buf[7], a->saltlen);
+  iov[iov_count].data = buf[7];
+  iov[iov_count].len = 4;
+  iov[iov_count].off = 0;
+  iov_count++;
+  iov[iov_count].data = (void *)a->salt;
+  iov[iov_count].len = a->saltlen;
+  iov[iov_count].off = 0;
+  iov_count++;
+
+  buf_put_le32 (buf[8], a->keylen);
+  iov[iov_count].data = buf[8];
+  iov[iov_count].len = 4;
+  iov[iov_count].off = 0;
+  iov_count++;
   if (a->key)
-    _gcry_md_write (hd, a->key, a->keylen);
+    {
+      iov[iov_count].data = (void *)a->key;
+      iov[iov_count].len = a->keylen;
+      iov[iov_count].off = 0;
+      iov_count++;
+    }
 
-  buf_put_le32 (buf, a->adlen);
-  _gcry_md_write (hd, buf, 4);
+  buf_put_le32 (buf[9], a->adlen);
+  iov[iov_count].data = buf[9];
+  iov[iov_count].len = 4;
+  iov[iov_count].off = 0;
+  iov_count++;
   if (a->ad)
-    _gcry_md_write (hd, a->ad, a->adlen);
-
-  digest = _gcry_md_read (hd, GCRY_MD_BLAKE2B_512);
-
-  memcpy (h0_01_i, digest, 64);
+    {
+      iov[iov_count].data = (void *)a->ad;
+      iov[iov_count].len = a->adlen;
+      iov[iov_count].off = 0;
+      iov_count++;
+    }
 
-  _gcry_md_close (hd);
+  _gcry_digest_spec_blake2b_512.hash_buffers (h0_01_i, 64, iov, iov_count);
 
   for (i = 0; i < a->lanes; i++)
     {
-- 
2.32.0


From jussi.kivilinna at iki.fi  Fri Jan 28 20:06:16 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 28 Jan 2022 21:06:16 +0200
Subject: [PATCH 4/4] kdf: handle errors from thread dispatch/wait functions
In-Reply-To: <20220128190616.884237-1-jussi.kivilinna@iki.fi>
References: <20220128190616.884237-1-jussi.kivilinna@iki.fi>
Message-ID: <20220128190616.884237-4-jussi.kivilinna@iki.fi>

* cipher/kdf.c (argon2_compute): Handle failed job dispatch/wait.
* tests/t-kdf.c (pthread_jobs_launch_job)
(wait_all_jobs_completion): Handle errors returned from pthread functions.
--

This allows thread helpers to return error code, which causes
KDF processing to stop.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/kdf.c  | 15 ++++++++++++---
 tests/t-kdf.c | 24 ++++++++++++++++++++----
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/cipher/kdf.c b/cipher/kdf.c
index 74c5b753..79dc6cd8 100644
--- a/cipher/kdf.c
+++ b/cipher/kdf.c
@@ -706,6 +706,7 @@ argon2_compute (argon2_ctx_t a, const struct gcry_kdf_thread_ops *ops)
   unsigned int r;
   unsigned int s;
   unsigned int l;
+  int ret;
 
   ec = argon2_fill_first_blocks (a);
   if (ec)
@@ -726,14 +727,22 @@ argon2_compute (argon2_ctx_t a, const struct gcry_kdf_thread_ops *ops)
             thread_data->lane = l;
 
             if (ops)
-              ops->dispatch_job (ops->jobs_context,
-                                 argon2_compute_segment, thread_data);
+	      {
+		ret = ops->dispatch_job (ops->jobs_context,
+					 argon2_compute_segment, thread_data);
+		if (ret < 0)
+		  return GPG_ERR_CANCELED;
+	      }
             else
               argon2_compute_segment (thread_data);
           }
 
         if (ops)
-          ops->wait_all_jobs (ops->jobs_context);
+	  {
+	    ret = ops->wait_all_jobs (ops->jobs_context);
+	    if (ret < 0)
+	      return GPG_ERR_CANCELED;
+	  }
       }
 
   return 0;
diff --git a/tests/t-kdf.c b/tests/t-kdf.c
index 8844e111..4c82fed8 100644
--- a/tests/t-kdf.c
+++ b/tests/t-kdf.c
@@ -1270,25 +1270,38 @@ job_thread (void *p)
   pthread_exit (NULL);
 }
 
+static int
+wait_all_jobs_completion (void *jobs_context);
+
 static int
 pthread_jobs_launch_job (void *jobs_context, gcry_kdf_job_fn_t job,
 			 void *job_priv)
 {
   struct user_defined_threads_ctx *ctx = jobs_context;
+  int ret;
 
   if (ctx->next_thread_idx == ctx->oldest_thread_idx)
     {
       assert (ctx->num_threads_running == MAX_THREADS);
       /* thread limit reached, join a thread */
-      pthread_join (ctx->thread[ctx->oldest_thread_idx], NULL);
+      ret = pthread_join (ctx->thread[ctx->oldest_thread_idx], NULL);
+      if (ret)
+	return -1;
       ctx->oldest_thread_idx = (ctx->oldest_thread_idx + 1) % MAX_THREADS;
       ctx->num_threads_running--;
     }
 
   ctx->work[ctx->next_thread_idx].job = job;
   ctx->work[ctx->next_thread_idx].priv = job_priv;
-  pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr,
-                  job_thread, &ctx->work[ctx->next_thread_idx]);
+  ret = pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr,
+			job_thread, &ctx->work[ctx->next_thread_idx]);
+  if (ret)
+    {
+      /* could not create new thread. */
+      (void)wait_all_jobs_completion (jobs_context);
+      return -1;
+    }
+
   if (ctx->oldest_thread_idx < 0)
     ctx->oldest_thread_idx = ctx->next_thread_idx;
   ctx->next_thread_idx = (ctx->next_thread_idx + 1) % MAX_THREADS;
@@ -1301,11 +1314,14 @@ wait_all_jobs_completion (void *jobs_context)
 {
   struct user_defined_threads_ctx *ctx = jobs_context;
   int i, idx;
+  int ret;
 
   for (i = 0; i < ctx->num_threads_running; i++)
     {
       idx = (ctx->oldest_thread_idx + i) % MAX_THREADS;
-      pthread_join (ctx->thread[idx], NULL);
+      ret = pthread_join (ctx->thread[idx], NULL);
+      if (ret)
+	return -1;
     }
 
   /* reset context for next round of parallel work */
-- 
2.32.0