From fweimer at redhat.com  Sat Dec  3 12:15:24 2022
From: fweimer at redhat.com (Florian Weimer)
Date: Sat, 03 Dec 2022 12:15:24 +0100
Subject: Port configure script to C99
Message-ID: <87tu2chh6b.fsf@oldenburg.str.redhat.com>

We are working to switch GCC to stricter defaults, catching up to C99
for a start.  In support of that, we are trying to build Fedora with
such a compiler.

  <https://fedoraproject.org/wiki/Changes/PortingToModernC>
  <https://fedoraproject.org/wiki/Toolchain/PortingToModernC>

We noticed that libgcrypt fails to build because it uses implicit
function declarations in the configure script.  The patch below should
fix that.

Thanks,
Florian

diff --git a/configure.ac b/configure.ac
index 6ea38f53b8548ee8..2baf25bc7d9481e6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1211,7 +1211,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
                 ".type asmfunc,%function;\n\t"
-              );]], [ asmfunc(); ] )],
+              );
+	      void asmfunc(void);]], [ asmfunc(); ] )],
             [gcry_cv_gcc_arm_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
@@ -1238,7 +1239,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly i
                 "eor x0, x0, x30, ror #12;\n\t"
                 "add x0, x0, x30, asr #12;\n\t"
                 "eor v0.16b, v0.16b, v31.16b;\n\t"
-              );]], [ asmfunc(); ] )],
+              );
+	      void asmfunc(void);]], [ asmfunc(); ] )],
             [gcry_cv_gcc_aarch64_platform_as_ok=yes])
         fi])
 if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
@@ -1267,7 +1269,8 @@ AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
                 ".cfi_restore_state\n\t"
                 ".long 0\n\t"
                 ".cfi_endproc\n\t"
-            );]])],
+            );
+	    void asmfunc(void)]])],
           [gcry_cv_gcc_asm_cfi_directives=yes])])
 if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
    AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
@@ -1666,7 +1669,8 @@ if test $amd64_as_feature_detection = yes; then
        [gcry_cv_gcc_as_const_division_ok],
        [gcry_cv_gcc_as_const_division_ok=no
         AC_LINK_IFELSE([AC_LANG_PROGRAM(
-          [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
+          [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");
+	    void fn(void);]],
             [fn();])],
           [gcry_cv_gcc_as_const_division_ok=yes])])
   if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then
@@ -1679,7 +1683,8 @@ if test $amd64_as_feature_detection = yes; then
          [gcry_cv_gcc_as_const_division_with_wadivide_ok],
          [gcry_cv_gcc_as_const_division_with_wadivide_ok=no
           AC_LINK_IFELSE([AC_LANG_PROGRAM(
-            [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]],
+            [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");
+	      void fn(void);]],
               [fn();])],
             [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])])
     if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then
@@ -1715,7 +1720,8 @@ if test $amd64_as_feature_detection = yes; then
 		 * and "-Wa,--divide" workaround failed, this causes assembly
 		 * to be disable on this machine. */
 		"xorl \$(123456789/12345678), %ebp;\n\t"
-            );]], [ asmfunc(); ])],
+            );
+	    void asmfunc(void);]], [ asmfunc(); ])],
           [gcry_cv_gcc_amd64_platform_as_ok=yes])
         fi])
   if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then
@@ -1734,7 +1740,8 @@ if test $amd64_as_feature_detection = yes; then
               ".globl asmfunc\n\t"
               "asmfunc:\n\t"
               "xorq \$(1234), %rbp;\n\t"
-          );]], [ asmfunc(); ])],
+          );
+	  void asmfunc(void);]], [ asmfunc(); ])],
         [gcry_cv_gcc_win64_platform_as_ok=yes])])
     if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then
       AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1,
@@ -1767,7 +1774,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im
                 "sub eax, [esp + 4];\n\t"
                 "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
-            );]], [ actest(); ])],
+            );
+	    void actest(void);]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
         fi])
 if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
@@ -1832,6 +1840,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
                 "vadd.u64 %q0, %q1;\n\t"
                 "vadd.s64 %d3, %d2, %d3;\n\t"
                 );
+	    void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_neon=yes])
         fi])
@@ -1879,6 +1888,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension i
 
                 "vmull.p64 q0, d0, d0;\n\t"
                 );
+	    void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch32_crypto=yes])
         fi])
@@ -1907,6 +1917,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions]
                 "dup v0.8b, w0;\n\t"
                 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t"
                 );
+	    void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_neon=yes])
         fi])
@@ -1955,6 +1966,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension i
                 "pmull v0.1q, v0.1d, v31.1d;\n\t"
                 "pmull2 v0.1q, v0.2d, v31.2d;\n\t"
                 );
+	    void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_aarch64_crypto=yes])
         fi])
@@ -2050,6 +2062,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto
 		    "vshasigmad %v0, %v1, 0, 15;\n"
 		    "vpmsumd %v11, %v11, %v11;\n"
 		  );
+	    void testfn(void);
             ]], [ testfn(); ] )],
           [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
         fi])
@@ -2075,6 +2088,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions
 		    "testfn:\n"
 		    "stxvb16x %r1,%v12,%v30;\n"
 		  );
+	    void testfn(void);
             ]], [ testfn(); ])],
           [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
         fi])


From jussi.kivilinna at iki.fi  Mon Dec  5 21:17:55 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon,  5 Dec 2022 22:17:55 +0200
Subject: [PATCH] chacha20-avx512: add handling for any input block count and
 tweak 16 block code a bit
Message-ID: <20221205201755.355987-1-jussi.kivilinna@iki.fi>

* cipher/chacha20-amd64-avx512.S: Add tail handling for 8/4/2/1
blocks; Rename `_gcry_chacha20_amd64_avx512_blocks16` to
`_gcry_chacha20_amd64_avx512_blocks`; Tweak 16 parallel block processing
for small speed improvement.
* cipher/chacha20.c (_gcry_chacha20_amd64_avx512_blocks16): Rename to ...
(_gcry_chacha20_amd64_avx512_blocks): ... this.
(chacha20_blocks) [USE_AVX512]: Add AVX512 code-path.
(do_chacha20_encrypt_stream_tail) [USE_AVX512]: Change to handle any
number of full input blocks instead of multiples of 16.
--

Patch improves performance of ChaCha20-AVX512 implementation on small
input buffer sizes (less than 64*16B = 1024B).

===

Benchmark on AMD Ryzen 9 7900X:

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.130 ns/B      7330 MiB/s     0.716 c/B      5500
     STREAM dec |     0.128 ns/B      7426 MiB/s     0.713 c/B      5555
   POLY1305 enc |     0.175 ns/B      5444 MiB/s     0.964 c/B      5500
   POLY1305 dec |     0.175 ns/B      5455 MiB/s     0.962 c/B      5500

After:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.124 ns/B      7675 MiB/s     0.699 c/B      5625
     STREAM dec |     0.126 ns/B      7544 MiB/s     0.695 c/B      5500
   POLY1305 enc |     0.170 ns/B      5626 MiB/s     0.954 c/B      5625
   POLY1305 dec |     0.169 ns/B      5639 MiB/s     0.945 c/B      5587

===

Benchmark on Intel Core i3-1115G4:

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.161 ns/B      5934 MiB/s     0.658 c/B      4097?3
     STREAM dec |     0.160 ns/B      5951 MiB/s     0.656 c/B      4097?4
   POLY1305 enc |     0.220 ns/B      4333 MiB/s     0.902 c/B      4096?3
   POLY1305 dec |     0.220 ns/B      4325 MiB/s     0.903 c/B      4096?3

After:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
     STREAM enc |     0.154 ns/B      6186 MiB/s     0.631 c/B      4096?3
     STREAM dec |     0.153 ns/B      6215 MiB/s     0.629 c/B      4096?3
   POLY1305 enc |     0.216 ns/B      4407 MiB/s     0.886 c/B      4096?3
   POLY1305 dec |     0.216 ns/B      4419 MiB/s     0.884 c/B      4096?3

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-amd64-avx512.S | 463 ++++++++++++++++++++++++++++++---
 cipher/chacha20.c              |  24 +-
 2 files changed, 447 insertions(+), 40 deletions(-)

diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index 8b4d7499..b48b1bf7 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -61,14 +61,56 @@
 #define X13 %zmm13
 #define X14 %zmm14
 #define X15 %zmm15
+#define X0y %ymm0
+#define X1y %ymm1
+#define X2y %ymm2
+#define X3y %ymm3
+#define X4y %ymm4
+#define X5y %ymm5
+#define X6y %ymm6
+#define X7y %ymm7
+#define X8y %ymm8
+#define X9y %ymm9
+#define X10y %ymm10
+#define X11y %ymm11
+#define X12y %ymm12
+#define X13y %ymm13
+#define X14y %ymm14
+#define X15y %ymm15
+#define X0x %xmm0
+#define X1x %xmm1
+#define X2x %xmm2
+#define X3x %xmm3
+#define X4x %xmm4
+#define X5x %xmm5
+#define X6x %xmm6
+#define X7x %xmm7
+#define X8x %xmm8
+#define X9x %xmm9
+#define X10x %xmm10
+#define X11x %xmm11
+#define X12x %xmm12
+#define X13x %xmm13
+#define X14x %xmm14
+#define X15x %xmm15
 
 #define TMP0 %zmm16
 #define TMP1 %zmm17
+#define TMP0y %ymm16
+#define TMP1y %ymm17
+#define TMP0x %xmm16
+#define TMP1x %xmm17
 
 #define COUNTER_ADD %zmm18
+#define COUNTER_ADDy %ymm18
+#define COUNTER_ADDx %xmm18
 
 #define X12_SAVE %zmm19
+#define X12_SAVEy %ymm19
+#define X12_SAVEx %xmm19
 #define X13_SAVE %zmm20
+#define X13_SAVEy %ymm20
+#define X13_SAVEx %xmm20
 
 #define S0 %zmm21
 #define S1 %zmm22
@@ -81,6 +123,28 @@
 #define S8 %zmm29
 #define S14 %zmm30
 #define S15 %zmm31
+#define S0y %ymm21
+#define S1y %ymm22
+#define S2y %ymm23
+#define S3y %ymm24
+#define S4y %ymm25
+#define S5y %ymm26
+#define S6y %ymm27
+#define S7y %ymm28
+#define S8y %ymm29
+#define S14y %ymm30
+#define S15y %ymm31
+#define S0x %xmm21
+#define S1x %xmm22
+#define S2x %xmm23
+#define S3x %xmm24
+#define S4x %xmm25
+#define S5x %xmm26
+#define S6x %xmm27
+#define S7x %xmm28
+#define S8x %xmm29
+#define S14x %xmm30
+#define S15x %xmm31
 
 /**********************************************************************
   helper macros
@@ -114,6 +178,12 @@
 	vshufi32x4 $0xdd, x2, t2, x3; \
 	vshufi32x4 $0x88, x2, t2, x2;
 
+/* 2x2 128-bit matrix transpose */
+#define transpose_16byte_2x2(x0,x1,t1) \
+	vmovdqa32  x0, t1; \
+	vshufi32x4 $0x0, x1, x0, x0; \
+	vshufi32x4 $0x3, x1, t1, x1;
+
 #define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \
 	vpxord (offset + 0 * (add))(src), x0, x0; \
 	vpxord (offset + 1 * (add))(src), x4, x4; \
@@ -141,7 +211,7 @@
 	clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31);
 
 /**********************************************************************
-  16-way chacha20
+  16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20
  **********************************************************************/
 
 #define ROTATE2(v1,v2,c)	\
@@ -154,7 +224,7 @@
 #define PLUS(ds,s) \
 	vpaddd s, ds, ds;
 
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
+#define QUARTERROUND2V(a1,b1,c1,d1,a2,b2,c2,d2)			\
 	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
 	    ROTATE2(d1, d2, 16);				\
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
@@ -164,33 +234,99 @@
 	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
 	    ROTATE2(b1, b2, 7);
 
+/**********************************************************************
+  1-way/2-way (xmm) chacha20
+ **********************************************************************/
+
+#define ROTATE(v1,c)			\
+	vprold $(c), v1, v1;		\
+
+#define WORD_SHUF(v1,shuf)		\
+	vpshufd $shuf, v1, v1;
+
+#define QUARTERROUND1H(x0,x1,x2,x3,shuf_x1,shuf_x2,shuf_x3) \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 16); \
+	PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12); \
+	PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 8); \
+	PLUS(x2, x3); \
+	  WORD_SHUF(x3, shuf_x3); \
+		      XOR(x1, x2); \
+	  WORD_SHUF(x2, shuf_x2); \
+				   ROTATE(x1, 7); \
+	  WORD_SHUF(x1, shuf_x1);
+
+#define QUARTERROUND2H(x0,x1,x2,x3,y0,y1,y2,y3,shuf_x1,shuf_x2,shuf_x3) \
+	PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \
+	  ROTATE(x3, 16); ROTATE(y3, 16); \
+	PLUS(x2, x3); PLUS(y2, y3); XOR(x1, x2); XOR(y1, y2); \
+	  ROTATE(x1, 12); ROTATE(y1, 12); \
+	PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \
+	  ROTATE(x3, 8); ROTATE(y3, 8); \
+	PLUS(x2, x3); PLUS(y2, y3); \
+	  WORD_SHUF(x3, shuf_x3); WORD_SHUF(y3, shuf_x3); \
+		      XOR(x1, x2); XOR(y1, y2); \
+	  WORD_SHUF(x2, shuf_x2); WORD_SHUF(y2, shuf_x2); \
+				   ROTATE(x1, 7); ROTATE(y1, 7); \
+	  WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1);
+
 .align 64
 ELF(.type _gcry_chacha20_amd64_avx512_data, at object;)
 _gcry_chacha20_amd64_avx512_data:
-.Linc_counter:
-	.byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lcounter_0_1_2_3:
+.Lcounter_0_1:
+	.long 0,0,0,0
 .Lone:
 	.long 1,0,0,0
+.Lcounter_2_3:
+.Ltwo:
+	.long 2,0,0,0
+.Lthree:
+	.long 3,0,0,0
+.Linc_counter:
+	.byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data)
 
 .align 16
-.globl _gcry_chacha20_amd64_avx512_blocks16
-ELF(.type _gcry_chacha20_amd64_avx512_blocks16, at function;)
-_gcry_chacha20_amd64_avx512_blocks16:
+.globl _gcry_chacha20_amd64_avx512_blocks
+ELF(.type _gcry_chacha20_amd64_avx512_blocks, at function;)
+_gcry_chacha20_amd64_avx512_blocks:
 	/* input:
 	 *	%rdi: input
 	 *	%rsi: dst
 	 *	%rdx: src
-	 *	%rcx: nblks (multiple of 16)
+	 *	%rcx: nblks
 	 */
 	CFI_STARTPROC();
 
 	vpxord %xmm16, %xmm16, %xmm16;
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	vpopcntb %ymm16, %ymm16; /* spec stop for old AVX512 CPUs */
+
+	cmpq $4, NBLKS;
+	jb .Lskip_vertical_handling;
 
+	/* Load constants */
 	vpmovzxbd .Linc_counter rRIP, COUNTER_ADD;
 
-	/* Preload state */
+	cmpq $16, NBLKS;
+	jae .Lload_zmm_state;
+
+	/* Preload state to YMM registers */
+	vpbroadcastd (0 * 4)(INPUT), S0y;
+	vpbroadcastd (1 * 4)(INPUT), S1y;
+	vpbroadcastd (2 * 4)(INPUT), S2y;
+	vpbroadcastd (3 * 4)(INPUT), S3y;
+	vpbroadcastd (4 * 4)(INPUT), S4y;
+	vpbroadcastd (5 * 4)(INPUT), S5y;
+	vpbroadcastd (6 * 4)(INPUT), S6y;
+	vpbroadcastd (7 * 4)(INPUT), S7y;
+	vpbroadcastd (8 * 4)(INPUT), S8y;
+	vpbroadcastd (14 * 4)(INPUT), S14y;
+	vpbroadcastd (15 * 4)(INPUT), S15y;
+	jmp .Lskip16v;
+
+.align 16
+.Lload_zmm_state:
+	/* Preload state to ZMM registers */
 	vpbroadcastd (0 * 4)(INPUT), S0;
 	vpbroadcastd (1 * 4)(INPUT), S1;
 	vpbroadcastd (2 * 4)(INPUT), S2;
@@ -204,13 +340,14 @@ _gcry_chacha20_amd64_avx512_blocks16:
 	vpbroadcastd (15 * 4)(INPUT), S15;
 
 .align 16
-.Loop16:
+.Loop16v:
+	/* Process 16 ChaCha20 blocks */
 	movl $20, ROUND;
+	subq $16, NBLKS;
 
 	/* Construct counter vectors X12 and X13 */
-	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12;
 	vpbroadcastd (13 * 4)(INPUT), X13;
-	vpaddd COUNTER_ADD, X12, X12;
 	vpcmpud $6, X12, COUNTER_ADD, %k2;
 	vpaddd .Lone rRIP {1to16}, X13, X13{%k2};
 	vmovdqa32 X12, X12_SAVE;
@@ -223,7 +360,7 @@ _gcry_chacha20_amd64_avx512_blocks16:
 	vmovdqa32 S1, X1;
 	vmovdqa32 S5, X5;
 	vpbroadcastd (9 * 4)(INPUT), X9;
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+	QUARTERROUND2V(X0, X4,  X8, X12,   X1, X5,  X9, X13)
 	vmovdqa32 S2, X2;
 	vmovdqa32 S6, X6;
 	vpbroadcastd (10 * 4)(INPUT), X10;
@@ -235,19 +372,18 @@ _gcry_chacha20_amd64_avx512_blocks16:
 
 	/* Update counter */
 	addq $16, (12 * 4)(INPUT);
-	jmp .Lround2_entry;
+	jmp .Lround2_entry_16v;
 
 .align 16
-.Lround2:
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14)
-	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13)
-.Lround2_entry:
+.Lround2_16v:
+	QUARTERROUND2V(X2, X7,  X8, X13,   X3, X4,  X9, X14)
+	QUARTERROUND2V(X0, X4,  X8, X12,   X1, X5,  X9, X13)
+.Lround2_entry_16v:
+	QUARTERROUND2V(X2, X6, X10, X14,   X3, X7, X11, X15)
+	QUARTERROUND2V(X0, X5, X10, X15,   X1, X6, X11, X12)
 	subl $2, ROUND;
-	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15)
-	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12)
-	jnz .Lround2;
+	jnz .Lround2_16v;
 
-.Lround2_end:
 	PLUS(X0, S0);
 	PLUS(X1, S1);
 	PLUS(X5, S5);
@@ -256,7 +392,7 @@ _gcry_chacha20_amd64_avx512_blocks16:
 	PLUS(X11, (11 * 4)(INPUT){1to16});
 	PLUS(X15, S15);
 	PLUS(X12, X12_SAVE);
-	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14)
+	QUARTERROUND2V(X2, X7,  X8, X13,   X3, X4,  X9, X14)
 
 	PLUS(X2, S2);
 	PLUS(X3, S3);
@@ -280,21 +416,286 @@ _gcry_chacha20_amd64_avx512_blocks16:
 	transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1);
 	xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15);
 
-	subq $16, NBLKS;
 	leaq (16 * 64)(SRC), SRC;
 	leaq (16 * 64)(DST), DST;
-	jnz .Loop16;
+	cmpq $16, NBLKS;
+	jae .Loop16v;
+
+.align 8
+.Lskip16v:
+	cmpq $8, NBLKS;
+	jb .Lskip8v;
+
+	/* Process 8 ChaCha20 blocks */
+	movl $20, ROUND;
+	subq $8, NBLKS;
+
+	/* Construct counter vectors X12 and X13 */
+	vpaddd (12 * 4)(INPUT){1to8}, COUNTER_ADDy, X12y;
+	vpbroadcastd (13 * 4)(INPUT), X13y;
+	vpcmpud $6, X12y, COUNTER_ADDy, %k2;
+	vpaddd .Lone rRIP {1to8}, X13y, X13y{%k2};
+	vmovdqa32 X12y, X12_SAVEy;
+	vmovdqa32 X13y, X13_SAVEy;
+
+	/* Load vectors */
+	vmovdqa32 S0y, X0y;
+	vmovdqa32 S4y, X4y;
+	vmovdqa32 S8y, X8y;
+	vmovdqa32 S1y, X1y;
+	vmovdqa32 S5y, X5y;
+	vpbroadcastd (9 * 4)(INPUT), X9y;
+	QUARTERROUND2V(X0y, X4y,  X8y, X12y,   X1y, X5y,  X9y, X13y)
+	vmovdqa32 S2y, X2y;
+	vmovdqa32 S6y, X6y;
+	vpbroadcastd (10 * 4)(INPUT), X10y;
+	vmovdqa32 S14y, X14y;
+	vmovdqa32 S3y, X3y;
+	vmovdqa32 S7y, X7y;
+	vpbroadcastd (11 * 4)(INPUT), X11y;
+	vmovdqa32 S15y, X15y;
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+	jmp .Lround2_entry_8v;
+
+.align 16
+.Lround2_8v:
+	QUARTERROUND2V(X2y, X7y,  X8y, X13y,   X3y, X4y,  X9y, X14y)
+	QUARTERROUND2V(X0y, X4y,  X8y, X12y,   X1y, X5y,  X9y, X13y)
+.Lround2_entry_8v:
+	QUARTERROUND2V(X2y, X6y, X10y, X14y,   X3y, X7y, X11y, X15y)
+	QUARTERROUND2V(X0y, X5y, X10y, X15y,   X1y, X6y, X11y, X12y)
+	subl $2, ROUND;
+	jnz .Lround2_8v;
+
+	PLUS(X0y, S0y);
+	PLUS(X1y, S1y);
+	PLUS(X5y, S5y);
+	PLUS(X6y, S6y);
+	PLUS(X10y, (10 * 4)(INPUT){1to8});
+	PLUS(X11y, (11 * 4)(INPUT){1to8});
+	PLUS(X15y, S15y);
+	PLUS(X12y, X12_SAVEy);
+	QUARTERROUND2V(X2y, X7y,  X8y, X13y,   X3y, X4y,  X9y, X14y)
+
+	PLUS(X2y, S2y);
+	PLUS(X3y, S3y);
+	PLUS(X4y, S4y);
+	PLUS(X7y, S7y);
+	transpose_4x4(X0y, X1y, X2y, X3y, TMP0y, TMP1y);
+	transpose_4x4(X4y, X5y, X6y, X7y, TMP0y, TMP1y);
+	PLUS(X8y, S8y);
+	PLUS(X9y, (9 * 4)(INPUT){1to8});
+	PLUS(X13y, X13_SAVEy);
+	PLUS(X14y, S14y);
+	transpose_16byte_2x2(X0y, X4y, TMP0y);
+	transpose_16byte_2x2(X1y, X5y, TMP0y);
+	transpose_16byte_2x2(X2y, X6y, TMP0y);
+	transpose_16byte_2x2(X3y, X7y, TMP0y);
+	transpose_4x4(X8y, X9y, X10y, X11y, TMP0y, TMP1y);
+	transpose_4x4(X12y, X13y, X14y, X15y, TMP0y, TMP1y);
+	xor_src_dst_4x4(DST, SRC, (16 * 0),  (64 * 1), X0y, X1y, X2y, X3y);
+	xor_src_dst_4x4(DST, SRC, (16 * 16), (64 * 1), X4y, X5y, X6y, X7y);
+	transpose_16byte_2x2(X8y, X12y, TMP0y);
+	transpose_16byte_2x2(X9y, X13y, TMP0y);
+	transpose_16byte_2x2(X10y, X14y, TMP0y);
+	transpose_16byte_2x2(X11y, X15y, TMP0y);
+	xor_src_dst_4x4(DST, SRC, (16 * 2),  (64 * 1), X8y, X9y, X10y, X11y);
+	xor_src_dst_4x4(DST, SRC, (16 * 18), (64 * 1), X12y, X13y, X14y, X15y);
+
+	leaq (8 * 64)(SRC), SRC;
+	leaq (8 * 64)(DST), DST;
+
+.align 8
+.Lskip8v:
+	cmpq $4, NBLKS;
+	jb .Lskip4v;
+
+	/* Process 4 ChaCha20 blocks */
+	movl $20, ROUND;
+	subq $4, NBLKS;
+
+	/* Construct counter vectors X12 and X13 */
+	vpaddd (12 * 4)(INPUT){1to4}, COUNTER_ADDx, X12x;
+	vpbroadcastd (13 * 4)(INPUT), X13x;
+	vpcmpud $6, X12x, COUNTER_ADDx, %k2;
+	vpaddd .Lone rRIP {1to4}, X13x, X13x{%k2};
+	vmovdqa32 X12x, X12_SAVEx;
+	vmovdqa32 X13x, X13_SAVEx;
+
+	/* Load vectors */
+	vmovdqa32 S0x, X0x;
+	vmovdqa32 S4x, X4x;
+	vmovdqa32 S8x, X8x;
+	vmovdqa32 S1x, X1x;
+	vmovdqa32 S5x, X5x;
+	vpbroadcastd (9 * 4)(INPUT), X9x;
+	QUARTERROUND2V(X0x, X4x,  X8x, X12x,   X1x, X5x,  X9x, X13x)
+	vmovdqa32 S2x, X2x;
+	vmovdqa32 S6x, X6x;
+	vpbroadcastd (10 * 4)(INPUT), X10x;
+	vmovdqa32 S14x, X14x;
+	vmovdqa32 S3x, X3x;
+	vmovdqa32 S7x, X7x;
+	vpbroadcastd (11 * 4)(INPUT), X11x;
+	vmovdqa32 S15x, X15x;
 
-	/* clear the used vector registers */
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+	jmp .Lround2_entry_4v;
+
+.align 16
+.Lround2_4v:
+	QUARTERROUND2V(X2x, X7x,  X8x, X13x,   X3x, X4x,  X9x, X14x)
+	QUARTERROUND2V(X0x, X4x,  X8x, X12x,   X1x, X5x,  X9x, X13x)
+.Lround2_entry_4v:
+	QUARTERROUND2V(X2x, X6x, X10x, X14x,   X3x, X7x, X11x, X15x)
+	QUARTERROUND2V(X0x, X5x, X10x, X15x,   X1x, X6x, X11x, X12x)
+	subl $2, ROUND;
+	jnz .Lround2_4v;
+
+	PLUS(X0x, S0x);
+	PLUS(X1x, S1x);
+	PLUS(X5x, S5x);
+	PLUS(X6x, S6x);
+	PLUS(X10x, (10 * 4)(INPUT){1to4});
+	PLUS(X11x, (11 * 4)(INPUT){1to4});
+	PLUS(X15x, S15x);
+	PLUS(X12x, X12_SAVEx);
+	QUARTERROUND2V(X2x, X7x,  X8x, X13x,   X3x, X4x,  X9x, X14x)
+
+	PLUS(X2x, S2x);
+	PLUS(X3x, S3x);
+	PLUS(X4x, S4x);
+	PLUS(X7x, S7x);
+	transpose_4x4(X0x, X1x, X2x, X3x, TMP0x, TMP1x);
+	transpose_4x4(X4x, X5x, X6x, X7x, TMP0x, TMP1x);
+	xor_src_dst_4x4(DST, SRC, (16 * 0), (64 * 1), X0x, X1x, X2x, X3x);
+	PLUS(X8x, S8x);
+	PLUS(X9x, (9 * 4)(INPUT){1to4});
+	xor_src_dst_4x4(DST, SRC, (16 * 1), (64 * 1), X4x, X5x, X6x, X7x);
+	PLUS(X13x, X13_SAVEx);
+	PLUS(X14x, S14x);
+	transpose_4x4(X8x, X9x, X10x, X11x, TMP0x, TMP1x);
+	transpose_4x4(X12x, X13x, X14x, X15x, TMP0x, TMP1x);
+	xor_src_dst_4x4(DST, SRC, (16 * 2), (64 * 1), X8x, X9x, X10x, X11x);
+	xor_src_dst_4x4(DST, SRC, (16 * 3), (64 * 1), X12x, X13x, X14x, X15x);
+
+	leaq (4 * 64)(SRC), SRC;
+	leaq (4 * 64)(DST), DST;
+
+.align 8
+.Lskip4v:
+	/* clear AVX512 registers */
+	kxorq %k2, %k2, %k2;
+	vzeroupper;
 	clear_zmm16_zmm31();
-	kxord %k2, %k2, %k2;
+
+.align 8
+.Lskip_vertical_handling:
+	cmpq $0, NBLKS;
+	je .Ldone;
+
+	/* Load state */
+	vmovdqu (0 * 4)(INPUT), X10x;
+	vmovdqu (4 * 4)(INPUT), X11x;
+	vmovdqu (8 * 4)(INPUT), X12x;
+	vmovdqu (12 * 4)(INPUT), X13x;
+
+	/* Load constant */
+	vmovdqa .Lone rRIP, X4x;
+
+	cmpq $1, NBLKS;
+	je .Lhandle1;
+
+	/* Process two ChaCha20 blocks (XMM) */
+	movl $20, ROUND;
+	subq $2, NBLKS;
+
+	vmovdqa X10x, X0x;
+	vmovdqa X11x, X1x;
+	vmovdqa X12x, X2x;
+	vmovdqa X13x, X3x;
+
+	vmovdqa X10x, X8x;
+	vmovdqa X11x, X9x;
+	vmovdqa X12x, X14x;
+	vpaddq X4x, X13x, X15x;
+	vmovdqa X15x, X7x;
+
+.align 16
+.Lround2_2:
+	QUARTERROUND2H(X0x, X1x, X2x,  X3x,  X8x, X9x, X14x, X15x,
+		       0x39, 0x4e, 0x93);
+	QUARTERROUND2H(X0x, X1x, X2x,  X3x,  X8x, X9x, X14x, X15x,
+		       0x93, 0x4e, 0x39);
+	subl $2, ROUND;
+	jnz .Lround2_2;
+
+	PLUS(X0x, X10x);
+	PLUS(X1x, X11x);
+	PLUS(X2x, X12x);
+	PLUS(X3x, X13x);
+
+	vpaddq .Ltwo rRIP, X13x, X13x; /* Update counter */
+
+	PLUS(X8x, X10x);
+	PLUS(X9x, X11x);
+	PLUS(X14x, X12x);
+	PLUS(X15x, X7x);
+
+	xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x);
+	xor_src_dst_4x4(DST, SRC, 16 * 4, 4 * 4, X8x, X9x, X14x, X15x);
+	lea (2 * 64)(DST), DST;
+	lea (2 * 64)(SRC), SRC;
+
+	cmpq $0, NBLKS;
+	je .Lskip1;
+
+.align 8
+.Lhandle1:
+	/* Process one ChaCha20 block (XMM) */
+	movl $20, ROUND;
+	subq $1, NBLKS;
+
+	vmovdqa X10x, X0x;
+	vmovdqa X11x, X1x;
+	vmovdqa X12x, X2x;
+	vmovdqa X13x, X3x;
+
+.align 16
+.Lround2_1:
+	QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x39, 0x4e, 0x93);
+	QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x93, 0x4e, 0x39);
+	subl $2, ROUND;
+	jnz .Lround2_1;
+
+	PLUS(X0x, X10x);
+	PLUS(X1x, X11x);
+	PLUS(X2x, X12x);
+	PLUS(X3x, X13x);
+
+	vpaddq X4x, X13x, X13x; /* Update counter */
+
+	xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x);
+	/*lea (1 * 64)(DST), DST;*/
+	/*lea (1 * 64)(SRC), SRC;*/
+
+.align 8
+.Lskip1:
+	/* Store counter */
+	vmovdqu X13x, (12 * 4)(INPUT);
+
+.align 8
+.Ldone:
 	vzeroall; /* clears ZMM0-ZMM15 */
 
-	/* eax zeroed by round loop. */
+	xorl %eax, %eax;
 	ret_spec_stop;
 	CFI_ENDPROC();
-ELF(.size _gcry_chacha20_amd64_avx512_blocks16,
-	  .-_gcry_chacha20_amd64_avx512_blocks16;)
+ELF(.size _gcry_chacha20_amd64_avx512_blocks,
+	  .-_gcry_chacha20_amd64_avx512_blocks;)
 
 #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index f0cb8721..a7e0dd63 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -173,9 +173,9 @@ unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
 
 #ifdef USE_AVX512
 
-unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst,
-						  const byte *src,
-						  size_t nblks) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_amd64_avx512_blocks(u32 *state, byte *dst,
+                                                const byte *src,
+                                                size_t nblks) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
@@ -352,6 +352,13 @@ static unsigned int
 chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
 		 size_t nblks)
 {
+#ifdef USE_AVX512
+  if (ctx->use_avx512)
+    {
+      return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks);
+    }
+#endif
+
 #ifdef USE_SSSE3
   if (ctx->use_ssse3)
     {
@@ -546,14 +553,13 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
   unsigned int nburn, burn = 0;
 
 #ifdef USE_AVX512
-  if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16)
+  if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE)
     {
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
-      nblocks -= nblocks % 16;
-      nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf,
-						   nblocks);
+      nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf,
+                                                 nblocks);
       burn = nburn > burn ? nburn : burn;
-      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      length %= CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
@@ -662,7 +668,7 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
       size_t nblocks = length / CHACHA20_BLOCK_SIZE;
       nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks);
       burn = nburn > burn ? nburn : burn;
-      length -= nblocks * CHACHA20_BLOCK_SIZE;
+      length %= CHACHA20_BLOCK_SIZE;
       outbuf += nblocks * CHACHA20_BLOCK_SIZE;
       inbuf  += nblocks * CHACHA20_BLOCK_SIZE;
     }
-- 
2.37.2


From jussi.kivilinna at iki.fi  Sun Dec 11 14:26:41 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 11 Dec 2022 15:26:41 +0200
Subject: [PATCH] avx512: tweak AVX512 spec stop, use common macro in assembly
Message-ID: <20221211132641.2371386-1-jussi.kivilinna@iki.fi>

* cipher/cipher-gcm-intel-pclmul.c: Use xmm registers for AVX512
spec stop.
* cipher/asm-common-amd64.h (spec_stop_avx512): New.
* cipher/blake2b-amd64-avx512.S: Use spec_stop_avx512.
* cipher/blake2s-amd64-avx512.S: Likewise.
* cipher/camellia-gfni-avx512-amd64.S: Likewise.
* cipher/chacha20-avx512-amd64.S: Likewise.
* cipher/keccak-amd64-avx512.S: Likewise.
* cipher/poly1305-amd64-avx512.S: Likewise.
* cipher/sha512-avx512-amd64.S: Likewise.
* cipher/sm4-gfni-avx512-amd64.S: Likewise.
---

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/asm-common-amd64.h           | 10 +++++++++-
 cipher/blake2b-amd64-avx512.S       |  2 ++
 cipher/blake2s-amd64-avx512.S       |  2 ++
 cipher/camellia-gfni-avx512-amd64.S | 14 +++++++-------
 cipher/chacha20-amd64-avx512.S      |  3 +--
 cipher/cipher-gcm-intel-pclmul.c    |  4 ++--
 cipher/keccak-amd64-avx512.S        |  4 ++++
 cipher/poly1305-amd64-avx512.S      |  3 +--
 cipher/sha512-avx512-amd64.S        |  2 ++
 cipher/sm4-gfni-avx512-amd64.S      | 20 ++++++++++++++------
 10 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 97912b1b..dc2c4d2f 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -186,8 +186,16 @@
 # define EXIT_SYSV_FUNC
 #endif
 
-/* 'ret' instruction replacement for straight-line speculation mitigation */
+/* 'ret' instruction replacement for straight-line speculation mitigation. */
 #define ret_spec_stop \
 	ret; int3;
 
+/* This prevents speculative execution on old AVX512 CPUs, to prevent
+ * speculative execution to AVX512 code. The vpopcntb instruction is
+ * available on newer CPUs that do not suffer from significant frequency
+ * drop when 512-bit vectors are utilized. */
+#define spec_stop_avx512 \
+	vpxord %xmm16, %xmm16, %xmm16; \
+	vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */
+
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S
index db53474d..18b0c3ad 100644
--- a/cipher/blake2b-amd64-avx512.S
+++ b/cipher/blake2b-amd64-avx512.S
@@ -221,6 +221,8 @@ _gcry_blake2b_transform_amd64_avx512:
          */
         CFI_STARTPROC();
 
+        spec_stop_avx512;
+
         movl $0xf, %eax;
         kmovw %eax, %k0;
         xorl %eax, %eax;
diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S
index 4457ca99..ddcdfd67 100644
--- a/cipher/blake2s-amd64-avx512.S
+++ b/cipher/blake2s-amd64-avx512.S
@@ -183,6 +183,8 @@ _gcry_blake2s_transform_amd64_avx512:
          */
         CFI_STARTPROC();
 
+        spec_stop_avx512;
+
         addq $64, (STATE_T + 0)(RSTATE);
 
         vmovdqa .Liv+(0 * 4) rRIP, ROW3;
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
index 15b2dc90..bddad804 100644
--- a/cipher/camellia-gfni-avx512-amd64.S
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -832,7 +832,7 @@ _gcry_camellia_gfni_avx512_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
 	vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
@@ -985,7 +985,7 @@ _gcry_camellia_gfni_avx512_cbc_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	movq %rcx, %r9;
 
@@ -1047,7 +1047,7 @@ _gcry_camellia_gfni_avx512_cfb_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
@@ -1122,7 +1122,7 @@ _gcry_camellia_gfni_avx512_ocb_enc:
 	 *	%r9 : L pointers (void *L[64])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	pushq %r12;
 	CFI_PUSH(%r12);
@@ -1285,7 +1285,7 @@ _gcry_camellia_gfni_avx512_ocb_dec:
 	 *	%r9 : L pointers (void *L[64])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	pushq %r12;
 	CFI_PUSH(%r12);
@@ -1451,7 +1451,7 @@ _gcry_camellia_gfni_avx512_enc_blk64:
 	 *	%rdx: src (64 blocks)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
@@ -1515,7 +1515,7 @@ _gcry_camellia_gfni_avx512_dec_blk64:
 	 *	%rdx: src (64 blocks)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index 682798fe..544e7cdc 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -298,8 +298,7 @@ _gcry_chacha20_amd64_avx512_blocks:
 	 */
 	CFI_STARTPROC();
 
-	vpxord %xmm16, %xmm16, %xmm16;
-	vpopcntb %xmm16, %xmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	cmpq $4, NBLKS;
 	jb .Lskip_vertical_handling;
diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c
index 78a9e338..ec00df09 100644
--- a/cipher/cipher-gcm-intel-pclmul.c
+++ b/cipher/cipher-gcm-intel-pclmul.c
@@ -1513,7 +1513,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
       if (nblocks >= 32
 	  && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
 	{
-	  asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */
+	  asm volatile ("vpopcntb %%xmm7, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
 			"vshufi64x2 $0, %%zmm7, %%zmm7, %%zmm15\n\t"
 			"vmovdqa %%xmm1, %%xmm8\n\t"
 			"vmovdqu64 %[swapperm], %%zmm14\n\t"
@@ -1792,7 +1792,7 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf,
       if (nblocks >= 32
 	  && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512))
 	{
-	  asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */
+	  asm volatile ("vpopcntb %%xmm1, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */
 			"vmovdqa %%xmm1, %%xmm8\n\t"
 			"vmovdqu64 %[swapperm], %%zmm14\n\t"
 			:
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
index f44e0285..58b4150f 100644
--- a/cipher/keccak-amd64-avx512.S
+++ b/cipher/keccak-amd64-avx512.S
@@ -282,6 +282,8 @@ _gcry_keccak_f1600_state_permute64_avx512:
 	 */
 	CFI_STARTPROC()
 
+	spec_stop_avx512;
+
 	leaq		12*8(%rdi), %rax
 	leaq		(24-1)*8(%rsi), %r11
 
@@ -362,6 +364,8 @@ _gcry_keccak_absorb_blocks_avx512:
 	 */
 	CFI_STARTPROC()
 
+	spec_stop_avx512;
+
 	leaq		12*8(%rdi), %rax
 	leaq		(24-1)*8(%rsi), %r11
 
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 72303e1e..5c8f838f 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1580,8 +1580,7 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
 ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;)
 _gcry_poly1305_amd64_avx512_blocks:
 	CFI_STARTPROC()
-	vpxord xmm16, xmm16, xmm16;
-	vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 	FUNC_ENTRY()
 
 #define _a0 gp3
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index 0e3f44ab..145c8667 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -264,6 +264,8 @@ _gcry_sha512_transform_amd64_avx512:
 	cmp	rdx, 0
 	je	.Lnowork
 
+	spec_stop_avx512;
+
 	/* Setup mask register for DC:BA merging. */
 	mov	eax, 0b1100
 	kmovd	MASK_DC_00, eax
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index 1d5e9a48..0f9899d4 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -158,6 +158,7 @@ _gcry_sm4_gfni_avx512_expand_key:
 	 *	%r8: ck array
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	vmovd 0*4(%rdi), RA0x;
 	vmovd 1*4(%rdi), RA1x;
@@ -553,6 +554,7 @@ _gcry_sm4_gfni_avx512_crypt_blk1_16:
 	 *	%rcx: num blocks (1..16)
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 #define LOAD_INPUT(offset, yreg) \
 	cmpq $(1 + 2 * (offset)), %rcx; \
@@ -621,6 +623,7 @@ _gcry_sm4_gfni_avx512_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
 	vmovdqa .Lcounter0123_lo rRIP, RTMP1;
@@ -728,6 +731,7 @@ _gcry_sm4_gfni_avx512_cbc_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	vmovdqu (0 * 32)(%rdx), RA0;
 	vmovdqu (1 * 32)(%rdx), RA1;
@@ -779,6 +783,7 @@ _gcry_sm4_gfni_avx512_cfb_dec:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	/* Load input */
 	vmovdqu (%rcx), RNOTx;
@@ -835,6 +840,7 @@ _gcry_sm4_gfni_avx512_ocb_enc:
 	 *	%r9 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -950,6 +956,7 @@ _gcry_sm4_gfni_avx512_ocb_dec:
 	 *	%r9 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -1066,6 +1073,7 @@ _gcry_sm4_gfni_avx512_ocb_auth:
 	 *	%r8 : L pointers (void *L[16])
 	 */
 	CFI_STARTPROC();
+	spec_stop_avx512;
 
 	subq $(4 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -1251,7 +1259,7 @@ _gcry_sm4_gfni_avx512_crypt_blk32:
 	 *	%rdx: src (32 blocks)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	/* Load input */
 	vmovdqu32 (0 * 64)(%rdx), RA0z;
@@ -1292,7 +1300,7 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
 	vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
@@ -1400,7 +1408,7 @@ _gcry_sm4_gfni_avx512_cbc_dec_blk32:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	vmovdqu32 (0 * 64)(%rdx), RA0z;
 	vmovdqu32 (1 * 64)(%rdx), RA1z;
@@ -1453,7 +1461,7 @@ _gcry_sm4_gfni_avx512_cfb_dec_blk32:
 	 *	%rcx: iv
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	/* Load input */
 	vmovdqu (%rcx), RA0x;
@@ -1510,7 +1518,7 @@ _gcry_sm4_gfni_avx512_ocb_enc_blk32:
 	 *	%r9 : L pointers (void *L[32])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	subq $(5 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(5 * 8);
@@ -1634,7 +1642,7 @@ _gcry_sm4_gfni_avx512_ocb_dec_blk32:
 	 *	%r9 : L pointers (void *L[32])
 	 */
 	CFI_STARTPROC();
-	vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+	spec_stop_avx512;
 
 	subq $(5 * 8), %rsp;
 	CFI_ADJUST_CFA_OFFSET(5 * 8);
-- 
2.37.2


From gniibe at fsij.org  Tue Dec 13 09:09:23 2022
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Tue, 13 Dec 2022 17:09:23 +0900
Subject: Port configure script to C99
In-Reply-To: <87tu2chh6b.fsf@oldenburg.str.redhat.com>
References: <87tu2chh6b.fsf@oldenburg.str.redhat.com>
Message-ID: <871qp3g1xo.fsf@akagi.fsij.org>

Florian Weimer wrote:
> We noticed that libgcrypt fails to build because it uses implicit
> function declarations in the configure script.  The patch below should
> fix that.

Thank you.

Applied and pushed the change (so that it has consistent tab&space) to
master.  Will apply to 1.10 branch, too.

For libgcrypt, I found m4/ax_cc_for_build.m4 and m4/noexecstack.m4 have
similar problems.  I'll fix.

For the source distribution (*.tar.bz2) which has generated configure
script, I think that we need to fix more.  IIUC, Autoconf and Libtool
need to be updated.
-- 


From jussi.kivilinna at iki.fi  Wed Dec 14 18:49:09 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 14 Dec 2022 19:49:09 +0200
Subject: [PATCH 2/2] rijndael-ppc: fix wrong inline assembly constraint
In-Reply-To: <20221214174909.569097-1-jussi.kivilinna@iki.fi>
References: <20221214174909.569097-1-jussi.kivilinna@iki.fi>
Message-ID: <20221214174909.569097-2-jussi.kivilinna@iki.fi>

* cipher/rijndael-ppc-function.h (CBC_ENC_FUNC): Fix outiv constraint.
--

Noticed when trying to compile with powerpc64le clang. GCC accepted the
buggy constraint without complaints.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ppc-functions.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 23fa4206..063c5358 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -373,7 +373,7 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
        * last one. */
       __asm__ volatile ("vcipherlast %0, %0, %2\n\t"
 			"vcipherlast %1, %1, %3\n\t"
-			: "+v" (iv), "+outiv" (outiv)
+			: "+v" (iv), "+v" (outiv)
 			: "v" (nextiv), "v" (rkeylast));
 
       VEC_STORE_BE ((u128_t *)out, 0, outiv, bige_const);
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Dec 14 18:49:08 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 14 Dec 2022 19:49:08 +0200
Subject: [PATCH 1/2] Fix building AVX512 Intel-syntax assembly with x86-64
 clang
Message-ID: <20221214174909.569097-1-jussi.kivilinna@iki.fi>

* cipher/asm-common-amd64.h (spec_stop_avx512_intel_syntax): New.
* cipher/poly1305-amd64-avx512.S: Use spec_stop_avx512_intel_syntax
instead of spec_stop_avx512.
* cipher/sha512-avx512-amd64.S: Likewise.
--

Reported-by: Clemens Lang <cllang at redhat.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/asm-common-amd64.h      | 4 ++++
 cipher/poly1305-amd64-avx512.S | 2 +-
 cipher/sha512-avx512-amd64.S   | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index dc2c4d2f..cd93abc3 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -198,4 +198,8 @@
 	vpxord %xmm16, %xmm16, %xmm16; \
 	vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */
 
+#define spec_stop_avx512_intel_syntax \
+	vpxord xmm16, xmm16, xmm16; \
+	vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */
+
 #endif /* GCRY_ASM_COMMON_AMD64_H */
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 5c8f838f..6622861f 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1580,7 +1580,7 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
 ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;)
 _gcry_poly1305_amd64_avx512_blocks:
 	CFI_STARTPROC()
-	spec_stop_avx512;
+	spec_stop_avx512_intel_syntax;
 	FUNC_ENTRY()
 
 #define _a0 gp3
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index 145c8667..65475422 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -264,7 +264,7 @@ _gcry_sha512_transform_amd64_avx512:
 	cmp	rdx, 0
 	je	.Lnowork
 
-	spec_stop_avx512;
+	spec_stop_avx512_intel_syntax;
 
 	/* Setup mask register for DC:BA merging. */
 	mov	eax, 0b1100
-- 
2.37.2


From jussi.kivilinna at iki.fi  Wed Dec 14 18:53:36 2022
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 14 Dec 2022 19:53:36 +0200
Subject: [PATCH] Add clang support for ARM 32-bit assembly
Message-ID: <20221214175336.604891-1-jussi.kivilinna@iki.fi>

* configure.ac (gcry_cv_gcc_arm_platform_as_ok)
(gcry_cv_gcc_inline_asm_neon): Remove % prefix from register names.
* cipher/cipher-gcm-armv7-neon.S (vmull_p64): Prefix constant values
with # character instead of $.
* cipher/blowfish-arm.S: Remove % prefix from all register names.
* cipher/camellia-arm.S: Likewise.
* cipher/cast5-arm.S: Likewise.
* cipher/rijndael-arm.S: Likewise.
* cipher/rijndael-armv8-aarch32-ce.S: Likewise.
* cipher/sha512-arm.S: Likewise.
* cipher/sha512-armv7-neon.S: Likewise.
* cipher/twofish-arm.S: Likewise.
* mpi/arm/mpih-add1.S: Likewise.
* mpi/arm/mpih-mul1.S: Likewise.
* mpi/arm/mpih-mul2.S: Likewise.
* mpi/arm/mpih-mul3.S: Likewise.
* mpi/arm/mpih-sub1.S: Likewise.
--

Reported-by: Dmytro Kovalov <dmytro.a.kovalov at globallogic.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/blowfish-arm.S              | 216 ++++++++++++++---------------
 cipher/camellia-arm.S              |  68 ++++-----
 cipher/cast5-arm.S                 | 204 +++++++++++++--------------
 cipher/cipher-gcm-armv7-neon.S     |  24 ++--
 cipher/rijndael-arm.S              | 106 +++++++-------
 cipher/rijndael-armv8-aarch32-ce.S |  66 ++++-----
 cipher/sha512-arm.S                | 204 +++++++++++++--------------
 cipher/sha512-armv7-neon.S         |  78 +++++------
 cipher/twofish-arm.S               |  62 ++++-----
 configure.ac                       |  10 +-
 mpi/arm/mpih-add1.S                |  50 +++----
 mpi/arm/mpih-mul1.S                |  58 ++++----
 mpi/arm/mpih-mul2.S                |  78 +++++------
 mpi/arm/mpih-mul3.S                |  88 ++++++------
 mpi/arm/mpih-sub1.S                |  52 +++----
 15 files changed, 682 insertions(+), 682 deletions(-)

diff --git a/cipher/blowfish-arm.S b/cipher/blowfish-arm.S
index b30aa31f..a5101b5c 100644
--- a/cipher/blowfish-arm.S
+++ b/cipher/blowfish-arm.S
@@ -36,24 +36,24 @@
 #define p	(s3 + (1 * 256) * 4)
 
 /* register macros */
-#define CTXs0 %r0
-#define CTXs1 %r9
-#define CTXs2 %r8
-#define CTXs3 %r10
-#define RMASK %lr
-#define RKEYL %r2
-#define RKEYR %ip
+#define CTXs0 r0
+#define CTXs1 r9
+#define CTXs2 r8
+#define CTXs3 r10
+#define RMASK lr
+#define RKEYL r2
+#define RKEYR ip
 
-#define RL0 %r3
-#define RR0 %r4
+#define RL0 r3
+#define RR0 r4
 
-#define RL1 %r9
-#define RR1 %r10
+#define RL1 r9
+#define RR1 r10
 
-#define RT0 %r11
-#define RT1 %r7
-#define RT2 %r5
-#define RT3 %r6
+#define RT0 r11
+#define RT1 r7
+#define RT2 r5
+#define RT3 r6
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
@@ -250,7 +250,7 @@ __blowfish_enc_blk1:
 	 * output:
 	 *	[RR0, RL0]: dst
 	 */
-	push {%lr};
+	push {lr};
 
 	add CTXs1, CTXs0, #(s1 - s0);
 	add CTXs2, CTXs0, #(s2 - s0);
@@ -268,7 +268,7 @@ __blowfish_enc_blk1:
 	round_enc(16);
 	add_roundkey_enc();
 
-	pop {%pc};
+	pop {pc};
 .size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
 
 .align 8
@@ -277,22 +277,22 @@ __blowfish_enc_blk1:
 
 _gcry_blowfish_arm_do_encrypt:
 	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: u32 *ret_xl
-	 *	%r2: u32 *ret_xr
+	 *	r0: ctx, CTX
+	 *	r1: u32 *ret_xl
+	 *	r2: u32 *ret_xr
 	 */
-	push {%r2, %r4-%r11, %ip, %lr};
+	push {r2, r4-r11, ip, lr};
 
-	ldr RL0, [%r1];
-	ldr RR0, [%r2];
+	ldr RL0, [r1];
+	ldr RR0, [r2];
 
 	bl __blowfish_enc_blk1;
 
-	pop {%r2};
-	str RR0, [%r1];
-	str RL0, [%r2];
+	pop {r2};
+	str RR0, [r1];
+	str RL0, [r2];
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
 
 .align 3
@@ -301,19 +301,19 @@ _gcry_blowfish_arm_do_encrypt:
 
 _gcry_blowfish_arm_encrypt_block:
 	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx, CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	bl __blowfish_enc_blk1;
 
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
 
 .align 3
@@ -322,18 +322,18 @@ _gcry_blowfish_arm_encrypt_block:
 
 _gcry_blowfish_arm_decrypt_block:
 	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx, CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
 	add CTXs1, CTXs0, #(s1 - s0);
 	add CTXs2, CTXs0, #(s2 - s0);
 	mov RMASK, #(0xff << 2); /* byte mask */
 	add CTXs3, CTXs1, #(s3 - s1);
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	load_roundkey_dec(17);
 	round_dec(15);
@@ -346,9 +346,9 @@ _gcry_blowfish_arm_decrypt_block:
 	round_dec(1);
 	add_roundkey_dec();
 
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
 
 /***********************************************************************
@@ -548,7 +548,7 @@ _gcry_blowfish_arm_enc_blk2:
 	 * output:
 	 *	[RR0, RL0], [RR1, RL1]: dst
 	 */
-	push {RT0,%lr};
+	push {RT0,lr};
 
 	add CTXs2, CTXs0, #(s2 - s0);
 	mov RMASK, #(0xff << 2); /* byte mask */
@@ -568,7 +568,7 @@ _gcry_blowfish_arm_enc_blk2:
 	host_to_be(RR1, RT0);
 	host_to_be(RL1, RT0);
 
-	pop {RT0,%pc};
+	pop {RT0,pc};
 .size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
 
 .align 3
@@ -577,40 +577,40 @@ _gcry_blowfish_arm_enc_blk2:
 
 _gcry_blowfish_arm_cfb_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r2, %r4-%r11, %ip, %lr};
+	push {r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
-	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-	ldm %r3, {RL0, RR0};
+	/* Load input (iv/r3 is aligned, src/r2 might not be) */
+	ldm r3, {RL0, RR0};
 	host_to_be(RL0, RT0);
 	host_to_be(RR0, RT0);
-	read_block(%r2, 0, RL1, RR1, RT0);
+	read_block(r2, 0, RL1, RR1, RT0);
 
 	/* Update IV, load src[1] and save to iv[0] */
-	read_block_host(%r2, 8, %r5, %r6, RT0);
-	stm %lr, {%r5, %r6};
+	read_block_host(r2, 8, r5, r6, RT0);
+	stm lr, {r5, r6};
 
 	bl _gcry_blowfish_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r1: dst, %r0: %src */
-	pop {%r0};
+	/* r1: dst, r0: src */
+	pop {r0};
 
 	/* dst = src ^ result */
-	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r0, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
 
@@ -620,42 +620,42 @@ _gcry_blowfish_arm_cfb_dec:
 
 _gcry_blowfish_arm_ctr_enc:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit, big-endian)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit, big-endian)
 	 */
-	push {%r2, %r4-%r11, %ip, %lr};
+	push {r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
 	/* Load IV (big => host endian) */
-	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
+	read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT0);
 
 	/* Construct IVs */
 	adds RR1, RR0, #1; /* +1 */
 	adc RL1, RL0, #0;
-	adds %r6, RR1, #1; /* +2 */
-	adc %r5, RL1, #0;
+	adds r6, RR1, #1; /* +2 */
+	adc r5, RL1, #0;
 
 	/* Store new IV (host => big-endian) */
-	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
+	write_block_aligned(lr, 0, r5, r6, host_to_be, RT0);
 
 	bl _gcry_blowfish_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r1: dst, %r0: %src */
-	pop {%r0};
+	/* r1: dst, r0: src */
+	pop {r0};
 
 	/* XOR key-stream with plaintext */
-	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r0, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r1, r5, r6, r7, r8, r9, r10);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
 
@@ -697,45 +697,45 @@ _gcry_blowfish_arm_dec_blk2:
 
 _gcry_blowfish_arm_cbc_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r2-%r11, %ip, %lr};
+	push {r2-r11, ip, lr};
 
-	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+	read_block2(r2, RL0, RR0, RL1, RR1, RT0);
 
 	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
 	 * of function call. */
 	b _gcry_blowfish_arm_dec_blk2;
 .Ldec_cbc_tail:
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: %src, %r1: dst, %r2: iv */
-	pop {%r0, %r2};
+	/* r0: src, r1: dst, r2: iv */
+	pop {r0, r2};
 
-	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r0, 0, %r7, %r8, %r5);
-	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-	ldm %r2, {%r5, %r6};
+	/* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+	read_block_host(r0, 0, r7, r8, r5);
+	/* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+	ldm r2, {r5, r6};
 
 	/* out[1] ^= IV+1 */
-	eor %r10, %r7;
-	eor %r9, %r8;
+	eor r10, r7;
+	eor r9, r8;
 	/* out[0] ^= IV */
-	eor %r4, %r5;
-	eor %r3, %r6;
+	eor r4, r5;
+	eor r3, r6;
 
-	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r0, 8, %r7, %r8, %r5);
+	/* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+	read_block_host(r0, 8, r7, r8, r5);
 	/* store IV+2 to iv[0] (aligned). */
-	stm %r2, {%r7, %r8};
+	stm r2, {r7, r8};
 
 	/* store result to dst[0-3]. Might be unaligned. */
-	write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+	write_block2_host(r1, r4, r3, r10, r9, r5, r6);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
 
diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S
index a3d87d11..decd40c2 100644
--- a/cipher/camellia-arm.S
+++ b/cipher/camellia-arm.S
@@ -45,23 +45,23 @@
 #define key_table 0
 
 /* register macros */
-#define CTX %r0
-#define RTAB1 %ip
-#define RTAB3 %r1
-#define RMASK %lr
+#define CTX r0
+#define RTAB1 ip
+#define RTAB3 r1
+#define RMASK lr
 
-#define IL %r2
-#define IR %r3
+#define IL r2
+#define IR r3
 
-#define XL %r4
-#define XR %r5
-#define YL %r6
-#define YR %r7
+#define XL r4
+#define XR r5
+#define YL r6
+#define YR r7
 
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
+#define RT0 r8
+#define RT1 r9
+#define RT2 r10
+#define RT3 r11
 
 /* helper macros */
 #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
@@ -248,7 +248,7 @@
 	    (n) * 2 + 0, (n) * 2 + 1);
 
 #define inpack(n) \
-	ldr_input_be(%r2, XL, XR, YL, YR, RT0); \
+	ldr_input_be(r2, XL, XR, YL, YR, RT0); \
 	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
 	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
 	eor XL, RT0; \
@@ -259,7 +259,7 @@
 	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
 	eor YL, RT0; \
 	eor YR, RT1; \
-	str_output_be(%r1, YL, YR, XL, XR, RT0, RT1);
+	str_output_be(r1, YL, YR, XL, XR, RT0, RT1);
 
 .align 3
 .globl _gcry_camellia_arm_encrypt_block
@@ -267,17 +267,17 @@
 
 _gcry_camellia_arm_encrypt_block:
 	/* input:
-	 *	%r0: keytable
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: keybitlen
+	 *	r0: keytable
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: keybitlen
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
 	mov RMASK, #0xff;
 	add RTAB3, RTAB1, #(2 * 4);
-	push {%r3};
+	push {r3};
 	mov RMASK, RMASK, lsl#4 /* byte mask */
 
 	inpack(0);
@@ -292,20 +292,20 @@ _gcry_camellia_arm_encrypt_block:
 	cmp RT0, #(16 * 8);
 	bne .Lenc_256;
 
-	pop {%r1};
+	pop {r1};
 	outunpack(24);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 
 .Lenc_256:
 	enc_fls(24);
 	enc_rounds(24);
 
-	pop {%r1};
+	pop {r1};
 	outunpack(32);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;
 
@@ -315,19 +315,19 @@ _gcry_camellia_arm_encrypt_block:
 
 _gcry_camellia_arm_decrypt_block:
 	/* input:
-	 *	%r0: keytable
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: keybitlen
+	 *	r0: keytable
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: keybitlen
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3);
 	mov RMASK, #0xff;
 	add RTAB3, RTAB1, #(2 * 4);
 	mov RMASK, RMASK, lsl#4 /* byte mask */
 
-	cmp %r3, #(16 * 8);
+	cmp r3, #(16 * 8);
 	bne .Ldec_256;
 
 	inpack(24);
@@ -339,10 +339,10 @@ _gcry_camellia_arm_decrypt_block:
 	dec_fls(8);
 	dec_rounds(0);
 
-	pop {%r1};
+	pop {r1};
 	outunpack(0);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 
 .Ldec_256:
diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S
index 76ddd2e3..ae53e6b4 100644
--- a/cipher/cast5-arm.S
+++ b/cipher/cast5-arm.S
@@ -50,25 +50,25 @@
 #define Kr_arm_dec (Kr_arm_enc + (16))
 
 /* register macros */
-#define CTX %r0
-#define Rs1 %r7
-#define Rs2 %r8
-#define Rs3 %r9
-#define Rs4 %r10
-#define RMASK %r11
-#define RKM %r1
-#define RKR %r2
-
-#define RL0 %r3
-#define RR0 %r4
-
-#define RL1 %r9
-#define RR1 %r10
-
-#define RT0 %lr
-#define RT1 %ip
-#define RT2 %r5
-#define RT3 %r6
+#define CTX r0
+#define Rs1 r7
+#define Rs2 r8
+#define Rs3 r9
+#define Rs4 r10
+#define RMASK r11
+#define RKM r1
+#define RKR r2
+
+#define RL0 r3
+#define RR0 r4
+
+#define RL1 r9
+#define RR1 r10
+
+#define RT0 lr
+#define RT1 ip
+#define RT2 r5
+#define RT3 r6
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
@@ -267,11 +267,11 @@
 
 _gcry_cast5_arm_encrypt_block:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
@@ -279,7 +279,7 @@ _gcry_cast5_arm_encrypt_block:
 	add Rs3, Rs1, #(0x100*4*2);
 	add Rs4, Rs1, #(0x100*4*3);
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	load_km(0);
 	load_kr(0);
@@ -300,10 +300,10 @@ _gcry_cast5_arm_encrypt_block:
 	enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
 	enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
 
-	ldr %r1, [%sp], #4;
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	ldr r1, [sp], #4;
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block;
 
@@ -313,11 +313,11 @@ _gcry_cast5_arm_encrypt_block:
 
 _gcry_cast5_arm_decrypt_block:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: CTX
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
@@ -325,7 +325,7 @@ _gcry_cast5_arm_decrypt_block:
 	add Rs3, Rs1, #(0x100 * 4 * 2);
 	add Rs4, Rs1, #(0x100 * 4 * 3);
 
-	read_block(%r2, 0, RL0, RR0, RT0);
+	read_block(r2, 0, RL0, RR0, RT0);
 
 	load_km(15);
 	load_dec_kr(15);
@@ -346,10 +346,10 @@ _gcry_cast5_arm_decrypt_block:
 	dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
 	dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
 
-	ldr %r1, [%sp], #4;
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+	ldr r1, [sp], #4;
+	write_block(r1, 0, RR0, RL0, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block;
 
@@ -511,7 +511,7 @@ _gcry_cast5_arm_enc_blk2:
 	 * output:
 	 *	[RR0, RL0], [RR1, RL1]: dst
 	 */
-	push {%lr};
+	push {lr};
 
 	GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2);
 	mov RMASK, #(0xff << 2);
@@ -541,7 +541,7 @@ _gcry_cast5_arm_enc_blk2:
 	host_to_be(RR1, RT0);
 	host_to_be(RL1, RT0);
 
-	pop {%pc};
+	pop {pc};
 .ltorg
 .size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2;
 
@@ -551,40 +551,40 @@ _gcry_cast5_arm_enc_blk2:
 
 _gcry_cast5_arm_cfb_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r1, %r2, %r4-%r11, %ip, %lr};
+	push {r1, r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
-	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-	ldm %r3, {RL0, RR0};
+	/* Load input (iv/r3 is aligned, src/r2 might not be) */
+	ldm r3, {RL0, RR0};
 	host_to_be(RL0, RT1);
 	host_to_be(RR0, RT1);
-	read_block(%r2, 0, RL1, RR1, %ip);
+	read_block(r2, 0, RL1, RR1, ip);
 
 	/* Update IV, load src[1] and save to iv[0] */
-	read_block_host(%r2, 8, %r5, %r6, %r7);
-	stm %lr, {%r5, %r6};
+	read_block_host(r2, 8, r5, r6, r7);
+	stm lr, {r5, r6};
 
 	bl _gcry_cast5_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: dst, %r1: %src */
-	pop {%r0, %r1};
+	/* r0: dst, r1: src */
+	pop {r0, r1};
 
 	/* dst = src ^ result */
-	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r1, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r0, r5, r6, r7, r8, r1, r2);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec;
 
@@ -594,42 +594,42 @@ _gcry_cast5_arm_cfb_dec:
 
 _gcry_cast5_arm_ctr_enc:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit, big-endian)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit, big-endian)
 	 */
-	push {%r1, %r2, %r4-%r11, %ip, %lr};
+	push {r1, r2, r4-r11, ip, lr};
 
-	mov %lr, %r3;
+	mov lr, r3;
 
 	/* Load IV (big => host endian) */
-	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1);
+	read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT1);
 
 	/* Construct IVs */
 	adds RR1, RR0, #1; /* +1 */
 	adc RL1, RL0, #0;
-	adds %r6, RR1, #1; /* +2 */
-	adc %r5, RL1, #0;
+	adds r6, RR1, #1; /* +2 */
+	adc r5, RL1, #0;
 
 	/* Store new IV (host => big-endian) */
-	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1);
+	write_block_aligned(lr, 0, r5, r6, host_to_be, RT1);
 
 	bl _gcry_cast5_arm_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: dst, %r1: %src */
-	pop {%r0, %r1};
+	/* r0: dst, r1: src */
+	pop {r0, r1};
 
 	/* XOR key-stream with plaintext */
-	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-	pop {%r4-%r11, %ip, %pc};
+	read_block2_host(r1, r5, r6, r7, r8, lr);
+	eor r5, r4;
+	eor r6, r3;
+	eor r7, r10;
+	eor r8, r9;
+	write_block2_host(r0, r5, r6, r7, r8, r1, r2);
+
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc;
 
@@ -682,45 +682,45 @@ _gcry_cast5_arm_dec_blk2:
 
 _gcry_cast5_arm_cbc_dec:
 	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
+	 *	r0: CTX
+	 *	r1: dst (2 blocks)
+	 *	r2: src (2 blocks)
+	 *	r3: iv (64bit)
 	 */
-	push {%r1-%r11, %ip, %lr};
+	push {r1-r11, ip, lr};
 
-	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+	read_block2(r2, RL0, RR0, RL1, RR1, RT0);
 
 	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
 	 * of function call. */
 	b _gcry_cast5_arm_dec_blk2;
 .Ldec_cbc_tail:
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+	/* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */
 
-	/* %r0: dst, %r1: %src, %r2: iv */
-	pop {%r0-%r2};
+	/* r0: dst, r1: src, r2: iv */
+	pop {r0-r2};
 
-	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r1, 0, %r7, %r8, %r5);
-	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-	ldm %r2, {%r5, %r6};
+	/* load IV+1 (src[0]) to r7:r8. Might be unaligned. */
+	read_block_host(r1, 0, r7, r8, r5);
+	/* load IV (iv[0]) to r5:r6. 'iv' is aligned. */
+	ldm r2, {r5, r6};
 
 	/* out[1] ^= IV+1 */
-	eor %r10, %r7;
-	eor %r9, %r8;
+	eor r10, r7;
+	eor r9, r8;
 	/* out[0] ^= IV */
-	eor %r4, %r5;
-	eor %r3, %r6;
+	eor r4, r5;
+	eor r3, r6;
 
-	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r1, 8, %r7, %r8, %r5);
+	/* load IV+2 (src[1]) to r7:r8. Might be unaligned. */
+	read_block_host(r1, 8, r7, r8, r5);
 	/* store IV+2 to iv[0] (aligned). */
-	stm %r2, {%r7, %r8};
+	stm r2, {r7, r8};
 
 	/* store result to dst[0-3]. Might be unaligned. */
-	write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
+	write_block2_host(r0, r4, r3, r10, r9, r5, r6);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec;
 
diff --git a/cipher/cipher-gcm-armv7-neon.S b/cipher/cipher-gcm-armv7-neon.S
index 16502b4a..c7027af3 100644
--- a/cipher/cipher-gcm-armv7-neon.S
+++ b/cipher/cipher-gcm-armv7-neon.S
@@ -121,21 +121,21 @@ gcry_gcm_reduction_constant:
  * Engineering ? MoCrySEn, 2013". */
 
 #define vmull_p64(rq, rl, rh, ad, bd) \
-	vext.8 t0l, ad, ad, $1; \
+	vext.8 t0l, ad, ad, #1; \
 	vmull.p8 t0q, t0l, bd; \
-	vext.8 rl, bd, bd, $1; \
+	vext.8 rl, bd, bd, #1; \
 	vmull.p8 rq, ad, rl; \
-	vext.8 t1l, ad, ad, $2; \
+	vext.8 t1l, ad, ad, #2; \
 	vmull.p8 t1q, t1l, bd; \
-	vext.8 t3l, bd, bd, $2; \
+	vext.8 t3l, bd, bd, #2; \
 	vmull.p8 t3q, ad, t3l; \
-	vext.8 t2l, ad, ad, $3; \
+	vext.8 t2l, ad, ad, #3; \
 	vmull.p8 t2q, t2l, bd; \
 	veor t0q, t0q, rq; \
-	vext.8 rl, bd, bd, $3; \
+	vext.8 rl, bd, bd, #3; \
 	vmull.p8 rq, ad, rl; \
 	veor t1q, t1q, t3q; \
-	vext.8 t3l, bd, bd, $4; \
+	vext.8 t3l, bd, bd, #4; \
 	vmull.p8 t3q, ad, t3l; \
 	veor t0l, t0l, t0h; \
 	vand t0h, t0h, k48; \
@@ -147,13 +147,13 @@ gcry_gcm_reduction_constant:
 	veor t2l, t2l, t2h; \
 	vand t2h, t2h, k16; \
 	veor t3l, t3l, t3h; \
-	vmov.i64 t3h, $0; \
-	vext.8 t0q, t0q, t0q, $15; \
+	vmov.i64 t3h, #0; \
+	vext.8 t0q, t0q, t0q, #15; \
 	veor t2l, t2l, t2h; \
-	vext.8 t1q, t1q, t1q, $14; \
+	vext.8 t1q, t1q, t1q, #14; \
 	vmull.p8 rq, ad, bd; \
-	vext.8 t2q, t2q, t2q, $13; \
-	vext.8 t3q, t3q, t3q, $12; \
+	vext.8 t2q, t2q, t2q, #13; \
+	vext.8 t3q, t3q, t3q, #12; \
 	veor t0q, t0q, t1q; \
 	veor t2q, t2q, t3q; \
 	veor rq, rq, t0q; \
diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S
index e680c817..632daac2 100644
--- a/cipher/rijndael-arm.S
+++ b/cipher/rijndael-arm.S
@@ -29,23 +29,23 @@
 .arm
 
 /* register macros */
-#define CTX	%r0
-#define RTAB	%lr
-#define RMASK	%ip
+#define CTX	r0
+#define RTAB	lr
+#define RMASK	ip
 
-#define RA	%r4
-#define RB	%r5
-#define RC	%r6
-#define RD	%r7
+#define RA	r4
+#define RB	r5
+#define RC	r6
+#define RD	r7
 
-#define RNA	%r8
-#define RNB	%r9
-#define RNC	%r10
-#define RND	%r11
+#define RNA	r8
+#define RNB	r9
+#define RNC	r10
+#define RND	r11
 
-#define RT0	%r1
-#define RT1	%r2
-#define RT2	%r3
+#define RT0	r1
+#define RT1	r2
+#define RT2	r3
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
@@ -216,30 +216,30 @@
 
 _gcry_aes_arm_encrypt_block:
 	/* input:
-	 *	%r0: keysched, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: number of rounds.. 10, 12 or 14
-	 *      %st+0: encryption table
+	 *	r0: keysched, CTX
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: number of rounds.. 10, 12 or 14
+	 *      st+0: encryption table
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
 	/* read input block */
 
 	/* test if src is unaligned */
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	1f;
 
 	/* unaligned load */
-	ldr_unaligned_le(RA, %r2, 0, RNA);
-	ldr_unaligned_le(RB, %r2, 4, RNB);
-	ldr_unaligned_le(RC, %r2, 8, RNA);
-	ldr_unaligned_le(RD, %r2, 12, RNB);
+	ldr_unaligned_le(RA, r2, 0, RNA);
+	ldr_unaligned_le(RB, r2, 4, RNB);
+	ldr_unaligned_le(RC, r2, 8, RNA);
+	ldr_unaligned_le(RD, r2, 12, RNB);
 	b	2f;
 .ltorg
 1:
 	/* aligned load */
-	ldm	%r2, {RA, RB, RC, RD};
+	ldm	r2, {RA, RB, RC, RD};
 #ifndef __ARMEL__
 	rev	RA, RA;
 	rev	RB, RB;
@@ -247,12 +247,12 @@ _gcry_aes_arm_encrypt_block:
 	rev	RD, RD;
 #endif
 2:
-	ldr     RTAB, [%sp, #40];
-	sub	%sp, #16;
+	ldr     RTAB, [sp, #40];
+	sub	sp, #16;
 
-	str	%r1, [%sp, #4];		/* dst */
+	str	r1, [sp, #4];		/* dst */
 	mov	RMASK, #0xff;
-	str	%r3, [%sp, #8];		/* nrounds */
+	str	r3, [sp, #8];		/* nrounds */
 	mov	RMASK, RMASK, lsl#2;	/* byte mask */
 
 	firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
@@ -264,7 +264,7 @@ _gcry_aes_arm_encrypt_block:
 	encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
 	encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
 
-	ldr	RT0, [%sp, #8];		/* nrounds */
+	ldr	RT0, [sp, #8];		/* nrounds */
 	cmp	RT0, #12;
 	bge	.Lenc_not_128;
 
@@ -272,8 +272,8 @@ _gcry_aes_arm_encrypt_block:
 	lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
 .Lenc_done:
-	ldr	RT0, [%sp, #4];		/* dst */
-	add	%sp, #16;
+	ldr	RT0, [sp, #4];		/* dst */
+	add	sp, #16;
 
 	/* store output block */
 
@@ -301,7 +301,7 @@ _gcry_aes_arm_encrypt_block:
 2:
 
 	mov     r0, #(10 * 4);
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 
 .ltorg
 .Lenc_not_128:
@@ -473,30 +473,30 @@ _gcry_aes_arm_encrypt_block:
 
 _gcry_aes_arm_decrypt_block:
 	/* input:
-	 *	%r0: keysched, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: number of rounds.. 10, 12 or 14
-	 *      %st+0: decryption table
+	 *	r0: keysched, CTX
+	 *	r1: dst
+	 *	r2: src
+	 *	r3: number of rounds.. 10, 12 or 14
+	 *      st+0: decryption table
 	 */
-	push {%r4-%r11, %ip, %lr};
+	push {r4-r11, ip, lr};
 
 	/* read input block */
 
 	/* test if src is unaligned */
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	1f;
 
 	/* unaligned load */
-	ldr_unaligned_le(RA, %r2, 0, RNA);
-	ldr_unaligned_le(RB, %r2, 4, RNB);
-	ldr_unaligned_le(RC, %r2, 8, RNA);
-	ldr_unaligned_le(RD, %r2, 12, RNB);
+	ldr_unaligned_le(RA, r2, 0, RNA);
+	ldr_unaligned_le(RB, r2, 4, RNB);
+	ldr_unaligned_le(RC, r2, 8, RNA);
+	ldr_unaligned_le(RD, r2, 12, RNB);
 	b	2f;
 .ltorg
 1:
 	/* aligned load */
-	ldm	%r2, {RA, RB, RC, RD};
+	ldm	r2, {RA, RB, RC, RD};
 #ifndef __ARMEL__
 	rev	RA, RA;
 	rev	RB, RB;
@@ -504,14 +504,14 @@ _gcry_aes_arm_decrypt_block:
 	rev	RD, RD;
 #endif
 2:
-	ldr     RTAB, [%sp, #40];
-	sub	%sp, #16;
+	ldr     RTAB, [sp, #40];
+	sub	sp, #16;
 
 	mov	RMASK, #0xff;
-	str	%r1, [%sp, #4];		/* dst */
+	str	r1, [sp, #4];		/* dst */
 	mov	RMASK, RMASK, lsl#2;	/* byte mask */
 
-	cmp	%r3, #12;
+	cmp	r3, #12;
 	bge	.Ldec_256;
 
 	firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
@@ -526,8 +526,8 @@ _gcry_aes_arm_decrypt_block:
 	decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask);
 	lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
 
-	ldr	RT0, [%sp, #4];		/* dst */
-	add	%sp, #16;
+	ldr	RT0, [sp, #4];		/* dst */
+	add	sp, #16;
 
 	/* store output block */
 
@@ -554,7 +554,7 @@ _gcry_aes_arm_decrypt_block:
 	stm	RT0, {RA, RB, RC, RD};
 2:
 	mov     r0, #(10 * 4);
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 
 .ltorg
 .Ldec_256:
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 6208652b..3c4149b3 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -483,9 +483,9 @@ _gcry_aes_cbc_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: cbc_mac => r5
-   *    %st+8: nrounds => r6
+   *    st+0: nblocks => r4
+   *    st+4: cbc_mac => r5
+   *    st+8: nrounds => r6
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -563,8 +563,8 @@ _gcry_aes_cbc_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -670,7 +670,7 @@ _gcry_aes_ecb_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: nblocks
-   *    %st+0: nrounds => r4
+   *    st+0: nrounds => r4
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -755,7 +755,7 @@ _gcry_aes_ecb_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: nblocks
-   *    %st+0: nrounds => r4
+   *    st+0: nrounds => r4
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -812,8 +812,8 @@ _gcry_aes_cfb_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -888,8 +888,8 @@ _gcry_aes_cfb_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   push {r4-r6,lr} /* 4*4 = 16b */
@@ -996,8 +996,8 @@ _gcry_aes_ctr_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
@@ -1176,8 +1176,8 @@ _gcry_aes_ctr32le_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
@@ -1301,11 +1301,11 @@ _gcry_aes_ocb_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: offset
-   *    %st+0: checksum => r4
-   *    %st+4: Ls => r5
-   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+12: nrounds => r7
-   *    %st+16: blkn => lr
+   *    st+0: checksum => r4
+   *    st+4: Ls => r5
+   *    st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    st+12: nrounds => r7
+   *    st+16: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1476,11 +1476,11 @@ _gcry_aes_ocb_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: offset
-   *    %st+0: checksum => r4
-   *    %st+4: Ls => r5
-   *    %st+8: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+12: nrounds => r7
-   *    %st+16: blkn => lr
+   *    st+0: checksum => r4
+   *    st+4: Ls => r5
+   *    st+8: nblocks => r6  (0 < nblocks <= 32)
+   *    st+12: nrounds => r7
+   *    st+16: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1650,10 +1650,10 @@ _gcry_aes_ocb_auth_armv8_ce:
    *    r1: abuf
    *    r2: offset
    *    r3: checksum
-   *    %st+0: Ls => r5
-   *    %st+4: nblocks => r6  (0 < nblocks <= 32)
-   *    %st+8: nrounds => r7
-   *    %st+12: blkn => lr
+   *    st+0: Ls => r5
+   *    st+4: nblocks => r6  (0 < nblocks <= 32)
+   *    st+8: nrounds => r7
+   *    st+12: blkn => lr
    */
 
   vpush {q4-q7}
@@ -1801,8 +1801,8 @@ _gcry_aes_xts_enc_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
@@ -1956,8 +1956,8 @@ _gcry_aes_xts_dec_armv8_ce:
    *    r1: outbuf
    *    r2: inbuf
    *    r3: iv
-   *    %st+0: nblocks => r4
-   *    %st+4: nrounds => r5
+   *    st+0: nblocks => r4
+   *    st+4: nrounds => r5
    */
 
   vpush {q4-q7}
diff --git a/cipher/sha512-arm.S b/cipher/sha512-arm.S
index 94ec0141..1e1d296f 100644
--- a/cipher/sha512-arm.S
+++ b/cipher/sha512-arm.S
@@ -38,23 +38,23 @@
 #define hd_h ((hd_g) + 8)
 
 /* register macros */
-#define RK    %r2
+#define RK    r2
 
-#define RElo %r0
-#define REhi %r1
+#define RElo r0
+#define REhi r1
 
-#define RT1lo %r3
-#define RT1hi %r4
-#define RT2lo %r5
-#define RT2hi %r6
-#define RWlo  %r7
-#define RWhi  %r8
-#define RT3lo %r9
-#define RT3hi %r10
-#define RT4lo %r11
-#define RT4hi %ip
+#define RT1lo r3
+#define RT1hi r4
+#define RT2lo r5
+#define RT2hi r6
+#define RWlo  r7
+#define RWhi  r8
+#define RT3lo r9
+#define RT3hi r10
+#define RT4lo r11
+#define RT4hi ip
 
-#define RRND  %lr
+#define RRND  lr
 
 /* variable offsets in stack */
 #define ctx (0)
@@ -150,13 +150,13 @@
     mov RWhi, REhi, lsr#14; \
     eor RWlo, RWlo, RElo, lsr#18; \
     eor RWhi, RWhi, REhi, lsr#18; \
-    ldr RT3lo, [%sp, #(_f)]; \
+    ldr RT3lo, [sp, #(_f)]; \
     adds RT1lo, RT2lo; /* t1 += K */ \
-    ldr RT3hi, [%sp, #(_f) + 4]; \
+    ldr RT3hi, [sp, #(_f) + 4]; \
     adc RT1hi, RT2hi; \
-    ldr RT4lo, [%sp, #(_g)]; \
+    ldr RT4lo, [sp, #(_g)]; \
     eor RWlo, RWlo, RElo, lsl#23; \
-    ldr RT4hi, [%sp, #(_g) + 4]; \
+    ldr RT4hi, [sp, #(_g) + 4]; \
     eor RWhi, RWhi, REhi, lsl#23; \
     eor RWlo, RWlo, REhi, lsl#18; \
     eor RWhi, RWhi, RElo, lsl#18; \
@@ -177,29 +177,29 @@
     \
     /* Load D */ \
     /* t1 += Cho(_e,_f,_g) */ \
-    ldr RElo, [%sp, #(_d)]; \
+    ldr RElo, [sp, #(_d)]; \
     adds RT1lo, RT3lo; \
-    ldr REhi, [%sp, #(_d) + 4]; \
+    ldr REhi, [sp, #(_d) + 4]; \
     adc RT1hi, RT3hi; \
     \
     /* Load A */ \
-    ldr RT3lo, [%sp, #(_a)]; \
+    ldr RT3lo, [sp, #(_a)]; \
     \
     /* _d += t1 */ \
     adds RElo, RT1lo; \
-    ldr RT3hi, [%sp, #(_a) + 4]; \
+    ldr RT3hi, [sp, #(_a) + 4]; \
     adc REhi, RT1hi; \
     \
     /* Store D */ \
-    str RElo, [%sp, #(_d)]; \
+    str RElo, [sp, #(_d)]; \
     \
     /* t2 = Sum0(_a) */ \
     mov RT2lo, RT3lo, lsr#28; \
-    str REhi, [%sp, #(_d) + 4]; \
+    str REhi, [sp, #(_d) + 4]; \
     mov RT2hi, RT3hi, lsr#28; \
-    ldr RWlo, [%sp, #(_b)]; \
+    ldr RWlo, [sp, #(_b)]; \
     eor RT2lo, RT2lo, RT3lo, lsl#30; \
-    ldr RWhi, [%sp, #(_b) + 4]; \
+    ldr RWhi, [sp, #(_b) + 4]; \
     eor RT2hi, RT2hi, RT3hi, lsl#30; \
     eor RT2lo, RT2lo, RT3lo, lsl#25; \
     eor RT2hi, RT2hi, RT3hi, lsl#25; \
@@ -212,11 +212,11 @@
     \
     /* t2 += t1 */ \
     adds RT2lo, RT1lo; \
-    ldr RT1lo, [%sp, #(_c)]; \
+    ldr RT1lo, [sp, #(_c)]; \
     adc RT2hi, RT1hi; \
     \
     /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \
-    ldr RT1hi, [%sp, #(_c) + 4]; \
+    ldr RT1hi, [sp, #(_c) + 4]; \
     and RT4lo, RWlo, RT3lo; \
     and RT4hi, RWhi, RT3hi; \
     eor RWlo, RWlo, RT3lo; \
@@ -229,36 +229,36 @@
 /* Message expansion */
 
 #define W_0_63(_a,_h,i) \
-    ldr RT3lo, [%sp, #(w(i-2))]; \
+    ldr RT3lo, [sp, #(w(i-2))]; \
     adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
-    ldr RT3hi, [%sp, #(w(i-2)) + 4]; \
+    ldr RT3hi, [sp, #(w(i-2)) + 4]; \
     adc RT2hi, RWhi; \
     /* nw = S1(w[i-2]) */ \
-    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    ldr RT1lo, [sp, #(_h)]; /* Load H */ \
     mov RWlo, RT3lo, lsr#19; \
-    str RT2lo, [%sp, #(_a)]; \
+    str RT2lo, [sp, #(_a)]; \
     eor RWlo, RWlo, RT3lo, lsl#3; \
-    ldr RT1hi, [%sp, #(_h) + 4]; \
+    ldr RT1hi, [sp, #(_h) + 4]; \
     mov RWhi, RT3hi, lsr#19; \
-    ldr RT2lo, [%sp, #(w(i-7))]; \
+    ldr RT2lo, [sp, #(w(i-7))]; \
     eor RWhi, RWhi, RT3hi, lsl#3; \
-    str RT2hi, [%sp, #(_a) + 4]; \
+    str RT2hi, [sp, #(_a) + 4]; \
     eor RWlo, RWlo, RT3lo, lsr#6; \
-    ldr RT2hi, [%sp, #(w(i-7)) + 4]; \
+    ldr RT2hi, [sp, #(w(i-7)) + 4]; \
     eor RWhi, RWhi, RT3hi, lsr#6; \
     eor RWlo, RWlo, RT3hi, lsl#13; \
     eor RWhi, RWhi, RT3lo, lsl#13; \
     eor RWlo, RWlo, RT3hi, lsr#29; \
     eor RWhi, RWhi, RT3lo, lsr#29; \
-    ldr RT3lo, [%sp, #(w(i-15))]; \
+    ldr RT3lo, [sp, #(w(i-15))]; \
     eor RWlo, RWlo, RT3hi, lsl#26; \
-    ldr RT3hi, [%sp, #(w(i-15)) + 4]; \
+    ldr RT3hi, [sp, #(w(i-15)) + 4]; \
     \
     adds RT2lo, RWlo; /* nw += w[i-7] */ \
-    ldr RWlo, [%sp, #(w(i-16))]; \
+    ldr RWlo, [sp, #(w(i-16))]; \
     adc RT2hi, RWhi; \
     mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \
-    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    ldr RWhi, [sp, #(w(i-16)) + 4]; \
     mov RT4hi, RT3hi, lsr#1; \
     adds RT2lo, RWlo; /* nw += w[i-16] */ \
     eor RT4lo, RT4lo, RT3lo, lsr#8; \
@@ -277,20 +277,20 @@
     adc RT2hi, RT4hi; \
     \
     /* w[0] = nw */ \
-    str RT2lo, [%sp, #(w(i))]; \
+    str RT2lo, [sp, #(w(i))]; \
     adds RT1lo, RWlo; \
-    str RT2hi, [%sp, #(w(i)) + 4]; \
+    str RT2hi, [sp, #(w(i)) + 4]; \
     adc RT1hi, RWhi;
 
 #define W_64_79(_a,_h,i) \
     adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
-    ldr RWlo, [%sp, #(w(i-16))]; \
+    ldr RWlo, [sp, #(w(i-16))]; \
     adc RT2hi, RWhi; \
-    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
-    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
-    ldr RT1hi, [%sp, #(_h) + 4]; \
-    str RT2lo, [%sp, #(_a)]; \
-    str RT2hi, [%sp, #(_a) + 4]; \
+    ldr RWhi, [sp, #(w(i-16)) + 4]; \
+    ldr RT1lo, [sp, #(_h)]; /* Load H */ \
+    ldr RT1hi, [sp, #(_h) + 4]; \
+    str RT2lo, [sp, #(_a)]; \
+    str RT2hi, [sp, #(_a) + 4]; \
     adds RT1lo, RWlo; \
     adc RT1hi, RWhi;
 
@@ -300,72 +300,72 @@
 
 _gcry_sha512_transform_arm:
 	/* Input:
-	 *	%r0: SHA512_CONTEXT
-	 *	%r1: data
-	 *	%r2: u64 k[] constants
-	 *	%r3: nblks
+	 *	r0: SHA512_CONTEXT
+	 *	r1: data
+	 *	r2: u64 k[] constants
+	 *	r3: nblks
 	 */
-	push {%r4-%r11, %ip, %lr};
-	sub %sp, %sp, #STACK_MAX;
-	movs RWlo, %r3;
-	str %r0, [%sp, #(ctx)];
+	push {r4-r11, ip, lr};
+	sub sp, sp, #STACK_MAX;
+	movs RWlo, r3;
+	str r0, [sp, #(ctx)];
 
 	beq .Ldone;
 
 .Loop_blocks:
-	str RWlo, [%sp, #nblks];
+	str RWlo, [sp, #nblks];
 
 	/* Load context to stack */
-	add RWhi, %sp, #(_a);
-	ldm %r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	add RWhi, sp, #(_a);
+	ldm r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
-	ldm %r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
 	/* Load input to w[16] */
 
 	/* test if data is unaligned */
-	tst %r1, #3;
+	tst r1, #3;
 	beq 1f;
 
 	/* unaligned load */
-	add RWhi, %sp, #(w(0));
-	read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	add RWhi, sp, #(w(0));
+	read_be64_unaligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_unaligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_unaligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_unaligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	b 2f;
 1:
 	/* aligned load */
-	add RWhi, %sp, #(w(0));
-	read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	add RWhi, sp, #(w(0));
+	read_be64_aligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_aligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_aligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	read_be64_aligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
 2:
-	add %r1, #(16 * 8);
+	add r1, #(16 * 8);
 	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
-	str %r1, [%sp, #(data)];
+	str r1, [sp, #(data)];
 
 	/* preload E & A */
-	ldr RElo, [%sp, #(_e)];
-	ldr REhi, [%sp, #(_e) + 4];
+	ldr RElo, [sp, #(_e)];
+	ldr REhi, [sp, #(_e) + 4];
 	mov RWlo, #0;
-	ldr RT2lo, [%sp, #(_a)];
+	ldr RT2lo, [sp, #(_a)];
 	mov RRND, #(80-16);
-	ldr RT2hi, [%sp, #(_a) + 4];
+	ldr RT2hi, [sp, #(_a) + 4];
 	mov RWhi, #0;
 
 .Loop_rounds:
@@ -406,58 +406,58 @@ _gcry_sha512_transform_arm:
 	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30);
 	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31);
 
-	ldr %r0, [%sp, #(ctx)];
+	ldr r0, [sp, #(ctx)];
 	adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */
-	ldr %r1, [%sp, #(data)];
+	ldr r1, [sp, #(data)];
 	adc RT2hi, RWhi;
 
-	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 	adds RT1lo, RT2lo;
-	ldr RT2lo, [%sp, #(_b + 0)];
+	ldr RT2lo, [sp, #(_b + 0)];
 	adc  RT1hi, RT2hi;
-	ldr RT2hi, [%sp, #(_b + 4)];
+	ldr RT2hi, [sp, #(_b + 4)];
 	adds RWlo, RT2lo;
-	ldr RT2lo, [%sp, #(_c + 0)];
+	ldr RT2lo, [sp, #(_c + 0)];
 	adc  RWhi, RT2hi;
-	ldr RT2hi, [%sp, #(_c + 4)];
+	ldr RT2hi, [sp, #(_c + 4)];
 	adds RT3lo, RT2lo;
-	ldr RT2lo, [%sp, #(_d + 0)];
+	ldr RT2lo, [sp, #(_d + 0)];
 	adc  RT3hi, RT2hi;
-	ldr RT2hi, [%sp, #(_d + 4)];
+	ldr RT2hi, [sp, #(_d + 4)];
 	adds RT4lo, RT2lo;
-	ldr RT2lo, [%sp, #(_e + 0)];
+	ldr RT2lo, [sp, #(_e + 0)];
 	adc  RT4hi, RT2hi;
-	stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 
-	ldr RT2hi, [%sp, #(_e + 4)];
-	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldr RT2hi, [sp, #(_e + 4)];
+	ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
 	adds RT1lo, RT2lo;
-	ldr RT2lo, [%sp, #(_f + 0)];
+	ldr RT2lo, [sp, #(_f + 0)];
 	adc  RT1hi, RT2hi;
-	ldr RT2hi, [%sp, #(_f + 4)];
+	ldr RT2hi, [sp, #(_f + 4)];
 	adds RWlo, RT2lo;
-	ldr RT2lo, [%sp, #(_g + 0)];
+	ldr RT2lo, [sp, #(_g + 0)];
 	adc  RWhi, RT2hi;
-	ldr RT2hi, [%sp, #(_g + 4)];
+	ldr RT2hi, [sp, #(_g + 4)];
 	adds RT3lo, RT2lo;
-	ldr RT2lo, [%sp, #(_h + 0)];
+	ldr RT2lo, [sp, #(_h + 0)];
 	adc  RT3hi, RT2hi;
-	ldr RT2hi, [%sp, #(_h + 4)];
+	ldr RT2hi, [sp, #(_h + 4)];
 	adds RT4lo, RT2lo;
 	adc  RT4hi, RT2hi;
-	stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
-	sub %r0, %r0, #(4 * 8);
-	ldr RWlo, [%sp, #nblks];
+	stm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	sub r0, r0, #(4 * 8);
+	ldr RWlo, [sp, #nblks];
 
 	sub RK, #(80 * 8);
 	subs RWlo, #1;
 	bne .Loop_blocks;
 
 .Ldone:
-	mov %r0, #STACK_MAX;
+	mov r0, #STACK_MAX;
 __out:
-	add %sp, %sp, #STACK_MAX;
-	pop {%r4-%r11, %ip, %pc};
+	add sp, sp, #STACK_MAX;
+	pop {r4-r11, ip, pc};
 .size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm;
 
 #endif
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S
index 2b186b47..a1df73b8 100644
--- a/cipher/sha512-armv7-neon.S
+++ b/cipher/sha512-armv7-neon.S
@@ -40,7 +40,7 @@
 #define hd_g ((hd_f) + 8)
 
 /* register macros */
-#define RK %r2
+#define RK r2
 
 #define RA d0
 #define RB d1
@@ -287,26 +287,26 @@
 
 _gcry_sha512_transform_armv7_neon:
 	/* Input:
-	 *	%r0: SHA512_CONTEXT
-	 *	%r1: data
-	 *	%r2: u64 k[] constants
-	 *	%r3: nblks
+	 *	r0: SHA512_CONTEXT
+	 *	r1: data
+	 *	r2: u64 k[] constants
+	 *	r3: nblks
 	 */
-	push {%lr};
+	push {lr};
 
-	mov %lr, #0;
+	mov lr, #0;
 
 	/* Load context to d0-d7 */
-	vld1.64 {RA-RD}, [%r0]!;
-	vld1.64 {RE-RH}, [%r0];
-	sub %r0, #(4*8);
+	vld1.64 {RA-RD}, [r0]!;
+	vld1.64 {RE-RH}, [r0];
+	sub r0, #(4*8);
 
 	/* Load input to w[16], d16-d31 */
 	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
-	vld1.64 {RW0-RW3}, [%r1]!;
-	vld1.64 {RW4-RW7}, [%r1]!;
-	vld1.64 {RW8-RW11}, [%r1]!;
-	vld1.64 {RW12-RW15}, [%r1]!;
+	vld1.64 {RW0-RW3}, [r1]!;
+	vld1.64 {RW4-RW7}, [r1]!;
+	vld1.64 {RW8-RW11}, [r1]!;
+	vld1.64 {RW12-RW15}, [r1]!;
 #ifdef __ARMEL__
 	/* byteswap */
 	vrev64.8 RW01q, RW01q;
@@ -334,46 +334,46 @@ _gcry_sha512_transform_armv7_neon:
 	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
 	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
 	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
-	add %lr, #16;
+	add lr, #16;
 	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
-	cmp %lr, #64;
+	cmp lr, #64;
 	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
 	bne .Loop_rounds;
 
-	subs %r3, #1;
+	subs r3, #1;
 
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 	beq .Lhandle_tail;
-	vld1.64 {RW0-RW3}, [%r1]!;
+	vld1.64 {RW0-RW3}, [r1]!;
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
 	vrev64.8 RW01q, RW01q;
 	vrev64.8 RW23q, RW23q;
 #endif
-	vld1.64 {RW4-RW7}, [%r1]!;
+	vld1.64 {RW4-RW7}, [r1]!;
 	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
 	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
 #ifdef __ARMEL__
 	vrev64.8 RW45q, RW45q;
 	vrev64.8 RW67q, RW67q;
 #endif
-	vld1.64 {RW8-RW11}, [%r1]!;
+	vld1.64 {RW8-RW11}, [r1]!;
 	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 #ifdef __ARMEL__
 	vrev64.8 RW89q, RW89q;
 	vrev64.8 RW1011q, RW1011q;
 #endif
-	vld1.64 {RW12-RW15}, [%r1]!;
+	vld1.64 {RW12-RW15}, [r1]!;
 	vadd_rg_RT0(RA);
 	vadd_rg_RT1(RA);
 
 	/* Load context */
-	vld1.64 {RT0-RT3}, [%r0]!;
-	vld1.64 {RT4-RT7}, [%r0];
-	sub %r0, #(4*8);
+	vld1.64 {RT0-RT3}, [r0]!;
+	vld1.64 {RT4-RT7}, [r0];
+	sub r0, #(4*8);
 
 #ifdef __ARMEL__
 	vrev64.8 RW1213q, RW1213q;
@@ -390,11 +390,11 @@ _gcry_sha512_transform_armv7_neon:
 	vadd.u64 RH, RT7;
 
 	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
+	vst1.64 {RA-RD}, [r0]!;
 	sub RK, $(8*80);
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	mov %lr, #0;
-	sub %r0, #(4*8);
+	vst1.64 {RE-RH}, [r0]; /* Store the last half of context */
+	mov lr, #0;
+	sub r0, #(4*8);
 
 	b .Loop;
 .ltorg
@@ -408,11 +408,11 @@ _gcry_sha512_transform_armv7_neon:
 	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
 
 	/* Load context to d16-d23 */
-	vld1.64 {RW0-RW3}, [%r0]!;
+	vld1.64 {RW0-RW3}, [r0]!;
 	vadd_rg_RT0(RA);
-	vld1.64 {RW4-RW7}, [%r0];
+	vld1.64 {RW4-RW7}, [r0];
 	vadd_rg_RT1(RA);
-	sub %r0, #(4*8);
+	sub r0, #(4*8);
 
 	vadd.u64 RA, RW0;
 	vadd.u64 RB, RW1;
@@ -424,7 +424,7 @@ _gcry_sha512_transform_armv7_neon:
 	vadd.u64 RH, RW7;
 
 	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
+	vst1.64 {RA-RD}, [r0]!;
 
 	/* Clear used registers */
 	/* d16-d31 */
@@ -432,7 +432,7 @@ _gcry_sha512_transform_armv7_neon:
 	CLEAR_REG(RW23q);
 	CLEAR_REG(RW45q);
 	CLEAR_REG(RW67q);
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	vst1.64 {RE-RH}, [r0]; /* Store the last half of context */
 	CLEAR_REG(RW89q);
 	CLEAR_REG(RW1011q);
 	CLEAR_REG(RW1213q);
@@ -440,13 +440,13 @@ _gcry_sha512_transform_armv7_neon:
 	/* d8-d15 */
 	vpop {RT0-RT7};
 	/* d0-d7 (q0-q3) */
-	CLEAR_REG(%q0);
-	CLEAR_REG(%q1);
-	CLEAR_REG(%q2);
-	CLEAR_REG(%q3);
+	CLEAR_REG(q0);
+	CLEAR_REG(q1);
+	CLEAR_REG(q2);
+	CLEAR_REG(q3);
 
-	eor %r0, %r0;
-	pop {%pc};
+	eor r0, r0;
+	pop {pc};
 .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
 
 #endif
diff --git a/cipher/twofish-arm.S b/cipher/twofish-arm.S
index 2e1da6cd..b381e546 100644
--- a/cipher/twofish-arm.S
+++ b/cipher/twofish-arm.S
@@ -37,25 +37,25 @@
 #define k  ((w) + 4 * 8)
 
 /* register macros */
-#define CTX %r0
-#define CTXs0 %r0
-#define CTXs1 %r1
-#define CTXs3 %r7
+#define CTX r0
+#define CTXs0 r0
+#define CTXs1 r1
+#define CTXs3 r7
 
-#define RA %r3
-#define RB %r4
-#define RC %r5
-#define RD %r6
+#define RA r3
+#define RB r4
+#define RC r5
+#define RD r6
 
-#define RX %r2
-#define RY %ip
+#define RX r2
+#define RY ip
 
-#define RMASK %lr
+#define RMASK lr
 
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
+#define RT0 r8
+#define RT1 r9
+#define RT2 r10
+#define RT3 r11
 
 /* helper macros */
 #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
@@ -262,15 +262,15 @@
 
 _gcry_twofish_arm_encrypt_block:
 	/* input:
-	 *	%r0: ctx
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	add RY, CTXs0, #w;
 
-	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+	ldr_input_le(r2, RA, RB, RC, RD, RT0);
 
 	/* Input whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
@@ -292,7 +292,7 @@ _gcry_twofish_arm_encrypt_block:
 	last_encrypt_cycle(7);
 
 	add RY, CTXs3, #(w + 4*4 - s3);
-	pop {%r1}; /* dst */
+	pop {r1}; /* dst */
 
 	/* Output whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
@@ -301,9 +301,9 @@ _gcry_twofish_arm_encrypt_block:
 	eor RA, RA, RT2;
 	eor RB, RB, RT3;
 
-	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+	str_output_le(r1, RC, RD, RA, RB, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .ltorg
 .size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
 
@@ -313,15 +313,15 @@ _gcry_twofish_arm_encrypt_block:
 
 _gcry_twofish_arm_decrypt_block:
 	/* input:
-	 *	%r0: ctx
-	 *	%r1: dst
-	 *	%r2: src
+	 *	r0: ctx
+	 *	r1: dst
+	 *	r2: src
 	 */
-	push {%r1, %r4-%r11, %ip, %lr};
+	push {r1, r4-r11, ip, lr};
 
 	add CTXs3, CTXs0, #(s3 - s0);
 
-	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+	ldr_input_le(r2, RC, RD, RA, RB, RT0);
 
 	add RY, CTXs3, #(w + 4*4 - s3);
 	add CTXs3, CTXs0, #(s3 - s0);
@@ -345,7 +345,7 @@ _gcry_twofish_arm_decrypt_block:
 	last_decrypt_cycle(0);
 
 	add RY, CTXs0, #w;
-	pop {%r1}; /* dst */
+	pop {r1}; /* dst */
 
 	/* Output whitening */
 	ldm RY, {RT0, RT1, RT2, RT3};
@@ -354,9 +354,9 @@ _gcry_twofish_arm_decrypt_block:
 	eor RC, RC, RT2;
 	eor RD, RD, RT3;
 
-	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+	str_output_le(r1, RA, RB, RC, RD, RT0, RT1);
 
-	pop {%r4-%r11, %ip, %pc};
+	pop {r4-r11, ip, pc};
 .size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
 
 #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
diff --git a/configure.ac b/configure.ac
index cd804305..cc1104ca 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1181,7 +1181,7 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat
                 ".text\n\t"
                 /* Following causes error if assembler ignored '.syntax unified'.  */
                 "asmfunc:\n\t"
-                "add %r0, %r0, %r4, ror #12;\n\t"
+                "add r0, r0, r4, ror #12;\n\t"
 
                 /* Test if '.type' and '.size' are supported.  */
                 ".size asmfunc,.-asmfunc;\n\t"
@@ -1864,10 +1864,10 @@ AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
                 ".fpu neon\n\t"
                 ".text\n\t"
                 "testfn:\n\t"
-                "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
-                "vrev64.8 %q0, %q3;\n\t"
-                "vadd.u64 %q0, %q1;\n\t"
-                "vadd.s64 %d3, %d2, %d3;\n\t"
+                "vld1.64 {q0-q1}, [r0]!;\n\t"
+                "vrev64.8 q0, q3;\n\t"
+                "vadd.u64 q0, q1;\n\t"
+                "vadd.s64 d3, d2, d3;\n\t"
                 );
             void testfn(void);
             ]], [ testfn(); ])],
diff --git a/mpi/arm/mpih-add1.S b/mpi/arm/mpih-add1.S
index 09e8b3b2..d59d3f3d 100644
--- a/mpi/arm/mpih-add1.S
+++ b/mpi/arm/mpih-add1.S
@@ -29,10 +29,10 @@
 
 /*******************
  *  mpi_limb_t
- *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	%r0
- *		   mpi_ptr_t s1_ptr,		%r1
- *		   mpi_ptr_t s2_ptr,		%r2
- *		   mpi_size_t size)		%r3
+ *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	r0
+ *		   mpi_ptr_t s1_ptr,		r1
+ *		   mpi_ptr_t s2_ptr,		r2
+ *		   mpi_size_t size)		r3
  */
 
 .text
@@ -40,37 +40,37 @@
 .globl _gcry_mpih_add_n
 .type  _gcry_mpih_add_n,%function
 _gcry_mpih_add_n:
-	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
-	cmn	%r0, #0; /* clear carry flag */
+	push	{r4, r5, r6, r7, r8, r9, r10, lr};
+	cmn	r0, #0; /* clear carry flag */
 
-	tst	%r3, #3;
+	tst	r3, #3;
 	beq	.Large_loop;
 
 .Loop:
-	ldr	%r4, [%r1], #4;
-	sub	%r3, #1;
-	ldr	%lr, [%r2], #4;
-	adcs	%r4, %lr;
-	tst	%r3, #3;
-	str	%r4, [%r0], #4;
+	ldr	r4, [r1], #4;
+	sub	r3, #1;
+	ldr	lr, [r2], #4;
+	adcs	r4, lr;
+	tst	r3, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r3, #0;
+	teq	r3, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldm	%r1!, {%r4, %r6, %r8, %r10};
-	ldm	%r2!, {%r5, %r7, %r9, %lr};
-	sub	%r3, #4;
-	adcs	%r4, %r5;
-	adcs	%r6, %r7;
-	adcs	%r8, %r9;
-	adcs	%r10, %lr;
-	teq	%r3, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	ldm	r1!, {r4, r6, r8, r10};
+	ldm	r2!, {r5, r7, r9, lr};
+	sub	r3, #4;
+	adcs	r4, r5;
+	adcs	r6, r7;
+	adcs	r8, r9;
+	adcs	r10, lr;
+	teq	r3, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
-	adc	%r0, %r3, #0;
-	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+	adc	r0, r3, #0;
+	pop	{r4, r5, r6, r7, r8, r9, r10, pc};
 .size _gcry_mpih_add_n,.-_gcry_mpih_add_n;
diff --git a/mpi/arm/mpih-mul1.S b/mpi/arm/mpih-mul1.S
index c2e2854b..ea196e8b 100644
--- a/mpi/arm/mpih-mul1.S
+++ b/mpi/arm/mpih-mul1.S
@@ -29,10 +29,10 @@
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,		%r0
- *		  mpi_ptr_t s1_ptr,		%r1
- *		  mpi_size_t s1_size,		%r2
- *		  mpi_limb_t s2_limb)		%r3
+ * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,		r0
+ *		  mpi_ptr_t s1_ptr,		r1
+ *		  mpi_size_t s1_size,		r2
+ *		  mpi_limb_t s2_limb)		r3
  */
 
 .text
@@ -40,41 +40,41 @@
 .globl _gcry_mpih_mul_1
 .type  _gcry_mpih_mul_1,%function
 _gcry_mpih_mul_1:
-	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %lr};
-	mov	%r4, #0;
+	push	{r4, r5, r6, r7, r8, r9, r10, r11, lr};
+	mov	r4, #0;
 
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	.Large_loop;
 
 .Loop:
-	ldr	%r5, [%r1], #4;
-	mov	%lr, #0;
-	umlal	%r4, %lr, %r5, %r3;
-	sub	%r2, #1;
-	str	%r4, [%r0], #4;
-	tst	%r2, #3;
-	mov	%r4, %lr;
+	ldr	r5, [r1], #4;
+	mov	lr, #0;
+	umlal	r4, lr, r5, r3;
+	sub	r2, #1;
+	str	r4, [r0], #4;
+	tst	r2, #3;
+	mov	r4, lr;
 	bne	.Loop;
 
-	teq	%r2, #0;
+	teq	r2, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldm	%r1!, {%r5, %r6, %r7, %r8};
-	mov	%r9, #0;
-	mov	%r10, #0;
-	umlal	%r4, %r9, %r5, %r3;
-	mov	%r11, #0;
-	umlal	%r9, %r10, %r6, %r3;
-	str	%r4, [%r0], #4;
-	mov	%r4, #0;
-	umlal	%r10, %r11, %r7, %r3;
-	subs	%r2, #4;
-	umlal	%r11, %r4, %r8, %r3;
-	stm	%r0!, {%r9, %r10, %r11};
+	ldm	r1!, {r5, r6, r7, r8};
+	mov	r9, #0;
+	mov	r10, #0;
+	umlal	r4, r9, r5, r3;
+	mov	r11, #0;
+	umlal	r9, r10, r6, r3;
+	str	r4, [r0], #4;
+	mov	r4, #0;
+	umlal	r10, r11, r7, r3;
+	subs	r2, #4;
+	umlal	r11, r4, r8, r3;
+	stm	r0!, {r9, r10, r11};
 	bne	.Large_loop;
 
 .Lend:
-	mov	%r0, %r4;
-	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %pc};
+	mov	r0, r4;
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc};
 .size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1;
diff --git a/mpi/arm/mpih-mul2.S b/mpi/arm/mpih-mul2.S
index bce932e9..8793b20f 100644
--- a/mpi/arm/mpih-mul2.S
+++ b/mpi/arm/mpih-mul2.S
@@ -29,10 +29,10 @@
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,	%r0
- *		     mpi_ptr_t s1_ptr,		%r1
- *		     mpi_size_t s1_size,	%r2
- *		     mpi_limb_t s2_limb)	%r3
+ * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,	r0
+ *		     mpi_ptr_t s1_ptr,		r1
+ *		     mpi_size_t s1_size,	r2
+ *		     mpi_limb_t s2_limb)	r3
  */
 
 .text
@@ -40,55 +40,55 @@
 .globl _gcry_mpih_addmul_1
 .type  _gcry_mpih_addmul_1,%function
 _gcry_mpih_addmul_1:
-	push	{%r4, %r5, %r6, %r8, %r10, %lr};
-	mov	%lr, #0;
-	cmn	%r0, #0; /* clear carry flag */
+	push	{r4, r5, r6, r8, r10, lr};
+	mov	lr, #0;
+	cmn	r0, #0; /* clear carry flag */
 
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	.Large_loop;
 .Loop:
-	ldr	%r5, [%r1], #4;
-	ldr	%r4, [%r0];
-	sub	%r2, #1;
-	adcs	%r4, %lr;
-	mov	%lr, #0;
-	umlal	%r4, %lr, %r5, %r3;
-	tst	%r2, #3;
-	str	%r4, [%r0], #4;
+	ldr	r5, [r1], #4;
+	ldr	r4, [r0];
+	sub	r2, #1;
+	adcs	r4, lr;
+	mov	lr, #0;
+	umlal	r4, lr, r5, r3;
+	tst	r2, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r2, #0;
+	teq	r2, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldr	%r5, [%r1], #4;
-	ldm	%r0, {%r4, %r6, %r8, %r10};
+	ldr	r5, [r1], #4;
+	ldm	r0, {r4, r6, r8, r10};
 
-	sub	%r2, #4;
-	adcs	%r4, %lr;
-	mov	%lr, #0;
-	umlal	%r4, %lr, %r5, %r3;
+	sub	r2, #4;
+	adcs	r4, lr;
+	mov	lr, #0;
+	umlal	r4, lr, r5, r3;
 
-	ldr	%r5, [%r1], #4;
-	adcs	%r6, %lr;
-	mov	%lr, #0;
-	umlal	%r6, %lr, %r5, %r3;
+	ldr	r5, [r1], #4;
+	adcs	r6, lr;
+	mov	lr, #0;
+	umlal	r6, lr, r5, r3;
 
-	ldr	%r5, [%r1], #4;
-	adcs	%r8, %lr;
-	mov	%lr, #0;
-	umlal	%r8, %lr, %r5, %r3;
+	ldr	r5, [r1], #4;
+	adcs	r8, lr;
+	mov	lr, #0;
+	umlal	r8, lr, r5, r3;
 
-	ldr	%r5, [%r1], #4;
-	adcs	%r10, %lr;
-	mov	%lr, #0;
-	umlal	%r10, %lr, %r5, %r3;
+	ldr	r5, [r1], #4;
+	adcs	r10, lr;
+	mov	lr, #0;
+	umlal	r10, lr, r5, r3;
 
-	teq	%r2, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	teq	r2, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
-	adc	%r0, %lr, #0;
-	pop	{%r4, %r5, %r6, %r8, %r10, %pc};
+	adc	r0, lr, #0;
+	pop	{r4, r5, r6, r8, r10, pc};
 .size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1;
diff --git a/mpi/arm/mpih-mul3.S b/mpi/arm/mpih-mul3.S
index 33326c78..2477c089 100644
--- a/mpi/arm/mpih-mul3.S
+++ b/mpi/arm/mpih-mul3.S
@@ -29,10 +29,10 @@
 
 /*******************
  * mpi_limb_t
- * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,	%r0
- *		     mpi_ptr_t s1_ptr,		%r1
- *		     mpi_size_t s1_size,	%r2
- *		     mpi_limb_t s2_limb)	%r3
+ * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,	r0
+ *		     mpi_ptr_t s1_ptr,		r1
+ *		     mpi_size_t s1_size,	r2
+ *		     mpi_limb_t s2_limb)	r3
  */
 
 .text
@@ -40,61 +40,61 @@
 .globl _gcry_mpih_submul_1
 .type  _gcry_mpih_submul_1,%function
 _gcry_mpih_submul_1:
-	push	{%r4, %r5, %r6, %r8, %r9, %r10, %lr};
-	mov	%lr, #0;
-	cmp	%r0, #0; /* prepare carry flag for sbc */
+	push	{r4, r5, r6, r8, r9, r10, lr};
+	mov	lr, #0;
+	cmp	r0, #0; /* prepare carry flag for sbc */
 
-	tst	%r2, #3;
+	tst	r2, #3;
 	beq	.Large_loop;
 .Loop:
-	ldr	%r5, [%r1], #4;
-	mov	%r4, %lr;
-	mov	%lr, #0;
-	ldr	%r6, [%r0];
-	umlal	%r4, %lr, %r5, %r3;
-	sub	%r2, #1;
-	sbcs	%r4, %r6, %r4;
-	tst	%r2, #3;
-	str	%r4, [%r0], #4;
+	ldr	r5, [r1], #4;
+	mov	r4, lr;
+	mov	lr, #0;
+	ldr	r6, [r0];
+	umlal	r4, lr, r5, r3;
+	sub	r2, #1;
+	sbcs	r4, r6, r4;
+	tst	r2, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r2, #0;
+	teq	r2, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldr	%r5, [%r1], #4;
-	mov	%r9, #0;
-	ldr	%r4, [%r0, #0];
+	ldr	r5, [r1], #4;
+	mov	r9, #0;
+	ldr	r4, [r0, #0];
 
-	umlal	%lr, %r9, %r5, %r3;
-	ldr	%r6, [%r0, #4];
-	ldr	%r5, [%r1], #4;
-	sbcs	%r4, %r4, %lr;
+	umlal	lr, r9, r5, r3;
+	ldr	r6, [r0, #4];
+	ldr	r5, [r1], #4;
+	sbcs	r4, r4, lr;
 
-	mov	%lr, #0;
-	umlal	%r9, %lr, %r5, %r3;
-	ldr	%r8, [%r0, #8];
-	ldr	%r5, [%r1], #4;
-	sbcs	%r6, %r6, %r9;
+	mov	lr, #0;
+	umlal	r9, lr, r5, r3;
+	ldr	r8, [r0, #8];
+	ldr	r5, [r1], #4;
+	sbcs	r6, r6, r9;
 
-	mov	%r9, #0;
-	umlal	%lr, %r9, %r5, %r3;
-	ldr	%r10, [%r0, #12];
-	ldr	%r5, [%r1], #4;
-	sbcs	%r8, %r8, %lr;
+	mov	r9, #0;
+	umlal	lr, r9, r5, r3;
+	ldr	r10, [r0, #12];
+	ldr	r5, [r1], #4;
+	sbcs	r8, r8, lr;
 
-	mov	%lr, #0;
-	umlal	%r9, %lr, %r5, %r3;
-	sub	%r2, #4;
-	sbcs	%r10, %r10, %r9;
+	mov	lr, #0;
+	umlal	r9, lr, r5, r3;
+	sub	r2, #4;
+	sbcs	r10, r10, r9;
 
-	teq	%r2, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	teq	r2, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
 	it	cc
-	movcc	%r2, #1;
-	add	%r0, %lr, %r2;
-	pop	{%r4, %r5, %r6, %r8, %r9, %r10, %pc};
+	movcc	r2, #1;
+	add	r0, lr, r2;
+	pop	{r4, r5, r6, r8, r9, r10, pc};
 .size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1;
diff --git a/mpi/arm/mpih-sub1.S b/mpi/arm/mpih-sub1.S
index 593e3cde..476d8a33 100644
--- a/mpi/arm/mpih-sub1.S
+++ b/mpi/arm/mpih-sub1.S
@@ -29,10 +29,10 @@
 
 /*******************
  *  mpi_limb_t
- *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	%r0
- *		   mpi_ptr_t s1_ptr,		%r1
- *		   mpi_ptr_t s2_ptr,		%r2
- *		   mpi_size_t size)		%r3
+ *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	r0
+ *		   mpi_ptr_t s1_ptr,		r1
+ *		   mpi_ptr_t s2_ptr,		r2
+ *		   mpi_size_t size)		r3
  */
 
 .text
@@ -40,38 +40,38 @@
 .globl _gcry_mpih_sub_n
 .type  _gcry_mpih_sub_n,%function
 _gcry_mpih_sub_n:
-	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
-	cmp	%r0, #0; /* prepare carry flag for sub */
+	push	{r4, r5, r6, r7, r8, r9, r10, lr};
+	cmp	r0, #0; /* prepare carry flag for sub */
 
-	tst	%r3, #3;
+	tst	r3, #3;
 	beq	.Large_loop;
 
 .Loop:
-	ldr	%r4, [%r1], #4;
-	sub	%r3, #1;
-	ldr	%lr, [%r2], #4;
-	sbcs	%r4, %lr;
-	tst	%r3, #3;
-	str	%r4, [%r0], #4;
+	ldr	r4, [r1], #4;
+	sub	r3, #1;
+	ldr	lr, [r2], #4;
+	sbcs	r4, lr;
+	tst	r3, #3;
+	str	r4, [r0], #4;
 	bne	.Loop;
 
-	teq	%r3, #0;
+	teq	r3, #0;
 	beq	.Lend;
 
 .Large_loop:
-	ldm	%r1!, {%r4, %r6, %r8, %r10};
-	sub	%r3, #4;
-	ldm	%r2!, {%r5, %r7, %r9, %lr};
-	sbcs	%r4, %r5;
-	sbcs	%r6, %r7;
-	sbcs	%r8, %r9;
-	sbcs	%r10, %lr;
-	teq	%r3, #0;
-	stm	%r0!, {%r4, %r6, %r8, %r10};
+	ldm	r1!, {r4, r6, r8, r10};
+	sub	r3, #4;
+	ldm	r2!, {r5, r7, r9, lr};
+	sbcs	r4, r5;
+	sbcs	r6, r7;
+	sbcs	r8, r9;
+	sbcs	r10, lr;
+	teq	r3, #0;
+	stm	r0!, {r4, r6, r8, r10};
 	bne	.Large_loop;
 
 .Lend:
-	sbc	%r0, %r3, #0;
-	neg	%r0, %r0;
-	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+	sbc	r0, r3, #0;
+	neg	r0, r0;
+	pop	{r4, r5, r6, r7, r8, r9, r10, pc};
 .size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n;
-- 
2.37.2


From mohammed.kraydiye at gmail.com  Thu Dec 15 10:59:59 2022
From: mohammed.kraydiye at gmail.com (Mohammed Mohammad)
Date: Thu, 15 Dec 2022 10:59:59 +0100
Subject: libgcrypt fips.c review and questions
Message-ID: <CAAoj3n2uMZ_pFAxtVPPN4no+NSd0OO6emgjUSS4ENnJ5PJ8brw@mail.gmail.com>

Hello, I'm interested in the concept of fips. I have taken a look at the
source code for managing fips in libgcrypt. I have found some code that I
hardly understand and in some places I feel that I'm able to contribute to
improve some parts of it. Is it okay if I can ask some questions about some
of the code blocks and is it okay if I have suggestions on improving the
code?
Best regards!
Mohammed.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.gnupg.org/pipermail/gcrypt-devel/attachments/20221215/525bbd1b/attachment.html>

From jjelen at redhat.com  Fri Dec 16 11:56:56 2022
From: jjelen at redhat.com (Jakub Jelen)
Date: Fri, 16 Dec 2022 11:56:56 +0100
Subject: libgcrypt fips.c review and questions
In-Reply-To: <CAAoj3n2uMZ_pFAxtVPPN4no+NSd0OO6emgjUSS4ENnJ5PJ8brw@mail.gmail.com>
References: <CAAoj3n2uMZ_pFAxtVPPN4no+NSd0OO6emgjUSS4ENnJ5PJ8brw@mail.gmail.com>
Message-ID: <a42c24f7-89d2-286c-07f9-6379cc3a1c20@redhat.com>

On 12/15/22 10:59, Mohammed Mohammad via Gcrypt-devel wrote:
> Hello, I'm interested in the concept of fips. I have taken a look at the 
> source code for managing fips in libgcrypt. I have found some code that 
> I hardly understand and in some places I feel that I'm able to 
> contribute to improve some parts of it. Is it okay if I can ask some 
> questions about some of the code blocks and is it okay if I have 
> suggestions on improving the code?

Sure, go ahead with asking. I think contributions are always welcomed if 
they will not go against the FIPS requirements.

Regards,
-- 
Jakub Jelen
Crypto Team, Security Engineering
Red Hat, Inc.