From fweimer at redhat.com Sat Dec 3 12:15:24 2022 From: fweimer at redhat.com (Florian Weimer) Date: Sat, 03 Dec 2022 12:15:24 +0100 Subject: Port configure script to C99 Message-ID: <87tu2chh6b.fsf@oldenburg.str.redhat.com> We are working to switch GCC to stricter defaults, catching up to C99 for a start. In support of that, we are trying to build Fedora with such a compiler. We noticed that libgcrypt fails to build because it uses implicit function declarations in the configure script. The patch below should fix that. Thanks, Florian diff --git a/configure.ac b/configure.ac index 6ea38f53b8548ee8..2baf25bc7d9481e6 100644 --- a/configure.ac +++ b/configure.ac @@ -1211,7 +1211,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat /* Test if '.type' and '.size' are supported. */ ".size asmfunc,.-asmfunc;\n\t" ".type asmfunc,%function;\n\t" - );]], [ asmfunc(); ] )], + ); + void asmfunc(void);]], [ asmfunc(); ] )], [gcry_cv_gcc_arm_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then @@ -1238,7 +1239,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARMv8/Aarch64 assembly i "eor x0, x0, x30, ror #12;\n\t" "add x0, x0, x30, asr #12;\n\t" "eor v0.16b, v0.16b, v31.16b;\n\t" - );]], [ asmfunc(); ] )], + ); + void asmfunc(void);]], [ asmfunc(); ] )], [gcry_cv_gcc_aarch64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then @@ -1267,7 +1269,8 @@ AC_CACHE_CHECK([whether GCC assembler supports for CFI directives], ".cfi_restore_state\n\t" ".long 0\n\t" ".cfi_endproc\n\t" - );]])], + ); + void asmfunc(void)]])], [gcry_cv_gcc_asm_cfi_directives=yes])]) if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1, @@ -1666,7 +1669,8 @@ if test $amd64_as_feature_detection = yes; then [gcry_cv_gcc_as_const_division_ok], [gcry_cv_gcc_as_const_division_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]], + [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t"); + void fn(void);]], [fn();])], [gcry_cv_gcc_as_const_division_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_ok" = "no" ; then @@ -1679,7 +1683,8 @@ if test $amd64_as_feature_detection = yes; then [gcry_cv_gcc_as_const_division_with_wadivide_ok], [gcry_cv_gcc_as_const_division_with_wadivide_ok=no AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t");]], + [[__asm__(".text\n\tfn:\n\t xorl \$(123456789/12345678), %ebp;\n\t"); + void fn(void);]], [fn();])], [gcry_cv_gcc_as_const_division_with_wadivide_ok=yes])]) if test "$gcry_cv_gcc_as_const_division_with_wadivide_ok" = "no" ; then @@ -1715,7 +1720,8 @@ if test $amd64_as_feature_detection = yes; then * and "-Wa,--divide" workaround failed, this causes assembly * to be disable on this machine. */ "xorl \$(123456789/12345678), %ebp;\n\t" - );]], [ asmfunc(); ])], + ); + void asmfunc(void);]], [ asmfunc(); ])], [gcry_cv_gcc_amd64_platform_as_ok=yes]) fi]) if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then @@ -1734,7 +1740,8 @@ if test $amd64_as_feature_detection = yes; then ".globl asmfunc\n\t" "asmfunc:\n\t" "xorq \$(1234), %rbp;\n\t" - );]], [ asmfunc(); ])], + ); + void asmfunc(void);]], [ asmfunc(); ])], [gcry_cv_gcc_win64_platform_as_ok=yes])]) if test "$gcry_cv_gcc_win64_platform_as_ok" = "yes" ; then AC_DEFINE(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS,1, @@ -1767,7 +1774,8 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im "sub eax, [esp + 4];\n\t" "add dword ptr [esp + eax], 0b10101;\n\t" ".att_syntax prefix\n\t" - );]], [ actest(); ])], + ); + void actest(void);]], [ actest(); ])], [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes]) fi]) if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then @@ -1832,6 +1840,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions], "vadd.u64 %q0, %q1;\n\t" "vadd.s64 %d3, %d2, %d3;\n\t" ); + void testfn(void); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_neon=yes]) fi]) @@ -1879,6 +1888,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch32 Crypto Extension i "vmull.p64 q0, d0, d0;\n\t" ); + void testfn(void); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch32_crypto=yes]) fi]) @@ -1907,6 +1917,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 NEON instructions] "dup v0.8b, w0;\n\t" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b},[x0],\#32;\n\t" ); + void testfn(void); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch64_neon=yes]) fi]) @@ -1955,6 +1966,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AArch64 Crypto Extension i "pmull v0.1q, v0.1d, v31.1d;\n\t" "pmull2 v0.1q, v0.2d, v31.2d;\n\t" ); + void testfn(void); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_aarch64_crypto=yes]) fi]) @@ -2050,6 +2062,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto "vshasigmad %v0, %v1, 0, 15;\n" "vpmsumd %v11, %v11, %v11;\n" ); + void testfn(void); ]], [ testfn(); ] )], [gcry_cv_gcc_inline_asm_ppc_altivec=yes]) fi]) @@ -2075,6 +2088,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions "testfn:\n" "stxvb16x %r1,%v12,%v30;\n" ); + void testfn(void); ]], [ testfn(); ])], [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes]) fi]) From jussi.kivilinna at iki.fi Mon Dec 5 21:17:55 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 5 Dec 2022 22:17:55 +0200 Subject: [PATCH] chacha20-avx512: add handling for any input block count and tweak 16 block code a bit Message-ID: <20221205201755.355987-1-jussi.kivilinna@iki.fi> * cipher/chacha20-amd64-avx512.S: Add tail handling for 8/4/2/1 blocks; Rename `_gcry_chacha20_amd64_avx512_blocks16` to `_gcry_chacha20_amd64_avx512_blocks`; Tweak 16 parallel block processing for small speed improvement. * cipher/chacha20.c (_gcry_chacha20_amd64_avx512_blocks16): Rename to ... (_gcry_chacha20_amd64_avx512_blocks): ... this. (chacha20_blocks) [USE_AVX512]: Add AVX512 code-path. (do_chacha20_encrypt_stream_tail) [USE_AVX512]: Change to handle any number of full input blocks instead of multiples of 16. -- Patch improves performance of ChaCha20-AVX512 implementation on small input buffer sizes (less than 64*16B = 1024B). === Benchmark on AMD Ryzen 9 7900X: Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz STREAM enc | 0.130 ns/B 7330 MiB/s 0.716 c/B 5500 STREAM dec | 0.128 ns/B 7426 MiB/s 0.713 c/B 5555 POLY1305 enc | 0.175 ns/B 5444 MiB/s 0.964 c/B 5500 POLY1305 dec | 0.175 ns/B 5455 MiB/s 0.962 c/B 5500 After: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz STREAM enc | 0.124 ns/B 7675 MiB/s 0.699 c/B 5625 STREAM dec | 0.126 ns/B 7544 MiB/s 0.695 c/B 5500 POLY1305 enc | 0.170 ns/B 5626 MiB/s 0.954 c/B 5625 POLY1305 dec | 0.169 ns/B 5639 MiB/s 0.945 c/B 5587 === Benchmark on Intel Core i3-1115G4: Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz STREAM enc | 0.161 ns/B 5934 MiB/s 0.658 c/B 4097?3 STREAM dec | 0.160 ns/B 5951 MiB/s 0.656 c/B 4097?4 POLY1305 enc | 0.220 ns/B 4333 MiB/s 0.902 c/B 4096?3 POLY1305 dec | 0.220 ns/B 4325 MiB/s 0.903 c/B 4096?3 After: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz STREAM enc | 0.154 ns/B 6186 MiB/s 0.631 c/B 4096?3 STREAM dec | 0.153 ns/B 6215 MiB/s 0.629 c/B 4096?3 POLY1305 enc | 0.216 ns/B 4407 MiB/s 0.886 c/B 4096?3 POLY1305 dec | 0.216 ns/B 4419 MiB/s 0.884 c/B 4096?3 Signed-off-by: Jussi Kivilinna --- cipher/chacha20-amd64-avx512.S | 463 ++++++++++++++++++++++++++++++--- cipher/chacha20.c | 24 +- 2 files changed, 447 insertions(+), 40 deletions(-) diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S index 8b4d7499..b48b1bf7 100644 --- a/cipher/chacha20-amd64-avx512.S +++ b/cipher/chacha20-amd64-avx512.S @@ -61,14 +61,56 @@ #define X13 %zmm13 #define X14 %zmm14 #define X15 %zmm15 +#define X0y %ymm0 +#define X1y %ymm1 +#define X2y %ymm2 +#define X3y %ymm3 +#define X4y %ymm4 +#define X5y %ymm5 +#define X6y %ymm6 +#define X7y %ymm7 +#define X8y %ymm8 +#define X9y %ymm9 +#define X10y %ymm10 +#define X11y %ymm11 +#define X12y %ymm12 +#define X13y %ymm13 +#define X14y %ymm14 +#define X15y %ymm15 +#define X0x %xmm0 +#define X1x %xmm1 +#define X2x %xmm2 +#define X3x %xmm3 +#define X4x %xmm4 +#define X5x %xmm5 +#define X6x %xmm6 +#define X7x %xmm7 +#define X8x %xmm8 +#define X9x %xmm9 +#define X10x %xmm10 +#define X11x %xmm11 +#define X12x %xmm12 +#define X13x %xmm13 +#define X14x %xmm14 +#define X15x %xmm15 #define TMP0 %zmm16 #define TMP1 %zmm17 +#define TMP0y %ymm16 +#define TMP1y %ymm17 +#define TMP0x %xmm16 +#define TMP1x %xmm17 #define COUNTER_ADD %zmm18 +#define COUNTER_ADDy %ymm18 +#define COUNTER_ADDx %xmm18 #define X12_SAVE %zmm19 +#define X12_SAVEy %ymm19 +#define X12_SAVEx %xmm19 #define X13_SAVE %zmm20 +#define X13_SAVEy %ymm20 +#define X13_SAVEx %xmm20 #define S0 %zmm21 #define S1 %zmm22 @@ -81,6 +123,28 @@ #define S8 %zmm29 #define S14 %zmm30 #define S15 %zmm31 +#define S0y %ymm21 +#define S1y %ymm22 +#define S2y %ymm23 +#define S3y %ymm24 +#define S4y %ymm25 +#define S5y %ymm26 +#define S6y %ymm27 +#define S7y %ymm28 +#define S8y %ymm29 +#define S14y %ymm30 +#define S15y %ymm31 +#define S0x %xmm21 +#define S1x %xmm22 +#define S2x %xmm23 +#define S3x %xmm24 +#define S4x %xmm25 +#define S5x %xmm26 +#define S6x %xmm27 +#define S7x %xmm28 +#define S8x %xmm29 +#define S14x %xmm30 +#define S15x %xmm31 /********************************************************************** helper macros @@ -114,6 +178,12 @@ vshufi32x4 $0xdd, x2, t2, x3; \ vshufi32x4 $0x88, x2, t2, x2; +/* 2x2 128-bit matrix transpose */ +#define transpose_16byte_2x2(x0,x1,t1) \ + vmovdqa32 x0, t1; \ + vshufi32x4 $0x0, x1, x0, x0; \ + vshufi32x4 $0x3, x1, t1, x1; + #define xor_src_dst_4x4(dst, src, offset, add, x0, x4, x8, x12) \ vpxord (offset + 0 * (add))(src), x0, x0; \ vpxord (offset + 1 * (add))(src), x4, x4; \ @@ -141,7 +211,7 @@ clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31); /********************************************************************** - 16-way chacha20 + 16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20 **********************************************************************/ #define ROTATE2(v1,v2,c) \ @@ -154,7 +224,7 @@ #define PLUS(ds,s) \ vpaddd s, ds, ds; -#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \ +#define QUARTERROUND2V(a1,b1,c1,d1,a2,b2,c2,d2) \ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \ ROTATE2(d1, d2, 16); \ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ @@ -164,33 +234,99 @@ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7); +/********************************************************************** + 1-way/2-way (xmm) chacha20 + **********************************************************************/ + +#define ROTATE(v1,c) \ + vprold $(c), v1, v1; \ + +#define WORD_SHUF(v1,shuf) \ + vpshufd $shuf, v1, v1; + +#define QUARTERROUND1H(x0,x1,x2,x3,shuf_x1,shuf_x2,shuf_x3) \ + PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 16); \ + PLUS(x2, x3); XOR(x1, x2); ROTATE(x1, 12); \ + PLUS(x0, x1); XOR(x3, x0); ROTATE(x3, 8); \ + PLUS(x2, x3); \ + WORD_SHUF(x3, shuf_x3); \ + XOR(x1, x2); \ + WORD_SHUF(x2, shuf_x2); \ + ROTATE(x1, 7); \ + WORD_SHUF(x1, shuf_x1); + +#define QUARTERROUND2H(x0,x1,x2,x3,y0,y1,y2,y3,shuf_x1,shuf_x2,shuf_x3) \ + PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \ + ROTATE(x3, 16); ROTATE(y3, 16); \ + PLUS(x2, x3); PLUS(y2, y3); XOR(x1, x2); XOR(y1, y2); \ + ROTATE(x1, 12); ROTATE(y1, 12); \ + PLUS(x0, x1); PLUS(y0, y1); XOR(x3, x0); XOR(y3, y0); \ + ROTATE(x3, 8); ROTATE(y3, 8); \ + PLUS(x2, x3); PLUS(y2, y3); \ + WORD_SHUF(x3, shuf_x3); WORD_SHUF(y3, shuf_x3); \ + XOR(x1, x2); XOR(y1, y2); \ + WORD_SHUF(x2, shuf_x2); WORD_SHUF(y2, shuf_x2); \ + ROTATE(x1, 7); ROTATE(y1, 7); \ + WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1); + .align 64 ELF(.type _gcry_chacha20_amd64_avx512_data, at object;) _gcry_chacha20_amd64_avx512_data: -.Linc_counter: - .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lcounter_0_1_2_3: +.Lcounter_0_1: + .long 0,0,0,0 .Lone: .long 1,0,0,0 +.Lcounter_2_3: +.Ltwo: + .long 2,0,0,0 +.Lthree: + .long 3,0,0,0 +.Linc_counter: + .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data) .align 16 -.globl _gcry_chacha20_amd64_avx512_blocks16 -ELF(.type _gcry_chacha20_amd64_avx512_blocks16, at function;) -_gcry_chacha20_amd64_avx512_blocks16: +.globl _gcry_chacha20_amd64_avx512_blocks +ELF(.type _gcry_chacha20_amd64_avx512_blocks, at function;) +_gcry_chacha20_amd64_avx512_blocks: /* input: * %rdi: input * %rsi: dst * %rdx: src - * %rcx: nblks (multiple of 16) + * %rcx: nblks */ CFI_STARTPROC(); vpxord %xmm16, %xmm16, %xmm16; - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + vpopcntb %ymm16, %ymm16; /* spec stop for old AVX512 CPUs */ + + cmpq $4, NBLKS; + jb .Lskip_vertical_handling; + /* Load constants */ vpmovzxbd .Linc_counter rRIP, COUNTER_ADD; - /* Preload state */ + cmpq $16, NBLKS; + jae .Lload_zmm_state; + + /* Preload state to YMM registers */ + vpbroadcastd (0 * 4)(INPUT), S0y; + vpbroadcastd (1 * 4)(INPUT), S1y; + vpbroadcastd (2 * 4)(INPUT), S2y; + vpbroadcastd (3 * 4)(INPUT), S3y; + vpbroadcastd (4 * 4)(INPUT), S4y; + vpbroadcastd (5 * 4)(INPUT), S5y; + vpbroadcastd (6 * 4)(INPUT), S6y; + vpbroadcastd (7 * 4)(INPUT), S7y; + vpbroadcastd (8 * 4)(INPUT), S8y; + vpbroadcastd (14 * 4)(INPUT), S14y; + vpbroadcastd (15 * 4)(INPUT), S15y; + jmp .Lskip16v; + +.align 16 +.Lload_zmm_state: + /* Preload state to ZMM registers */ vpbroadcastd (0 * 4)(INPUT), S0; vpbroadcastd (1 * 4)(INPUT), S1; vpbroadcastd (2 * 4)(INPUT), S2; @@ -204,13 +340,14 @@ _gcry_chacha20_amd64_avx512_blocks16: vpbroadcastd (15 * 4)(INPUT), S15; .align 16 -.Loop16: +.Loop16v: + /* Process 16 ChaCha20 blocks */ movl $20, ROUND; + subq $16, NBLKS; /* Construct counter vectors X12 and X13 */ - vpbroadcastd (12 * 4)(INPUT), X12; + vpaddd (12 * 4)(INPUT){1to16}, COUNTER_ADD, X12; vpbroadcastd (13 * 4)(INPUT), X13; - vpaddd COUNTER_ADD, X12, X12; vpcmpud $6, X12, COUNTER_ADD, %k2; vpaddd .Lone rRIP {1to16}, X13, X13{%k2}; vmovdqa32 X12, X12_SAVE; @@ -223,7 +360,7 @@ _gcry_chacha20_amd64_avx512_blocks16: vmovdqa32 S1, X1; vmovdqa32 S5, X5; vpbroadcastd (9 * 4)(INPUT), X9; - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13) + QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13) vmovdqa32 S2, X2; vmovdqa32 S6, X6; vpbroadcastd (10 * 4)(INPUT), X10; @@ -235,19 +372,18 @@ _gcry_chacha20_amd64_avx512_blocks16: /* Update counter */ addq $16, (12 * 4)(INPUT); - jmp .Lround2_entry; + jmp .Lround2_entry_16v; .align 16 -.Lround2: - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14) - QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13) -.Lround2_entry: +.Lround2_16v: + QUARTERROUND2V(X2, X7, X8, X13, X3, X4, X9, X14) + QUARTERROUND2V(X0, X4, X8, X12, X1, X5, X9, X13) +.Lround2_entry_16v: + QUARTERROUND2V(X2, X6, X10, X14, X3, X7, X11, X15) + QUARTERROUND2V(X0, X5, X10, X15, X1, X6, X11, X12) subl $2, ROUND; - QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15) - QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12) - jnz .Lround2; + jnz .Lround2_16v; -.Lround2_end: PLUS(X0, S0); PLUS(X1, S1); PLUS(X5, S5); @@ -256,7 +392,7 @@ _gcry_chacha20_amd64_avx512_blocks16: PLUS(X11, (11 * 4)(INPUT){1to16}); PLUS(X15, S15); PLUS(X12, X12_SAVE); - QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14) + QUARTERROUND2V(X2, X7, X8, X13, X3, X4, X9, X14) PLUS(X2, S2); PLUS(X3, S3); @@ -280,21 +416,286 @@ _gcry_chacha20_amd64_avx512_blocks16: transpose_16byte_4x4(X3, X7, X11, X15, TMP0, TMP1); xor_src_dst_4x4(DST, SRC, (64 * 3), (64 * 4), X3, X7, X11, X15); - subq $16, NBLKS; leaq (16 * 64)(SRC), SRC; leaq (16 * 64)(DST), DST; - jnz .Loop16; + cmpq $16, NBLKS; + jae .Loop16v; + +.align 8 +.Lskip16v: + cmpq $8, NBLKS; + jb .Lskip8v; + + /* Process 8 ChaCha20 blocks */ + movl $20, ROUND; + subq $8, NBLKS; + + /* Construct counter vectors X12 and X13 */ + vpaddd (12 * 4)(INPUT){1to8}, COUNTER_ADDy, X12y; + vpbroadcastd (13 * 4)(INPUT), X13y; + vpcmpud $6, X12y, COUNTER_ADDy, %k2; + vpaddd .Lone rRIP {1to8}, X13y, X13y{%k2}; + vmovdqa32 X12y, X12_SAVEy; + vmovdqa32 X13y, X13_SAVEy; + + /* Load vectors */ + vmovdqa32 S0y, X0y; + vmovdqa32 S4y, X4y; + vmovdqa32 S8y, X8y; + vmovdqa32 S1y, X1y; + vmovdqa32 S5y, X5y; + vpbroadcastd (9 * 4)(INPUT), X9y; + QUARTERROUND2V(X0y, X4y, X8y, X12y, X1y, X5y, X9y, X13y) + vmovdqa32 S2y, X2y; + vmovdqa32 S6y, X6y; + vpbroadcastd (10 * 4)(INPUT), X10y; + vmovdqa32 S14y, X14y; + vmovdqa32 S3y, X3y; + vmovdqa32 S7y, X7y; + vpbroadcastd (11 * 4)(INPUT), X11y; + vmovdqa32 S15y, X15y; + + /* Update counter */ + addq $8, (12 * 4)(INPUT); + jmp .Lround2_entry_8v; + +.align 16 +.Lround2_8v: + QUARTERROUND2V(X2y, X7y, X8y, X13y, X3y, X4y, X9y, X14y) + QUARTERROUND2V(X0y, X4y, X8y, X12y, X1y, X5y, X9y, X13y) +.Lround2_entry_8v: + QUARTERROUND2V(X2y, X6y, X10y, X14y, X3y, X7y, X11y, X15y) + QUARTERROUND2V(X0y, X5y, X10y, X15y, X1y, X6y, X11y, X12y) + subl $2, ROUND; + jnz .Lround2_8v; + + PLUS(X0y, S0y); + PLUS(X1y, S1y); + PLUS(X5y, S5y); + PLUS(X6y, S6y); + PLUS(X10y, (10 * 4)(INPUT){1to8}); + PLUS(X11y, (11 * 4)(INPUT){1to8}); + PLUS(X15y, S15y); + PLUS(X12y, X12_SAVEy); + QUARTERROUND2V(X2y, X7y, X8y, X13y, X3y, X4y, X9y, X14y) + + PLUS(X2y, S2y); + PLUS(X3y, S3y); + PLUS(X4y, S4y); + PLUS(X7y, S7y); + transpose_4x4(X0y, X1y, X2y, X3y, TMP0y, TMP1y); + transpose_4x4(X4y, X5y, X6y, X7y, TMP0y, TMP1y); + PLUS(X8y, S8y); + PLUS(X9y, (9 * 4)(INPUT){1to8}); + PLUS(X13y, X13_SAVEy); + PLUS(X14y, S14y); + transpose_16byte_2x2(X0y, X4y, TMP0y); + transpose_16byte_2x2(X1y, X5y, TMP0y); + transpose_16byte_2x2(X2y, X6y, TMP0y); + transpose_16byte_2x2(X3y, X7y, TMP0y); + transpose_4x4(X8y, X9y, X10y, X11y, TMP0y, TMP1y); + transpose_4x4(X12y, X13y, X14y, X15y, TMP0y, TMP1y); + xor_src_dst_4x4(DST, SRC, (16 * 0), (64 * 1), X0y, X1y, X2y, X3y); + xor_src_dst_4x4(DST, SRC, (16 * 16), (64 * 1), X4y, X5y, X6y, X7y); + transpose_16byte_2x2(X8y, X12y, TMP0y); + transpose_16byte_2x2(X9y, X13y, TMP0y); + transpose_16byte_2x2(X10y, X14y, TMP0y); + transpose_16byte_2x2(X11y, X15y, TMP0y); + xor_src_dst_4x4(DST, SRC, (16 * 2), (64 * 1), X8y, X9y, X10y, X11y); + xor_src_dst_4x4(DST, SRC, (16 * 18), (64 * 1), X12y, X13y, X14y, X15y); + + leaq (8 * 64)(SRC), SRC; + leaq (8 * 64)(DST), DST; + +.align 8 +.Lskip8v: + cmpq $4, NBLKS; + jb .Lskip4v; + + /* Process 4 ChaCha20 blocks */ + movl $20, ROUND; + subq $4, NBLKS; + + /* Construct counter vectors X12 and X13 */ + vpaddd (12 * 4)(INPUT){1to4}, COUNTER_ADDx, X12x; + vpbroadcastd (13 * 4)(INPUT), X13x; + vpcmpud $6, X12x, COUNTER_ADDx, %k2; + vpaddd .Lone rRIP {1to4}, X13x, X13x{%k2}; + vmovdqa32 X12x, X12_SAVEx; + vmovdqa32 X13x, X13_SAVEx; + + /* Load vectors */ + vmovdqa32 S0x, X0x; + vmovdqa32 S4x, X4x; + vmovdqa32 S8x, X8x; + vmovdqa32 S1x, X1x; + vmovdqa32 S5x, X5x; + vpbroadcastd (9 * 4)(INPUT), X9x; + QUARTERROUND2V(X0x, X4x, X8x, X12x, X1x, X5x, X9x, X13x) + vmovdqa32 S2x, X2x; + vmovdqa32 S6x, X6x; + vpbroadcastd (10 * 4)(INPUT), X10x; + vmovdqa32 S14x, X14x; + vmovdqa32 S3x, X3x; + vmovdqa32 S7x, X7x; + vpbroadcastd (11 * 4)(INPUT), X11x; + vmovdqa32 S15x, X15x; - /* clear the used vector registers */ + /* Update counter */ + addq $4, (12 * 4)(INPUT); + jmp .Lround2_entry_4v; + +.align 16 +.Lround2_4v: + QUARTERROUND2V(X2x, X7x, X8x, X13x, X3x, X4x, X9x, X14x) + QUARTERROUND2V(X0x, X4x, X8x, X12x, X1x, X5x, X9x, X13x) +.Lround2_entry_4v: + QUARTERROUND2V(X2x, X6x, X10x, X14x, X3x, X7x, X11x, X15x) + QUARTERROUND2V(X0x, X5x, X10x, X15x, X1x, X6x, X11x, X12x) + subl $2, ROUND; + jnz .Lround2_4v; + + PLUS(X0x, S0x); + PLUS(X1x, S1x); + PLUS(X5x, S5x); + PLUS(X6x, S6x); + PLUS(X10x, (10 * 4)(INPUT){1to4}); + PLUS(X11x, (11 * 4)(INPUT){1to4}); + PLUS(X15x, S15x); + PLUS(X12x, X12_SAVEx); + QUARTERROUND2V(X2x, X7x, X8x, X13x, X3x, X4x, X9x, X14x) + + PLUS(X2x, S2x); + PLUS(X3x, S3x); + PLUS(X4x, S4x); + PLUS(X7x, S7x); + transpose_4x4(X0x, X1x, X2x, X3x, TMP0x, TMP1x); + transpose_4x4(X4x, X5x, X6x, X7x, TMP0x, TMP1x); + xor_src_dst_4x4(DST, SRC, (16 * 0), (64 * 1), X0x, X1x, X2x, X3x); + PLUS(X8x, S8x); + PLUS(X9x, (9 * 4)(INPUT){1to4}); + xor_src_dst_4x4(DST, SRC, (16 * 1), (64 * 1), X4x, X5x, X6x, X7x); + PLUS(X13x, X13_SAVEx); + PLUS(X14x, S14x); + transpose_4x4(X8x, X9x, X10x, X11x, TMP0x, TMP1x); + transpose_4x4(X12x, X13x, X14x, X15x, TMP0x, TMP1x); + xor_src_dst_4x4(DST, SRC, (16 * 2), (64 * 1), X8x, X9x, X10x, X11x); + xor_src_dst_4x4(DST, SRC, (16 * 3), (64 * 1), X12x, X13x, X14x, X15x); + + leaq (4 * 64)(SRC), SRC; + leaq (4 * 64)(DST), DST; + +.align 8 +.Lskip4v: + /* clear AVX512 registers */ + kxorq %k2, %k2, %k2; + vzeroupper; clear_zmm16_zmm31(); - kxord %k2, %k2, %k2; + +.align 8 +.Lskip_vertical_handling: + cmpq $0, NBLKS; + je .Ldone; + + /* Load state */ + vmovdqu (0 * 4)(INPUT), X10x; + vmovdqu (4 * 4)(INPUT), X11x; + vmovdqu (8 * 4)(INPUT), X12x; + vmovdqu (12 * 4)(INPUT), X13x; + + /* Load constant */ + vmovdqa .Lone rRIP, X4x; + + cmpq $1, NBLKS; + je .Lhandle1; + + /* Process two ChaCha20 blocks (XMM) */ + movl $20, ROUND; + subq $2, NBLKS; + + vmovdqa X10x, X0x; + vmovdqa X11x, X1x; + vmovdqa X12x, X2x; + vmovdqa X13x, X3x; + + vmovdqa X10x, X8x; + vmovdqa X11x, X9x; + vmovdqa X12x, X14x; + vpaddq X4x, X13x, X15x; + vmovdqa X15x, X7x; + +.align 16 +.Lround2_2: + QUARTERROUND2H(X0x, X1x, X2x, X3x, X8x, X9x, X14x, X15x, + 0x39, 0x4e, 0x93); + QUARTERROUND2H(X0x, X1x, X2x, X3x, X8x, X9x, X14x, X15x, + 0x93, 0x4e, 0x39); + subl $2, ROUND; + jnz .Lround2_2; + + PLUS(X0x, X10x); + PLUS(X1x, X11x); + PLUS(X2x, X12x); + PLUS(X3x, X13x); + + vpaddq .Ltwo rRIP, X13x, X13x; /* Update counter */ + + PLUS(X8x, X10x); + PLUS(X9x, X11x); + PLUS(X14x, X12x); + PLUS(X15x, X7x); + + xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x); + xor_src_dst_4x4(DST, SRC, 16 * 4, 4 * 4, X8x, X9x, X14x, X15x); + lea (2 * 64)(DST), DST; + lea (2 * 64)(SRC), SRC; + + cmpq $0, NBLKS; + je .Lskip1; + +.align 8 +.Lhandle1: + /* Process one ChaCha20 block (XMM) */ + movl $20, ROUND; + subq $1, NBLKS; + + vmovdqa X10x, X0x; + vmovdqa X11x, X1x; + vmovdqa X12x, X2x; + vmovdqa X13x, X3x; + +.align 16 +.Lround2_1: + QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x39, 0x4e, 0x93); + QUARTERROUND1H(X0x, X1x, X2x, X3x, 0x93, 0x4e, 0x39); + subl $2, ROUND; + jnz .Lround2_1; + + PLUS(X0x, X10x); + PLUS(X1x, X11x); + PLUS(X2x, X12x); + PLUS(X3x, X13x); + + vpaddq X4x, X13x, X13x; /* Update counter */ + + xor_src_dst_4x4(DST, SRC, 0 * 4, 4 * 4, X0x, X1x, X2x, X3x); + /*lea (1 * 64)(DST), DST;*/ + /*lea (1 * 64)(SRC), SRC;*/ + +.align 8 +.Lskip1: + /* Store counter */ + vmovdqu X13x, (12 * 4)(INPUT); + +.align 8 +.Ldone: vzeroall; /* clears ZMM0-ZMM15 */ - /* eax zeroed by round loop. */ + xorl %eax, %eax; ret_spec_stop; CFI_ENDPROC(); -ELF(.size _gcry_chacha20_amd64_avx512_blocks16, - .-_gcry_chacha20_amd64_avx512_blocks16;) +ELF(.size _gcry_chacha20_amd64_avx512_blocks, + .-_gcry_chacha20_amd64_avx512_blocks;) #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ #endif /*__x86_64*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index f0cb8721..a7e0dd63 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -173,9 +173,9 @@ unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8( #ifdef USE_AVX512 -unsigned int _gcry_chacha20_amd64_avx512_blocks16(u32 *state, byte *dst, - const byte *src, - size_t nblks) ASM_FUNC_ABI; +unsigned int _gcry_chacha20_amd64_avx512_blocks(u32 *state, byte *dst, + const byte *src, + size_t nblks) ASM_FUNC_ABI; #endif /* USE_AVX2 */ @@ -352,6 +352,13 @@ static unsigned int chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, size_t nblks) { +#ifdef USE_AVX512 + if (ctx->use_avx512) + { + return _gcry_chacha20_amd64_avx512_blocks(ctx->input, dst, src, nblks); + } +#endif + #ifdef USE_SSSE3 if (ctx->use_ssse3) { @@ -546,14 +553,13 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, unsigned int nburn, burn = 0; #ifdef USE_AVX512 - if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE * 16) + if (ctx->use_avx512 && length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; - nblocks -= nblocks % 16; - nburn = _gcry_chacha20_amd64_avx512_blocks16(ctx->input, outbuf, inbuf, - nblocks); + nburn = _gcry_chacha20_amd64_avx512_blocks(ctx->input, outbuf, inbuf, + nblocks); burn = nburn > burn ? nburn : burn; - length -= nblocks * CHACHA20_BLOCK_SIZE; + length %= CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } @@ -662,7 +668,7 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, size_t nblocks = length / CHACHA20_BLOCK_SIZE; nburn = chacha20_blocks(ctx, outbuf, inbuf, nblocks); burn = nburn > burn ? nburn : burn; - length -= nblocks * CHACHA20_BLOCK_SIZE; + length %= CHACHA20_BLOCK_SIZE; outbuf += nblocks * CHACHA20_BLOCK_SIZE; inbuf += nblocks * CHACHA20_BLOCK_SIZE; } -- 2.37.2 From jussi.kivilinna at iki.fi Sun Dec 11 14:26:41 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 11 Dec 2022 15:26:41 +0200 Subject: [PATCH] avx512: tweak AVX512 spec stop, use common macro in assembly Message-ID: <20221211132641.2371386-1-jussi.kivilinna@iki.fi> * cipher/cipher-gcm-intel-pclmul.c: Use xmm registers for AVX512 spec stop. * cipher/asm-common-amd64.h (spec_stop_avx512): New. * cipher/blake2b-amd64-avx512.S: Use spec_stop_avx512. * cipher/blake2s-amd64-avx512.S: Likewise. * cipher/camellia-gfni-avx512-amd64.S: Likewise. * cipher/chacha20-avx512-amd64.S: Likewise. * cipher/keccak-amd64-avx512.S: Likewise. * cipher/poly1305-amd64-avx512.S: Likewise. * cipher/sha512-avx512-amd64.S: Likewise. * cipher/sm4-gfni-avx512-amd64.S: Likewise. --- Signed-off-by: Jussi Kivilinna --- cipher/asm-common-amd64.h | 10 +++++++++- cipher/blake2b-amd64-avx512.S | 2 ++ cipher/blake2s-amd64-avx512.S | 2 ++ cipher/camellia-gfni-avx512-amd64.S | 14 +++++++------- cipher/chacha20-amd64-avx512.S | 3 +-- cipher/cipher-gcm-intel-pclmul.c | 4 ++-- cipher/keccak-amd64-avx512.S | 4 ++++ cipher/poly1305-amd64-avx512.S | 3 +-- cipher/sha512-avx512-amd64.S | 2 ++ cipher/sm4-gfni-avx512-amd64.S | 20 ++++++++++++++------ 10 files changed, 44 insertions(+), 20 deletions(-) diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index 97912b1b..dc2c4d2f 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -186,8 +186,16 @@ # define EXIT_SYSV_FUNC #endif -/* 'ret' instruction replacement for straight-line speculation mitigation */ +/* 'ret' instruction replacement for straight-line speculation mitigation. */ #define ret_spec_stop \ ret; int3; +/* This prevents speculative execution on old AVX512 CPUs, to prevent + * speculative execution to AVX512 code. The vpopcntb instruction is + * available on newer CPUs that do not suffer from significant frequency + * drop when 512-bit vectors are utilized. */ +#define spec_stop_avx512 \ + vpxord %xmm16, %xmm16, %xmm16; \ + vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ + #endif /* GCRY_ASM_COMMON_AMD64_H */ diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S index db53474d..18b0c3ad 100644 --- a/cipher/blake2b-amd64-avx512.S +++ b/cipher/blake2b-amd64-avx512.S @@ -221,6 +221,8 @@ _gcry_blake2b_transform_amd64_avx512: */ CFI_STARTPROC(); + spec_stop_avx512; + movl $0xf, %eax; kmovw %eax, %k0; xorl %eax, %eax; diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S index 4457ca99..ddcdfd67 100644 --- a/cipher/blake2s-amd64-avx512.S +++ b/cipher/blake2s-amd64-avx512.S @@ -183,6 +183,8 @@ _gcry_blake2s_transform_amd64_avx512: */ CFI_STARTPROC(); + spec_stop_avx512; + addq $64, (STATE_T + 0)(RSTATE); vmovdqa .Liv+(0 * 4) rRIP, ROW3; diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S index 15b2dc90..bddad804 100644 --- a/cipher/camellia-gfni-avx512-amd64.S +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -832,7 +832,7 @@ _gcry_camellia_gfni_avx512_ctr_enc: * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19; vmovdqa64 .Lcounter0123_lo rRIP, %zmm21; @@ -985,7 +985,7 @@ _gcry_camellia_gfni_avx512_cbc_dec: * %rcx: iv */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; movq %rcx, %r9; @@ -1047,7 +1047,7 @@ _gcry_camellia_gfni_avx512_cfb_dec: * %rcx: iv */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; cmpl $128, key_bitlength(CTX); movl $32, %r8d; @@ -1122,7 +1122,7 @@ _gcry_camellia_gfni_avx512_ocb_enc: * %r9 : L pointers (void *L[64]) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; pushq %r12; CFI_PUSH(%r12); @@ -1285,7 +1285,7 @@ _gcry_camellia_gfni_avx512_ocb_dec: * %r9 : L pointers (void *L[64]) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; pushq %r12; CFI_PUSH(%r12); @@ -1451,7 +1451,7 @@ _gcry_camellia_gfni_avx512_enc_blk64: * %rdx: src (64 blocks) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; cmpl $128, key_bitlength(CTX); movl $32, %r8d; @@ -1515,7 +1515,7 @@ _gcry_camellia_gfni_avx512_dec_blk64: * %rdx: src (64 blocks) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; cmpl $128, key_bitlength(CTX); movl $32, %r8d; diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S index 682798fe..544e7cdc 100644 --- a/cipher/chacha20-amd64-avx512.S +++ b/cipher/chacha20-amd64-avx512.S @@ -298,8 +298,7 @@ _gcry_chacha20_amd64_avx512_blocks: */ CFI_STARTPROC(); - vpxord %xmm16, %xmm16, %xmm16; - vpopcntb %xmm16, %xmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; cmpq $4, NBLKS; jb .Lskip_vertical_handling; diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index 78a9e338..ec00df09 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -1513,7 +1513,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, if (nblocks >= 32 && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512)) { - asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */ + asm volatile ("vpopcntb %%xmm7, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */ "vshufi64x2 $0, %%zmm7, %%zmm7, %%zmm15\n\t" "vmovdqa %%xmm1, %%xmm8\n\t" "vmovdqu64 %[swapperm], %%zmm14\n\t" @@ -1792,7 +1792,7 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, if (nblocks >= 32 && (c->u_mode.gcm.hw_impl_flags & GCM_INTEL_USE_VPCLMUL_AVX512)) { - asm volatile ("vpopcntb %%zmm7, %%zmm15\n\t" /* spec stop for old AVX512 CPUs */ + asm volatile ("vpopcntb %%xmm1, %%xmm16\n\t" /* spec stop for old AVX512 CPUs */ "vmovdqa %%xmm1, %%xmm8\n\t" "vmovdqu64 %[swapperm], %%zmm14\n\t" : diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S index f44e0285..58b4150f 100644 --- a/cipher/keccak-amd64-avx512.S +++ b/cipher/keccak-amd64-avx512.S @@ -282,6 +282,8 @@ _gcry_keccak_f1600_state_permute64_avx512: */ CFI_STARTPROC() + spec_stop_avx512; + leaq 12*8(%rdi), %rax leaq (24-1)*8(%rsi), %r11 @@ -362,6 +364,8 @@ _gcry_keccak_absorb_blocks_avx512: */ CFI_STARTPROC() + spec_stop_avx512; + leaq 12*8(%rdi), %rax leaq (24-1)*8(%rsi), %r11 diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S index 72303e1e..5c8f838f 100644 --- a/cipher/poly1305-amd64-avx512.S +++ b/cipher/poly1305-amd64-avx512.S @@ -1580,8 +1580,7 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts) ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;) _gcry_poly1305_amd64_avx512_blocks: CFI_STARTPROC() - vpxord xmm16, xmm16, xmm16; - vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; FUNC_ENTRY() #define _a0 gp3 diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S index 0e3f44ab..145c8667 100644 --- a/cipher/sha512-avx512-amd64.S +++ b/cipher/sha512-avx512-amd64.S @@ -264,6 +264,8 @@ _gcry_sha512_transform_amd64_avx512: cmp rdx, 0 je .Lnowork + spec_stop_avx512; + /* Setup mask register for DC:BA merging. */ mov eax, 0b1100 kmovd MASK_DC_00, eax diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S index 1d5e9a48..0f9899d4 100644 --- a/cipher/sm4-gfni-avx512-amd64.S +++ b/cipher/sm4-gfni-avx512-amd64.S @@ -158,6 +158,7 @@ _gcry_sm4_gfni_avx512_expand_key: * %r8: ck array */ CFI_STARTPROC(); + spec_stop_avx512; vmovd 0*4(%rdi), RA0x; vmovd 1*4(%rdi), RA1x; @@ -553,6 +554,7 @@ _gcry_sm4_gfni_avx512_crypt_blk1_16: * %rcx: num blocks (1..16) */ CFI_STARTPROC(); + spec_stop_avx512; #define LOAD_INPUT(offset, yreg) \ cmpq $(1 + 2 * (offset)), %rcx; \ @@ -621,6 +623,7 @@ _gcry_sm4_gfni_avx512_ctr_enc: * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); + spec_stop_avx512; vbroadcasti128 .Lbswap128_mask rRIP, RTMP0; vmovdqa .Lcounter0123_lo rRIP, RTMP1; @@ -728,6 +731,7 @@ _gcry_sm4_gfni_avx512_cbc_dec: * %rcx: iv */ CFI_STARTPROC(); + spec_stop_avx512; vmovdqu (0 * 32)(%rdx), RA0; vmovdqu (1 * 32)(%rdx), RA1; @@ -779,6 +783,7 @@ _gcry_sm4_gfni_avx512_cfb_dec: * %rcx: iv */ CFI_STARTPROC(); + spec_stop_avx512; /* Load input */ vmovdqu (%rcx), RNOTx; @@ -835,6 +840,7 @@ _gcry_sm4_gfni_avx512_ocb_enc: * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); + spec_stop_avx512; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); @@ -950,6 +956,7 @@ _gcry_sm4_gfni_avx512_ocb_dec: * %r9 : L pointers (void *L[16]) */ CFI_STARTPROC(); + spec_stop_avx512; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); @@ -1066,6 +1073,7 @@ _gcry_sm4_gfni_avx512_ocb_auth: * %r8 : L pointers (void *L[16]) */ CFI_STARTPROC(); + spec_stop_avx512; subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); @@ -1251,7 +1259,7 @@ _gcry_sm4_gfni_avx512_crypt_blk32: * %rdx: src (32 blocks) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; /* Load input */ vmovdqu32 (0 * 64)(%rdx), RA0z; @@ -1292,7 +1300,7 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32: * %rcx: iv (big endian, 128bit) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z; vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z; @@ -1400,7 +1408,7 @@ _gcry_sm4_gfni_avx512_cbc_dec_blk32: * %rcx: iv */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; vmovdqu32 (0 * 64)(%rdx), RA0z; vmovdqu32 (1 * 64)(%rdx), RA1z; @@ -1453,7 +1461,7 @@ _gcry_sm4_gfni_avx512_cfb_dec_blk32: * %rcx: iv */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; /* Load input */ vmovdqu (%rcx), RA0x; @@ -1510,7 +1518,7 @@ _gcry_sm4_gfni_avx512_ocb_enc_blk32: * %r9 : L pointers (void *L[32]) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; subq $(5 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(5 * 8); @@ -1634,7 +1642,7 @@ _gcry_sm4_gfni_avx512_ocb_dec_blk32: * %r9 : L pointers (void *L[32]) */ CFI_STARTPROC(); - vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */ + spec_stop_avx512; subq $(5 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(5 * 8); -- 2.37.2 From gniibe at fsij.org Tue Dec 13 09:09:23 2022 From: gniibe at fsij.org (NIIBE Yutaka) Date: Tue, 13 Dec 2022 17:09:23 +0900 Subject: Port configure script to C99 In-Reply-To: <87tu2chh6b.fsf@oldenburg.str.redhat.com> References: <87tu2chh6b.fsf@oldenburg.str.redhat.com> Message-ID: <871qp3g1xo.fsf@akagi.fsij.org> Florian Weimer wrote: > We noticed that libgcrypt fails to build because it uses implicit > function declarations in the configure script. The patch below should > fix that. Thank you. Applied and pushed the change (so that it has consistent tab&space) to master. Will apply to 1.10 branch, too. For libgcrypt, I found m4/ax_cc_for_build.m4 and m4/noexecstack.m4 have similar problems. I'll fix. For the source distribution (*.tar.bz2) which has generated configure script, I think that we need to fix more. IIUC, Autoconf and Libtool need to be updated. -- From jussi.kivilinna at iki.fi Wed Dec 14 18:49:09 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 14 Dec 2022 19:49:09 +0200 Subject: [PATCH 2/2] rijndael-ppc: fix wrong inline assembly constraint In-Reply-To: <20221214174909.569097-1-jussi.kivilinna@iki.fi> References: <20221214174909.569097-1-jussi.kivilinna@iki.fi> Message-ID: <20221214174909.569097-2-jussi.kivilinna@iki.fi> * cipher/rijndael-ppc-function.h (CBC_ENC_FUNC): Fix outiv constraint. -- Noticed when trying to compile with powerpc64le clang. GCC accepted the buggy constraint without complaints. Signed-off-by: Jussi Kivilinna --- cipher/rijndael-ppc-functions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h index 23fa4206..063c5358 100644 --- a/cipher/rijndael-ppc-functions.h +++ b/cipher/rijndael-ppc-functions.h @@ -373,7 +373,7 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg, * last one. */ __asm__ volatile ("vcipherlast %0, %0, %2\n\t" "vcipherlast %1, %1, %3\n\t" - : "+v" (iv), "+outiv" (outiv) + : "+v" (iv), "+v" (outiv) : "v" (nextiv), "v" (rkeylast)); VEC_STORE_BE ((u128_t *)out, 0, outiv, bige_const); -- 2.37.2 From jussi.kivilinna at iki.fi Wed Dec 14 18:49:08 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 14 Dec 2022 19:49:08 +0200 Subject: [PATCH 1/2] Fix building AVX512 Intel-syntax assembly with x86-64 clang Message-ID: <20221214174909.569097-1-jussi.kivilinna@iki.fi> * cipher/asm-common-amd64.h (spec_stop_avx512_intel_syntax): New. * cipher/poly1305-amd64-avx512.S: Use spec_stop_avx512_intel_syntax instead of spec_stop_avx512. * cipher/sha512-avx512-amd64.S: Likewise. -- Reported-by: Clemens Lang Signed-off-by: Jussi Kivilinna --- cipher/asm-common-amd64.h | 4 ++++ cipher/poly1305-amd64-avx512.S | 2 +- cipher/sha512-avx512-amd64.S | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index dc2c4d2f..cd93abc3 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -198,4 +198,8 @@ vpxord %xmm16, %xmm16, %xmm16; \ vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ +#define spec_stop_avx512_intel_syntax \ + vpxord xmm16, xmm16, xmm16; \ + vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */ + #endif /* GCRY_ASM_COMMON_AMD64_H */ diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S index 5c8f838f..6622861f 100644 --- a/cipher/poly1305-amd64-avx512.S +++ b/cipher/poly1305-amd64-avx512.S @@ -1580,7 +1580,7 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts) ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;) _gcry_poly1305_amd64_avx512_blocks: CFI_STARTPROC() - spec_stop_avx512; + spec_stop_avx512_intel_syntax; FUNC_ENTRY() #define _a0 gp3 diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S index 145c8667..65475422 100644 --- a/cipher/sha512-avx512-amd64.S +++ b/cipher/sha512-avx512-amd64.S @@ -264,7 +264,7 @@ _gcry_sha512_transform_amd64_avx512: cmp rdx, 0 je .Lnowork - spec_stop_avx512; + spec_stop_avx512_intel_syntax; /* Setup mask register for DC:BA merging. */ mov eax, 0b1100 -- 2.37.2 From jussi.kivilinna at iki.fi Wed Dec 14 18:53:36 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 14 Dec 2022 19:53:36 +0200 Subject: [PATCH] Add clang support for ARM 32-bit assembly Message-ID: <20221214175336.604891-1-jussi.kivilinna@iki.fi> * configure.ac (gcry_cv_gcc_arm_platform_as_ok) (gcry_cv_gcc_inline_asm_neon): Remove % prefix from register names. * cipher/cipher-gcm-armv7-neon.S (vmull_p64): Prefix constant values with # character instead of $. * cipher/blowfish-arm.S: Remove % prefix from all register names. * cipher/camellia-arm.S: Likewise. * cipher/cast5-arm.S: Likewise. * cipher/rijndael-arm.S: Likewise. * cipher/rijndael-armv8-aarch32-ce.S: Likewise. * cipher/sha512-arm.S: Likewise. * cipher/sha512-armv7-neon.S: Likewise. * cipher/twofish-arm.S: Likewise. * mpi/arm/mpih-add1.S: Likewise. * mpi/arm/mpih-mul1.S: Likewise. * mpi/arm/mpih-mul2.S: Likewise. * mpi/arm/mpih-mul3.S: Likewise. * mpi/arm/mpih-sub1.S: Likewise. -- Reported-by: Dmytro Kovalov Signed-off-by: Jussi Kivilinna --- cipher/blowfish-arm.S | 216 ++++++++++++++--------------- cipher/camellia-arm.S | 68 ++++----- cipher/cast5-arm.S | 204 +++++++++++++-------------- cipher/cipher-gcm-armv7-neon.S | 24 ++-- cipher/rijndael-arm.S | 106 +++++++------- cipher/rijndael-armv8-aarch32-ce.S | 66 ++++----- cipher/sha512-arm.S | 204 +++++++++++++-------------- cipher/sha512-armv7-neon.S | 78 +++++------ cipher/twofish-arm.S | 62 ++++----- configure.ac | 10 +- mpi/arm/mpih-add1.S | 50 +++---- mpi/arm/mpih-mul1.S | 58 ++++---- mpi/arm/mpih-mul2.S | 78 +++++------ mpi/arm/mpih-mul3.S | 88 ++++++------ mpi/arm/mpih-sub1.S | 52 +++---- 15 files changed, 682 insertions(+), 682 deletions(-) diff --git a/cipher/blowfish-arm.S b/cipher/blowfish-arm.S index b30aa31f..a5101b5c 100644 --- a/cipher/blowfish-arm.S +++ b/cipher/blowfish-arm.S @@ -36,24 +36,24 @@ #define p (s3 + (1 * 256) * 4) /* register macros */ -#define CTXs0 %r0 -#define CTXs1 %r9 -#define CTXs2 %r8 -#define CTXs3 %r10 -#define RMASK %lr -#define RKEYL %r2 -#define RKEYR %ip +#define CTXs0 r0 +#define CTXs1 r9 +#define CTXs2 r8 +#define CTXs3 r10 +#define RMASK lr +#define RKEYL r2 +#define RKEYR ip -#define RL0 %r3 -#define RR0 %r4 +#define RL0 r3 +#define RR0 r4 -#define RL1 %r9 -#define RR1 %r10 +#define RL1 r9 +#define RR1 r10 -#define RT0 %r11 -#define RT1 %r7 -#define RT2 %r5 -#define RT3 %r6 +#define RT0 r11 +#define RT1 r7 +#define RT2 r5 +#define RT3 r6 /* helper macros */ #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ @@ -250,7 +250,7 @@ __blowfish_enc_blk1: * output: * [RR0, RL0]: dst */ - push {%lr}; + push {lr}; add CTXs1, CTXs0, #(s1 - s0); add CTXs2, CTXs0, #(s2 - s0); @@ -268,7 +268,7 @@ __blowfish_enc_blk1: round_enc(16); add_roundkey_enc(); - pop {%pc}; + pop {pc}; .size __blowfish_enc_blk1,.-__blowfish_enc_blk1; .align 8 @@ -277,22 +277,22 @@ __blowfish_enc_blk1: _gcry_blowfish_arm_do_encrypt: /* input: - * %r0: ctx, CTX - * %r1: u32 *ret_xl - * %r2: u32 *ret_xr + * r0: ctx, CTX + * r1: u32 *ret_xl + * r2: u32 *ret_xr */ - push {%r2, %r4-%r11, %ip, %lr}; + push {r2, r4-r11, ip, lr}; - ldr RL0, [%r1]; - ldr RR0, [%r2]; + ldr RL0, [r1]; + ldr RR0, [r2]; bl __blowfish_enc_blk1; - pop {%r2}; - str RR0, [%r1]; - str RL0, [%r2]; + pop {r2}; + str RR0, [r1]; + str RL0, [r2]; - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt; .align 3 @@ -301,19 +301,19 @@ _gcry_blowfish_arm_do_encrypt: _gcry_blowfish_arm_encrypt_block: /* input: - * %r0: ctx, CTX - * %r1: dst - * %r2: src + * r0: ctx, CTX + * r1: dst + * r2: src */ - push {%r4-%r11, %ip, %lr}; + push {r4-r11, ip, lr}; - read_block(%r2, 0, RL0, RR0, RT0); + read_block(r2, 0, RL0, RR0, RT0); bl __blowfish_enc_blk1; - write_block(%r1, 0, RR0, RL0, RT0, RT1); + write_block(r1, 0, RR0, RL0, RT0, RT1); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block; .align 3 @@ -322,18 +322,18 @@ _gcry_blowfish_arm_encrypt_block: _gcry_blowfish_arm_decrypt_block: /* input: - * %r0: ctx, CTX - * %r1: dst - * %r2: src + * r0: ctx, CTX + * r1: dst + * r2: src */ - push {%r4-%r11, %ip, %lr}; + push {r4-r11, ip, lr}; add CTXs1, CTXs0, #(s1 - s0); add CTXs2, CTXs0, #(s2 - s0); mov RMASK, #(0xff << 2); /* byte mask */ add CTXs3, CTXs1, #(s3 - s1); - read_block(%r2, 0, RL0, RR0, RT0); + read_block(r2, 0, RL0, RR0, RT0); load_roundkey_dec(17); round_dec(15); @@ -346,9 +346,9 @@ _gcry_blowfish_arm_decrypt_block: round_dec(1); add_roundkey_dec(); - write_block(%r1, 0, RR0, RL0, RT0, RT1); + write_block(r1, 0, RR0, RL0, RT0, RT1); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block; /*********************************************************************** @@ -548,7 +548,7 @@ _gcry_blowfish_arm_enc_blk2: * output: * [RR0, RL0], [RR1, RL1]: dst */ - push {RT0,%lr}; + push {RT0,lr}; add CTXs2, CTXs0, #(s2 - s0); mov RMASK, #(0xff << 2); /* byte mask */ @@ -568,7 +568,7 @@ _gcry_blowfish_arm_enc_blk2: host_to_be(RR1, RT0); host_to_be(RL1, RT0); - pop {RT0,%pc}; + pop {RT0,pc}; .size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2; .align 3 @@ -577,40 +577,40 @@ _gcry_blowfish_arm_enc_blk2: _gcry_blowfish_arm_cfb_dec: /* input: - * %r0: CTX - * %r1: dst (2 blocks) - * %r2: src (2 blocks) - * %r3: iv (64bit) + * r0: CTX + * r1: dst (2 blocks) + * r2: src (2 blocks) + * r3: iv (64bit) */ - push {%r2, %r4-%r11, %ip, %lr}; + push {r2, r4-r11, ip, lr}; - mov %lr, %r3; + mov lr, r3; - /* Load input (iv/%r3 is aligned, src/%r2 might not be) */ - ldm %r3, {RL0, RR0}; + /* Load input (iv/r3 is aligned, src/r2 might not be) */ + ldm r3, {RL0, RR0}; host_to_be(RL0, RT0); host_to_be(RR0, RT0); - read_block(%r2, 0, RL1, RR1, RT0); + read_block(r2, 0, RL1, RR1, RT0); /* Update IV, load src[1] and save to iv[0] */ - read_block_host(%r2, 8, %r5, %r6, RT0); - stm %lr, {%r5, %r6}; + read_block_host(r2, 8, r5, r6, RT0); + stm lr, {r5, r6}; bl _gcry_blowfish_arm_enc_blk2; - /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */ - /* %r1: dst, %r0: %src */ - pop {%r0}; + /* r1: dst, r0: src */ + pop {r0}; /* dst = src ^ result */ - read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr); - eor %r5, %r4; - eor %r6, %r3; - eor %r7, %r10; - eor %r8, %r9; - write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10); - - pop {%r4-%r11, %ip, %pc}; + read_block2_host(r0, r5, r6, r7, r8, lr); + eor r5, r4; + eor r6, r3; + eor r7, r10; + eor r8, r9; + write_block2_host(r1, r5, r6, r7, r8, r9, r10); + + pop {r4-r11, ip, pc}; .ltorg .size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec; @@ -620,42 +620,42 @@ _gcry_blowfish_arm_cfb_dec: _gcry_blowfish_arm_ctr_enc: /* input: - * %r0: CTX - * %r1: dst (2 blocks) - * %r2: src (2 blocks) - * %r3: iv (64bit, big-endian) + * r0: CTX + * r1: dst (2 blocks) + * r2: src (2 blocks) + * r3: iv (64bit, big-endian) */ - push {%r2, %r4-%r11, %ip, %lr}; + push {r2, r4-r11, ip, lr}; - mov %lr, %r3; + mov lr, r3; /* Load IV (big => host endian) */ - read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0); + read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT0); /* Construct IVs */ adds RR1, RR0, #1; /* +1 */ adc RL1, RL0, #0; - adds %r6, RR1, #1; /* +2 */ - adc %r5, RL1, #0; + adds r6, RR1, #1; /* +2 */ + adc r5, RL1, #0; /* Store new IV (host => big-endian) */ - write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0); + write_block_aligned(lr, 0, r5, r6, host_to_be, RT0); bl _gcry_blowfish_arm_enc_blk2; - /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */ - /* %r1: dst, %r0: %src */ - pop {%r0}; + /* r1: dst, r0: src */ + pop {r0}; /* XOR key-stream with plaintext */ - read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr); - eor %r5, %r4; - eor %r6, %r3; - eor %r7, %r10; - eor %r8, %r9; - write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10); - - pop {%r4-%r11, %ip, %pc}; + read_block2_host(r0, r5, r6, r7, r8, lr); + eor r5, r4; + eor r6, r3; + eor r7, r10; + eor r8, r9; + write_block2_host(r1, r5, r6, r7, r8, r9, r10); + + pop {r4-r11, ip, pc}; .ltorg .size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc; @@ -697,45 +697,45 @@ _gcry_blowfish_arm_dec_blk2: _gcry_blowfish_arm_cbc_dec: /* input: - * %r0: CTX - * %r1: dst (2 blocks) - * %r2: src (2 blocks) - * %r3: iv (64bit) + * r0: CTX + * r1: dst (2 blocks) + * r2: src (2 blocks) + * r3: iv (64bit) */ - push {%r2-%r11, %ip, %lr}; + push {r2-r11, ip, lr}; - read_block2(%r2, RL0, RR0, RL1, RR1, RT0); + read_block2(r2, RL0, RR0, RL1, RR1, RT0); /* dec_blk2 is only used by cbc_dec, jump directly in/out instead * of function call. */ b _gcry_blowfish_arm_dec_blk2; .Ldec_cbc_tail: - /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */ - /* %r0: %src, %r1: dst, %r2: iv */ - pop {%r0, %r2}; + /* r0: src, r1: dst, r2: iv */ + pop {r0, r2}; - /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */ - read_block_host(%r0, 0, %r7, %r8, %r5); - /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */ - ldm %r2, {%r5, %r6}; + /* load IV+1 (src[0]) to r7:r8. Might be unaligned. */ + read_block_host(r0, 0, r7, r8, r5); + /* load IV (iv[0]) to r5:r6. 'iv' is aligned. */ + ldm r2, {r5, r6}; /* out[1] ^= IV+1 */ - eor %r10, %r7; - eor %r9, %r8; + eor r10, r7; + eor r9, r8; /* out[0] ^= IV */ - eor %r4, %r5; - eor %r3, %r6; + eor r4, r5; + eor r3, r6; - /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */ - read_block_host(%r0, 8, %r7, %r8, %r5); + /* load IV+2 (src[1]) to r7:r8. Might be unaligned. */ + read_block_host(r0, 8, r7, r8, r5); /* store IV+2 to iv[0] (aligned). */ - stm %r2, {%r7, %r8}; + stm r2, {r7, r8}; /* store result to dst[0-3]. Might be unaligned. */ - write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6); + write_block2_host(r1, r4, r3, r10, r9, r5, r6); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec; diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S index a3d87d11..decd40c2 100644 --- a/cipher/camellia-arm.S +++ b/cipher/camellia-arm.S @@ -45,23 +45,23 @@ #define key_table 0 /* register macros */ -#define CTX %r0 -#define RTAB1 %ip -#define RTAB3 %r1 -#define RMASK %lr +#define CTX r0 +#define RTAB1 ip +#define RTAB3 r1 +#define RMASK lr -#define IL %r2 -#define IR %r3 +#define IL r2 +#define IR r3 -#define XL %r4 -#define XR %r5 -#define YL %r6 -#define YR %r7 +#define XL r4 +#define XR r5 +#define YL r6 +#define YR r7 -#define RT0 %r8 -#define RT1 %r9 -#define RT2 %r10 -#define RT3 %r11 +#define RT0 r8 +#define RT1 r9 +#define RT2 r10 +#define RT3 r11 /* helper macros */ #define ldr_unaligned_be(rout, rsrc, offs, rtmp) \ @@ -248,7 +248,7 @@ (n) * 2 + 0, (n) * 2 + 1); #define inpack(n) \ - ldr_input_be(%r2, XL, XR, YL, YR, RT0); \ + ldr_input_be(r2, XL, XR, YL, YR, RT0); \ ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \ ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \ eor XL, RT0; \ @@ -259,7 +259,7 @@ ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \ eor YL, RT0; \ eor YR, RT1; \ - str_output_be(%r1, YL, YR, XL, XR, RT0, RT1); + str_output_be(r1, YL, YR, XL, XR, RT0, RT1); .align 3 .globl _gcry_camellia_arm_encrypt_block @@ -267,17 +267,17 @@ _gcry_camellia_arm_encrypt_block: /* input: - * %r0: keytable - * %r1: dst - * %r2: src - * %r3: keybitlen + * r0: keytable + * r1: dst + * r2: src + * r3: keybitlen */ - push {%r1, %r4-%r11, %ip, %lr}; + push {r1, r4-r11, ip, lr}; GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3); mov RMASK, #0xff; add RTAB3, RTAB1, #(2 * 4); - push {%r3}; + push {r3}; mov RMASK, RMASK, lsl#4 /* byte mask */ inpack(0); @@ -292,20 +292,20 @@ _gcry_camellia_arm_encrypt_block: cmp RT0, #(16 * 8); bne .Lenc_256; - pop {%r1}; + pop {r1}; outunpack(24); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .Lenc_256: enc_fls(24); enc_rounds(24); - pop {%r1}; + pop {r1}; outunpack(32); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block; @@ -315,19 +315,19 @@ _gcry_camellia_arm_encrypt_block: _gcry_camellia_arm_decrypt_block: /* input: - * %r0: keytable - * %r1: dst - * %r2: src - * %r3: keybitlen + * r0: keytable + * r1: dst + * r2: src + * r3: keybitlen */ - push {%r1, %r4-%r11, %ip, %lr}; + push {r1, r4-r11, ip, lr}; GET_DATA_POINTER(RTAB1, .Lcamellia_sp1110, RTAB3); mov RMASK, #0xff; add RTAB3, RTAB1, #(2 * 4); mov RMASK, RMASK, lsl#4 /* byte mask */ - cmp %r3, #(16 * 8); + cmp r3, #(16 * 8); bne .Ldec_256; inpack(24); @@ -339,10 +339,10 @@ _gcry_camellia_arm_decrypt_block: dec_fls(8); dec_rounds(0); - pop {%r1}; + pop {r1}; outunpack(0); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .Ldec_256: diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S index 76ddd2e3..ae53e6b4 100644 --- a/cipher/cast5-arm.S +++ b/cipher/cast5-arm.S @@ -50,25 +50,25 @@ #define Kr_arm_dec (Kr_arm_enc + (16)) /* register macros */ -#define CTX %r0 -#define Rs1 %r7 -#define Rs2 %r8 -#define Rs3 %r9 -#define Rs4 %r10 -#define RMASK %r11 -#define RKM %r1 -#define RKR %r2 - -#define RL0 %r3 -#define RR0 %r4 - -#define RL1 %r9 -#define RR1 %r10 - -#define RT0 %lr -#define RT1 %ip -#define RT2 %r5 -#define RT3 %r6 +#define CTX r0 +#define Rs1 r7 +#define Rs2 r8 +#define Rs3 r9 +#define Rs4 r10 +#define RMASK r11 +#define RKM r1 +#define RKR r2 + +#define RL0 r3 +#define RR0 r4 + +#define RL1 r9 +#define RR1 r10 + +#define RT0 lr +#define RT1 ip +#define RT2 r5 +#define RT3 r6 /* helper macros */ #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ @@ -267,11 +267,11 @@ _gcry_cast5_arm_encrypt_block: /* input: - * %r0: CTX - * %r1: dst - * %r2: src + * r0: CTX + * r1: dst + * r2: src */ - push {%r1, %r4-%r11, %ip, %lr}; + push {r1, r4-r11, ip, lr}; GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); @@ -279,7 +279,7 @@ _gcry_cast5_arm_encrypt_block: add Rs3, Rs1, #(0x100*4*2); add Rs4, Rs1, #(0x100*4*3); - read_block(%r2, 0, RL0, RR0, RT0); + read_block(r2, 0, RL0, RR0, RT0); load_km(0); load_kr(0); @@ -300,10 +300,10 @@ _gcry_cast5_arm_encrypt_block: enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy); enc_round(15, F1, RR0, RL0, dummy, dummy, dummy); - ldr %r1, [%sp], #4; - write_block(%r1, 0, RR0, RL0, RT0, RT1); + ldr r1, [sp], #4; + write_block(r1, 0, RR0, RL0, RT0, RT1); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block; @@ -313,11 +313,11 @@ _gcry_cast5_arm_encrypt_block: _gcry_cast5_arm_decrypt_block: /* input: - * %r0: CTX - * %r1: dst - * %r2: src + * r0: CTX + * r1: dst + * r2: src */ - push {%r1, %r4-%r11, %ip, %lr}; + push {r1, r4-r11, ip, lr}; GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); @@ -325,7 +325,7 @@ _gcry_cast5_arm_decrypt_block: add Rs3, Rs1, #(0x100 * 4 * 2); add Rs4, Rs1, #(0x100 * 4 * 3); - read_block(%r2, 0, RL0, RR0, RT0); + read_block(r2, 0, RL0, RR0, RT0); load_km(15); load_dec_kr(15); @@ -346,10 +346,10 @@ _gcry_cast5_arm_decrypt_block: dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy); dec_round(0, F1, RR0, RL0, dummy, dummy, dummy); - ldr %r1, [%sp], #4; - write_block(%r1, 0, RR0, RL0, RT0, RT1); + ldr r1, [sp], #4; + write_block(r1, 0, RR0, RL0, RT0, RT1); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block; @@ -511,7 +511,7 @@ _gcry_cast5_arm_enc_blk2: * output: * [RR0, RL0], [RR1, RL1]: dst */ - push {%lr}; + push {lr}; GET_DATA_POINTER(Rs1, _gcry_cast5_s1to4, Rs2); mov RMASK, #(0xff << 2); @@ -541,7 +541,7 @@ _gcry_cast5_arm_enc_blk2: host_to_be(RR1, RT0); host_to_be(RL1, RT0); - pop {%pc}; + pop {pc}; .ltorg .size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2; @@ -551,40 +551,40 @@ _gcry_cast5_arm_enc_blk2: _gcry_cast5_arm_cfb_dec: /* input: - * %r0: CTX - * %r1: dst (2 blocks) - * %r2: src (2 blocks) - * %r3: iv (64bit) + * r0: CTX + * r1: dst (2 blocks) + * r2: src (2 blocks) + * r3: iv (64bit) */ - push {%r1, %r2, %r4-%r11, %ip, %lr}; + push {r1, r2, r4-r11, ip, lr}; - mov %lr, %r3; + mov lr, r3; - /* Load input (iv/%r3 is aligned, src/%r2 might not be) */ - ldm %r3, {RL0, RR0}; + /* Load input (iv/r3 is aligned, src/r2 might not be) */ + ldm r3, {RL0, RR0}; host_to_be(RL0, RT1); host_to_be(RR0, RT1); - read_block(%r2, 0, RL1, RR1, %ip); + read_block(r2, 0, RL1, RR1, ip); /* Update IV, load src[1] and save to iv[0] */ - read_block_host(%r2, 8, %r5, %r6, %r7); - stm %lr, {%r5, %r6}; + read_block_host(r2, 8, r5, r6, r7); + stm lr, {r5, r6}; bl _gcry_cast5_arm_enc_blk2; - /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */ - /* %r0: dst, %r1: %src */ - pop {%r0, %r1}; + /* r0: dst, r1: src */ + pop {r0, r1}; /* dst = src ^ result */ - read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr); - eor %r5, %r4; - eor %r6, %r3; - eor %r7, %r10; - eor %r8, %r9; - write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2); - - pop {%r4-%r11, %ip, %pc}; + read_block2_host(r1, r5, r6, r7, r8, lr); + eor r5, r4; + eor r6, r3; + eor r7, r10; + eor r8, r9; + write_block2_host(r0, r5, r6, r7, r8, r1, r2); + + pop {r4-r11, ip, pc}; .ltorg .size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec; @@ -594,42 +594,42 @@ _gcry_cast5_arm_cfb_dec: _gcry_cast5_arm_ctr_enc: /* input: - * %r0: CTX - * %r1: dst (2 blocks) - * %r2: src (2 blocks) - * %r3: iv (64bit, big-endian) + * r0: CTX + * r1: dst (2 blocks) + * r2: src (2 blocks) + * r3: iv (64bit, big-endian) */ - push {%r1, %r2, %r4-%r11, %ip, %lr}; + push {r1, r2, r4-r11, ip, lr}; - mov %lr, %r3; + mov lr, r3; /* Load IV (big => host endian) */ - read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1); + read_block_aligned(lr, 0, RL0, RR0, be_to_host, RT1); /* Construct IVs */ adds RR1, RR0, #1; /* +1 */ adc RL1, RL0, #0; - adds %r6, RR1, #1; /* +2 */ - adc %r5, RL1, #0; + adds r6, RR1, #1; /* +2 */ + adc r5, RL1, #0; /* Store new IV (host => big-endian) */ - write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1); + write_block_aligned(lr, 0, r5, r6, host_to_be, RT1); bl _gcry_cast5_arm_enc_blk2; - /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */ - /* %r0: dst, %r1: %src */ - pop {%r0, %r1}; + /* r0: dst, r1: src */ + pop {r0, r1}; /* XOR key-stream with plaintext */ - read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr); - eor %r5, %r4; - eor %r6, %r3; - eor %r7, %r10; - eor %r8, %r9; - write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2); - - pop {%r4-%r11, %ip, %pc}; + read_block2_host(r1, r5, r6, r7, r8, lr); + eor r5, r4; + eor r6, r3; + eor r7, r10; + eor r8, r9; + write_block2_host(r0, r5, r6, r7, r8, r1, r2); + + pop {r4-r11, ip, pc}; .ltorg .size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc; @@ -682,45 +682,45 @@ _gcry_cast5_arm_dec_blk2: _gcry_cast5_arm_cbc_dec: /* input: - * %r0: CTX - * %r1: dst (2 blocks) - * %r2: src (2 blocks) - * %r3: iv (64bit) + * r0: CTX + * r1: dst (2 blocks) + * r2: src (2 blocks) + * r3: iv (64bit) */ - push {%r1-%r11, %ip, %lr}; + push {r1-r11, ip, lr}; - read_block2(%r2, RL0, RR0, RL1, RR1, RT0); + read_block2(r2, RL0, RR0, RL1, RR1, RT0); /* dec_blk2 is only used by cbc_dec, jump directly in/out instead * of function call. */ b _gcry_cast5_arm_dec_blk2; .Ldec_cbc_tail: - /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + /* result in RR0:RL0, RR1:RL1 = r4:r3, r10:r9 */ - /* %r0: dst, %r1: %src, %r2: iv */ - pop {%r0-%r2}; + /* r0: dst, r1: src, r2: iv */ + pop {r0-r2}; - /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */ - read_block_host(%r1, 0, %r7, %r8, %r5); - /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */ - ldm %r2, {%r5, %r6}; + /* load IV+1 (src[0]) to r7:r8. Might be unaligned. */ + read_block_host(r1, 0, r7, r8, r5); + /* load IV (iv[0]) to r5:r6. 'iv' is aligned. */ + ldm r2, {r5, r6}; /* out[1] ^= IV+1 */ - eor %r10, %r7; - eor %r9, %r8; + eor r10, r7; + eor r9, r8; /* out[0] ^= IV */ - eor %r4, %r5; - eor %r3, %r6; + eor r4, r5; + eor r3, r6; - /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */ - read_block_host(%r1, 8, %r7, %r8, %r5); + /* load IV+2 (src[1]) to r7:r8. Might be unaligned. */ + read_block_host(r1, 8, r7, r8, r5); /* store IV+2 to iv[0] (aligned). */ - stm %r2, {%r7, %r8}; + stm r2, {r7, r8}; /* store result to dst[0-3]. Might be unaligned. */ - write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6); + write_block2_host(r0, r4, r3, r10, r9, r5, r6); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec; diff --git a/cipher/cipher-gcm-armv7-neon.S b/cipher/cipher-gcm-armv7-neon.S index 16502b4a..c7027af3 100644 --- a/cipher/cipher-gcm-armv7-neon.S +++ b/cipher/cipher-gcm-armv7-neon.S @@ -121,21 +121,21 @@ gcry_gcm_reduction_constant: * Engineering ? MoCrySEn, 2013". */ #define vmull_p64(rq, rl, rh, ad, bd) \ - vext.8 t0l, ad, ad, $1; \ + vext.8 t0l, ad, ad, #1; \ vmull.p8 t0q, t0l, bd; \ - vext.8 rl, bd, bd, $1; \ + vext.8 rl, bd, bd, #1; \ vmull.p8 rq, ad, rl; \ - vext.8 t1l, ad, ad, $2; \ + vext.8 t1l, ad, ad, #2; \ vmull.p8 t1q, t1l, bd; \ - vext.8 t3l, bd, bd, $2; \ + vext.8 t3l, bd, bd, #2; \ vmull.p8 t3q, ad, t3l; \ - vext.8 t2l, ad, ad, $3; \ + vext.8 t2l, ad, ad, #3; \ vmull.p8 t2q, t2l, bd; \ veor t0q, t0q, rq; \ - vext.8 rl, bd, bd, $3; \ + vext.8 rl, bd, bd, #3; \ vmull.p8 rq, ad, rl; \ veor t1q, t1q, t3q; \ - vext.8 t3l, bd, bd, $4; \ + vext.8 t3l, bd, bd, #4; \ vmull.p8 t3q, ad, t3l; \ veor t0l, t0l, t0h; \ vand t0h, t0h, k48; \ @@ -147,13 +147,13 @@ gcry_gcm_reduction_constant: veor t2l, t2l, t2h; \ vand t2h, t2h, k16; \ veor t3l, t3l, t3h; \ - vmov.i64 t3h, $0; \ - vext.8 t0q, t0q, t0q, $15; \ + vmov.i64 t3h, #0; \ + vext.8 t0q, t0q, t0q, #15; \ veor t2l, t2l, t2h; \ - vext.8 t1q, t1q, t1q, $14; \ + vext.8 t1q, t1q, t1q, #14; \ vmull.p8 rq, ad, bd; \ - vext.8 t2q, t2q, t2q, $13; \ - vext.8 t3q, t3q, t3q, $12; \ + vext.8 t2q, t2q, t2q, #13; \ + vext.8 t3q, t3q, t3q, #12; \ veor t0q, t0q, t1q; \ veor t2q, t2q, t3q; \ veor rq, rq, t0q; \ diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S index e680c817..632daac2 100644 --- a/cipher/rijndael-arm.S +++ b/cipher/rijndael-arm.S @@ -29,23 +29,23 @@ .arm /* register macros */ -#define CTX %r0 -#define RTAB %lr -#define RMASK %ip +#define CTX r0 +#define RTAB lr +#define RMASK ip -#define RA %r4 -#define RB %r5 -#define RC %r6 -#define RD %r7 +#define RA r4 +#define RB r5 +#define RC r6 +#define RD r7 -#define RNA %r8 -#define RNB %r9 -#define RNC %r10 -#define RND %r11 +#define RNA r8 +#define RNB r9 +#define RNC r10 +#define RND r11 -#define RT0 %r1 -#define RT1 %r2 -#define RT2 %r3 +#define RT0 r1 +#define RT1 r2 +#define RT2 r3 /* helper macros */ #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ @@ -216,30 +216,30 @@ _gcry_aes_arm_encrypt_block: /* input: - * %r0: keysched, CTX - * %r1: dst - * %r2: src - * %r3: number of rounds.. 10, 12 or 14 - * %st+0: encryption table + * r0: keysched, CTX + * r1: dst + * r2: src + * r3: number of rounds.. 10, 12 or 14 + * st+0: encryption table */ - push {%r4-%r11, %ip, %lr}; + push {r4-r11, ip, lr}; /* read input block */ /* test if src is unaligned */ - tst %r2, #3; + tst r2, #3; beq 1f; /* unaligned load */ - ldr_unaligned_le(RA, %r2, 0, RNA); - ldr_unaligned_le(RB, %r2, 4, RNB); - ldr_unaligned_le(RC, %r2, 8, RNA); - ldr_unaligned_le(RD, %r2, 12, RNB); + ldr_unaligned_le(RA, r2, 0, RNA); + ldr_unaligned_le(RB, r2, 4, RNB); + ldr_unaligned_le(RC, r2, 8, RNA); + ldr_unaligned_le(RD, r2, 12, RNB); b 2f; .ltorg 1: /* aligned load */ - ldm %r2, {RA, RB, RC, RD}; + ldm r2, {RA, RB, RC, RD}; #ifndef __ARMEL__ rev RA, RA; rev RB, RB; @@ -247,12 +247,12 @@ _gcry_aes_arm_encrypt_block: rev RD, RD; #endif 2: - ldr RTAB, [%sp, #40]; - sub %sp, #16; + ldr RTAB, [sp, #40]; + sub sp, #16; - str %r1, [%sp, #4]; /* dst */ + str r1, [sp, #4]; /* dst */ mov RMASK, #0xff; - str %r3, [%sp, #8]; /* nrounds */ + str r3, [sp, #8]; /* nrounds */ mov RMASK, RMASK, lsl#2; /* byte mask */ firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND); @@ -264,7 +264,7 @@ _gcry_aes_arm_encrypt_block: encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); - ldr RT0, [%sp, #8]; /* nrounds */ + ldr RT0, [sp, #8]; /* nrounds */ cmp RT0, #12; bge .Lenc_not_128; @@ -272,8 +272,8 @@ _gcry_aes_arm_encrypt_block: lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD); .Lenc_done: - ldr RT0, [%sp, #4]; /* dst */ - add %sp, #16; + ldr RT0, [sp, #4]; /* dst */ + add sp, #16; /* store output block */ @@ -301,7 +301,7 @@ _gcry_aes_arm_encrypt_block: 2: mov r0, #(10 * 4); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .Lenc_not_128: @@ -473,30 +473,30 @@ _gcry_aes_arm_encrypt_block: _gcry_aes_arm_decrypt_block: /* input: - * %r0: keysched, CTX - * %r1: dst - * %r2: src - * %r3: number of rounds.. 10, 12 or 14 - * %st+0: decryption table + * r0: keysched, CTX + * r1: dst + * r2: src + * r3: number of rounds.. 10, 12 or 14 + * st+0: decryption table */ - push {%r4-%r11, %ip, %lr}; + push {r4-r11, ip, lr}; /* read input block */ /* test if src is unaligned */ - tst %r2, #3; + tst r2, #3; beq 1f; /* unaligned load */ - ldr_unaligned_le(RA, %r2, 0, RNA); - ldr_unaligned_le(RB, %r2, 4, RNB); - ldr_unaligned_le(RC, %r2, 8, RNA); - ldr_unaligned_le(RD, %r2, 12, RNB); + ldr_unaligned_le(RA, r2, 0, RNA); + ldr_unaligned_le(RB, r2, 4, RNB); + ldr_unaligned_le(RC, r2, 8, RNA); + ldr_unaligned_le(RD, r2, 12, RNB); b 2f; .ltorg 1: /* aligned load */ - ldm %r2, {RA, RB, RC, RD}; + ldm r2, {RA, RB, RC, RD}; #ifndef __ARMEL__ rev RA, RA; rev RB, RB; @@ -504,14 +504,14 @@ _gcry_aes_arm_decrypt_block: rev RD, RD; #endif 2: - ldr RTAB, [%sp, #40]; - sub %sp, #16; + ldr RTAB, [sp, #40]; + sub sp, #16; mov RMASK, #0xff; - str %r1, [%sp, #4]; /* dst */ + str r1, [sp, #4]; /* dst */ mov RMASK, RMASK, lsl#2; /* byte mask */ - cmp %r3, #12; + cmp r3, #12; bge .Ldec_256; firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND); @@ -526,8 +526,8 @@ _gcry_aes_arm_decrypt_block: decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, set_last_round_rmask); lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD); - ldr RT0, [%sp, #4]; /* dst */ - add %sp, #16; + ldr RT0, [sp, #4]; /* dst */ + add sp, #16; /* store output block */ @@ -554,7 +554,7 @@ _gcry_aes_arm_decrypt_block: stm RT0, {RA, RB, RC, RD}; 2: mov r0, #(10 * 4); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .Ldec_256: diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 6208652b..3c4149b3 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -483,9 +483,9 @@ _gcry_aes_cbc_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: cbc_mac => r5 - * %st+8: nrounds => r6 + * st+0: nblocks => r4 + * st+4: cbc_mac => r5 + * st+8: nrounds => r6 */ push {r4-r6,lr} /* 4*4 = 16b */ @@ -563,8 +563,8 @@ _gcry_aes_cbc_dec_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ push {r4-r6,lr} /* 4*4 = 16b */ @@ -670,7 +670,7 @@ _gcry_aes_ecb_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: nblocks - * %st+0: nrounds => r4 + * st+0: nrounds => r4 */ push {r4-r6,lr} /* 4*4 = 16b */ @@ -755,7 +755,7 @@ _gcry_aes_ecb_dec_armv8_ce: * r1: outbuf * r2: inbuf * r3: nblocks - * %st+0: nrounds => r4 + * st+0: nrounds => r4 */ push {r4-r6,lr} /* 4*4 = 16b */ @@ -812,8 +812,8 @@ _gcry_aes_cfb_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ push {r4-r6,lr} /* 4*4 = 16b */ @@ -888,8 +888,8 @@ _gcry_aes_cfb_dec_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ push {r4-r6,lr} /* 4*4 = 16b */ @@ -996,8 +996,8 @@ _gcry_aes_ctr_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ vpush {q4-q7} @@ -1176,8 +1176,8 @@ _gcry_aes_ctr32le_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ vpush {q4-q7} @@ -1301,11 +1301,11 @@ _gcry_aes_ocb_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: offset - * %st+0: checksum => r4 - * %st+4: Ls => r5 - * %st+8: nblocks => r6 (0 < nblocks <= 32) - * %st+12: nrounds => r7 - * %st+16: blkn => lr + * st+0: checksum => r4 + * st+4: Ls => r5 + * st+8: nblocks => r6 (0 < nblocks <= 32) + * st+12: nrounds => r7 + * st+16: blkn => lr */ vpush {q4-q7} @@ -1476,11 +1476,11 @@ _gcry_aes_ocb_dec_armv8_ce: * r1: outbuf * r2: inbuf * r3: offset - * %st+0: checksum => r4 - * %st+4: Ls => r5 - * %st+8: nblocks => r6 (0 < nblocks <= 32) - * %st+12: nrounds => r7 - * %st+16: blkn => lr + * st+0: checksum => r4 + * st+4: Ls => r5 + * st+8: nblocks => r6 (0 < nblocks <= 32) + * st+12: nrounds => r7 + * st+16: blkn => lr */ vpush {q4-q7} @@ -1650,10 +1650,10 @@ _gcry_aes_ocb_auth_armv8_ce: * r1: abuf * r2: offset * r3: checksum - * %st+0: Ls => r5 - * %st+4: nblocks => r6 (0 < nblocks <= 32) - * %st+8: nrounds => r7 - * %st+12: blkn => lr + * st+0: Ls => r5 + * st+4: nblocks => r6 (0 < nblocks <= 32) + * st+8: nrounds => r7 + * st+12: blkn => lr */ vpush {q4-q7} @@ -1801,8 +1801,8 @@ _gcry_aes_xts_enc_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ vpush {q4-q7} @@ -1956,8 +1956,8 @@ _gcry_aes_xts_dec_armv8_ce: * r1: outbuf * r2: inbuf * r3: iv - * %st+0: nblocks => r4 - * %st+4: nrounds => r5 + * st+0: nblocks => r4 + * st+4: nrounds => r5 */ vpush {q4-q7} diff --git a/cipher/sha512-arm.S b/cipher/sha512-arm.S index 94ec0141..1e1d296f 100644 --- a/cipher/sha512-arm.S +++ b/cipher/sha512-arm.S @@ -38,23 +38,23 @@ #define hd_h ((hd_g) + 8) /* register macros */ -#define RK %r2 +#define RK r2 -#define RElo %r0 -#define REhi %r1 +#define RElo r0 +#define REhi r1 -#define RT1lo %r3 -#define RT1hi %r4 -#define RT2lo %r5 -#define RT2hi %r6 -#define RWlo %r7 -#define RWhi %r8 -#define RT3lo %r9 -#define RT3hi %r10 -#define RT4lo %r11 -#define RT4hi %ip +#define RT1lo r3 +#define RT1hi r4 +#define RT2lo r5 +#define RT2hi r6 +#define RWlo r7 +#define RWhi r8 +#define RT3lo r9 +#define RT3hi r10 +#define RT4lo r11 +#define RT4hi ip -#define RRND %lr +#define RRND lr /* variable offsets in stack */ #define ctx (0) @@ -150,13 +150,13 @@ mov RWhi, REhi, lsr#14; \ eor RWlo, RWlo, RElo, lsr#18; \ eor RWhi, RWhi, REhi, lsr#18; \ - ldr RT3lo, [%sp, #(_f)]; \ + ldr RT3lo, [sp, #(_f)]; \ adds RT1lo, RT2lo; /* t1 += K */ \ - ldr RT3hi, [%sp, #(_f) + 4]; \ + ldr RT3hi, [sp, #(_f) + 4]; \ adc RT1hi, RT2hi; \ - ldr RT4lo, [%sp, #(_g)]; \ + ldr RT4lo, [sp, #(_g)]; \ eor RWlo, RWlo, RElo, lsl#23; \ - ldr RT4hi, [%sp, #(_g) + 4]; \ + ldr RT4hi, [sp, #(_g) + 4]; \ eor RWhi, RWhi, REhi, lsl#23; \ eor RWlo, RWlo, REhi, lsl#18; \ eor RWhi, RWhi, RElo, lsl#18; \ @@ -177,29 +177,29 @@ \ /* Load D */ \ /* t1 += Cho(_e,_f,_g) */ \ - ldr RElo, [%sp, #(_d)]; \ + ldr RElo, [sp, #(_d)]; \ adds RT1lo, RT3lo; \ - ldr REhi, [%sp, #(_d) + 4]; \ + ldr REhi, [sp, #(_d) + 4]; \ adc RT1hi, RT3hi; \ \ /* Load A */ \ - ldr RT3lo, [%sp, #(_a)]; \ + ldr RT3lo, [sp, #(_a)]; \ \ /* _d += t1 */ \ adds RElo, RT1lo; \ - ldr RT3hi, [%sp, #(_a) + 4]; \ + ldr RT3hi, [sp, #(_a) + 4]; \ adc REhi, RT1hi; \ \ /* Store D */ \ - str RElo, [%sp, #(_d)]; \ + str RElo, [sp, #(_d)]; \ \ /* t2 = Sum0(_a) */ \ mov RT2lo, RT3lo, lsr#28; \ - str REhi, [%sp, #(_d) + 4]; \ + str REhi, [sp, #(_d) + 4]; \ mov RT2hi, RT3hi, lsr#28; \ - ldr RWlo, [%sp, #(_b)]; \ + ldr RWlo, [sp, #(_b)]; \ eor RT2lo, RT2lo, RT3lo, lsl#30; \ - ldr RWhi, [%sp, #(_b) + 4]; \ + ldr RWhi, [sp, #(_b) + 4]; \ eor RT2hi, RT2hi, RT3hi, lsl#30; \ eor RT2lo, RT2lo, RT3lo, lsl#25; \ eor RT2hi, RT2hi, RT3hi, lsl#25; \ @@ -212,11 +212,11 @@ \ /* t2 += t1 */ \ adds RT2lo, RT1lo; \ - ldr RT1lo, [%sp, #(_c)]; \ + ldr RT1lo, [sp, #(_c)]; \ adc RT2hi, RT1hi; \ \ /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \ - ldr RT1hi, [%sp, #(_c) + 4]; \ + ldr RT1hi, [sp, #(_c) + 4]; \ and RT4lo, RWlo, RT3lo; \ and RT4hi, RWhi, RT3hi; \ eor RWlo, RWlo, RT3lo; \ @@ -229,36 +229,36 @@ /* Message expansion */ #define W_0_63(_a,_h,i) \ - ldr RT3lo, [%sp, #(w(i-2))]; \ + ldr RT3lo, [sp, #(w(i-2))]; \ adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \ - ldr RT3hi, [%sp, #(w(i-2)) + 4]; \ + ldr RT3hi, [sp, #(w(i-2)) + 4]; \ adc RT2hi, RWhi; \ /* nw = S1(w[i-2]) */ \ - ldr RT1lo, [%sp, #(_h)]; /* Load H */ \ + ldr RT1lo, [sp, #(_h)]; /* Load H */ \ mov RWlo, RT3lo, lsr#19; \ - str RT2lo, [%sp, #(_a)]; \ + str RT2lo, [sp, #(_a)]; \ eor RWlo, RWlo, RT3lo, lsl#3; \ - ldr RT1hi, [%sp, #(_h) + 4]; \ + ldr RT1hi, [sp, #(_h) + 4]; \ mov RWhi, RT3hi, lsr#19; \ - ldr RT2lo, [%sp, #(w(i-7))]; \ + ldr RT2lo, [sp, #(w(i-7))]; \ eor RWhi, RWhi, RT3hi, lsl#3; \ - str RT2hi, [%sp, #(_a) + 4]; \ + str RT2hi, [sp, #(_a) + 4]; \ eor RWlo, RWlo, RT3lo, lsr#6; \ - ldr RT2hi, [%sp, #(w(i-7)) + 4]; \ + ldr RT2hi, [sp, #(w(i-7)) + 4]; \ eor RWhi, RWhi, RT3hi, lsr#6; \ eor RWlo, RWlo, RT3hi, lsl#13; \ eor RWhi, RWhi, RT3lo, lsl#13; \ eor RWlo, RWlo, RT3hi, lsr#29; \ eor RWhi, RWhi, RT3lo, lsr#29; \ - ldr RT3lo, [%sp, #(w(i-15))]; \ + ldr RT3lo, [sp, #(w(i-15))]; \ eor RWlo, RWlo, RT3hi, lsl#26; \ - ldr RT3hi, [%sp, #(w(i-15)) + 4]; \ + ldr RT3hi, [sp, #(w(i-15)) + 4]; \ \ adds RT2lo, RWlo; /* nw += w[i-7] */ \ - ldr RWlo, [%sp, #(w(i-16))]; \ + ldr RWlo, [sp, #(w(i-16))]; \ adc RT2hi, RWhi; \ mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \ - ldr RWhi, [%sp, #(w(i-16)) + 4]; \ + ldr RWhi, [sp, #(w(i-16)) + 4]; \ mov RT4hi, RT3hi, lsr#1; \ adds RT2lo, RWlo; /* nw += w[i-16] */ \ eor RT4lo, RT4lo, RT3lo, lsr#8; \ @@ -277,20 +277,20 @@ adc RT2hi, RT4hi; \ \ /* w[0] = nw */ \ - str RT2lo, [%sp, #(w(i))]; \ + str RT2lo, [sp, #(w(i))]; \ adds RT1lo, RWlo; \ - str RT2hi, [%sp, #(w(i)) + 4]; \ + str RT2hi, [sp, #(w(i)) + 4]; \ adc RT1hi, RWhi; #define W_64_79(_a,_h,i) \ adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \ - ldr RWlo, [%sp, #(w(i-16))]; \ + ldr RWlo, [sp, #(w(i-16))]; \ adc RT2hi, RWhi; \ - ldr RWhi, [%sp, #(w(i-16)) + 4]; \ - ldr RT1lo, [%sp, #(_h)]; /* Load H */ \ - ldr RT1hi, [%sp, #(_h) + 4]; \ - str RT2lo, [%sp, #(_a)]; \ - str RT2hi, [%sp, #(_a) + 4]; \ + ldr RWhi, [sp, #(w(i-16)) + 4]; \ + ldr RT1lo, [sp, #(_h)]; /* Load H */ \ + ldr RT1hi, [sp, #(_h) + 4]; \ + str RT2lo, [sp, #(_a)]; \ + str RT2hi, [sp, #(_a) + 4]; \ adds RT1lo, RWlo; \ adc RT1hi, RWhi; @@ -300,72 +300,72 @@ _gcry_sha512_transform_arm: /* Input: - * %r0: SHA512_CONTEXT - * %r1: data - * %r2: u64 k[] constants - * %r3: nblks + * r0: SHA512_CONTEXT + * r1: data + * r2: u64 k[] constants + * r3: nblks */ - push {%r4-%r11, %ip, %lr}; - sub %sp, %sp, #STACK_MAX; - movs RWlo, %r3; - str %r0, [%sp, #(ctx)]; + push {r4-r11, ip, lr}; + sub sp, sp, #STACK_MAX; + movs RWlo, r3; + str r0, [sp, #(ctx)]; beq .Ldone; .Loop_blocks: - str RWlo, [%sp, #nblks]; + str RWlo, [sp, #nblks]; /* Load context to stack */ - add RWhi, %sp, #(_a); - ldm %r0!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} + add RWhi, sp, #(_a); + ldm r0!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - ldm %r0, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} + ldm r0, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} /* Load input to w[16] */ /* test if data is unaligned */ - tst %r1, #3; + tst r1, #3; beq 1f; /* unaligned load */ - add RWhi, %sp, #(w(0)); - read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + add RWhi, sp, #(w(0)); + read_be64_unaligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + read_be64_unaligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + read_be64_unaligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + read_be64_unaligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); b 2f; 1: /* aligned load */ - add RWhi, %sp, #(w(0)); - read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + add RWhi, sp, #(w(0)); + read_be64_aligned_4(r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + read_be64_aligned_4(r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + read_be64_aligned_4(r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); + read_be64_aligned_4(r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo); 2: - add %r1, #(16 * 8); + add r1, #(16 * 8); stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi} - str %r1, [%sp, #(data)]; + str r1, [sp, #(data)]; /* preload E & A */ - ldr RElo, [%sp, #(_e)]; - ldr REhi, [%sp, #(_e) + 4]; + ldr RElo, [sp, #(_e)]; + ldr REhi, [sp, #(_e) + 4]; mov RWlo, #0; - ldr RT2lo, [%sp, #(_a)]; + ldr RT2lo, [sp, #(_a)]; mov RRND, #(80-16); - ldr RT2hi, [%sp, #(_a) + 4]; + ldr RT2hi, [sp, #(_a) + 4]; mov RWhi, #0; .Loop_rounds: @@ -406,58 +406,58 @@ _gcry_sha512_transform_arm: R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30); R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31); - ldr %r0, [%sp, #(ctx)]; + ldr r0, [sp, #(ctx)]; adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ - ldr %r1, [%sp, #(data)]; + ldr r1, [sp, #(data)]; adc RT2hi, RWhi; - ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} + ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} adds RT1lo, RT2lo; - ldr RT2lo, [%sp, #(_b + 0)]; + ldr RT2lo, [sp, #(_b + 0)]; adc RT1hi, RT2hi; - ldr RT2hi, [%sp, #(_b + 4)]; + ldr RT2hi, [sp, #(_b + 4)]; adds RWlo, RT2lo; - ldr RT2lo, [%sp, #(_c + 0)]; + ldr RT2lo, [sp, #(_c + 0)]; adc RWhi, RT2hi; - ldr RT2hi, [%sp, #(_c + 4)]; + ldr RT2hi, [sp, #(_c + 4)]; adds RT3lo, RT2lo; - ldr RT2lo, [%sp, #(_d + 0)]; + ldr RT2lo, [sp, #(_d + 0)]; adc RT3hi, RT2hi; - ldr RT2hi, [%sp, #(_d + 4)]; + ldr RT2hi, [sp, #(_d + 4)]; adds RT4lo, RT2lo; - ldr RT2lo, [%sp, #(_e + 0)]; + ldr RT2lo, [sp, #(_e + 0)]; adc RT4hi, RT2hi; - stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} + stm r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} - ldr RT2hi, [%sp, #(_e + 4)]; - ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} + ldr RT2hi, [sp, #(_e + 4)]; + ldm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} adds RT1lo, RT2lo; - ldr RT2lo, [%sp, #(_f + 0)]; + ldr RT2lo, [sp, #(_f + 0)]; adc RT1hi, RT2hi; - ldr RT2hi, [%sp, #(_f + 4)]; + ldr RT2hi, [sp, #(_f + 4)]; adds RWlo, RT2lo; - ldr RT2lo, [%sp, #(_g + 0)]; + ldr RT2lo, [sp, #(_g + 0)]; adc RWhi, RT2hi; - ldr RT2hi, [%sp, #(_g + 4)]; + ldr RT2hi, [sp, #(_g + 4)]; adds RT3lo, RT2lo; - ldr RT2lo, [%sp, #(_h + 0)]; + ldr RT2lo, [sp, #(_h + 0)]; adc RT3hi, RT2hi; - ldr RT2hi, [%sp, #(_h + 4)]; + ldr RT2hi, [sp, #(_h + 4)]; adds RT4lo, RT2lo; adc RT4hi, RT2hi; - stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} - sub %r0, %r0, #(4 * 8); - ldr RWlo, [%sp, #nblks]; + stm r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi} + sub r0, r0, #(4 * 8); + ldr RWlo, [sp, #nblks]; sub RK, #(80 * 8); subs RWlo, #1; bne .Loop_blocks; .Ldone: - mov %r0, #STACK_MAX; + mov r0, #STACK_MAX; __out: - add %sp, %sp, #STACK_MAX; - pop {%r4-%r11, %ip, %pc}; + add sp, sp, #STACK_MAX; + pop {r4-r11, ip, pc}; .size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm; #endif diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S index 2b186b47..a1df73b8 100644 --- a/cipher/sha512-armv7-neon.S +++ b/cipher/sha512-armv7-neon.S @@ -40,7 +40,7 @@ #define hd_g ((hd_f) + 8) /* register macros */ -#define RK %r2 +#define RK r2 #define RA d0 #define RB d1 @@ -287,26 +287,26 @@ _gcry_sha512_transform_armv7_neon: /* Input: - * %r0: SHA512_CONTEXT - * %r1: data - * %r2: u64 k[] constants - * %r3: nblks + * r0: SHA512_CONTEXT + * r1: data + * r2: u64 k[] constants + * r3: nblks */ - push {%lr}; + push {lr}; - mov %lr, #0; + mov lr, #0; /* Load context to d0-d7 */ - vld1.64 {RA-RD}, [%r0]!; - vld1.64 {RE-RH}, [%r0]; - sub %r0, #(4*8); + vld1.64 {RA-RD}, [r0]!; + vld1.64 {RE-RH}, [r0]; + sub r0, #(4*8); /* Load input to w[16], d16-d31 */ /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ - vld1.64 {RW0-RW3}, [%r1]!; - vld1.64 {RW4-RW7}, [%r1]!; - vld1.64 {RW8-RW11}, [%r1]!; - vld1.64 {RW12-RW15}, [%r1]!; + vld1.64 {RW0-RW3}, [r1]!; + vld1.64 {RW4-RW7}, [r1]!; + vld1.64 {RW8-RW11}, [r1]!; + vld1.64 {RW12-RW15}, [r1]!; #ifdef __ARMEL__ /* byteswap */ vrev64.8 RW01q, RW01q; @@ -334,46 +334,46 @@ _gcry_sha512_transform_armv7_neon: rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q); rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q); rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q); - add %lr, #16; + add lr, #16; rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q); - cmp %lr, #64; + cmp lr, #64; rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q); bne .Loop_rounds; - subs %r3, #1; + subs r3, #1; rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _); rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG); beq .Lhandle_tail; - vld1.64 {RW0-RW3}, [%r1]!; + vld1.64 {RW0-RW3}, [r1]!; rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); #ifdef __ARMEL__ vrev64.8 RW01q, RW01q; vrev64.8 RW23q, RW23q; #endif - vld1.64 {RW4-RW7}, [%r1]!; + vld1.64 {RW4-RW7}, [r1]!; rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); #ifdef __ARMEL__ vrev64.8 RW45q, RW45q; vrev64.8 RW67q, RW67q; #endif - vld1.64 {RW8-RW11}, [%r1]!; + vld1.64 {RW8-RW11}, [r1]!; rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); #ifdef __ARMEL__ vrev64.8 RW89q, RW89q; vrev64.8 RW1011q, RW1011q; #endif - vld1.64 {RW12-RW15}, [%r1]!; + vld1.64 {RW12-RW15}, [r1]!; vadd_rg_RT0(RA); vadd_rg_RT1(RA); /* Load context */ - vld1.64 {RT0-RT3}, [%r0]!; - vld1.64 {RT4-RT7}, [%r0]; - sub %r0, #(4*8); + vld1.64 {RT0-RT3}, [r0]!; + vld1.64 {RT4-RT7}, [r0]; + sub r0, #(4*8); #ifdef __ARMEL__ vrev64.8 RW1213q, RW1213q; @@ -390,11 +390,11 @@ _gcry_sha512_transform_armv7_neon: vadd.u64 RH, RT7; /* Store the first half of context */ - vst1.64 {RA-RD}, [%r0]!; + vst1.64 {RA-RD}, [r0]!; sub RK, $(8*80); - vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ - mov %lr, #0; - sub %r0, #(4*8); + vst1.64 {RE-RH}, [r0]; /* Store the last half of context */ + mov lr, #0; + sub r0, #(4*8); b .Loop; .ltorg @@ -408,11 +408,11 @@ _gcry_sha512_transform_armv7_neon: rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); /* Load context to d16-d23 */ - vld1.64 {RW0-RW3}, [%r0]!; + vld1.64 {RW0-RW3}, [r0]!; vadd_rg_RT0(RA); - vld1.64 {RW4-RW7}, [%r0]; + vld1.64 {RW4-RW7}, [r0]; vadd_rg_RT1(RA); - sub %r0, #(4*8); + sub r0, #(4*8); vadd.u64 RA, RW0; vadd.u64 RB, RW1; @@ -424,7 +424,7 @@ _gcry_sha512_transform_armv7_neon: vadd.u64 RH, RW7; /* Store the first half of context */ - vst1.64 {RA-RD}, [%r0]!; + vst1.64 {RA-RD}, [r0]!; /* Clear used registers */ /* d16-d31 */ @@ -432,7 +432,7 @@ _gcry_sha512_transform_armv7_neon: CLEAR_REG(RW23q); CLEAR_REG(RW45q); CLEAR_REG(RW67q); - vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ + vst1.64 {RE-RH}, [r0]; /* Store the last half of context */ CLEAR_REG(RW89q); CLEAR_REG(RW1011q); CLEAR_REG(RW1213q); @@ -440,13 +440,13 @@ _gcry_sha512_transform_armv7_neon: /* d8-d15 */ vpop {RT0-RT7}; /* d0-d7 (q0-q3) */ - CLEAR_REG(%q0); - CLEAR_REG(%q1); - CLEAR_REG(%q2); - CLEAR_REG(%q3); + CLEAR_REG(q0); + CLEAR_REG(q1); + CLEAR_REG(q2); + CLEAR_REG(q3); - eor %r0, %r0; - pop {%pc}; + eor r0, r0; + pop {pc}; .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; #endif diff --git a/cipher/twofish-arm.S b/cipher/twofish-arm.S index 2e1da6cd..b381e546 100644 --- a/cipher/twofish-arm.S +++ b/cipher/twofish-arm.S @@ -37,25 +37,25 @@ #define k ((w) + 4 * 8) /* register macros */ -#define CTX %r0 -#define CTXs0 %r0 -#define CTXs1 %r1 -#define CTXs3 %r7 +#define CTX r0 +#define CTXs0 r0 +#define CTXs1 r1 +#define CTXs3 r7 -#define RA %r3 -#define RB %r4 -#define RC %r5 -#define RD %r6 +#define RA r3 +#define RB r4 +#define RC r5 +#define RD r6 -#define RX %r2 -#define RY %ip +#define RX r2 +#define RY ip -#define RMASK %lr +#define RMASK lr -#define RT0 %r8 -#define RT1 %r9 -#define RT2 %r10 -#define RT3 %r11 +#define RT0 r8 +#define RT1 r9 +#define RT2 r10 +#define RT3 r11 /* helper macros */ #define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ @@ -262,15 +262,15 @@ _gcry_twofish_arm_encrypt_block: /* input: - * %r0: ctx - * %r1: dst - * %r2: src + * r0: ctx + * r1: dst + * r2: src */ - push {%r1, %r4-%r11, %ip, %lr}; + push {r1, r4-r11, ip, lr}; add RY, CTXs0, #w; - ldr_input_le(%r2, RA, RB, RC, RD, RT0); + ldr_input_le(r2, RA, RB, RC, RD, RT0); /* Input whitening */ ldm RY, {RT0, RT1, RT2, RT3}; @@ -292,7 +292,7 @@ _gcry_twofish_arm_encrypt_block: last_encrypt_cycle(7); add RY, CTXs3, #(w + 4*4 - s3); - pop {%r1}; /* dst */ + pop {r1}; /* dst */ /* Output whitening */ ldm RY, {RT0, RT1, RT2, RT3}; @@ -301,9 +301,9 @@ _gcry_twofish_arm_encrypt_block: eor RA, RA, RT2; eor RB, RB, RT3; - str_output_le(%r1, RC, RD, RA, RB, RT0, RT1); + str_output_le(r1, RC, RD, RA, RB, RT0, RT1); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .ltorg .size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block; @@ -313,15 +313,15 @@ _gcry_twofish_arm_encrypt_block: _gcry_twofish_arm_decrypt_block: /* input: - * %r0: ctx - * %r1: dst - * %r2: src + * r0: ctx + * r1: dst + * r2: src */ - push {%r1, %r4-%r11, %ip, %lr}; + push {r1, r4-r11, ip, lr}; add CTXs3, CTXs0, #(s3 - s0); - ldr_input_le(%r2, RC, RD, RA, RB, RT0); + ldr_input_le(r2, RC, RD, RA, RB, RT0); add RY, CTXs3, #(w + 4*4 - s3); add CTXs3, CTXs0, #(s3 - s0); @@ -345,7 +345,7 @@ _gcry_twofish_arm_decrypt_block: last_decrypt_cycle(0); add RY, CTXs0, #w; - pop {%r1}; /* dst */ + pop {r1}; /* dst */ /* Output whitening */ ldm RY, {RT0, RT1, RT2, RT3}; @@ -354,9 +354,9 @@ _gcry_twofish_arm_decrypt_block: eor RC, RC, RT2; eor RD, RD, RT3; - str_output_le(%r1, RA, RB, RC, RD, RT0, RT1); + str_output_le(r1, RA, RB, RC, RD, RT0, RT1); - pop {%r4-%r11, %ip, %pc}; + pop {r4-r11, ip, pc}; .size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block; #endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ diff --git a/configure.ac b/configure.ac index cd804305..cc1104ca 100644 --- a/configure.ac +++ b/configure.ac @@ -1181,7 +1181,7 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementat ".text\n\t" /* Following causes error if assembler ignored '.syntax unified'. */ "asmfunc:\n\t" - "add %r0, %r0, %r4, ror #12;\n\t" + "add r0, r0, r4, ror #12;\n\t" /* Test if '.type' and '.size' are supported. */ ".size asmfunc,.-asmfunc;\n\t" @@ -1864,10 +1864,10 @@ AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions], ".fpu neon\n\t" ".text\n\t" "testfn:\n\t" - "vld1.64 {%q0-%q1}, [%r0]!;\n\t" - "vrev64.8 %q0, %q3;\n\t" - "vadd.u64 %q0, %q1;\n\t" - "vadd.s64 %d3, %d2, %d3;\n\t" + "vld1.64 {q0-q1}, [r0]!;\n\t" + "vrev64.8 q0, q3;\n\t" + "vadd.u64 q0, q1;\n\t" + "vadd.s64 d3, d2, d3;\n\t" ); void testfn(void); ]], [ testfn(); ])], diff --git a/mpi/arm/mpih-add1.S b/mpi/arm/mpih-add1.S index 09e8b3b2..d59d3f3d 100644 --- a/mpi/arm/mpih-add1.S +++ b/mpi/arm/mpih-add1.S @@ -29,10 +29,10 @@ /******************* * mpi_limb_t - * _gcry_mpih_add_n( mpi_ptr_t res_ptr, %r0 - * mpi_ptr_t s1_ptr, %r1 - * mpi_ptr_t s2_ptr, %r2 - * mpi_size_t size) %r3 + * _gcry_mpih_add_n( mpi_ptr_t res_ptr, r0 + * mpi_ptr_t s1_ptr, r1 + * mpi_ptr_t s2_ptr, r2 + * mpi_size_t size) r3 */ .text @@ -40,37 +40,37 @@ .globl _gcry_mpih_add_n .type _gcry_mpih_add_n,%function _gcry_mpih_add_n: - push {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr}; - cmn %r0, #0; /* clear carry flag */ + push {r4, r5, r6, r7, r8, r9, r10, lr}; + cmn r0, #0; /* clear carry flag */ - tst %r3, #3; + tst r3, #3; beq .Large_loop; .Loop: - ldr %r4, [%r1], #4; - sub %r3, #1; - ldr %lr, [%r2], #4; - adcs %r4, %lr; - tst %r3, #3; - str %r4, [%r0], #4; + ldr r4, [r1], #4; + sub r3, #1; + ldr lr, [r2], #4; + adcs r4, lr; + tst r3, #3; + str r4, [r0], #4; bne .Loop; - teq %r3, #0; + teq r3, #0; beq .Lend; .Large_loop: - ldm %r1!, {%r4, %r6, %r8, %r10}; - ldm %r2!, {%r5, %r7, %r9, %lr}; - sub %r3, #4; - adcs %r4, %r5; - adcs %r6, %r7; - adcs %r8, %r9; - adcs %r10, %lr; - teq %r3, #0; - stm %r0!, {%r4, %r6, %r8, %r10}; + ldm r1!, {r4, r6, r8, r10}; + ldm r2!, {r5, r7, r9, lr}; + sub r3, #4; + adcs r4, r5; + adcs r6, r7; + adcs r8, r9; + adcs r10, lr; + teq r3, #0; + stm r0!, {r4, r6, r8, r10}; bne .Large_loop; .Lend: - adc %r0, %r3, #0; - pop {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc}; + adc r0, r3, #0; + pop {r4, r5, r6, r7, r8, r9, r10, pc}; .size _gcry_mpih_add_n,.-_gcry_mpih_add_n; diff --git a/mpi/arm/mpih-mul1.S b/mpi/arm/mpih-mul1.S index c2e2854b..ea196e8b 100644 --- a/mpi/arm/mpih-mul1.S +++ b/mpi/arm/mpih-mul1.S @@ -29,10 +29,10 @@ /******************* * mpi_limb_t - * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, %r0 - * mpi_ptr_t s1_ptr, %r1 - * mpi_size_t s1_size, %r2 - * mpi_limb_t s2_limb) %r3 + * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, r0 + * mpi_ptr_t s1_ptr, r1 + * mpi_size_t s1_size, r2 + * mpi_limb_t s2_limb) r3 */ .text @@ -40,41 +40,41 @@ .globl _gcry_mpih_mul_1 .type _gcry_mpih_mul_1,%function _gcry_mpih_mul_1: - push {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %lr}; - mov %r4, #0; + push {r4, r5, r6, r7, r8, r9, r10, r11, lr}; + mov r4, #0; - tst %r2, #3; + tst r2, #3; beq .Large_loop; .Loop: - ldr %r5, [%r1], #4; - mov %lr, #0; - umlal %r4, %lr, %r5, %r3; - sub %r2, #1; - str %r4, [%r0], #4; - tst %r2, #3; - mov %r4, %lr; + ldr r5, [r1], #4; + mov lr, #0; + umlal r4, lr, r5, r3; + sub r2, #1; + str r4, [r0], #4; + tst r2, #3; + mov r4, lr; bne .Loop; - teq %r2, #0; + teq r2, #0; beq .Lend; .Large_loop: - ldm %r1!, {%r5, %r6, %r7, %r8}; - mov %r9, #0; - mov %r10, #0; - umlal %r4, %r9, %r5, %r3; - mov %r11, #0; - umlal %r9, %r10, %r6, %r3; - str %r4, [%r0], #4; - mov %r4, #0; - umlal %r10, %r11, %r7, %r3; - subs %r2, #4; - umlal %r11, %r4, %r8, %r3; - stm %r0!, {%r9, %r10, %r11}; + ldm r1!, {r5, r6, r7, r8}; + mov r9, #0; + mov r10, #0; + umlal r4, r9, r5, r3; + mov r11, #0; + umlal r9, r10, r6, r3; + str r4, [r0], #4; + mov r4, #0; + umlal r10, r11, r7, r3; + subs r2, #4; + umlal r11, r4, r8, r3; + stm r0!, {r9, r10, r11}; bne .Large_loop; .Lend: - mov %r0, %r4; - pop {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %pc}; + mov r0, r4; + pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}; .size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1; diff --git a/mpi/arm/mpih-mul2.S b/mpi/arm/mpih-mul2.S index bce932e9..8793b20f 100644 --- a/mpi/arm/mpih-mul2.S +++ b/mpi/arm/mpih-mul2.S @@ -29,10 +29,10 @@ /******************* * mpi_limb_t - * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, %r0 - * mpi_ptr_t s1_ptr, %r1 - * mpi_size_t s1_size, %r2 - * mpi_limb_t s2_limb) %r3 + * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, r0 + * mpi_ptr_t s1_ptr, r1 + * mpi_size_t s1_size, r2 + * mpi_limb_t s2_limb) r3 */ .text @@ -40,55 +40,55 @@ .globl _gcry_mpih_addmul_1 .type _gcry_mpih_addmul_1,%function _gcry_mpih_addmul_1: - push {%r4, %r5, %r6, %r8, %r10, %lr}; - mov %lr, #0; - cmn %r0, #0; /* clear carry flag */ + push {r4, r5, r6, r8, r10, lr}; + mov lr, #0; + cmn r0, #0; /* clear carry flag */ - tst %r2, #3; + tst r2, #3; beq .Large_loop; .Loop: - ldr %r5, [%r1], #4; - ldr %r4, [%r0]; - sub %r2, #1; - adcs %r4, %lr; - mov %lr, #0; - umlal %r4, %lr, %r5, %r3; - tst %r2, #3; - str %r4, [%r0], #4; + ldr r5, [r1], #4; + ldr r4, [r0]; + sub r2, #1; + adcs r4, lr; + mov lr, #0; + umlal r4, lr, r5, r3; + tst r2, #3; + str r4, [r0], #4; bne .Loop; - teq %r2, #0; + teq r2, #0; beq .Lend; .Large_loop: - ldr %r5, [%r1], #4; - ldm %r0, {%r4, %r6, %r8, %r10}; + ldr r5, [r1], #4; + ldm r0, {r4, r6, r8, r10}; - sub %r2, #4; - adcs %r4, %lr; - mov %lr, #0; - umlal %r4, %lr, %r5, %r3; + sub r2, #4; + adcs r4, lr; + mov lr, #0; + umlal r4, lr, r5, r3; - ldr %r5, [%r1], #4; - adcs %r6, %lr; - mov %lr, #0; - umlal %r6, %lr, %r5, %r3; + ldr r5, [r1], #4; + adcs r6, lr; + mov lr, #0; + umlal r6, lr, r5, r3; - ldr %r5, [%r1], #4; - adcs %r8, %lr; - mov %lr, #0; - umlal %r8, %lr, %r5, %r3; + ldr r5, [r1], #4; + adcs r8, lr; + mov lr, #0; + umlal r8, lr, r5, r3; - ldr %r5, [%r1], #4; - adcs %r10, %lr; - mov %lr, #0; - umlal %r10, %lr, %r5, %r3; + ldr r5, [r1], #4; + adcs r10, lr; + mov lr, #0; + umlal r10, lr, r5, r3; - teq %r2, #0; - stm %r0!, {%r4, %r6, %r8, %r10}; + teq r2, #0; + stm r0!, {r4, r6, r8, r10}; bne .Large_loop; .Lend: - adc %r0, %lr, #0; - pop {%r4, %r5, %r6, %r8, %r10, %pc}; + adc r0, lr, #0; + pop {r4, r5, r6, r8, r10, pc}; .size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1; diff --git a/mpi/arm/mpih-mul3.S b/mpi/arm/mpih-mul3.S index 33326c78..2477c089 100644 --- a/mpi/arm/mpih-mul3.S +++ b/mpi/arm/mpih-mul3.S @@ -29,10 +29,10 @@ /******************* * mpi_limb_t - * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, %r0 - * mpi_ptr_t s1_ptr, %r1 - * mpi_size_t s1_size, %r2 - * mpi_limb_t s2_limb) %r3 + * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, r0 + * mpi_ptr_t s1_ptr, r1 + * mpi_size_t s1_size, r2 + * mpi_limb_t s2_limb) r3 */ .text @@ -40,61 +40,61 @@ .globl _gcry_mpih_submul_1 .type _gcry_mpih_submul_1,%function _gcry_mpih_submul_1: - push {%r4, %r5, %r6, %r8, %r9, %r10, %lr}; - mov %lr, #0; - cmp %r0, #0; /* prepare carry flag for sbc */ + push {r4, r5, r6, r8, r9, r10, lr}; + mov lr, #0; + cmp r0, #0; /* prepare carry flag for sbc */ - tst %r2, #3; + tst r2, #3; beq .Large_loop; .Loop: - ldr %r5, [%r1], #4; - mov %r4, %lr; - mov %lr, #0; - ldr %r6, [%r0]; - umlal %r4, %lr, %r5, %r3; - sub %r2, #1; - sbcs %r4, %r6, %r4; - tst %r2, #3; - str %r4, [%r0], #4; + ldr r5, [r1], #4; + mov r4, lr; + mov lr, #0; + ldr r6, [r0]; + umlal r4, lr, r5, r3; + sub r2, #1; + sbcs r4, r6, r4; + tst r2, #3; + str r4, [r0], #4; bne .Loop; - teq %r2, #0; + teq r2, #0; beq .Lend; .Large_loop: - ldr %r5, [%r1], #4; - mov %r9, #0; - ldr %r4, [%r0, #0]; + ldr r5, [r1], #4; + mov r9, #0; + ldr r4, [r0, #0]; - umlal %lr, %r9, %r5, %r3; - ldr %r6, [%r0, #4]; - ldr %r5, [%r1], #4; - sbcs %r4, %r4, %lr; + umlal lr, r9, r5, r3; + ldr r6, [r0, #4]; + ldr r5, [r1], #4; + sbcs r4, r4, lr; - mov %lr, #0; - umlal %r9, %lr, %r5, %r3; - ldr %r8, [%r0, #8]; - ldr %r5, [%r1], #4; - sbcs %r6, %r6, %r9; + mov lr, #0; + umlal r9, lr, r5, r3; + ldr r8, [r0, #8]; + ldr r5, [r1], #4; + sbcs r6, r6, r9; - mov %r9, #0; - umlal %lr, %r9, %r5, %r3; - ldr %r10, [%r0, #12]; - ldr %r5, [%r1], #4; - sbcs %r8, %r8, %lr; + mov r9, #0; + umlal lr, r9, r5, r3; + ldr r10, [r0, #12]; + ldr r5, [r1], #4; + sbcs r8, r8, lr; - mov %lr, #0; - umlal %r9, %lr, %r5, %r3; - sub %r2, #4; - sbcs %r10, %r10, %r9; + mov lr, #0; + umlal r9, lr, r5, r3; + sub r2, #4; + sbcs r10, r10, r9; - teq %r2, #0; - stm %r0!, {%r4, %r6, %r8, %r10}; + teq r2, #0; + stm r0!, {r4, r6, r8, r10}; bne .Large_loop; .Lend: it cc - movcc %r2, #1; - add %r0, %lr, %r2; - pop {%r4, %r5, %r6, %r8, %r9, %r10, %pc}; + movcc r2, #1; + add r0, lr, r2; + pop {r4, r5, r6, r8, r9, r10, pc}; .size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1; diff --git a/mpi/arm/mpih-sub1.S b/mpi/arm/mpih-sub1.S index 593e3cde..476d8a33 100644 --- a/mpi/arm/mpih-sub1.S +++ b/mpi/arm/mpih-sub1.S @@ -29,10 +29,10 @@ /******************* * mpi_limb_t - * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, %r0 - * mpi_ptr_t s1_ptr, %r1 - * mpi_ptr_t s2_ptr, %r2 - * mpi_size_t size) %r3 + * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, r0 + * mpi_ptr_t s1_ptr, r1 + * mpi_ptr_t s2_ptr, r2 + * mpi_size_t size) r3 */ .text @@ -40,38 +40,38 @@ .globl _gcry_mpih_sub_n .type _gcry_mpih_sub_n,%function _gcry_mpih_sub_n: - push {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr}; - cmp %r0, #0; /* prepare carry flag for sub */ + push {r4, r5, r6, r7, r8, r9, r10, lr}; + cmp r0, #0; /* prepare carry flag for sub */ - tst %r3, #3; + tst r3, #3; beq .Large_loop; .Loop: - ldr %r4, [%r1], #4; - sub %r3, #1; - ldr %lr, [%r2], #4; - sbcs %r4, %lr; - tst %r3, #3; - str %r4, [%r0], #4; + ldr r4, [r1], #4; + sub r3, #1; + ldr lr, [r2], #4; + sbcs r4, lr; + tst r3, #3; + str r4, [r0], #4; bne .Loop; - teq %r3, #0; + teq r3, #0; beq .Lend; .Large_loop: - ldm %r1!, {%r4, %r6, %r8, %r10}; - sub %r3, #4; - ldm %r2!, {%r5, %r7, %r9, %lr}; - sbcs %r4, %r5; - sbcs %r6, %r7; - sbcs %r8, %r9; - sbcs %r10, %lr; - teq %r3, #0; - stm %r0!, {%r4, %r6, %r8, %r10}; + ldm r1!, {r4, r6, r8, r10}; + sub r3, #4; + ldm r2!, {r5, r7, r9, lr}; + sbcs r4, r5; + sbcs r6, r7; + sbcs r8, r9; + sbcs r10, lr; + teq r3, #0; + stm r0!, {r4, r6, r8, r10}; bne .Large_loop; .Lend: - sbc %r0, %r3, #0; - neg %r0, %r0; - pop {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc}; + sbc r0, r3, #0; + neg r0, r0; + pop {r4, r5, r6, r7, r8, r9, r10, pc}; .size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n; -- 2.37.2 From mohammed.kraydiye at gmail.com Thu Dec 15 10:59:59 2022 From: mohammed.kraydiye at gmail.com (Mohammed Mohammad) Date: Thu, 15 Dec 2022 10:59:59 +0100 Subject: libgcrypt fips.c review and questions Message-ID: Hello, I'm interested in the concept of fips. I have taken a look at the source code for managing fips in libgcrypt. I have found some code that I hardly understand and in some places I feel that I'm able to contribute to improve some parts of it. Is it okay if I can ask some questions about some of the code blocks and is it okay if I have suggestions on improving the code? Best regards! Mohammed. -------------- next part -------------- An HTML attachment was scrubbed... URL: From jjelen at redhat.com Fri Dec 16 11:56:56 2022 From: jjelen at redhat.com (Jakub Jelen) Date: Fri, 16 Dec 2022 11:56:56 +0100 Subject: libgcrypt fips.c review and questions In-Reply-To: References: Message-ID: On 12/15/22 10:59, Mohammed Mohammad via Gcrypt-devel wrote: > Hello, I'm interested in the concept of fips. I have taken a look at the > source code for managing fips in libgcrypt. I have found some code that > I hardly understand and in some places I feel that I'm able to > contribute to improve some parts of it. Is it okay if I can ask some > questions about some of the code blocks and is it okay if I have > suggestions on improving the code? Sure, go ahead with asking. I think contributions are always welcomed if they will not go against the FIPS requirements. Regards, -- Jakub Jelen Crypto Team, Security Engineering Red Hat, Inc.