[PATCH] Add CFI unwind assembly directives for AMD64

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Apr 16 22:04:23 CEST 2019


* configure.ac (gcry_cv_gcc_asm_cfi_directives): New.
* cipher/asm-common-amd64.h (ADD_RIP, CFI_STARTPROC, CFI_ENDPROC)
(CFI_REMEMBER_STATE, CFI_RESTORE_STATE, CFI_ADJUST_CFA_OFFSET)
(CFI_REL_OFFSET, CFI_DEF_CFA_REGISTER, CFI_REGISTER, CFI_RESTORE)
(CFI_PUSH, CFI_POP, CFI_POP_TMP_REG, CFI_LEAVE, DW_REGNO)
(DW_SLEB128_7BIT, DW_SLEB128_28BIT, CFI_CFA_ON_STACK)
(CFI_REG_ON_STACK): New.
(ENTER_SYSV_FUNCPARAMS_0_4, EXIT_SYSV_FUNC): Add CFI directives.
* cipher/arcfour-amd64.S: Add CFI directives.
* cipher/blake2b-amd64-avx2.S: Add CFI directives.
* cipher/blake2s-amd64-avx.S: Add CFI directives.
* cipher/blowfish-amd64.S: Add CFI directives.
* cipher/camellia-aesni-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/camellia-aesni-avx2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/cast5-amd64.S: Add CFI directives.
* cipher/chacha20-amd64-avx2.S: Add CFI directives.
* cipher/chacha20-amd64-ssse3.S: Add CFI directives.
* cipher/des-amd64.S: Add CFI directives.
* cipher/rijndael-amd64.S: Add CFI directives.
* cipher/rijndael-ssse3-amd64-asm.S: Add CFI directives.
* cipher/salsa20-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'.
* cipher/serpent-avx2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/serpent-sse2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-avx-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-avx2-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha1-ssse3-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha256-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha256-avx2-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha256-ssse3-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha512-avx-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha512-avx2-bmi2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/sha512-ssse3-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/twofish-amd64.S: Add CFI directives.
* cipher/twofish-avx2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* cipher/whirlpool-sse2-amd64.S: Add CFI directives; Use
'asm-common-amd64.h'.
* mpi/amd64/func_abi.h: Include 'config.h'.
(CFI_STARTPROC, CFI_ENDPROC, CFI_ADJUST_CFA_OFFSET, CFI_REL_OFFSET)
(CFI_RESTORE, CFI_PUSH, CFI_POP): New.
(FUNC_ENTRY, FUNC_EXIT): Add CFI directives.
--

This commit adds CFI directives that add DWARF unwinding information for
debugger to backtrace when executing code from AMD64 assembly files.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index c08f3453b..221dfeff7 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -25,9 +25,12 @@
 .globl _gcry_arcfour_amd64
 ELF(.type _gcry_arcfour_amd64, at function)
 _gcry_arcfour_amd64:
+	CFI_STARTPROC()
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	push	%rbp
+	CFI_PUSH(%rbp)
 	push	%rbx
+	CFI_PUSH(%rbx)
 	mov	%rdi,		%rbp	# key = ARG(key)
 	mov	%rsi,		%rbx	# rbx = ARG(len)
 	mov	%rdx,		%rsi	# in = ARG(in)
@@ -92,9 +95,12 @@ _gcry_arcfour_amd64:
 	movb	%cl,		(4*256)(%rbp)	# key->y = y
 	movb	%dl,		(4*256+4)(%rbp)	# key->x = x
 	pop	%rbx
+	CFI_POP(%rbx)
 	pop	%rbp
+	CFI_POP(%rbp)
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC()
 .L__gcry_arcfour_amd64_end:
 ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
 
diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h
index 7eb426495..9d4a028a0 100644
--- a/cipher/asm-common-amd64.h
+++ b/cipher/asm-common-amd64.h
@@ -41,6 +41,12 @@
 #  define RIP
 #endif
 
+#ifdef __PIC__
+#  define ADD_RIP +rip
+#else
+#  define ADD_RIP
+#endif
+
 #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
 #  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
 #else
@@ -60,10 +66,101 @@
 #  endif
 #endif
 
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+/* CFI directives to emit DWARF stack unwinding information. */
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_REMEMBER_STATE()       .cfi_remember_state
+# define CFI_RESTORE_STATE()        .cfi_restore_state
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)  .cfi_def_cfa_register reg
+# define CFI_REGISTER(ro,rn)        .cfi_register ro, rn
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+# define CFI_POP_TMP_REG() \
+	CFI_ADJUST_CFA_OFFSET(-8);
+# define CFI_LEAVE() \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp)
+
+/* CFA expressions are used for pointing CFA and registers to
+ * %rsp relative offsets. */
+# define DW_REGNO_rax 0
+# define DW_REGNO_rdx 1
+# define DW_REGNO_rcx 2
+# define DW_REGNO_rbx 3
+# define DW_REGNO_rsi 4
+# define DW_REGNO_rdi 5
+# define DW_REGNO_rbp 6
+# define DW_REGNO_rsp 7
+# define DW_REGNO_r8  8
+# define DW_REGNO_r9  9
+# define DW_REGNO_r10 10
+# define DW_REGNO_r11 11
+# define DW_REGNO_r12 12
+# define DW_REGNO_r13 13
+# define DW_REGNO_r14 14
+# define DW_REGNO_r15 15
+
+# define DW_REGNO(reg) DW_REGNO_ ## reg
+
+/* Fixed length encoding used for integers for now. */
+# define DW_SLEB128_7BIT(value) \
+	0x00|((value) & 0x7f)
+# define DW_SLEB128_28BIT(value) \
+	0x80|((value)&0x7f), \
+	0x80|(((value)>>7)&0x7f), \
+	0x80|(((value)>>14)&0x7f), \
+	0x00|(((value)>>21)&0x7f)
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \
+	.cfi_escape \
+	  0x0f, /* DW_CFA_def_cfa_expression */ \
+	    DW_SLEB128_7BIT(11), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs), \
+	  0x06, /* DW_OP_deref */ \
+	  0x23, /* DW_OP_plus_constu */ \
+	    DW_SLEB128_28BIT((cfa_depth)+8)
+
+# define CFI_REG_ON_STACK(reg,rsp_offs) \
+	.cfi_escape \
+	  0x10, /* DW_CFA_expression */ \
+	    DW_SLEB128_7BIT(DW_REGNO(reg)), \
+	    DW_SLEB128_7BIT(5), /* length */ \
+	  0x77, /* DW_OP_breg7, rsp + constant */ \
+	    DW_SLEB128_28BIT(rsp_offs)
+
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_REMEMBER_STATE()
+# define CFI_RESTORE_STATE()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_REGISTER(ro,rn)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+# define CFI_POP_TMP_REG()
+# define CFI_LEAVE()
+
+# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth)
+# define CFI_REG_ON_STACK(reg,rsp_offs)
+#endif
+
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 # define ENTER_SYSV_FUNC_PARAMS_0_4 \
 	pushq %rdi; \
+	CFI_PUSH(%rdi); \
 	pushq %rsi; \
+	CFI_PUSH(%rsi); \
 	movq %rcx, %rdi; \
 	movq %rdx, %rsi; \
 	movq %r8, %rdx; \
@@ -79,7 +176,9 @@
 
 # define EXIT_SYSV_FUNC \
 	popq %rsi; \
-	popq %rdi;
+	CFI_POP(%rsi); \
+	popq %rdi; \
+	CFI_POP(%rdi);
 #else
 # define ENTER_SYSV_FUNC_PARAMS_0_4
 # define ENTER_SYSV_FUNC_PARAMS_5
diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S
index 6bcc5652d..08c816cdf 100644
--- a/cipher/blake2b-amd64-avx2.S
+++ b/cipher/blake2b-amd64-avx2.S
@@ -207,6 +207,7 @@ _gcry_blake2b_transform_amd64_avx2:
          *	%rsi: blks
          *	%rdx: num_blks
          */
+        CFI_STARTPROC();
 
         vzeroupper;
 
@@ -291,6 +292,7 @@ _gcry_blake2b_transform_amd64_avx2:
         xor %eax, %eax;
         vzeroall;
         ret;
+        CFI_ENDPROC();
 ELF(.size _gcry_blake2b_transform_amd64_avx2,
     .-_gcry_blake2b_transform_amd64_avx2;)
 
diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S
index f7312dbd0..198373262 100644
--- a/cipher/blake2s-amd64-avx.S
+++ b/cipher/blake2s-amd64-avx.S
@@ -191,6 +191,7 @@ _gcry_blake2s_transform_amd64_avx:
          *	%rsi: blks
          *	%rdx: num_blks
          */
+        CFI_STARTPROC();
 
         vzeroupper;
 
@@ -269,6 +270,7 @@ _gcry_blake2s_transform_amd64_avx:
         xor %eax, %eax;
         vzeroall;
         ret;
+        CFI_ENDPROC();
 ELF(.size _gcry_blake2s_transform_amd64_avx,
     .-_gcry_blake2s_transform_amd64_avx;)
 
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 02d3b7102..bdb361d7e 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -133,7 +133,9 @@ __blowfish_enc_blk1:
 	 * output:
 	 *	RX0: output plaintext block
 	 */
+	CFI_STARTPROC();
 	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
 
 	load_roundkey_enc(0);
 	round_enc(2);
@@ -147,8 +149,10 @@ __blowfish_enc_blk1:
 	add_roundkey_enc();
 
 	movq %r11, %rbp;
+	CFI_RESTORE(%rbp)
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
 
 .align 8
@@ -161,6 +165,7 @@ _gcry_blowfish_amd64_do_encrypt:
 	 *	%rsi: u32 *ret_xl
 	 *	%rdx: u32 *ret_xr
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movl (%rdx), RX0d;
@@ -178,6 +183,7 @@ _gcry_blowfish_amd64_do_encrypt:
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
 .align 8
@@ -190,6 +196,7 @@ _gcry_blowfish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rsi, %r10;
@@ -204,6 +211,7 @@ _gcry_blowfish_amd64_encrypt_block:
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
 .align 8
@@ -216,9 +224,11 @@ _gcry_blowfish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	movq %rbp, %r11;
+	CFI_REGISTER(%rbp, %r11);
 
 	movq %rsi, %r10;
 	movq %rdx, RIO;
@@ -240,9 +250,11 @@ _gcry_blowfish_amd64_decrypt_block:
 	write_block();
 
 	movq %r11, %rbp;
+	CFI_RESTORE(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
 /**********************************************************************
@@ -340,6 +352,7 @@ __blowfish_enc_blk4:
 	 * output:
 	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	preload_roundkey_enc(0);
 
 	round_enc4(0);
@@ -355,6 +368,7 @@ __blowfish_enc_blk4:
 	outbswap_block4();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
 
 .align 8
@@ -367,6 +381,7 @@ __blowfish_dec_blk4:
 	 * output:
 	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
 	 */
+	CFI_STARTPROC();
 	preload_roundkey_dec(17);
 
 	inbswap_block4();
@@ -384,6 +399,7 @@ __blowfish_dec_blk4:
 	outbswap_block4();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
 
 .align 8
@@ -396,12 +412,17 @@ _gcry_blowfish_amd64_ctr_enc:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
 	movq %rcx, %r13; /*iv*/
@@ -438,12 +459,17 @@ _gcry_blowfish_amd64_ctr_enc:
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
 .align 8
@@ -456,12 +482,17 @@ _gcry_blowfish_amd64_cbc_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
 	movq %rsi, %r11; /*dst*/
@@ -489,12 +520,17 @@ _gcry_blowfish_amd64_cbc_dec:
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
 .align 8
@@ -507,12 +543,17 @@ _gcry_blowfish_amd64_cfb_dec:
 	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 
 	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
 	movq %rcx, %r13; /*iv*/
@@ -543,12 +584,17 @@ _gcry_blowfish_amd64_cfb_dec:
 	movq RX3, 3 * 8(%r11);
 
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
 #endif /*defined(USE_BLOWFISH)*/
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 8022934fb..e16d4f613 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -24,17 +24,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -75,10 +65,10 @@
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
-	vmovdqa .Linv_shift_row RIP, t4; \
-	vbroadcastss .L0f0f0f0f RIP, t7; \
-	vmovdqa .Lpre_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpre_tf_hi_s1 RIP, t1; \
+	vmovdqa .Linv_shift_row rRIP, t4; \
+	vbroadcastss .L0f0f0f0f rRIP, t7; \
+	vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
@@ -91,8 +81,8 @@
 	vpshufb t4, x6, x6; \
 	\
 	/* prefilter sboxes 1, 2 and 3 */ \
-	vmovdqa .Lpre_tf_lo_s4 RIP, t2; \
-	vmovdqa .Lpre_tf_hi_s4 RIP, t3; \
+	vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \
+	vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t6); \
 	filter_8bit(x7, t0, t1, t7, t6); \
 	filter_8bit(x1, t0, t1, t7, t6); \
@@ -106,8 +96,8 @@
 	filter_8bit(x6, t2, t3, t7, t6); \
 	\
 	/* AES subbytes + AES shift rows */ \
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
 	vaesenclast t4, x0, x0; \
 	vaesenclast t4, x7, x7; \
 	vaesenclast t4, x1, x1; \
@@ -118,16 +108,16 @@
 	vaesenclast t4, x6, x6; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
-	vmovdqa .Lpost_tf_lo_s3 RIP, t2; \
-	vmovdqa .Lpost_tf_hi_s3 RIP, t3; \
+	vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \
+	vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t6); \
 	filter_8bit(x7, t0, t1, t7, t6); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
-	vmovdqa .Lpost_tf_lo_s2 RIP, t4; \
-	vmovdqa .Lpost_tf_hi_s2 RIP, t5; \
+	vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \
+	vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
@@ -442,7 +432,7 @@
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
-	vmovdqu .Lshufb_16x16b RIP, a0; \
+	vmovdqu .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
@@ -508,7 +498,7 @@
 	vpunpcklwd t1, t3, e; \
 	vpunpckhwd t1, t3, f; \
 	\
-	vmovdqa .Ltranspose_8x8_shuf RIP, t3; \
+	vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \
 	\
 	vpunpcklwd g, c, d; \
 	vpunpckhwd g, c, c; \
@@ -540,7 +530,7 @@
 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 16(rio), x0, y7; \
 	vpxor 1 * 16(rio), x0, y6; \
@@ -591,7 +581,7 @@
 	vmovdqu x0, stack_tmp0; \
 	\
 	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
@@ -786,6 +776,7 @@ __camellia_enc_blk16:
 	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 16(%rax), %rcx;
 
@@ -859,6 +850,7 @@ __camellia_enc_blk16:
 		     %xmm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
 .align 8
@@ -874,6 +866,7 @@ __camellia_dec_blk16:
 	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 16(%rax), %rcx;
 
@@ -944,6 +937,7 @@ __camellia_dec_blk16:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
+	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -963,9 +957,12 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -973,7 +970,7 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	vmovdqa .Lbswap128_mask RIP, %xmm14;
+	vmovdqa .Lbswap128_mask rRIP, %xmm14;
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm15;
@@ -1018,12 +1015,12 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	vmovdqa %xmm0, %xmm13;
 	vpshufb %xmm14, %xmm0, %xmm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
 	vmovdqu %xmm13, (%rcx);
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1067,7 +1064,9 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
 .align 8
@@ -1081,9 +1080,12 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1135,7 +1137,9 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
 .align 8
@@ -1149,9 +1153,12 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1161,7 +1168,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm0;
-	vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+	vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0;
 	vpxor (%rcx), %xmm0, %xmm15;
 	vmovdqu 15 * 16(%rdx), %xmm1;
 	vmovdqu %xmm1, (%rcx); /* store new IV */
@@ -1207,7 +1214,9 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
 .align 8
@@ -1223,9 +1232,12 @@ _gcry_camellia_aesni_avx_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1233,10 +1245,14 @@ _gcry_camellia_aesni_avx_ocb_enc:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm15;
@@ -1292,7 +1308,7 @@ _gcry_camellia_aesni_avx_ocb_enc:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1335,13 +1351,19 @@ _gcry_camellia_aesni_avx_ocb_enc:
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
 
 .align 8
@@ -1357,9 +1379,12 @@ _gcry_camellia_aesni_avx_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1367,10 +1392,14 @@ _gcry_camellia_aesni_avx_ocb_dec:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm15;
 
@@ -1428,7 +1457,7 @@ _gcry_camellia_aesni_avx_ocb_dec:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX, %r8, 8), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1493,13 +1522,19 @@ _gcry_camellia_aesni_avx_ocb_dec:
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
 
 .align 8
@@ -1514,9 +1549,12 @@ _gcry_camellia_aesni_avx_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1524,10 +1562,14 @@ _gcry_camellia_aesni_avx_ocb_auth:
 	andq $~31, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 16 + 0 * 8)(%rax);
-	movq %r11, (16 * 16 + 1 * 8)(%rax);
-	movq %r12, (16 * 16 + 2 * 8)(%rax);
-	movq %r13, (16 * 16 + 3 * 8)(%rax);
+	movq %r10, (16 * 16 + 0 * 8)(%rsp);
+	movq %r11, (16 * 16 + 1 * 8)(%rsp);
+	movq %r12, (16 * 16 + 2 * 8)(%rsp);
+	movq %r13, (16 * 16 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm15;
 
@@ -1580,7 +1622,7 @@ _gcry_camellia_aesni_avx_ocb_auth:
 
 	/* inpack16_pre: */
 	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
 	vpxor %xmm0, %xmm15, %xmm0;
 	vpxor %xmm1, %xmm15, %xmm1;
 	vpxor %xmm2, %xmm15, %xmm2;
@@ -1623,13 +1665,19 @@ _gcry_camellia_aesni_avx_ocb_auth:
 
 	vzeroall;
 
-	movq (16 * 16 + 0 * 8)(%rax), %r10;
-	movq (16 * 16 + 1 * 8)(%rax), %r11;
-	movq (16 * 16 + 2 * 8)(%rax), %r12;
-	movq (16 * 16 + 3 * 8)(%rax), %r13;
+	movq (16 * 16 + 0 * 8)(%rsp), %r10;
+	movq (16 * 16 + 1 * 8)(%rsp), %r11;
+	movq (16 * 16 + 2 * 8)(%rsp), %r12;
+	movq (16 * 16 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
 
 /*
@@ -1657,8 +1705,8 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
 	vpand sbox4mask, t0, t0; \
 	vpor t0, x, x; \
 	\
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \
 	\
 	/* prefilter sboxes */ \
 	filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
@@ -1672,18 +1720,18 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;
 	/* output rotation for sbox2 (<<< 1) */ \
 	/* output rotation for sbox3 (>>> 1) */ \
 	vpshufb inv_shift_row, x, t1; \
-	vpshufb .Lsp0044440444044404mask RIP, x, t4; \
-	vpshufb .Lsp1110111010011110mask RIP, x, x; \
+	vpshufb .Lsp0044440444044404mask rRIP, x, t4; \
+	vpshufb .Lsp1110111010011110mask rRIP, x, x; \
 	vpaddb t1, t1, t2; \
 	vpsrlw $7, t1, t0; \
 	vpsllw $7, t1, t3; \
 	vpor t0, t2, t0; \
 	vpsrlw $1, t1, t1; \
-	vpshufb .Lsp0222022222000222mask RIP, t0, t0; \
+	vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \
 	vpor t1, t3, t1; \
 	\
 	vpxor x, t4, t4; \
-	vpshufb .Lsp3033303303303033mask RIP, t1, t1; \
+	vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \
 	vpxor t4, t0, t0; \
 	vpxor t1, t0, t0; \
 	vpsrldq $8, t0, x; \
@@ -1741,17 +1789,19 @@ __camellia_avx_setup128:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
 	 *	%xmm0: key
 	 */
+	CFI_STARTPROC();
+
 #define cmll_sub(n, ctx) (key_table+((n)*8))(ctx)
 #define KL128 %xmm0
 #define KA128 %xmm2
 
-	vpshufb .Lbswap128_mask RIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
 
-	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
-	vmovq .Lsbox4_input_mask RIP, %xmm12;
-	vbroadcastss .L0f0f0f0f RIP, %xmm13;
-	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
-	vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15;
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
 
 	/*
 	 * Generate KA
@@ -1763,18 +1813,18 @@ __camellia_avx_setup128:
 
 	camellia_f(%xmm2, %xmm4, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm2, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
 	camellia_f(%xmm2, %xmm3, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm4, %xmm1,
 		   %xmm5, %xmm6, %xmm7, %xmm8,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
 
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm4, %xmm2, %xmm2;
@@ -2076,6 +2126,7 @@ __camellia_avx_setup128:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
 .align 8
@@ -2086,19 +2137,21 @@ __camellia_avx_setup256:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
 	 *	%xmm0 & %xmm1: key
 	 */
+	CFI_STARTPROC();
+
 #define KL128 %xmm0
 #define KR128 %xmm1
 #define KA128 %xmm2
 #define KB128 %xmm3
 
-	vpshufb .Lbswap128_mask RIP, KL128, KL128;
-	vpshufb .Lbswap128_mask RIP, KR128, KR128;
+	vpshufb .Lbswap128_mask rRIP, KL128, KL128;
+	vpshufb .Lbswap128_mask rRIP, KR128, KR128;
 
-	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
-	vmovq .Lsbox4_input_mask RIP, %xmm12;
-	vbroadcastss .L0f0f0f0f RIP, %xmm13;
-	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
-	vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15;
+	vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11;
+	vmovq .Lsbox4_input_mask rRIP, %xmm12;
+	vbroadcastss .L0f0f0f0f rRIP, %xmm13;
+	vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14;
+	vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15;
 
 	/*
 	 * Generate KA
@@ -2111,20 +2164,20 @@ __camellia_avx_setup256:
 
 	camellia_f(%xmm2, %xmm4, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm2, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP);
 	vpxor %xmm6, %xmm2, %xmm2;
 	camellia_f(%xmm2, %xmm3, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP);
 	vpxor %xmm4, %xmm3, %xmm3;
 	vpxor KR128, %xmm3, %xmm3;
 	camellia_f(%xmm3, %xmm4, %xmm5,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP);
 
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm4, %xmm2, %xmm2;
@@ -2142,12 +2195,12 @@ __camellia_avx_setup256:
 
 	camellia_f(%xmm4, %xmm5, %xmm6,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP);
 	vpxor %xmm5, %xmm3, %xmm3;
 
 	camellia_f(%xmm3, %xmm5, %xmm6,
 		   %xmm7, %xmm8, %xmm9, %xmm10,
-		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 RIP);
+		   %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP);
 	vpslldq $8, %xmm3, %xmm3;
 	vpxor %xmm5, %xmm4, %xmm4;
 	vpsrldq $8, %xmm3, %xmm3;
@@ -2553,6 +2606,7 @@ __camellia_avx_setup256:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
 .align 8
@@ -2565,6 +2619,7 @@ _gcry_camellia_aesni_avx_keygen:
 	 *	%rsi: key
 	 *	%rdx: keylen
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -2585,6 +2640,7 @@ _gcry_camellia_aesni_avx_keygen:
 	vpor %xmm2, %xmm1, %xmm1;
 
 	jmp __camellia_avx_setup256;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 897e4aeec..cc01c7743 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -24,17 +24,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -92,12 +82,12 @@
 	/* \
 	 * S-function with AES subbytes \
 	 */ \
-	vbroadcasti128 .Linv_shift_row RIP, t4; \
-	vpbroadcastd .L0f0f0f0f RIP, t7; \
-	vbroadcasti128 .Lpre_tf_lo_s1 RIP, t5; \
-	vbroadcasti128 .Lpre_tf_hi_s1 RIP, t6; \
-	vbroadcasti128 .Lpre_tf_lo_s4 RIP, t2; \
-	vbroadcasti128 .Lpre_tf_hi_s4 RIP, t3; \
+	vbroadcasti128 .Linv_shift_row rRIP, t4; \
+	vpbroadcastd .L0f0f0f0f rRIP, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \
+	vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \
+	vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \
 	\
 	/* AES inverse shift rows */ \
 	vpshufb t4, x0, x0; \
@@ -143,8 +133,8 @@
 	vinserti128 $1, t2##_x, x6, x6; \
 	vextracti128 $1, x1, t3##_x; \
 	vextracti128 $1, x4, t2##_x; \
-	vbroadcasti128 .Lpost_tf_lo_s1 RIP, t0; \
-	vbroadcasti128 .Lpost_tf_hi_s1 RIP, t1; \
+	vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \
+	vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \
 	vaesenclast t4##_x, x2##_x, x2##_x; \
 	vaesenclast t4##_x, t6##_x, t6##_x; \
 	vaesenclast t4##_x, x5##_x, x5##_x; \
@@ -159,16 +149,16 @@
 	vinserti128 $1, t2##_x, x4, x4; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
-	vbroadcasti128 .Lpost_tf_lo_s3 RIP, t2; \
-	vbroadcasti128 .Lpost_tf_hi_s3 RIP, t3; \
+	vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \
+	vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \
 	filter_8bit(x0, t0, t1, t7, t4); \
 	filter_8bit(x7, t0, t1, t7, t4); \
 	filter_8bit(x3, t0, t1, t7, t6); \
 	filter_8bit(x6, t0, t1, t7, t6); \
 	\
 	/* postfilter sbox 3 */ \
-	vbroadcasti128 .Lpost_tf_lo_s2 RIP, t4; \
-	vbroadcasti128 .Lpost_tf_hi_s2 RIP, t5; \
+	vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \
+	vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
@@ -485,7 +475,7 @@
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
-	vbroadcasti128 .Lshufb_16x16b RIP, a0; \
+	vbroadcasti128 .Lshufb_16x16b rRIP, a0; \
 	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
@@ -524,7 +514,7 @@
 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		     y6, y7, rio, key) \
 	vpbroadcastq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor 0 * 32(rio), x0, y7; \
 	vpxor 1 * 32(rio), x0, y6; \
@@ -575,7 +565,7 @@
 	vmovdqu x0, stack_tmp0; \
 	\
 	vpbroadcastq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
+	vpshufb .Lpack_bswap rRIP, x0, x0; \
 	\
 	vpxor x0, y7, y7; \
 	vpxor x0, y6, y6; \
@@ -765,6 +755,7 @@ __camellia_enc_blk32:
 	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
@@ -838,6 +829,7 @@ __camellia_enc_blk32:
 		     %ymm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
 .align 8
@@ -853,6 +845,7 @@ __camellia_dec_blk32:
 	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
 	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
 	 */
+	CFI_STARTPROC();
 
 	leaq 8 * 32(%rax), %rcx;
 
@@ -923,6 +916,7 @@ __camellia_dec_blk32:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
+	CFI_ENDPROC();
 ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -942,9 +936,12 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	movq 8(%rcx), %r11;
 	bswapq %r11;
@@ -960,10 +957,10 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
 	/* load IV and byteswap */
 	vmovdqu (%rcx), %xmm0;
-	vpshufb .Lbswap128_mask RIP, %xmm0, %xmm0;
+	vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0;
 	vmovdqa %xmm0, %xmm1;
 	inc_le128(%xmm0, %xmm15, %xmm14);
-	vbroadcasti128 .Lbswap128_mask RIP, %ymm14;
+	vbroadcasti128 .Lbswap128_mask rRIP, %ymm14;
 	vinserti128 $1, %xmm0, %ymm1, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
 	vmovdqu %ymm13, 15 * 32(%rax);
@@ -1064,14 +1061,14 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	vextracti128 $1, %ymm0, %xmm13;
 	vpshufb %ymm14, %ymm0, %ymm0;
 	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13;
+	vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13;
 	vmovdqu %xmm13, (%rcx);
 
 .align 4
 .Lload_ctr_done:
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1116,7 +1113,9 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
 
 .align 8
@@ -1130,9 +1129,12 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1188,7 +1190,9 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
 
 .align 8
@@ -1202,9 +1206,12 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1214,7 +1221,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm0;
-	vpshufb .Lpack_bswap RIP, %ymm0, %ymm0;
+	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
 	vmovdqu (%rcx), %xmm15;
 	vinserti128 $1, (%rdx), %ymm15, %ymm15;
 	vpxor %ymm15, %ymm0, %ymm15;
@@ -1262,7 +1269,9 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 	vzeroall;
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
 .align 8
@@ -1278,9 +1287,12 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1288,10 +1300,14 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 	vmovdqu (%r8), %xmm13;
@@ -1369,7 +1385,7 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1412,13 +1428,19 @@ _gcry_camellia_aesni_avx2_ocb_enc:
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;)
 
 .align 8
@@ -1434,9 +1456,12 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[32])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1444,10 +1469,14 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rcx), %xmm14;
 
@@ -1525,7 +1554,7 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1596,13 +1625,19 @@ _gcry_camellia_aesni_avx2_ocb_dec:
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;)
 
 .align 8
@@ -1617,9 +1652,12 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -1627,10 +1665,14 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 	andq $~63, %rsp;
 	movq %rsp, %rax;
 
-	movq %r10, (16 * 32 + 0 * 8)(%rax);
-	movq %r11, (16 * 32 + 1 * 8)(%rax);
-	movq %r12, (16 * 32 + 2 * 8)(%rax);
-	movq %r13, (16 * 32 + 3 * 8)(%rax);
+	movq %r10, (16 * 32 + 0 * 8)(%rsp);
+	movq %r11, (16 * 32 + 1 * 8)(%rsp);
+	movq %r12, (16 * 32 + 2 * 8)(%rsp);
+	movq %r13, (16 * 32 + 3 * 8)(%rsp);
+	CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8);
+	CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8);
+	CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8);
+	CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8);
 
 	vmovdqu (%rdx), %xmm14;
 
@@ -1703,7 +1745,7 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 
 	/* inpack16_pre: */
 	vpbroadcastq (key_table)(CTX), %ymm15;
-	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
 	vpxor %ymm0, %ymm15, %ymm0;
 	vpxor %ymm1, %ymm15, %ymm1;
 	vpxor %ymm2, %ymm15, %ymm2;
@@ -1749,13 +1791,19 @@ _gcry_camellia_aesni_avx2_ocb_auth:
 
 	vzeroall;
 
-	movq (16 * 32 + 0 * 8)(%rax), %r10;
-	movq (16 * 32 + 1 * 8)(%rax), %r11;
-	movq (16 * 32 + 2 * 8)(%rax), %r12;
-	movq (16 * 32 + 3 * 8)(%rax), %r13;
+	movq (16 * 32 + 0 * 8)(%rsp), %r10;
+	movq (16 * 32 + 1 * 8)(%rsp), %r11;
+	movq (16 * 32 + 2 * 8)(%rsp), %r12;
+	movq (16 * 32 + 3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index 1a1d43fd5..82f678901 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -183,10 +183,13 @@ _gcry_cast5_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 
 	movq %rsi, %r10;
 
@@ -211,10 +214,13 @@ _gcry_cast5_amd64_encrypt_block:
 	write_block();
 
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
 .align 8
@@ -227,10 +233,13 @@ _gcry_cast5_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 
 	movq %rsi, %r10;
 
@@ -255,10 +264,13 @@ _gcry_cast5_amd64_decrypt_block:
 	write_block();
 
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
 /**********************************************************************
@@ -371,6 +383,7 @@ __cast5_enc_blk4:
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	get_round_km(0, RKM0d);
@@ -387,6 +400,7 @@ __cast5_enc_blk4:
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
+	CFI_ENDPROC();
 ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
 .align 8
@@ -399,6 +413,7 @@ __cast5_dec_blk4:
 	 * output:
 	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
 	 */
+	CFI_STARTPROC();
 	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
 
 	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
@@ -416,6 +431,7 @@ __cast5_dec_blk4:
 	round_dec_last4(1, F4_2, F4_1);
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	CFI_ENDPROC();
 	ret;
 ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
@@ -425,20 +441,28 @@ ELF(.type   _gcry_cast5_amd64_ctr_enc, at function;)
 _gcry_cast5_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (big endian, 64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* load IV and byteswap */
 	movq (%rcx), RX0;
@@ -458,7 +482,9 @@ _gcry_cast5_amd64_ctr_enc:
 	call __cast5_enc_blk4;
 
 	popq %r14; /*src*/
+	CFI_POP_TMP_REG();
 	popq %r13; /*dst*/
+	CFI_POP_TMP_REG();
 
 	/* XOR key-stream with plaintext */
 	xorq 0 * 8(%r14), RLR0;
@@ -471,13 +497,19 @@ _gcry_cast5_amd64_ctr_enc:
 	movq RLR3, 3 * 8(%r13);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
 .align 8
@@ -486,21 +518,30 @@ ELF(.type   _gcry_cast5_amd64_cbc_dec, at function;)
 _gcry_cast5_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rcx;
+	CFI_PUSH(%rcx);
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* load input */
 	movq 0 * 8(%rdx), RLR0;
@@ -511,8 +552,11 @@ _gcry_cast5_amd64_cbc_dec:
 	call __cast5_dec_blk4;
 
 	popq RX0; /*src*/
+	CFI_POP_TMP_REG();
 	popq RX1; /*dst*/
+	CFI_POP_TMP_REG();
 	popq RX2; /*iv*/
+	CFI_POP_TMP_REG();
 
 	movq 3 * 8(RX0), %r14;
 	xorq      (RX2), RLR0;
@@ -527,14 +571,19 @@ _gcry_cast5_amd64_cbc_dec:
 	movq RLR3, 3 * 8(RX1);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
-
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
 .align 8
@@ -543,20 +592,28 @@ ELF(.type   _gcry_cast5_amd64_cfb_dec, at function;)
 _gcry_cast5_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (8 blocks)
-	 *	%rdx: src (8 blocks)
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 
 	pushq %rsi;
+	CFI_PUSH(%rsi);
 	pushq %rdx;
+	CFI_PUSH(%rdx);
 
 	/* Load input */
 	movq (%rcx), RLR0;
@@ -573,7 +630,9 @@ _gcry_cast5_amd64_cfb_dec:
 	call __cast5_enc_blk4;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rcx; /*dst*/
+	CFI_POP_TMP_REG();
 
 	xorq 0 * 8(%rdx), RLR0;
 	xorq 1 * 8(%rdx), RLR1;
@@ -585,14 +644,19 @@ _gcry_cast5_amd64_cfb_dec:
 	movq RLR3, 3 * 8(%rcx);
 
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
-
+	CFI_ENDPROC();
 ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
 
 #endif /*defined(USE_CAST5)*/
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index 94c8e8cf7..de6263b69 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -179,11 +179,14 @@ _gcry_chacha20_amd64_avx2_blocks8:
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 8)
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $STACK_MAX, %rsp;
 	andq $~31, %rsp;
@@ -318,7 +321,9 @@ _gcry_chacha20_amd64_avx2_blocks8:
 
 	/* eax zeroed by round loop. */
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_amd64_avx2_blocks8;)
 
@@ -339,9 +344,12 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	vzeroupper;
 
@@ -353,6 +361,11 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
 	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
 	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
 
 	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
 	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
@@ -752,10 +765,17 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8:
 	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
 	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
 	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
 	  .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
 
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 1657f7712..6bbf12fc1 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -175,9 +175,12 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 	 *	%rdx: src
 	 *	%rcx: nblks (multiple of 4)
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $STACK_MAX, %rsp;
 	andq $~15, %rsp;
@@ -329,7 +332,9 @@ _gcry_chacha20_amd64_ssse3_blocks4:
 
 	/* eax zeroed by round loop. */
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_amd64_ssse3_blocks4;)
 
@@ -372,6 +377,7 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 	 *	%rdx: src
 	 *	%rcx: nblks
 	 */
+	CFI_STARTPROC();
 
 	/* Load constants */
 	movdqa .Lcounter1 rRIP, X4;
@@ -497,6 +503,7 @@ _gcry_chacha20_amd64_ssse3_blocks1:
 
 	/* eax zeroed by round loop. */
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_amd64_ssse3_blocks1;)
 
@@ -517,9 +524,12 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(8 * 8) + STACK_MAX + 16, %rsp;
 	andq $~15, %rsp;
@@ -529,6 +539,11 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	movq %r13, (STACK_MAX + 2 * 8)(%rsp);
 	movq %r14, (STACK_MAX + 3 * 8)(%rsp);
 	movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8);
+	CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8);
+	CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8);
+	CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8);
+	CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8);
 
 	movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
 	movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
@@ -901,10 +916,17 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4:
 	movq (STACK_MAX + 2 * 8)(%rsp), %r13;
 	movq (STACK_MAX + 3 * 8)(%rsp), %r14;
 	movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
 
@@ -925,8 +947,12 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	 *	%r9: poly1305-state
 	 *	%r8: poly1305-src
 	 */
+	CFI_STARTPROC();
+
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
 
 	subq $(8 * 8), %rsp;
 	movq %rbx, (0 * 8)(%rsp);
@@ -934,6 +960,11 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	movq %r13, (2 * 8)(%rsp);
 	movq %r14, (3 * 8)(%rsp);
 	movq %r15, (4 * 8)(%rsp);
+	CFI_REG_ON_STACK(rbx, 0 * 8);
+	CFI_REG_ON_STACK(r12, 1 * 8);
+	CFI_REG_ON_STACK(r13, 2 * 8);
+	CFI_REG_ON_STACK(r14, 3 * 8);
+	CFI_REG_ON_STACK(r15, 4 * 8);
 
 	movq %rdx, (5 * 8)(%rsp); # SRC
 	movq %rsi, (6 * 8)(%rsp); # DST
@@ -1206,10 +1237,17 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1:
 	movq (2 * 8)(%rsp), %r13;
 	movq (3 * 8)(%rsp), %r14;
 	movq (4 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 
 	xorl %eax, %eax;
 	leave;
+	CFI_LEAVE();
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
 	  .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
 
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index f25573d99..a211dac38 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -190,15 +190,23 @@ _gcry_3des_amd64_crypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 
 	leaq .L_s1 rRIP, SBOXES;
 
@@ -259,18 +267,26 @@ _gcry_3des_amd64_crypt_block:
 	round1(32+15, RL0, RR0, dummy2);
 
 	popq RW2; /*dst*/
+	CFI_POP_TMP_REG();
 	final_permutation(RR0, RL0);
 	write_block(RW2, RR0, RL0);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
 /***********************************************************************
@@ -465,6 +481,7 @@ _gcry_3des_amd64_crypt_blk3:
 	 *  RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks
 	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
 	 */
+	CFI_STARTPROC();
 
 	leaq .L_s1 rRIP, SBOXES;
 
@@ -528,6 +545,7 @@ _gcry_3des_amd64_crypt_blk3:
 	final_permutation3(RR, RL);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
 
 .align 8
@@ -540,18 +558,28 @@ _gcry_3des_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	pushq %rcx; /*iv*/
+	CFI_PUSH(%rcx);
 
 	/* load input */
 	movl 0 * 4(%rdx), RL0d;
@@ -571,8 +599,11 @@ _gcry_3des_amd64_cbc_dec:
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rcx; /*iv*/
+	CFI_POP_TMP_REG();
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
@@ -598,14 +629,21 @@ _gcry_3des_amd64_cbc_dec:
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
@@ -618,17 +656,26 @@ _gcry_3des_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	movq %rcx, RW2;
 
 	/* load IV and byteswap */
@@ -654,7 +701,9 @@ _gcry_3des_amd64_ctr_enc:
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
@@ -678,14 +727,21 @@ _gcry_3des_amd64_ctr_enc:
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
@@ -698,17 +754,26 @@ _gcry_3des_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (64bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	pushq %rbp;
+	CFI_PUSH(%rbp);
 	pushq %rbx;
+	CFI_PUSH(%rbx);
 	pushq %r12;
+	CFI_PUSH(%r12);
 	pushq %r13;
+	CFI_PUSH(%r13);
 	pushq %r14;
+	CFI_PUSH(%r14);
 	pushq %r15;
+	CFI_PUSH(%r15);
 
 	pushq %rsi; /*dst*/
+	CFI_PUSH(%rsi);
 	pushq %rdx; /*src*/
+	CFI_PUSH(%rdx);
 	movq %rcx, RW2;
 
 	/* Load input */
@@ -733,7 +798,9 @@ _gcry_3des_amd64_cfb_dec:
 	call _gcry_3des_amd64_crypt_blk3;
 
 	popq %rdx; /*src*/
+	CFI_POP_TMP_REG();
 	popq %rsi; /*dst*/
+	CFI_POP_TMP_REG();
 
 	bswapl RR0d;
 	bswapl RL0d;
@@ -757,14 +824,21 @@ _gcry_3des_amd64_cfb_dec:
 	movl RL2d, 5 * 4(%rsi);
 
 	popq %r15;
+	CFI_POP(%r15);
 	popq %r14;
+	CFI_POP(%r14);
 	popq %r13;
+	CFI_POP(%r13);
 	popq %r12;
+	CFI_POP(%r12);
 	popq %rbx;
+	CFI_POP(%rbx);
 	popq %rbp;
+	CFI_POP(%rbp);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
 .align 16
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
index 798ff51af..3dcaa856b 100644
--- a/cipher/rijndael-amd64.S
+++ b/cipher/rijndael-amd64.S
@@ -212,14 +212,19 @@ _gcry_aes_amd64_encrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  encryption tables
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
 	movq %rbp, (2 * 8)(%rsp);
 	movq %rbx, (3 * 8)(%rsp);
 	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
 
 	leaq (%r8), RTAB;
 
@@ -251,16 +256,23 @@ _gcry_aes_amd64_encrypt_block:
 	movl RCd, 2 * 4(%rsi);
 	movl RDd, 3 * 4(%rsi);
 
+	CFI_REMEMBER_STATE();
+
 	movq (4 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
 
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
 	ret;
 
+	CFI_RESTORE_STATE();
 .align 4
 .Lenc_not_128:
 	je .Lenc_192
@@ -280,6 +292,7 @@ _gcry_aes_amd64_encrypt_block:
 	lastencround(11);
 
 	jmp .Lenc_done;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;)
 
 #define do_decround(next_r) \
@@ -376,14 +389,19 @@ _gcry_aes_amd64_decrypt_block:
 	 *	%ecx: number of rounds.. 10, 12 or 14
 	 *	%r8:  decryption tables
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(5 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movl %ecx, (1 * 8)(%rsp);
 	movq %rbp, (2 * 8)(%rsp);
 	movq %rbx, (3 * 8)(%rsp);
 	movq %r12, (4 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 2 * 8);
+	CFI_REL_OFFSET(%rbx, 3 * 8);
+	CFI_REL_OFFSET(%r12, 4 * 8);
 
 	leaq (%r8), RTAB;
 
@@ -416,16 +434,23 @@ _gcry_aes_amd64_decrypt_block:
 	movl RCd, 2 * 4(%rsi);
 	movl RDd, 3 * 4(%rsi);
 
+	CFI_REMEMBER_STATE();
+
 	movq (4 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %rbx;
 	movq (2 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(5 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-5 * 8);
 
 	movl $(6 * 8), %eax;
 
 	EXIT_SYSV_FUNC
 	ret;
 
+	CFI_RESTORE_STATE();
 .align 4
 .Ldec_256:
 	je .Ldec_192;
@@ -445,6 +470,7 @@ _gcry_aes_amd64_decrypt_block:
 	decround(9);
 
 	jmp .Ldec_tail;
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;)
 
 #endif /*USE_AES*/
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index ffce5df2f..8124eb219 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -50,6 +50,7 @@
 ELF(.type _gcry_aes_ssse3_enc_preload, at function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9  # 0F
@@ -61,6 +62,7 @@ _gcry_aes_ssse3_enc_preload:
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
 ##
@@ -69,6 +71,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 ELF(.type _gcry_aes_ssse3_dec_preload, at function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9   # 0F
@@ -81,6 +84,7 @@ _gcry_aes_ssse3_dec_preload:
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
 ##
@@ -111,6 +115,7 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
 .globl _gcry_aes_ssse3_encrypt_core
 _gcry_aes_ssse3_encrypt_core:
 _aes_encrypt_core:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	mov	%rdi,	%rdx
 	leaq	-1(%rsi), %rax
@@ -190,6 +195,7 @@ _aes_encrypt_core:
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
 ##
@@ -202,6 +208,7 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
 _gcry_aes_ssse3_decrypt_core:
 _aes_decrypt_core:
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 	mov	%rdi,	%rdx
 	lea	.Laes_consts(%rip), %rcx
@@ -297,6 +304,7 @@ _aes_decrypt_core:
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
 ########################################################
@@ -315,6 +323,7 @@ _aes_schedule_core:
 	# rdx = buffer
 	# rcx = direction.  0=encrypt, 1=decrypt
 	# r8 = rotoffs
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	# load the tables
@@ -671,6 +680,7 @@ _aes_schedule_core:
 	pxor	%xmm8,  %xmm8
 	EXIT_SYSV_FUNC
 	ret
+	CFI_ENDPROC();
 ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
 ########################################################
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
index 470c32aad..ae8f27155 100644
--- a/cipher/salsa20-amd64.S
+++ b/cipher/salsa20-amd64.S
@@ -28,11 +28,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -40,6 +36,7 @@
 .globl _gcry_salsa20_amd64_keysetup
 ELF(.type  _gcry_salsa20_amd64_keysetup, at function;)
 _gcry_salsa20_amd64_keysetup:
+	CFI_STARTPROC();
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%r9d
 	movl   8(%rsi),%eax
@@ -87,11 +84,13 @@ _gcry_salsa20_amd64_keysetup:
 	movl   %r8d,12(%rdi)
 .L_keysetupdone:
 	ret
+	CFI_ENDPROC();
 
 .align 8
 .globl _gcry_salsa20_amd64_ivsetup
 ELF(.type  _gcry_salsa20_amd64_ivsetup, at function;)
 _gcry_salsa20_amd64_ivsetup:
+	CFI_STARTPROC();
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%esi
 	mov  $0,%r9
@@ -101,6 +100,7 @@ _gcry_salsa20_amd64_ivsetup:
 	movl   %r9d,32(%rdi)
 	movl   %eax,52(%rdi)
 	ret
+	CFI_ENDPROC();
 
 .align 8
 .globl _gcry_salsa20_amd64_encrypt_blocks
@@ -112,13 +112,15 @@ _gcry_salsa20_amd64_encrypt_blocks:
 	 *  - Length is input as number of blocks, so don't handle tail bytes
 	 *    (this is done in salsa20.c).
 	 */
+	CFI_STARTPROC();
 	push %rbx
+	CFI_PUSH(%rbx);
 	shlq $6, %rcx /* blocks to bytes */
 	mov %r8, %rbx
 	mov %rsp,%r11
-	and $31,%r11
-	add $384,%r11
-	sub %r11,%rsp
+	CFI_DEF_CFA_REGISTER(%r11);
+	sub $384,%rsp
+	and $~31,%rsp
 	mov  %rdi,%r8
 	mov  %rsi,%rsi
 	mov  %rdx,%rdi
@@ -916,15 +918,22 @@ _gcry_salsa20_amd64_encrypt_blocks:
 	cmp  $64,%rdx
 	ja .L_bytes_are_128_or_192
 .L_done:
-	add %r11,%rsp
+	CFI_REMEMBER_STATE();
 	mov %r11,%rax
+	sub %rsp,%rax
+	mov %r11,%rsp
+	CFI_REGISTER(%r11, %rsp)
+	CFI_DEF_CFA_REGISTER(%rsp)
 	pop %rbx
+	CFI_POP(%rbx)
 	ret
+	CFI_RESTORE_STATE();
 .L_bytes_are_128_or_192:
 	sub  $64,%rdx
 	add  $64,%rdi
 	add  $64,%rsi
 	jmp .L_bytes_are_64_128_or_192
+	CFI_ENDPROC();
 ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;)
 
 #endif /*defined(USE_SALSA20)*/
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index 8d60a159e..9b17c2bd1 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -24,17 +24,7 @@
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \
     defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 /* struct serpent_context: */
 #define ctx_keys 0
@@ -421,6 +411,7 @@ __serpent_enc_blk16:
 	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
 	 * 						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
@@ -496,6 +487,7 @@ __serpent_enc_blk16:
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
 
 .align 8
@@ -509,6 +501,7 @@ __serpent_dec_blk16:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
 	 *						plaintext blocks
 	 */
+	CFI_STARTPROC();
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
@@ -586,6 +579,7 @@ __serpent_dec_blk16:
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -604,13 +598,14 @@ _gcry_serpent_avx2_ctr_enc:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
 	vzeroupper;
 
-	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
 	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
@@ -701,7 +696,8 @@ _gcry_serpent_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
 
 .align 8
@@ -714,6 +710,7 @@ _gcry_serpent_avx2_cbc_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -752,7 +749,8 @@ _gcry_serpent_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
 
 .align 8
@@ -765,6 +763,7 @@ _gcry_serpent_avx2_cfb_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -805,7 +804,8 @@ _gcry_serpent_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
 .align 8
@@ -821,15 +821,21 @@ _gcry_serpent_avx2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 	vmovdqu (%r8), RTMP1x;
@@ -882,10 +888,15 @@ _gcry_serpent_avx2_ocb_enc:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor (0 * 32)(%rsi), RA4, RA4;
 	vpxor (1 * 32)(%rsi), RA1, RA1;
@@ -908,6 +919,7 @@ _gcry_serpent_avx2_ocb_enc:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
 
 .align 8
@@ -923,15 +935,21 @@ _gcry_serpent_avx2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 
@@ -978,10 +996,15 @@ _gcry_serpent_avx2_ocb_dec:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_dec_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vmovdqu (%r8), RTMP1x;
 
@@ -1020,6 +1043,7 @@ _gcry_serpent_avx2_ocb_dec:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
 
 .align 8
@@ -1034,15 +1058,21 @@ _gcry_serpent_avx2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rdx), RTMP0x;
 
@@ -1088,10 +1118,15 @@ _gcry_serpent_avx2_ocb_auth:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor RA4, RB4, RA4;
 	vpxor RA1, RB1, RA1;
@@ -1111,6 +1146,7 @@ _gcry_serpent_avx2_ocb_auth:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
 
 .align 16
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index b149af24e..39cba0029 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 /* struct serpent_context: */
 #define ctx_keys 0
@@ -444,6 +434,7 @@ __serpent_enc_blk8:
 	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
 	 * 						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 
 	pcmpeqd RNOT, RNOT;
 
@@ -519,6 +510,7 @@ __serpent_enc_blk8:
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
 
 .align 8
@@ -532,6 +524,7 @@ __serpent_dec_blk8:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
 	 *						blocks
 	 */
+	CFI_STARTPROC();
 
 	pcmpeqd RNOT, RNOT;
 
@@ -609,6 +602,7 @@ __serpent_dec_blk8:
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
 .align 8
@@ -621,6 +615,7 @@ _gcry_serpent_sse2_ctr_enc:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	/* load IV and byteswap */
 	movdqu (%rcx), RA0;
@@ -738,7 +733,8 @@ _gcry_serpent_sse2_ctr_enc:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
 
 .align 8
@@ -751,6 +747,7 @@ _gcry_serpent_sse2_cbc_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	movdqu (0 * 16)(%rdx), RA0;
 	movdqu (1 * 16)(%rdx), RA1;
@@ -799,7 +796,8 @@ _gcry_serpent_sse2_cbc_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
 
 .align 8
@@ -812,6 +810,7 @@ _gcry_serpent_sse2_cfb_dec:
 	 *	%rdx: src (8 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	/* Load input */
 	movdqu (%rcx), RA0;
@@ -863,7 +862,8 @@ _gcry_serpent_sse2_cfb_dec:
 	pxor RTMP2, RTMP2;
 	pxor RNOT, RNOT;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
 .align 8
@@ -879,13 +879,19 @@ _gcry_serpent_sse2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rcx), RTMP0;
 	movdqu (%r8), RTMP1;
@@ -926,10 +932,15 @@ _gcry_serpent_sse2_ocb_enc:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	pxor_u((0 * 16)(%rsi), RA4, RTMP0);
 	pxor_u((1 * 16)(%rsi), RA1, RTMP0);
@@ -966,6 +977,7 @@ _gcry_serpent_sse2_ocb_enc:
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
 
 .align 8
@@ -981,13 +993,19 @@ _gcry_serpent_sse2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rcx), RTMP0;
 
@@ -1024,10 +1042,15 @@ _gcry_serpent_sse2_ocb_dec:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_dec_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	movdqu (%r8), RTMP0;
 
@@ -1078,6 +1101,7 @@ _gcry_serpent_sse2_ocb_dec:
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
 
 .align 8
@@ -1092,13 +1116,19 @@ _gcry_serpent_sse2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[8])
 	 */
+	CFI_STARTPROC();
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	movdqu (%rdx), RTMP0;
 
@@ -1134,10 +1164,15 @@ _gcry_serpent_sse2_ocb_auth:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __serpent_enc_blk8;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	movdqu (%rcx), RTMP0;
 	pxor RB4, RA4;
@@ -1169,6 +1204,7 @@ _gcry_serpent_sse2_ocb_auth:
 	pxor RNOT, RNOT;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;)
 
 #endif /*defined(USE_SERPENT)*/
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 5d674c151..85876ad41 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -33,18 +33,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -161,7 +150,7 @@
 	vpshufb BSWAP_REG, tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0) \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0;
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	vmovdqa tmp0, WK(i&~3);
@@ -186,7 +175,7 @@
 #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
 	vpxor W, tmp0, tmp0; \
 	vpxor tmp1, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
@@ -203,7 +192,7 @@
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	vpor W, tmp0, W; \
-	vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+	vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \
 	vmovdqa tmp0, WK((i)&~3);
 
 
@@ -223,6 +212,7 @@ _gcry_sha1_transform_amd64_avx:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
@@ -234,9 +224,12 @@ _gcry_sha1_transform_amd64_avx:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
@@ -248,7 +241,7 @@ _gcry_sha1_transform_amd64_avx:
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
 
-  vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -415,15 +408,20 @@ _gcry_sha1_transform_amd64_avx:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx,
     .-_gcry_sha1_transform_amd64_avx;)
 
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index fe8901eff..5dfcdca97 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -34,18 +34,7 @@
     defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -222,6 +211,7 @@ _gcry_sha1_transform_amd64_avx_bmi2:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
@@ -233,10 +223,14 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
   pushq %r12;
+  CFI_PUSH(%r12);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
@@ -249,11 +243,11 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movl state_h4(RSTATE), e;
   xorl ne, ne;
 
-  vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
-  vpbroadcastd .LK1 RIP, K1;
-  vpbroadcastd .LK2 RIP, K2;
-  vpbroadcastd .LK3 RIP, K3;
-  vpbroadcastd .LK4 RIP, K4;
+  vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -424,16 +418,22 @@ _gcry_sha1_transform_amd64_avx_bmi2:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %r12;
+  CFI_POP(%r12);
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
     .-_gcry_sha1_transform_amd64_avx_bmi2;)
 
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
index 2a2f21a56..938632305 100644
--- a/cipher/sha1-avx2-bmi2-amd64.S
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -34,18 +34,7 @@
      defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
      defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -228,6 +217,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks (multiple of 2, larger than 0)
    */
+  CFI_STARTPROC();
 
   vzeroupper;
 
@@ -235,10 +225,14 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
   pushq %r12;
+  CFI_PUSH(%r12);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(WK_STACK_WORDS*4), %rsp;
   andq $(~63), %rsp;
@@ -251,11 +245,11 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   movl state_h4(RSTATE), e;
   xorl ne, ne;
 
-  vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG;
-  vpbroadcastd .LK1 RIP, K1;
-  vpbroadcastd .LK2 RIP, K2;
-  vpbroadcastd .LK3 RIP, K3;
-  vpbroadcastd .LK4 RIP, K4;
+  vbroadcasti128 .Lbswap_shufb_ctl rRIP, BSWAP_REG;
+  vpbroadcastd .LK1 rRIP, K1;
+  vpbroadcastd .LK2 rRIP, K2;
+  vpbroadcastd .LK3 rRIP, K3;
+  vpbroadcastd .LK4 rRIP, K4;
 
   /* Precalc 0-31 for block 1 & 2. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -557,15 +551,21 @@ _gcry_sha1_transform_amd64_avx2_bmi2:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %r12;
+  CFI_POP(%r12);
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
     .-_gcry_sha1_transform_amd64_avx2_bmi2;)
 
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index fff140345..7e32b0f4b 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -33,18 +33,7 @@
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
 
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 
 /* Context structure */
@@ -162,7 +151,7 @@
 	movdqa tmp0, W;
 
 #define W_PRECALC_00_15_2(i, W, tmp0) \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0;
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0;
 
 #define W_PRECALC_00_15_3(i, W, tmp0) \
 	movdqa tmp0, WK(i&~3);
@@ -193,7 +182,7 @@
 	pxor W, tmp0; \
 	pxor tmp1, tmp0; \
 	movdqa tmp0, W; \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
 	movdqa tmp0, WK((i)&~3);
 
 #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
@@ -213,7 +202,7 @@
 
 #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
 	movdqa tmp0, W; \
-	paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+	paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
 	movdqa tmp0, WK((i)&~3);
 
 #define CLEAR_REG(reg) pxor reg, reg;
@@ -235,6 +224,7 @@ _gcry_sha1_transform_amd64_ssse3:
    *	%rsi: data (64*nblks bytes)
    *	%rdx: nblks
    */
+  CFI_STARTPROC();
 
   xorl %eax, %eax;
   cmpq $0, %rdx;
@@ -244,9 +234,12 @@ _gcry_sha1_transform_amd64_ssse3:
   movq %rdi, RSTATE;
   movq %rsi, RDATA;
   pushq %rbx;
+  CFI_PUSH(%rbx);
   pushq %rbp;
+  CFI_PUSH(%rbp);
 
   movq %rsp, ROLDSTACK;
+  CFI_DEF_CFA_REGISTER(ROLDSTACK);
 
   subq $(16*4), %rsp;
   andq $(~31), %rsp;
@@ -258,7 +251,7 @@ _gcry_sha1_transform_amd64_ssse3:
   movl state_h3(RSTATE), d;
   movl state_h4(RSTATE), e;
 
-  movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+  movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
 
   /* Precalc 0-15. */
   W_PRECALC_00_15_0(0, W0, Wtmp0);
@@ -423,15 +416,20 @@ _gcry_sha1_transform_amd64_ssse3:
   movl e, state_h4(RSTATE);
 
   movq ROLDSTACK, %rsp;
+  CFI_REGISTER(ROLDSTACK, %rsp);
+  CFI_DEF_CFA_REGISTER(%rsp);
 
   popq %rbp;
+  CFI_POP(%rbp);
   popq %rbx;
+  CFI_POP(%rbx);
 
   /* stack already burned */
   xorl %eax, %eax;
 
 .Lret:
   ret;
+  CFI_ENDPROC();
 ELF(.size _gcry_sha1_transform_amd64_ssse3,
     .-_gcry_sha1_transform_amd64_ssse3;)
 
diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index b8b01b15b..77143ff0e 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -59,17 +59,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -380,15 +370,22 @@ rotate_Xs
 ELF(.type  _gcry_sha256_transform_amd64_avx, at function;)
 .align 16
 _gcry_sha256_transform_amd64_avx:
+	CFI_STARTPROC()
 	vzeroupper
 
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
@@ -487,14 +484,21 @@ _gcry_sha256_transform_amd64_avx:
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 	ret
+	CFI_ENDPROC()
 
 
 .align 16
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 5fc402cd1..52be1a07b 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -60,17 +60,7 @@
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -314,17 +304,24 @@ a = TMP_
 ELF(.type _gcry_sha256_transform_amd64_avx2, at function)
 .align 32
 _gcry_sha256_transform_amd64_avx2:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
 	je .Lnowork
 
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r12
+	CFI_PUSH(r12)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	vzeroupper
 
@@ -333,9 +330,11 @@ _gcry_sha256_transform_amd64_avx2:
 	vmovdqa	SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
 
 	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, STACK_SIZE
 	and	rsp, ~63
 	mov	[rsp + _RSP], rax
+	CFI_CFA_ON_STACK(_RSP, 6 * 8)
 
 	shl	NUM_BLKS, 6	/*  convert to bytes */
 	lea	NUM_BLKS, [NUM_BLKS + INP - 64] /*  pointer to last block */
@@ -507,16 +506,24 @@ _gcry_sha256_transform_amd64_avx2:
 	xor     eax, eax
 
 	mov	rsp, [rsp + _RSP]
+	CFI_DEF_CFA_REGISTER(rsp)
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	r12
+	CFI_POP(r12)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 .align 64
 .LK256:
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index ca5c9fd1d..0fb94c1b3 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -60,17 +60,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -386,13 +376,20 @@ rotate_Xs
 ELF(.type  _gcry_sha256_transform_amd64_ssse3, at function;)
 .align 16
 _gcry_sha256_transform_amd64_ssse3:
+	CFI_STARTPROC()
 	push	rbx
+	CFI_PUSH(rbx)
 	push	rbp
+	CFI_PUSH(rbp)
 	push	r13
+	CFI_PUSH(r13)
 	push	r14
+	CFI_PUSH(r14)
 	push	r15
+	CFI_PUSH(r15)
 
 	sub	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
 
 	shl	NUM_BLKS, 6	/* convert to bytes */
 	jz	.Ldone_hash
@@ -508,14 +505,21 @@ _gcry_sha256_transform_amd64_ssse3:
 	xor     eax, eax
 
 	add	rsp, STACK_SIZE
+	CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
 
 	pop	r15
+	CFI_POP(r15)
 	pop	r14
+	CFI_POP(r14)
 	pop	r13
+	CFI_POP(r13)
 	pop	rbp
+	CFI_POP(rbp)
 	pop	rbx
+	CFI_POP(rbx)
 
 	ret
+	CFI_ENDPROC()
 
 
 .align 16
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 534351e44..991fd6395 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -46,17 +46,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -269,6 +259,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 ELF(.type _gcry_sha512_transform_amd64_avx, at function;)
 .align 16
 _gcry_sha512_transform_amd64_avx:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp	msglen, 0
@@ -278,6 +269,7 @@ _gcry_sha512_transform_amd64_avx:
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
@@ -285,6 +277,11 @@ _gcry_sha512_transform_amd64_avx:
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
@@ -351,6 +348,11 @@ _gcry_sha512_transform_amd64_avx:
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	vzeroall
 
@@ -365,9 +367,11 @@ _gcry_sha512_transform_amd64_avx:
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 32cfceb0b..3b28ab6c6 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -49,17 +49,7 @@
     defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
     defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -352,6 +342,7 @@ y4 =    r12
 ELF(.type _gcry_sha512_transform_amd64_avx2, at function;)
 .align 16
 _gcry_sha512_transform_amd64_avx2:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp rdx, 0
@@ -361,9 +352,11 @@ _gcry_sha512_transform_amd64_avx2:
 
 	/* Allocate Stack Space */
 	mov	rax, rsp
+	CFI_DEF_CFA_REGISTER(rax);
 	sub	rsp, frame_size
 	and	rsp, ~(0x40 - 1)
 	mov	[rsp + frame_RSPSAVE], rax
+	CFI_CFA_ON_STACK(frame_RSPSAVE, 0)
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbp
@@ -372,6 +365,12 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 5], r15
+	CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0)
+	CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1)
+	CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2)
+	CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3)
+	CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4)
+	CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5)
 
 	mov	[rsp + frame_NBLKS], NUM_BLKS
 
@@ -494,11 +493,20 @@ _gcry_sha512_transform_amd64_avx2:
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 4]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 5]
+	CFI_RESTORE(rbp)
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	/* Restore Stack Pointer */
 	mov	rsp, [rsp + frame_RSPSAVE]
+	CFI_DEF_CFA_REGISTER(rsp)
+
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 /*;; Binary Data */
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 8e950e0e4..39bfe3625 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -49,17 +49,7 @@
     defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512)
 
-#ifdef __PIC__
-#  define ADD_RIP +rip
-#else
-#  define ADD_RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .intel_syntax noprefix
 
@@ -271,6 +261,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 ELF(.type _gcry_sha512_transform_amd64_ssse3, at function;)
 .align 16
 _gcry_sha512_transform_amd64_ssse3:
+	CFI_STARTPROC()
 	xor eax, eax
 
 	cmp msglen, 0
@@ -278,6 +269,7 @@ _gcry_sha512_transform_amd64_ssse3:
 
 	/* Allocate Stack Space */
 	sub	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(frame_size);
 
 	/* Save GPRs */
 	mov	[rsp + frame_GPRSAVE + 8 * 0], rbx
@@ -285,6 +277,11 @@ _gcry_sha512_transform_amd64_ssse3:
 	mov	[rsp + frame_GPRSAVE + 8 * 2], r13
 	mov	[rsp + frame_GPRSAVE + 8 * 3], r14
 	mov	[rsp + frame_GPRSAVE + 8 * 4], r15
+	CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0);
+	CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1);
+	CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2);
+	CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3);
+	CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4);
 
 .Lupdateblock:
 
@@ -351,6 +348,11 @@ _gcry_sha512_transform_amd64_ssse3:
 	mov	r13, [rsp + frame_GPRSAVE + 8 * 2]
 	mov	r14, [rsp + frame_GPRSAVE + 8 * 3]
 	mov	r15, [rsp + frame_GPRSAVE + 8 * 4]
+	CFI_RESTORE(rbx)
+	CFI_RESTORE(r12)
+	CFI_RESTORE(r13)
+	CFI_RESTORE(r14)
+	CFI_RESTORE(r15)
 
 	pxor	xmm0, xmm0
 	pxor	xmm1, xmm1
@@ -370,9 +372,11 @@ _gcry_sha512_transform_amd64_ssse3:
 
 	/* Restore Stack Pointer */
 	add	rsp, frame_size
+	CFI_ADJUST_CFA_OFFSET(-frame_size);
 
 .Lnowork:
 	ret
+	CFI_ENDPROC()
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index 134d6401e..3cb734317 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -171,12 +171,16 @@ _gcry_twofish_amd64_encrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
 	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
 
 	movq %rdx, RX;
 	inpack(RX, 0, RAd, 0);
@@ -201,10 +205,14 @@ _gcry_twofish_amd64_encrypt_block:
 
 	movq (2 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 .align 8
@@ -217,12 +225,16 @@ _gcry_twofish_amd64_decrypt_block:
 	 *	%rsi: dst
 	 *	%rdx: src
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(3 * 8);
 	movq %rsi, (0 * 8)(%rsp);
 	movq %rbp, (1 * 8)(%rsp);
 	movq %rbx, (2 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 1 * 8);
+	CFI_REL_OFFSET(%rbx, 2 * 8);
 
 	movq %rdx, RX;
 	inpack(RX, 0, RCd, 4);
@@ -247,10 +259,14 @@ _gcry_twofish_amd64_decrypt_block:
 
 	movq (2 * 8)(%rsp), %rbx;
 	movq (1 * 8)(%rsp), %rbp;
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%rbp);
 	addq $(3 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-3 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 #undef CTX
@@ -480,6 +496,8 @@ __twofish_enc_blk3:
 	 * output:
 	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks
 	 */
+	CFI_STARTPROC();
+
 	inpack_enc3();
 
 	encrypt_cycle3(RAB, RCD, 0);
@@ -494,6 +512,7 @@ __twofish_enc_blk3:
 	outunpack_enc3();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
 
 .align 8
@@ -506,6 +525,8 @@ __twofish_dec_blk3:
 	 * output:
 	 *	RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks
 	 */
+	CFI_STARTPROC();
+
 	inpack_dec3();
 
 	decrypt_cycle3(RAB, RCD, 7);
@@ -520,6 +541,7 @@ __twofish_dec_blk3:
 	outunpack_dec3();
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
 .align 8
@@ -532,15 +554,23 @@ _gcry_twofish_amd64_ctr_enc:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
@@ -601,10 +631,18 @@ _gcry_twofish_amd64_ctr_enc:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
 .align 8
@@ -617,15 +655,23 @@ _gcry_twofish_amd64_cbc_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(9 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
@@ -670,10 +716,18 @@ _gcry_twofish_amd64_cbc_dec:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(9 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-9 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
 .align 8
@@ -686,15 +740,23 @@ _gcry_twofish_amd64_cfb_dec:
 	 *	%rdx: src (3 blocks)
 	 *	%rcx: iv (128bit)
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_0_4
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, (7 * 8)(%rsp);
@@ -739,10 +801,18 @@ _gcry_twofish_amd64_cfb_dec:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
 .align 8
@@ -757,15 +827,23 @@ _gcry_twofish_amd64_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_6
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %rdx, RX0;
@@ -849,10 +927,18 @@ _gcry_twofish_amd64_ocb_enc:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
 
 .align 8
@@ -867,15 +953,23 @@ _gcry_twofish_amd64_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_6
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rsi, (6 * 8)(%rsp);
 	movq %r8,  (7 * 8)(%rsp);
@@ -967,10 +1061,18 @@ _gcry_twofish_amd64_ocb_dec:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
 
 .align 8
@@ -984,15 +1086,23 @@ _gcry_twofish_amd64_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[3])
 	 */
+	CFI_STARTPROC();
 	ENTER_SYSV_FUNC_PARAMS_5
 
 	subq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(8 * 8);
 	movq %rbp, (0 * 8)(%rsp);
 	movq %rbx, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
 	movq %r14, (4 * 8)(%rsp);
 	movq %r15, (5 * 8)(%rsp);
+	CFI_REL_OFFSET(%rbp, 0 * 8);
+	CFI_REL_OFFSET(%rbx, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
+	CFI_REL_OFFSET(%r14, 4 * 8);
+	CFI_REL_OFFSET(%r15, 5 * 8);
 
 	movq %rcx, (6 * 8)(%rsp);
 	movq %rsi, RX0;
@@ -1056,10 +1166,18 @@ _gcry_twofish_amd64_ocb_auth:
 	movq (3 * 8)(%rsp), %r13;
 	movq (4 * 8)(%rsp), %r14;
 	movq (5 * 8)(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $(8 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-8 * 8);
 
 	EXIT_SYSV_FUNC
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
 
 #endif /*USE_TWOFISH*/
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index db6e21826..74cad3558 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -24,17 +24,7 @@
     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \
     defined(ENABLE_AVX2_SUPPORT)
 
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -423,6 +413,7 @@ __twofish_enc_blk16:
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	init_round_constants();
 
 	transpose4x4_16(RA, RB, RC, RD);
@@ -441,6 +432,7 @@ __twofish_enc_blk16:
 	transpose4x4_16(RA, RB, RC, RD);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
 
 .align 8
@@ -454,6 +446,7 @@ __twofish_dec_blk16:
 	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
 	 *						ciphertext blocks
 	 */
+	CFI_STARTPROC();
 	init_round_constants();
 
 	transpose4x4_16(RA, RB, RC, RD);
@@ -472,6 +465,7 @@ __twofish_dec_blk16:
 	transpose4x4_16(RA, RB, RC, RD);
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
@@ -490,13 +484,14 @@ _gcry_twofish_avx2_ctr_enc:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv (big endian, 128bit)
 	 */
+	CFI_STARTPROC();
 
 	movq 8(%rcx), %rax;
 	bswapq %rax;
 
 	vzeroupper;
 
-	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
 	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
@@ -587,7 +582,8 @@ _gcry_twofish_avx2_ctr_enc:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
 
 .align 8
@@ -600,6 +596,7 @@ _gcry_twofish_avx2_cbc_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -638,7 +635,8 @@ _gcry_twofish_avx2_cbc_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
 
 .align 8
@@ -651,6 +649,7 @@ _gcry_twofish_avx2_cfb_dec:
 	 *	%rdx: src (16 blocks)
 	 *	%rcx: iv
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
@@ -691,7 +690,8 @@ _gcry_twofish_avx2_cfb_dec:
 
 	vzeroall;
 
-	ret
+	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
 
 .align 8
@@ -707,15 +707,21 @@ _gcry_twofish_avx2_ocb_enc:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 	vmovdqu (%r8), RTMP1x;
@@ -768,10 +774,15 @@ _gcry_twofish_avx2_ocb_enc:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_enc_blk16;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor (0 * 32)(%rsi), RA0, RA0;
 	vpxor (1 * 32)(%rsi), RB0, RB0;
@@ -794,6 +805,7 @@ _gcry_twofish_avx2_ocb_enc:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
 
 .align 8
@@ -809,15 +821,21 @@ _gcry_twofish_avx2_ocb_dec:
 	 *	%r8 : checksum
 	 *	%r9 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rcx), RTMP0x;
 
@@ -865,6 +883,10 @@ _gcry_twofish_avx2_ocb_dec:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_dec_blk16;
 
@@ -880,6 +902,7 @@ _gcry_twofish_avx2_ocb_dec:
 	vpxor (7 * 32)(%rsi), RD1, RD1;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	/* Checksum_i = Checksum_{i-1} xor P_i  */
 
@@ -907,6 +930,7 @@ _gcry_twofish_avx2_ocb_dec:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
 
 .align 8
@@ -921,15 +945,21 @@ _gcry_twofish_avx2_ocb_auth:
 	 *	%rcx: checksum
 	 *	%r8 : L pointers (void *L[16])
 	 */
+	CFI_STARTPROC();
 
 	vzeroupper;
 
 	subq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(4 * 8);
 
 	movq %r10, (0 * 8)(%rsp);
 	movq %r11, (1 * 8)(%rsp);
 	movq %r12, (2 * 8)(%rsp);
 	movq %r13, (3 * 8)(%rsp);
+	CFI_REL_OFFSET(%r10, 0 * 8);
+	CFI_REL_OFFSET(%r11, 1 * 8);
+	CFI_REL_OFFSET(%r12, 2 * 8);
+	CFI_REL_OFFSET(%r13, 3 * 8);
 
 	vmovdqu (%rdx), RTMP0x;
 
@@ -975,6 +1005,10 @@ _gcry_twofish_avx2_ocb_auth:
 	movq (1 * 8)(%rsp), %r11;
 	movq (2 * 8)(%rsp), %r12;
 	movq (3 * 8)(%rsp), %r13;
+	CFI_RESTORE(%r10);
+	CFI_RESTORE(%r11);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
 
 	call __twofish_enc_blk16;
 
@@ -987,6 +1021,7 @@ _gcry_twofish_avx2_ocb_auth:
 	vpxor RA1, RC1, RA1;
 
 	addq $(4 * 8), %rsp;
+	CFI_ADJUST_CFA_OFFSET(-4 * 8);
 
 	vpxor RA1, RA0, RTMP1;
 
@@ -998,6 +1033,7 @@ _gcry_twofish_avx2_ocb_auth:
 	vzeroall;
 
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
 
 .align 16
diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S
index e98b831c0..5631dc567 100644
--- a/cipher/whirlpool-sse2-amd64.S
+++ b/cipher/whirlpool-sse2-amd64.S
@@ -23,17 +23,7 @@
 #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_WHIRLPOOL)
 
-#ifdef __PIC__
-#  define RIP %rip
-#else
-#  define RIP
-#endif
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -173,16 +163,24 @@ _gcry_whirlpool_transform_amd64:
 	 *	%rdx: nblks
 	 *      %rcx: look-up tables
 	 */
+	CFI_STARTPROC();
 	cmp $0, %rdx;
 	je .Lskip;
 
 	subq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(STACK_MAX);
 	movq %rbp, STACK_RBP(%rsp);
 	movq %rbx, STACK_RBX(%rsp);
 	movq %r12, STACK_R12(%rsp);
 	movq %r13, STACK_R13(%rsp);
 	movq %r14, STACK_R14(%rsp);
 	movq %r15, STACK_R15(%rsp);
+	CFI_REL_OFFSET(%rbp, STACK_RBP);
+	CFI_REL_OFFSET(%rbx, STACK_RBX);
+	CFI_REL_OFFSET(%r12, STACK_R12);
+	CFI_REL_OFFSET(%r13, STACK_R13);
+	CFI_REL_OFFSET(%r14, STACK_R14);
+	CFI_REL_OFFSET(%r15, STACK_R15);
 
 	movq %rdx, STACK_NBLKS(%rsp);
 	movq %rdi, STACK_STATEP(%rsp);
@@ -332,10 +330,18 @@ _gcry_whirlpool_transform_amd64:
 	movq STACK_R13(%rsp), %r13;
 	movq STACK_R14(%rsp), %r14;
 	movq STACK_R15(%rsp), %r15;
+	CFI_RESTORE(%rbp);
+	CFI_RESTORE(%rbx);
+	CFI_RESTORE(%r12);
+	CFI_RESTORE(%r13);
+	CFI_RESTORE(%r14);
+	CFI_RESTORE(%r15);
 	addq $STACK_MAX, %rsp;
+	CFI_ADJUST_CFA_OFFSET(-STACK_MAX);
 .Lskip:
 	movl $(STACK_MAX + 8), %eax;
 	ret;
+	CFI_ENDPROC();
 ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;)
 
 #endif
diff --git a/configure.ac b/configure.ac
index b54b212b3..75949f942 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1171,6 +1171,32 @@ if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then
 fi
 
 
+#
+# Check whether GCC assembler supports for CFI directives.
+#
+AC_CACHE_CHECK([whether GCC assembler supports for CFI directives],
+       [gcry_cv_gcc_asm_cfi_directives],
+       [gcry_cv_gcc_asm_cfi_directives=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[__asm__(
+                ".cfi_startproc\n\t"
+                ".cfi_remember_state\n\t"
+                ".cfi_adjust_cfa_offset 8\n\t"
+                ".cfi_rel_offset 0, 8\n\t"
+                ".cfi_def_cfa_register 1\n\t"
+                ".cfi_register 2, 3\n\t"
+                ".cfi_restore 2\n\t"
+                ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t"
+                ".cfi_restore_state\n\t"
+                ".cfi_endproc\n\t"
+            );]])],
+          [gcry_cv_gcc_asm_cfi_directives=yes])])
+if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1,
+             [Defined if underlying assembler supports for CFI directives])
+fi
+
+
 #
 # Check whether underscores in symbols are required.  This needs to be
 # done before setting up the assembler stuff.
@@ -1617,7 +1643,6 @@ if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
-
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h
index ce4467441..37d5722af 100644
--- a/mpi/amd64/func_abi.h
+++ b/mpi/amd64/func_abi.h
@@ -1,9 +1,36 @@
+#include <config.h>
+
+#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
+# define CFI_STARTPROC()            .cfi_startproc
+# define CFI_ENDPROC()              .cfi_endproc
+# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off
+# define CFI_REL_OFFSET(reg,off)    .cfi_rel_offset reg, off
+# define CFI_RESTORE(reg)           .cfi_restore reg
+
+# define CFI_PUSH(reg) \
+	CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0)
+# define CFI_POP(reg) \
+	CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg)
+#else
+# define CFI_STARTPROC()
+# define CFI_ENDPROC()
+# define CFI_ADJUST_CFA_OFFSET(off)
+# define CFI_REL_OFFSET(reg,off)
+# define CFI_RESTORE(reg)
+
+# define CFI_PUSH(reg)
+# define CFI_POP(reg)
+#endif
+
 #ifdef USE_MS_ABI
  /* Store registers and move four first input arguments from MS ABI to
   * SYSV ABI.  */
  #define FUNC_ENTRY() \
+	CFI_STARTPROC(); \
 	pushq %rsi; \
+	CFI_PUSH(%rsi); \
 	pushq %rdi; \
+	CFI_PUSH(%rdi); \
 	movq %rdx, %rsi; \
 	movq %rcx, %rdi; \
 	movq %r8, %rdx; \
@@ -12,8 +39,16 @@
  /* Restore registers.  */
  #define FUNC_EXIT() \
 	popq %rdi; \
-	popq %rsi;
+	CFI_POP(%rdi); \
+	popq %rsi; \
+	CFI_POP(%rsi); \
+	ret; \
+	CFI_ENDPROC();
 #else
- #define FUNC_ENTRY() /**/
- #define FUNC_EXIT() /**/
+ #define FUNC_ENTRY() \
+	CFI_STARTPROC();
+
+ #define FUNC_EXIT() \
+	ret; \
+	CFI_ENDPROC();
 #endif
diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S
index 6a9026219..157e5f1e0 100644
--- a/mpi/amd64/mpih-add1.S
+++ b/mpi/amd64/mpih-add1.S
@@ -62,4 +62,3 @@ C_SYMBOL_NAME(_gcry_mpih_add_n:)
 	adcq	%rax, %rax
 	FUNC_EXIT()
 	ret
-	
\ No newline at end of file




More information about the Gcrypt-devel mailing list