[PATCH 3/3] Enable assembler optimizations on earlier ARM cores

Dmitry Eremin-Solenikov dbaryshkov at gmail.com
Tue Oct 22 21:29:27 CEST 2013


* cipher/blowfish-armv6.S => cipher/blowfish-arm.S: adapt to pre-armv6 CPUs.
* cipher/blowfish.c: enable assembly on armv4/armv5 little-endian CPUs.
* cipher/camellia-armv6.S => cipher/camellia-arm.S: adapt to pre-armv6 CPUs.
* cipher/camellia.c, cipher-camellia-glue.c: enable assembly on armv4/armv5
  little-endian CPUs.
* cipher/cast5-armv6.S => cipher/cast5-arm.S: adapt to pre-armv6 CPUs.
* cipher/cast5.c: enable assembly on armv4/armv5 little-endian CPUs.
* cipher/rijndael-armv6.S => cipher/rijndael-arm.S: adapt to pre-armv6 CPUs.
* cipher/rijndael.c: enable assembly on armv4/armv5 little-endian CPUs.
* cipher/twofish-armv6.S => cipher/twofish-arm.S: adapt to pre-armv6 CPUs.
* cipher/twofish.c: enable assembly on armv4/armv5 little-endian CPUs.

--
Our ARMv6 assembly optimized code can be easily adapted to earlier CPUs.
The only incompatible place is rev instruction used to do byte swapping.
Replace it on <= ARMv6 with a series of 4 instructions.

Compare:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
AES            620ms   610ms   650ms   680ms   620ms   630ms   660ms 660ms   630ms   630ms
CAMELLIA128    720ms   720ms   780ms   790ms   770ms   760ms   780ms 780ms   770ms   760ms
CAMELLIA256    910ms   910ms   970ms   970ms   960ms   950ms   970ms 970ms   960ms   950ms
CAST5          820ms   820ms   930ms   920ms   890ms   860ms   930ms 920ms   880ms   890ms
BLOWFISH       550ms   560ms   650ms   660ms   630ms   600ms   660ms 650ms   610ms   620ms

                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
AES            130ms   140ms   180ms   200ms   160ms   170ms   190ms 200ms   170ms   170ms
CAMELLIA128    150ms   160ms   210ms   220ms   200ms   190ms   210ms 220ms   190ms   190ms
CAMELLIA256    180ms   180ms   260ms   240ms   240ms   230ms   250ms 250ms   230ms   230ms
CAST5          170ms   160ms   270ms   120ms   240ms   130ms   260ms 270ms   130ms   120ms
BLOWFISH       160ms   150ms   260ms   110ms   230ms   120ms   250ms 260ms   110ms   120ms

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov at gmail.com>
---
 cipher/Makefile.am      |   8 +-
 cipher/blowfish-arm.S   | 743 +++++++++++++++++++++++++++++++++++++++++
 cipher/blowfish-armv6.S | 730 -----------------------------------------
 cipher/blowfish.c       |  44 +--
 cipher/camellia-arm.S   | 616 ++++++++++++++++++++++++++++++++++
 cipher/camellia-armv6.S | 604 ----------------------------------
 cipher/camellia-glue.c  |  14 +-
 cipher/camellia.c       |   8 +-
 cipher/camellia.h       |   2 +-
 cipher/cast5-arm.S      | 715 ++++++++++++++++++++++++++++++++++++++++
 cipher/cast5-armv6.S    | 702 ---------------------------------------
 cipher/cast5.c          |  44 +--
 cipher/rijndael-arm.S   | 853 ++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/rijndael-armv6.S | 853 ------------------------------------------------
 cipher/rijndael.c       |  36 +-
 cipher/twofish-arm.S    | 365 +++++++++++++++++++++
 cipher/twofish-armv6.S  | 365 ---------------------
 cipher/twofish.c        |   8 +-
 configure.ac            |  10 +-
 19 files changed, 3379 insertions(+), 3341 deletions(-)
 create mode 100644 cipher/blowfish-arm.S
 delete mode 100644 cipher/blowfish-armv6.S
 create mode 100644 cipher/camellia-arm.S
 delete mode 100644 cipher/camellia-armv6.S
 create mode 100644 cipher/cast5-arm.S
 delete mode 100644 cipher/cast5-armv6.S
 create mode 100644 cipher/rijndael-arm.S
 delete mode 100644 cipher/rijndael-armv6.S
 create mode 100644 cipher/twofish-arm.S
 delete mode 100644 cipher/twofish-armv6.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3d8149a..e3aed3b 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -56,7 +56,7 @@ rmd.h
 EXTRA_libcipher_la_SOURCES = \
 arcfour.c \
 blowfish.c blowfish-amd64.S \
-cast5.c cast5-amd64.S cast5-armv6.S \
+cast5.c cast5-amd64.S cast5-arm.S \
 crc.c \
 des.c \
 dsa.c \
@@ -67,7 +67,7 @@ gost28147.c gost.h \
 gostr3411-94.c \
 md4.c \
 md5.c \
-rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-armv6.S \
+rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \
 rmd160.c \
 rsa.c \
 salsa20.c \
@@ -80,10 +80,10 @@ sha512.c sha512-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c \
-twofish.c twofish-amd64.S twofish-armv6.S \
+twofish.c twofish-amd64.S twofish-arm.S \
 rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
-  camellia-aesni-avx2-amd64.S camellia-armv6.S
+  camellia-aesni-avx2-amd64.S camellia-arm.S
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g'
diff --git a/cipher/blowfish-arm.S b/cipher/blowfish-arm.S
new file mode 100644
index 0000000..501b085
--- /dev/null
+++ b/cipher/blowfish-arm.S
@@ -0,0 +1,743 @@
+/* blowfish-arm.S  -  ARM assembly implementation of Blowfish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of crypto context */
+#define s0	0
+#define s1	(s0 + (1 * 256) * 4)
+#define s2	(s0 + (2 * 256) * 4)
+#define s3	(s0 + (3 * 256) * 4)
+#define p	(s3 + (1 * 256) * 4)
+
+/* register macros */
+#define CTXs0 %r0
+#define CTXs1 %r9
+#define CTXs2 %r8
+#define CTXs3 %r10
+#define RMASK %lr
+#define RKEYL %r2
+#define RKEYR %ip
+
+#define RL0 %r3
+#define RR0 %r4
+
+#define RL1 %r9
+#define RR1 %r10
+
+#define RT0 %r11
+#define RT1 %r7
+#define RT2 %r5
+#define RT3 %r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 3)]; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 0)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 3)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 2)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 1)]; \
+	strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+	#define ldr_unaligned_host ldr_unaligned_le
+	#define str_unaligned_host str_unaligned_le
+
+	/* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+	#define host_to_be(reg, rtmp) \
+		rev reg, reg;
+	#define be_to_host(reg, rtmp) \
+		rev reg, reg;
+#else
+	#define host_to_be(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+	#define be_to_host(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+#endif
+#else
+	#define ldr_unaligned_host ldr_unaligned_be
+	#define str_unaligned_host str_unaligned_be
+
+	/* nop on big-endian */
+	#define host_to_be(reg, rtmp) /*_*/
+	#define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F(l, r) \
+	and RT0, RMASK, l, lsr#(24 - 2); \
+	and RT1, RMASK, l, lsr#(16 - 2); \
+	ldr RT0, [CTXs0, RT0]; \
+	and RT2, RMASK, l, lsr#(8 - 2); \
+	ldr RT1, [CTXs1, RT1]; \
+	and RT3, RMASK, l, lsl#2; \
+	ldr RT2, [CTXs2, RT2]; \
+	add RT0, RT1; \
+	ldr RT3, [CTXs3, RT3]; \
+	eor RT0, RT2; \
+	add RT0, RT3; \
+	eor r, RT0;
+
+#define load_roundkey_enc(n) \
+	ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
+	ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
+
+#define add_roundkey_enc() \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR;
+
+#define round_enc(n) \
+	add_roundkey_enc(); \
+	load_roundkey_enc(n); \
+	\
+	F(RL0, RR0); \
+	F(RR0, RL0);
+
+#define load_roundkey_dec(n) \
+	ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
+	ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
+
+#define add_roundkey_dec() \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR;
+
+#define round_dec(n) \
+	add_roundkey_dec(); \
+	load_roundkey_dec(n); \
+	\
+	F(RL0, RR0); \
+	F(RR0, RL0);
+
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
+	ldr l0, [rin, #((offs) + 0)]; \
+	ldr r0, [rin, #((offs) + 4)]; \
+	convert(l0, rtmp); \
+	convert(r0, rtmp);
+
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	str l0, [rout, #((offs) + 0)]; \
+	str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_be)
+
+	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, be_to_host)
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_host)
+
+	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, host_to_host)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, host_to_host); \
+		2:;
+#endif
+
+.align 3
+.type  __blowfish_enc_blk1,%function;
+
+__blowfish_enc_blk1:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0]: src
+	 * output:
+	 *	[RR0, RL0]: dst
+	 */
+	push {%lr};
+
+	add CTXs1, CTXs0, #(s1 - s0);
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+	add CTXs3, CTXs1, #(s3 - s1);
+
+	load_roundkey_enc(0);
+	round_enc(2);
+	round_enc(4);
+	round_enc(6);
+	round_enc(8);
+	round_enc(10);
+	round_enc(12);
+	round_enc(14);
+	round_enc(16);
+	add_roundkey_enc();
+
+	pop {%pc};
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl  _gcry_blowfish_arm_do_encrypt
+.type   _gcry_blowfish_arm_do_encrypt,%function;
+
+_gcry_blowfish_arm_do_encrypt:
+	/* input:
+	 *	%r0: ctx, CTX
+	 *	%r1: u32 *ret_xl
+	 *	%r2: u32 *ret_xr
+	 */
+	push {%r2, %r4-%r11, %ip, %lr};
+
+	ldr RL0, [%r1];
+	ldr RR0, [%r2];
+
+	bl __blowfish_enc_blk1;
+
+	pop {%r2};
+	str RR0, [%r1];
+	str RL0, [%r2];
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_do_encrypt,.-_gcry_blowfish_arm_do_encrypt;
+
+.align 3
+.global _gcry_blowfish_arm_encrypt_block
+.type   _gcry_blowfish_arm_encrypt_block,%function;
+
+_gcry_blowfish_arm_encrypt_block:
+	/* input:
+	 *	%r0: ctx, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	bl __blowfish_enc_blk1;
+
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_encrypt_block,.-_gcry_blowfish_arm_encrypt_block;
+
+.align 3
+.global _gcry_blowfish_arm_decrypt_block
+.type   _gcry_blowfish_arm_decrypt_block,%function;
+
+_gcry_blowfish_arm_decrypt_block:
+	/* input:
+	 *	%r0: ctx, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	add CTXs1, CTXs0, #(s1 - s0);
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+	add CTXs3, CTXs1, #(s3 - s1);
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	load_roundkey_dec(17);
+	round_dec(15);
+	round_dec(13);
+	round_dec(11);
+	round_dec(9);
+	round_dec(7);
+	round_dec(5);
+	round_dec(3);
+	round_dec(1);
+	add_roundkey_dec();
+
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_blowfish_arm_decrypt_block,.-_gcry_blowfish_arm_decrypt_block;
+
+/***********************************************************************
+ * 2-way blowfish
+ ***********************************************************************/
+#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
+	\
+	and RT0, RMASK, l0, lsr#(24 - 2); \
+	and RT1, RMASK, l0, lsr#(16 - 2); \
+	and RT2, RMASK, l0, lsr#(8 - 2); \
+	add RT1, #(s1 - s0); \
+	\
+	ldr RT0, [CTXs0, RT0]; \
+	and RT3, RMASK, l0, lsl#2; \
+	ldr RT1, [CTXs0, RT1]; \
+	add RT3, #(s3 - s2); \
+	ldr RT2, [CTXs2, RT2]; \
+	add RT0, RT1; \
+	ldr RT3, [CTXs2, RT3]; \
+	\
+	and RT1, RMASK, l1, lsr#(24 - 2); \
+	eor RT0, RT2; \
+	and RT2, RMASK, l1, lsr#(16 - 2); \
+	add RT0, RT3; \
+	add RT2, #(s1 - s0); \
+	and RT3, RMASK, l1, lsr#(8 - 2); \
+	eor r0, RT0; \
+	\
+	ldr RT1, [CTXs0, RT1]; \
+	and RT0, RMASK, l1, lsl#2; \
+	ldr RT2, [CTXs0, RT2]; \
+	add RT0, #(s3 - s2); \
+	ldr RT3, [CTXs2, RT3]; \
+	add RT1, RT2; \
+	ldr RT0, [CTXs2, RT0]; \
+	\
+	and RT2, RMASK, r0, lsr#(24 - 2); \
+	eor RT1, RT3; \
+	and RT3, RMASK, r0, lsr#(16 - 2); \
+	add RT1, RT0; \
+	add RT3, #(s1 - s0); \
+	and RT0, RMASK, r0, lsr#(8 - 2); \
+	eor r1, RT1; \
+	\
+	ldr RT2, [CTXs0, RT2]; \
+	and RT1, RMASK, r0, lsl#2; \
+	ldr RT3, [CTXs0, RT3]; \
+	add RT1, #(s3 - s2); \
+	ldr RT0, [CTXs2, RT0]; \
+	add RT2, RT3; \
+	ldr RT1, [CTXs2, RT1]; \
+	\
+	and RT3, RMASK, r1, lsr#(24 - 2); \
+	eor RT2, RT0; \
+	and RT0, RMASK, r1, lsr#(16 - 2); \
+	add RT2, RT1; \
+	add RT0, #(s1 - s0); \
+	and RT1, RMASK, r1, lsr#(8 - 2); \
+	eor l0, RT2; \
+	\
+	ldr RT3, [CTXs0, RT3]; \
+	and RT2, RMASK, r1, lsl#2; \
+	ldr RT0, [CTXs0, RT0]; \
+	add RT2, #(s3 - s2); \
+	ldr RT1, [CTXs2, RT1]; \
+	eor l1, RKEYL; \
+	ldr RT2, [CTXs2, RT2]; \
+	\
+	eor r0, RKEYR; \
+	add RT3, RT0; \
+	eor r1, RKEYR; \
+	eor RT3, RT1; \
+	eor l0, RKEYL; \
+	add RT3, RT2; \
+	set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
+	eor l1, RT3; \
+	set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
+
+#define load_n_add_roundkey_enc2(n) \
+	load_roundkey_enc(n); \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR; \
+	eor RL1, RKEYL; \
+	eor RR1, RKEYR; \
+	load_roundkey_enc((n) + 2);
+
+#define next_key(reg, offs) \
+	ldr reg, [CTXs2, #(offs)];
+
+#define dummy(x, y) /* do nothing */
+
+#define round_enc2(n, load_next_key) \
+	F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
+
+#define load_n_add_roundkey_dec2(n) \
+	load_roundkey_dec(n); \
+	eor RL0, RKEYL; \
+	eor RR0, RKEYR; \
+	eor RL1, RKEYL; \
+	eor RR1, RKEYR; \
+	load_roundkey_dec((n) - 2);
+
+#define round_dec2(n, load_next_key) \
+	F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
+	ldr l0, [rin, #(0)]; \
+	ldr r0, [rin, #(4)]; \
+	convert(l0, rtmp); \
+	ldr l1, [rin, #(8)]; \
+	convert(r0, rtmp); \
+	ldr r1, [rin, #(12)]; \
+	convert(l1, rtmp); \
+	convert(r1, rtmp);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	convert(l1, rtmp); \
+	str l0, [rout, #(0)]; \
+	convert(r1, rtmp); \
+	str r0, [rout, #(4)]; \
+	str l1, [rout, #(8)]; \
+	str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, 4, rtmp0); \
+			ldr_unaligned_be(l1, rin, 8, rtmp0); \
+			ldr_unaligned_be(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, 4, rtmp0); \
+			ldr_unaligned_host(l1, rin, 8, rtmp0); \
+			ldr_unaligned_host(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+#endif
+
+.align 3
+.type  _gcry_blowfish_arm_enc_blk2,%function;
+
+_gcry_blowfish_arm_enc_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+	push {RT0,%lr};
+
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+
+	load_n_add_roundkey_enc2(0);
+	round_enc2(2, next_key);
+	round_enc2(4, next_key);
+	round_enc2(6, next_key);
+	round_enc2(8, next_key);
+	round_enc2(10, next_key);
+	round_enc2(12, next_key);
+	round_enc2(14, next_key);
+	round_enc2(16, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	pop {RT0,%pc};
+.size _gcry_blowfish_arm_enc_blk2,.-_gcry_blowfish_arm_enc_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cfb_dec;
+.type  _gcry_blowfish_arm_cfb_dec,%function;
+
+_gcry_blowfish_arm_cfb_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
+	ldm %r3, {RL0, RR0};
+	host_to_be(RL0, RT0);
+	host_to_be(RR0, RT0);
+	read_block(%r2, 0, RL1, RR1, RT0);
+
+	/* Update IV, load src[1] and save to iv[0] */
+	read_block_host(%r2, 8, %r5, %r6, RT0);
+	stm %lr, {%r5, %r6};
+
+	bl _gcry_blowfish_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r1: dst, %r0: %src */
+	pop {%r0};
+
+	/* dst = src ^ result */
+	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_cfb_dec,.-_gcry_blowfish_arm_cfb_dec;
+
+.align 3
+.globl _gcry_blowfish_arm_ctr_enc;
+.type  _gcry_blowfish_arm_ctr_enc,%function;
+
+_gcry_blowfish_arm_ctr_enc:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit, big-endian)
+	 */
+	push {%r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load IV (big => host endian) */
+	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT0);
+
+	/* Construct IVs */
+	adds RR1, RR0, #1; /* +1 */
+	adc RL1, RL0, #0;
+	adds %r6, RR1, #1; /* +2 */
+	adc %r5, RL1, #0;
+
+	/* Store new IV (host => big-endian) */
+	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT0);
+
+	bl _gcry_blowfish_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r1: dst, %r0: %src */
+	pop {%r0};
+
+	/* XOR key-stream with plaintext */
+	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_ctr_enc,.-_gcry_blowfish_arm_ctr_enc;
+
+.align 3
+.type  _gcry_blowfish_arm_dec_blk2,%function;
+
+_gcry_blowfish_arm_dec_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+	add CTXs2, CTXs0, #(s2 - s0);
+	mov RMASK, #(0xff << 2); /* byte mask */
+
+	load_n_add_roundkey_dec2(17);
+	round_dec2(15, next_key);
+	round_dec2(13, next_key);
+	round_dec2(11, next_key);
+	round_dec2(9, next_key);
+	round_dec2(7, next_key);
+	round_dec2(5, next_key);
+	round_dec2(3, next_key);
+	round_dec2(1, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_blowfish_arm_dec_blk2,.-_gcry_blowfish_arm_dec_blk2;
+
+.align 3
+.globl _gcry_blowfish_arm_cbc_dec;
+.type  _gcry_blowfish_arm_cbc_dec,%function;
+
+_gcry_blowfish_arm_cbc_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r2-%r11, %ip, %lr};
+
+	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+
+	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+	 * of function call. */
+	b _gcry_blowfish_arm_dec_blk2;
+.Ldec_cbc_tail:
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: %src, %r1: dst, %r2: iv */
+	pop {%r0, %r2};
+
+	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r0, 0, %r7, %r8, %r5);
+	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
+	ldm %r2, {%r5, %r6};
+
+	/* out[1] ^= IV+1 */
+	eor %r10, %r7;
+	eor %r9, %r8;
+	/* out[0] ^= IV */
+	eor %r4, %r5;
+	eor %r3, %r6;
+
+	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r0, 8, %r7, %r8, %r5);
+	/* store IV+2 to iv[0] (aligned). */
+	stm %r2, {%r7, %r8};
+
+	/* store result to dst[0-3]. Might be unaligned. */
+	write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_blowfish_arm_cbc_dec,.-_gcry_blowfish_arm_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/blowfish-armv6.S b/cipher/blowfish-armv6.S
deleted file mode 100644
index eea879f..0000000
--- a/cipher/blowfish-armv6.S
+++ /dev/null
@@ -1,730 +0,0 @@
-/* blowfish-armv6.S  -  ARM assembly implementation of Blowfish cipher
- *
- * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-
-.text
-
-.syntax unified
-.arm
-
-/* structure of crypto context */
-#define s0	0
-#define s1	(s0 + (1 * 256) * 4)
-#define s2	(s0 + (2 * 256) * 4)
-#define s3	(s0 + (3 * 256) * 4)
-#define p	(s3 + (1 * 256) * 4)
-
-/* register macros */
-#define CTXs0 %r0
-#define CTXs1 %r9
-#define CTXs2 %r8
-#define CTXs3 %r10
-#define RMASK %lr
-#define RKEYL %r2
-#define RKEYR %ip
-
-#define RL0 %r3
-#define RR0 %r4
-
-#define RL1 %r9
-#define RR1 %r10
-
-#define RT0 %r11
-#define RT1 %r7
-#define RT2 %r5
-#define RT3 %r6
-
-/* helper macros */
-#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 0)]; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 3)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 0)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 1)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 2)]; \
-	strb rtmp0, [rdst, #((offs) + 3)];
-
-#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 3)]; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 0)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 3)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 2)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 1)]; \
-	strb rtmp0, [rdst, #((offs) + 0)];
-
-#ifdef __ARMEL__
-	#define ldr_unaligned_host ldr_unaligned_le
-	#define str_unaligned_host str_unaligned_le
-
-	/* bswap on little-endian */
-	#define host_to_be(reg) \
-		rev reg, reg;
-	#define be_to_host(reg) \
-		rev reg, reg;
-#else
-	#define ldr_unaligned_host ldr_unaligned_be
-	#define str_unaligned_host str_unaligned_be
-
-	/* nop on big-endian */
-	#define host_to_be(reg) /*_*/
-	#define be_to_host(reg) /*_*/
-#endif
-
-#define host_to_host(x) /*_*/
-
-/***********************************************************************
- * 1-way blowfish
- ***********************************************************************/
-#define F(l, r) \
-	and RT0, RMASK, l, lsr#(24 - 2); \
-	and RT1, RMASK, l, lsr#(16 - 2); \
-	ldr RT0, [CTXs0, RT0]; \
-	and RT2, RMASK, l, lsr#(8 - 2); \
-	ldr RT1, [CTXs1, RT1]; \
-	and RT3, RMASK, l, lsl#2; \
-	ldr RT2, [CTXs2, RT2]; \
-	add RT0, RT1; \
-	ldr RT3, [CTXs3, RT3]; \
-	eor RT0, RT2; \
-	add RT0, RT3; \
-	eor r, RT0;
-
-#define load_roundkey_enc(n) \
-	ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \
-	ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))];
-
-#define add_roundkey_enc() \
-	eor RL0, RKEYL; \
-	eor RR0, RKEYR;
-
-#define round_enc(n) \
-	add_roundkey_enc(); \
-	load_roundkey_enc(n); \
-	\
-	F(RL0, RR0); \
-	F(RR0, RL0);
-
-#define load_roundkey_dec(n) \
-	ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \
-	ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))];
-
-#define add_roundkey_dec() \
-	eor RL0, RKEYL; \
-	eor RR0, RKEYR;
-
-#define round_dec(n) \
-	add_roundkey_dec(); \
-	load_roundkey_dec(n); \
-	\
-	F(RL0, RR0); \
-	F(RR0, RL0);
-
-#define read_block_aligned(rin, offs, l0, r0, convert) \
-	ldr l0, [rin, #((offs) + 0)]; \
-	ldr r0, [rin, #((offs) + 4)]; \
-	convert(l0); \
-	convert(r0);
-
-#define write_block_aligned(rout, offs, l0, r0, convert) \
-	convert(l0); \
-	convert(r0); \
-	str l0, [rout, #((offs) + 0)]; \
-	str r0, [rout, #((offs) + 4)];
-
-#ifdef __ARM_FEATURE_UNALIGNED
-	/* unaligned word reads allowed */
-	#define read_block(rin, offs, l0, r0, rtmp0) \
-		read_block_aligned(rin, offs, l0, r0, host_to_be)
-
-	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
-		write_block_aligned(rout, offs, r0, l0, be_to_host)
-
-	#define read_block_host(rin, offs, l0, r0, rtmp0) \
-		read_block_aligned(rin, offs, l0, r0, host_to_host)
-
-	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
-		write_block_aligned(rout, offs, r0, l0, host_to_host)
-#else
-	/* need to handle unaligned reads by byte reads */
-	#define read_block(rin, offs, l0, r0, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
-			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block_aligned(rin, offs, l0, r0, host_to_be); \
-		2:;
-
-	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
-			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block_aligned(rout, offs, l0, r0, be_to_host); \
-		2:;
-
-	#define read_block_host(rin, offs, l0, r0, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
-			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block_aligned(rin, offs, l0, r0, host_to_host); \
-		2:;
-
-	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
-			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block_aligned(rout, offs, l0, r0, host_to_host); \
-		2:;
-#endif
-
-.align 3
-.type  __blowfish_enc_blk1,%function;
-
-__blowfish_enc_blk1:
-	/* input:
-	 *	preloaded: CTX
-	 *	[RL0, RR0]: src
-	 * output:
-	 *	[RR0, RL0]: dst
-	 */
-	push {%lr};
-
-	add CTXs1, CTXs0, #(s1 - s0);
-	add CTXs2, CTXs0, #(s2 - s0);
-	mov RMASK, #(0xff << 2); /* byte mask */
-	add CTXs3, CTXs1, #(s3 - s1);
-
-	load_roundkey_enc(0);
-	round_enc(2);
-	round_enc(4);
-	round_enc(6);
-	round_enc(8);
-	round_enc(10);
-	round_enc(12);
-	round_enc(14);
-	round_enc(16);
-	add_roundkey_enc();
-
-	pop {%pc};
-.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
-
-.align 8
-.globl  _gcry_blowfish_armv6_do_encrypt
-.type   _gcry_blowfish_armv6_do_encrypt,%function;
-
-_gcry_blowfish_armv6_do_encrypt:
-	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: u32 *ret_xl
-	 *	%r2: u32 *ret_xr
-	 */
-	push {%r2, %r4-%r11, %ip, %lr};
-
-	ldr RL0, [%r1];
-	ldr RR0, [%r2];
-
-	bl __blowfish_enc_blk1;
-
-	pop {%r2};
-	str RR0, [%r1];
-	str RL0, [%r2];
-
-	pop {%r4-%r11, %ip, %pc};
-.size _gcry_blowfish_armv6_do_encrypt,.-_gcry_blowfish_armv6_do_encrypt;
-
-.align 3
-.global _gcry_blowfish_armv6_encrypt_block
-.type   _gcry_blowfish_armv6_encrypt_block,%function;
-
-_gcry_blowfish_armv6_encrypt_block:
-	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 */
-	push {%r4-%r11, %ip, %lr};
-
-	read_block(%r2, 0, RL0, RR0, RT0);
-
-	bl __blowfish_enc_blk1;
-
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
-
-	pop {%r4-%r11, %ip, %pc};
-.size _gcry_blowfish_armv6_encrypt_block,.-_gcry_blowfish_armv6_encrypt_block;
-
-.align 3
-.global _gcry_blowfish_armv6_decrypt_block
-.type   _gcry_blowfish_armv6_decrypt_block,%function;
-
-_gcry_blowfish_armv6_decrypt_block:
-	/* input:
-	 *	%r0: ctx, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 */
-	push {%r4-%r11, %ip, %lr};
-
-	add CTXs1, CTXs0, #(s1 - s0);
-	add CTXs2, CTXs0, #(s2 - s0);
-	mov RMASK, #(0xff << 2); /* byte mask */
-	add CTXs3, CTXs1, #(s3 - s1);
-
-	read_block(%r2, 0, RL0, RR0, RT0);
-
-	load_roundkey_dec(17);
-	round_dec(15);
-	round_dec(13);
-	round_dec(11);
-	round_dec(9);
-	round_dec(7);
-	round_dec(5);
-	round_dec(3);
-	round_dec(1);
-	add_roundkey_dec();
-
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
-
-	pop {%r4-%r11, %ip, %pc};
-.size _gcry_blowfish_armv6_decrypt_block,.-_gcry_blowfish_armv6_decrypt_block;
-
-/***********************************************************************
- * 2-way blowfish
- ***********************************************************************/
-#define F2(n, l0, r0, l1, r1, set_nextk, dec) \
-	\
-	and RT0, RMASK, l0, lsr#(24 - 2); \
-	and RT1, RMASK, l0, lsr#(16 - 2); \
-	and RT2, RMASK, l0, lsr#(8 - 2); \
-	add RT1, #(s1 - s0); \
-	\
-	ldr RT0, [CTXs0, RT0]; \
-	and RT3, RMASK, l0, lsl#2; \
-	ldr RT1, [CTXs0, RT1]; \
-	add RT3, #(s3 - s2); \
-	ldr RT2, [CTXs2, RT2]; \
-	add RT0, RT1; \
-	ldr RT3, [CTXs2, RT3]; \
-	\
-	and RT1, RMASK, l1, lsr#(24 - 2); \
-	eor RT0, RT2; \
-	and RT2, RMASK, l1, lsr#(16 - 2); \
-	add RT0, RT3; \
-	add RT2, #(s1 - s0); \
-	and RT3, RMASK, l1, lsr#(8 - 2); \
-	eor r0, RT0; \
-	\
-	ldr RT1, [CTXs0, RT1]; \
-	and RT0, RMASK, l1, lsl#2; \
-	ldr RT2, [CTXs0, RT2]; \
-	add RT0, #(s3 - s2); \
-	ldr RT3, [CTXs2, RT3]; \
-	add RT1, RT2; \
-	ldr RT0, [CTXs2, RT0]; \
-	\
-	and RT2, RMASK, r0, lsr#(24 - 2); \
-	eor RT1, RT3; \
-	and RT3, RMASK, r0, lsr#(16 - 2); \
-	add RT1, RT0; \
-	add RT3, #(s1 - s0); \
-	and RT0, RMASK, r0, lsr#(8 - 2); \
-	eor r1, RT1; \
-	\
-	ldr RT2, [CTXs0, RT2]; \
-	and RT1, RMASK, r0, lsl#2; \
-	ldr RT3, [CTXs0, RT3]; \
-	add RT1, #(s3 - s2); \
-	ldr RT0, [CTXs2, RT0]; \
-	add RT2, RT3; \
-	ldr RT1, [CTXs2, RT1]; \
-	\
-	and RT3, RMASK, r1, lsr#(24 - 2); \
-	eor RT2, RT0; \
-	and RT0, RMASK, r1, lsr#(16 - 2); \
-	add RT2, RT1; \
-	add RT0, #(s1 - s0); \
-	and RT1, RMASK, r1, lsr#(8 - 2); \
-	eor l0, RT2; \
-	\
-	ldr RT3, [CTXs0, RT3]; \
-	and RT2, RMASK, r1, lsl#2; \
-	ldr RT0, [CTXs0, RT0]; \
-	add RT2, #(s3 - s2); \
-	ldr RT1, [CTXs2, RT1]; \
-	eor l1, RKEYL; \
-	ldr RT2, [CTXs2, RT2]; \
-	\
-	eor r0, RKEYR; \
-	add RT3, RT0; \
-	eor r1, RKEYR; \
-	eor RT3, RT1; \
-	eor l0, RKEYL; \
-	add RT3, RT2; \
-	set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \
-	eor l1, RT3; \
-	set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4)));
-
-#define load_n_add_roundkey_enc2(n) \
-	load_roundkey_enc(n); \
-	eor RL0, RKEYL; \
-	eor RR0, RKEYR; \
-	eor RL1, RKEYL; \
-	eor RR1, RKEYR; \
-	load_roundkey_enc((n) + 2);
-
-#define next_key(reg, offs) \
-	ldr reg, [CTXs2, #(offs)];
-
-#define dummy(x, y) /* do nothing */
-
-#define round_enc2(n, load_next_key) \
-	F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0);
-
-#define load_n_add_roundkey_dec2(n) \
-	load_roundkey_dec(n); \
-	eor RL0, RKEYL; \
-	eor RR0, RKEYR; \
-	eor RL1, RKEYL; \
-	eor RR1, RKEYR; \
-	load_roundkey_dec((n) - 2);
-
-#define round_dec2(n, load_next_key) \
-	F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1);
-
-#define read_block2_aligned(rin, l0, r0, l1, r1, convert) \
-	ldr l0, [rin, #(0)]; \
-	ldr r0, [rin, #(4)]; \
-	convert(l0); \
-	ldr l1, [rin, #(8)]; \
-	convert(r0); \
-	ldr r1, [rin, #(12)]; \
-	convert(l1); \
-	convert(r1);
-
-#define write_block2_aligned(rout, l0, r0, l1, r1, convert) \
-	convert(l0); \
-	convert(r0); \
-	convert(l1); \
-	str l0, [rout, #(0)]; \
-	convert(r1); \
-	str r0, [rout, #(4)]; \
-	str l1, [rout, #(8)]; \
-	str r1, [rout, #(12)];
-
-#ifdef __ARM_FEATURE_UNALIGNED
-	/* unaligned word reads allowed */
-	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
-		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be)
-
-	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host)
-
-	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
-		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host)
-
-	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host)
-#else
-	/* need to handle unaligned reads by byte reads */
-	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_be(l0, rin, 0, rtmp0); \
-			ldr_unaligned_be(r0, rin, 4, rtmp0); \
-			ldr_unaligned_be(l1, rin, 8, rtmp0); \
-			ldr_unaligned_be(r1, rin, 12, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be); \
-		2:;
-
-	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
-			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
-			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
-			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host); \
-		2:;
-
-	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_host(l0, rin, 0, rtmp0); \
-			ldr_unaligned_host(r0, rin, 4, rtmp0); \
-			ldr_unaligned_host(l1, rin, 8, rtmp0); \
-			ldr_unaligned_host(r1, rin, 12, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host); \
-		2:;
-
-	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
-			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
-			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
-			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host); \
-		2:;
-#endif
-
-.align 3
-.type  _gcry_blowfish_armv6_enc_blk2,%function;
-
-_gcry_blowfish_armv6_enc_blk2:
-	/* input:
-	 *	preloaded: CTX
-	 *	[RL0, RR0], [RL1, RR1]: src
-	 * output:
-	 *	[RR0, RL0], [RR1, RL1]: dst
-	 */
-	push {%lr};
-
-	add CTXs2, CTXs0, #(s2 - s0);
-	mov RMASK, #(0xff << 2); /* byte mask */
-
-	load_n_add_roundkey_enc2(0);
-	round_enc2(2, next_key);
-	round_enc2(4, next_key);
-	round_enc2(6, next_key);
-	round_enc2(8, next_key);
-	round_enc2(10, next_key);
-	round_enc2(12, next_key);
-	round_enc2(14, next_key);
-	round_enc2(16, dummy);
-
-	host_to_be(RR0);
-	host_to_be(RL0);
-	host_to_be(RR1);
-	host_to_be(RL1);
-
-	pop {%pc};
-.size _gcry_blowfish_armv6_enc_blk2,.-_gcry_blowfish_armv6_enc_blk2;
-
-.align 3
-.globl _gcry_blowfish_armv6_cfb_dec;
-.type  _gcry_blowfish_armv6_cfb_dec,%function;
-
-_gcry_blowfish_armv6_cfb_dec:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
-	 */
-	push {%r2, %r4-%r11, %ip, %lr};
-
-	mov %lr, %r3;
-
-	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-	ldm %r3, {RL0, RR0};
-	host_to_be(RL0);
-	host_to_be(RR0);
-	read_block(%r2, 0, RL1, RR1, RT0);
-
-	/* Update IV, load src[1] and save to iv[0] */
-	read_block_host(%r2, 8, %r5, %r6, RT0);
-	stm %lr, {%r5, %r6};
-
-	bl _gcry_blowfish_armv6_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
-
-	/* %r1: dst, %r0: %src */
-	pop {%r0};
-
-	/* dst = src ^ result */
-	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_blowfish_armv6_cfb_dec,.-_gcry_blowfish_armv6_cfb_dec;
-
-.align 3
-.globl _gcry_blowfish_armv6_ctr_enc;
-.type  _gcry_blowfish_armv6_ctr_enc,%function;
-
-_gcry_blowfish_armv6_ctr_enc:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit, big-endian)
-	 */
-	push {%r2, %r4-%r11, %ip, %lr};
-
-	mov %lr, %r3;
-
-	/* Load IV (big => host endian) */
-	read_block_aligned(%lr, 0, RL0, RR0, be_to_host);
-
-	/* Construct IVs */
-	adds RR1, RR0, #1; /* +1 */
-	adc RL1, RL0, #0;
-	adds %r6, RR1, #1; /* +2 */
-	adc %r5, RL1, #0;
-
-	/* Store new IV (host => big-endian) */
-	write_block_aligned(%lr, 0, %r5, %r6, host_to_be);
-
-	bl _gcry_blowfish_armv6_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
-
-	/* %r1: dst, %r0: %src */
-	pop {%r0};
-
-	/* XOR key-stream with plaintext */
-	read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_blowfish_armv6_ctr_enc,.-_gcry_blowfish_armv6_ctr_enc;
-
-.align 3
-.type  _gcry_blowfish_armv6_dec_blk2,%function;
-
-_gcry_blowfish_armv6_dec_blk2:
-	/* input:
-	 *	preloaded: CTX
-	 *	[RL0, RR0], [RL1, RR1]: src
-	 * output:
-	 *	[RR0, RL0], [RR1, RL1]: dst
-	 */
-	add CTXs2, CTXs0, #(s2 - s0);
-	mov RMASK, #(0xff << 2); /* byte mask */
-
-	load_n_add_roundkey_dec2(17);
-	round_dec2(15, next_key);
-	round_dec2(13, next_key);
-	round_dec2(11, next_key);
-	round_dec2(9, next_key);
-	round_dec2(7, next_key);
-	round_dec2(5, next_key);
-	round_dec2(3, next_key);
-	round_dec2(1, dummy);
-
-	host_to_be(RR0);
-	host_to_be(RL0);
-	host_to_be(RR1);
-	host_to_be(RL1);
-
-	b .Ldec_cbc_tail;
-.ltorg
-.size _gcry_blowfish_armv6_dec_blk2,.-_gcry_blowfish_armv6_dec_blk2;
-
-.align 3
-.globl _gcry_blowfish_armv6_cbc_dec;
-.type  _gcry_blowfish_armv6_cbc_dec,%function;
-
-_gcry_blowfish_armv6_cbc_dec:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
-	 */
-	push {%r2-%r11, %ip, %lr};
-
-	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
-
-	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
-	 * of function call. */
-	b _gcry_blowfish_armv6_dec_blk2;
-.Ldec_cbc_tail:
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
-
-	/* %r0: %src, %r1: dst, %r2: iv */
-	pop {%r0, %r2};
-
-	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r0, 0, %r7, %r8, %r5);
-	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-	ldm %r2, {%r5, %r6};
-
-	/* out[1] ^= IV+1 */
-	eor %r10, %r7;
-	eor %r9, %r8;
-	/* out[0] ^= IV */
-	eor %r4, %r5;
-	eor %r3, %r6;
-
-	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r0, 8, %r7, %r8, %r5);
-	/* store IV+2 to iv[0] (aligned). */
-	stm %r2, {%r7, %r8};
-
-	/* store result to dst[0-3]. Might be unaligned. */
-	write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_blowfish_armv6_cbc_dec,.-_gcry_blowfish_armv6_cbc_dec;
-
-#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
-#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index 2f739c8..2bedbea 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -50,11 +50,11 @@
 # define USE_AMD64_ASM 1
 #endif
 
-/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
-#undef USE_ARMV6_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+/* USE_ARM_ASM indicates whether to use ARMv6 assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
 # if (BLOWFISH_ROUNDS == 16) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
-#  define USE_ARMV6_ASM 1
+#  define USE_ARM_ASM 1
 # endif
 #endif
 
@@ -314,44 +314,44 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (2*8);
 }
 
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
 
 /* Assembly implementations of Blowfish. */
-extern void _gcry_blowfish_armv6_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+extern void _gcry_blowfish_arm_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
 					    u32 *ret_xr);
 
-extern void _gcry_blowfish_armv6_encrypt_block(BLOWFISH_context *c, byte *out,
+extern void _gcry_blowfish_arm_encrypt_block(BLOWFISH_context *c, byte *out,
 					       const byte *in);
 
-extern void _gcry_blowfish_armv6_decrypt_block(BLOWFISH_context *c, byte *out,
+extern void _gcry_blowfish_arm_decrypt_block(BLOWFISH_context *c, byte *out,
 					       const byte *in);
 
 /* These assembly implementations process two blocks in parallel. */
-extern void _gcry_blowfish_armv6_ctr_enc(BLOWFISH_context *ctx, byte *out,
+extern void _gcry_blowfish_arm_ctr_enc(BLOWFISH_context *ctx, byte *out,
 					 const byte *in, byte *ctr);
 
-extern void _gcry_blowfish_armv6_cbc_dec(BLOWFISH_context *ctx, byte *out,
+extern void _gcry_blowfish_arm_cbc_dec(BLOWFISH_context *ctx, byte *out,
 					 const byte *in, byte *iv);
 
-extern void _gcry_blowfish_armv6_cfb_dec(BLOWFISH_context *ctx, byte *out,
+extern void _gcry_blowfish_arm_cfb_dec(BLOWFISH_context *ctx, byte *out,
 					 const byte *in, byte *iv);
 
 static void
 do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
 {
-  _gcry_blowfish_armv6_do_encrypt (bc, ret_xl, ret_xr);
+  _gcry_blowfish_arm_do_encrypt (bc, ret_xl, ret_xr);
 }
 
 static void
 do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-  _gcry_blowfish_armv6_encrypt_block (context, outbuf, inbuf);
+  _gcry_blowfish_arm_encrypt_block (context, outbuf, inbuf);
 }
 
 static void
 do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
-  _gcry_blowfish_armv6_decrypt_block (context, outbuf, inbuf);
+  _gcry_blowfish_arm_decrypt_block (context, outbuf, inbuf);
 }
 
 static unsigned int
@@ -370,7 +370,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (10*4);
 }
 
-#else /*USE_ARMV6_ASM*/
+#else /*USE_ARM_ASM*/
 
 #if BLOWFISH_ROUNDS != 16
 static inline u32
@@ -580,7 +580,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (64);
 }
 
-#endif /*!USE_AMD64_ASM&&!USE_ARMV6_ASM*/
+#endif /*!USE_AMD64_ASM&&!USE_ARM_ASM*/
 
 
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
@@ -615,12 +615,12 @@ _gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Use generic code to handle smaller chunks... */
     /* TODO: use caching instead? */
   }
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
   {
     /* Process data in 2 block chunks. */
     while (nblocks >= 2)
       {
-        _gcry_blowfish_armv6_ctr_enc(ctx, outbuf, inbuf, ctr);
+        _gcry_blowfish_arm_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 2;
         outbuf += 2 * BLOWFISH_BLOCKSIZE;
@@ -683,12 +683,12 @@ _gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
   {
     /* Process data in 2 block chunks. */
     while (nblocks >= 2)
       {
-        _gcry_blowfish_armv6_cbc_dec(ctx, outbuf, inbuf, iv);
+        _gcry_blowfish_arm_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 2;
         outbuf += 2 * BLOWFISH_BLOCKSIZE;
@@ -746,12 +746,12 @@ _gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
   {
     /* Process data in 2 block chunks. */
     while (nblocks >= 2)
       {
-        _gcry_blowfish_armv6_cfb_dec(ctx, outbuf, inbuf, iv);
+        _gcry_blowfish_arm_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 2;
         outbuf += 2 * BLOWFISH_BLOCKSIZE;
diff --git a/cipher/camellia-arm.S b/cipher/camellia-arm.S
new file mode 100644
index 0000000..820c46e
--- /dev/null
+++ b/cipher/camellia-arm.S
@@ -0,0 +1,616 @@
+/* camellia-arm.S  -  ARM assembly implementation of Camellia cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %r0
+#define RTAB1 %ip
+#define RTAB3 %r1
+#define RMASK %lr
+
+#define IL %r2
+#define IR %r3
+
+#define XL %r4
+#define XR %r5
+#define YL %r6
+#define YR %r7
+
+#define RT0 %r8
+#define RT1 %r9
+#define RT2 %r10
+#define RT3 %r11
+
+/* helper macros */
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 3)]; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 0)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 3)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 2)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 1)]; \
+	strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+#ifdef HAVE_ARM_ARCH_V6
+	#define host_to_be(reg, rtmp) \
+		rev reg, reg;
+	#define be_to_host(reg, rtmp) \
+		rev reg, reg;
+#else
+	#define host_to_be(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+	#define be_to_host(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+#endif
+#else
+	/* nop on big-endian */
+	#define host_to_be(reg, rtmp) /*_*/
+	#define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define ldr_input_aligned_be(rin, a, b, c, d, rtmp) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	be_to_host(a, rtmp); \
+	ldr c, [rin, #8]; \
+	be_to_host(b, rtmp); \
+	ldr d, [rin, #12]; \
+	be_to_host(c, rtmp); \
+	be_to_host(d, rtmp);
+
+#define str_output_aligned_be(rout, a, b, c, d, rtmp) \
+	be_to_host(a, rtmp); \
+	be_to_host(b, rtmp); \
+	str a, [rout, #0]; \
+	be_to_host(c, rtmp); \
+	str b, [rout, #4]; \
+	be_to_host(d, rtmp); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads/writes allowed */
+	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \
+		ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp)
+
+	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0)
+#else
+	/* need to handle unaligned reads/writes by byte reads */
+	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(ra, rin, 0, rtmp0); \
+			ldr_unaligned_be(rb, rin, 4, rtmp0); \
+			ldr_unaligned_be(rc, rin, 8, rtmp0); \
+			ldr_unaligned_be(rd, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			ldr_input_aligned_be(rin, ra, rb, rc, rd, rtmp0); \
+		2:;
+
+	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(ra, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_be(rb, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_be(rc, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_be(rd, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			str_output_aligned_be(rout, ra, rb, rc, rd, rtmp0); \
+		2:;
+#endif
+
+/**********************************************************************
+  1-way camellia
+ **********************************************************************/
+#define roundsm(xl, xr, kl, kr, yl, yr) \
+	ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \
+	and  IR, RMASK, xr, lsl#(4);      /*sp1110*/ \
+	ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \
+	and  IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \
+	and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \
+	ldr  IR, [RTAB1,  IR]; \
+	and RT1, RMASK, xl, lsr#(8 - 4);  /*sp3033*/ \
+	eor yl, RT2; \
+	ldr  IL, [RTAB1,  IL]; \
+	eor yr, RT3; \
+	\
+	ldr RT0, [RTAB3, RT0]; \
+	add RTAB1, #4; \
+	ldr RT1, [RTAB3, RT1]; \
+	add RTAB3, #4; \
+	\
+	and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \
+	and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \
+	\
+	eor IR, RT0; \
+	eor IL, RT1; \
+	\
+	ldr RT2, [RTAB1, RT2]; \
+	and RT0, RMASK, xr, lsr#(8 - 4);  /*sp4404*/ \
+	ldr RT3, [RTAB1, RT3]; \
+	and RT1, RMASK, xl, lsl#(4);      /*sp4404*/ \
+	\
+	ldr RT0, [RTAB3, RT0]; \
+	sub RTAB1, #4; \
+	ldr RT1, [RTAB3, RT1]; \
+	sub RTAB3, #4; \
+	\
+	eor IR, RT2; \
+	eor IL, RT3; \
+	eor IR, RT0; \
+	eor IL, RT1; \
+	\
+	eor IR, IL; \
+	eor yr, yr, IL, ror#8; \
+	eor yl, IR; \
+	eor yr, IR;
+
+#define enc_rounds(n) \
+	roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR);
+
+#define dec_rounds(n) \
+	roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \
+	roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \
+	roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR);
+
+/* perform FL and FL⁻¹ */
+#define fls(ll, lr, rl, rr, kll, klr, krl, krr) \
+	ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \
+	ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \
+	and RT0, ll; \
+	ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \
+	orr RT2, rr; \
+	ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \
+	eor rl, RT2; \
+	eor lr, lr, RT0, ror#31; \
+	and RT3, rl; \
+	orr RT1, lr; \
+	eor ll, RT1; \
+	eor rr, rr, RT3, ror#31;
+
+#define enc_fls(n) \
+	fls(XL, XR, YL, YR, \
+	    (n) * 2 + 0, (n) * 2 + 1, \
+	    (n) * 2 + 2, (n) * 2 + 3);
+
+#define dec_fls(n) \
+	fls(XL, XR, YL, YR, \
+	    (n) * 2 + 2, (n) * 2 + 3, \
+	    (n) * 2 + 0, (n) * 2 + 1);
+
+#define inpack(n) \
+	ldr_input_be(%r2, XL, XR, YL, YR, RT0); \
+	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+	eor XL, RT0; \
+	eor XR, RT1;
+
+#define outunpack(n) \
+	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
+	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
+	eor YL, RT0; \
+	eor YR, RT1; \
+	str_output_be(%r1, YL, YR, XL, XR, RT0, RT1);
+
+.align 3
+.global _gcry_camellia_arm_encrypt_block
+.type   _gcry_camellia_arm_encrypt_block,%function;
+
+_gcry_camellia_arm_encrypt_block:
+	/* input:
+	 *	%r0: keytable
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: keybitlen
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	ldr RTAB1, =.Lcamellia_sp1110;
+	mov RMASK, #0xff;
+	add RTAB3, RTAB1, #(2 * 4);
+	push {%r3};
+	mov RMASK, RMASK, lsl#4 /* byte mask */
+
+	inpack(0);
+
+	enc_rounds(0);
+	enc_fls(8);
+	enc_rounds(8);
+	enc_fls(16);
+	enc_rounds(16);
+
+	pop {RT0};
+	cmp RT0, #(16 * 8);
+	bne .Lenc_256;
+
+	pop {%r1};
+	outunpack(24);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+
+.Lenc_256:
+	enc_fls(24);
+	enc_rounds(24);
+
+	pop {%r1};
+	outunpack(32);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;
+
+.align 3
+.global _gcry_camellia_arm_decrypt_block
+.type   _gcry_camellia_arm_decrypt_block,%function;
+
+_gcry_camellia_arm_decrypt_block:
+	/* input:
+	 *	%r0: keytable
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: keybitlen
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	ldr RTAB1, =.Lcamellia_sp1110;
+	mov RMASK, #0xff;
+	add RTAB3, RTAB1, #(2 * 4);
+	mov RMASK, RMASK, lsl#4 /* byte mask */
+
+	cmp %r3, #(16 * 8);
+	bne .Ldec_256;
+
+	inpack(24);
+
+.Ldec_128:
+	dec_rounds(16);
+	dec_fls(16);
+	dec_rounds(8);
+	dec_fls(8);
+	dec_rounds(0);
+
+	pop {%r1};
+	outunpack(0);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+
+.Ldec_256:
+	inpack(32);
+	dec_rounds(24);
+	dec_fls(24);
+
+	b .Ldec_128;
+.ltorg
+.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;
+
+.data
+
+/* Encryption/Decryption tables */
+.align 5
+.Lcamellia_sp1110:
+.long 0x70707000
+.Lcamellia_sp0222:
+            .long 0x00e0e0e0
+.Lcamellia_sp3033:
+                        .long 0x38003838
+.Lcamellia_sp4404:
+                                    .long 0x70700070
+.long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c
+.long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3
+.long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0
+.long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4
+.long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057
+.long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea
+.long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae
+.long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023
+.long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b
+.long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045
+.long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5
+.long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed
+.long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f
+.long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d
+.long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092
+.long 0x23232300, 0x00464646, 0x91009191, 0x86860086
+.long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af
+.long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c
+.long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f
+.long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e
+.long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc
+.long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e
+.long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b
+.long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6
+.long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039
+.long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5
+.long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d
+.long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9
+.long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a
+.long 0x92929200, 0x00252525, 0x49004949, 0x51510051
+.long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c
+.long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b
+.long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a
+.long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb
+.long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0
+.long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074
+.long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b
+.long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0
+.long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084
+.long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df
+.long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb
+.long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034
+.long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076
+.long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d
+.long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9
+.long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1
+.long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004
+.long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014
+.long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a
+.long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de
+.long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011
+.long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032
+.long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c
+.long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053
+.long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2
+.long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe
+.long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf
+.long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3
+.long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a
+.long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024
+.long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8
+.long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060
+.long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069
+.long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa
+.long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0
+.long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1
+.long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062
+.long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054
+.long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e
+.long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0
+.long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064
+.long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010
+.long 0x12121200, 0x00242424, 0x09000909, 0x00000000
+.long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3
+.long 0x20202000, 0x00404040, 0x10001010, 0x75750075
+.long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a
+.long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6
+.long 0x84848400, 0x00090909, 0x42004242, 0x09090009
+.long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd
+.long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087
+.long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083
+.long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd
+.long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090
+.long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073
+.long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6
+.long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d
+.long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf
+.long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052
+.long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8
+.long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8
+.long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6
+.long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081
+.long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f
+.long 0x04040400, 0x00080808, 0x02000202, 0x13130013
+.long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063
+.long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9
+.long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7
+.long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f
+.long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc
+.long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029
+.long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9
+.long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f
+.long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4
+.long 0x32323200, 0x00646464, 0x19001919, 0x78780078
+.long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006
+.long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7
+.long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071
+.long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4
+.long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab
+.long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088
+.long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d
+.long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072
+.long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9
+.long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8
+.long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac
+.long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036
+.long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a
+.long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c
+.long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1
+.long 0x24242400, 0x00484848, 0x12001212, 0x40400040
+.long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3
+.long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb
+.long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043
+.long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015
+.long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad
+.long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077
+.long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080
+.long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082
+.long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec
+.long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027
+.long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5
+.long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085
+.long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035
+.long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c
+.long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041
+.long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef
+.long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093
+.long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019
+.long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021
+.long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e
+.long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e
+.long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065
+.long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd
+.long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8
+.long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f
+.long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb
+.long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce
+.long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030
+.long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f
+.long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5
+.long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a
+.long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1
+.long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca
+.long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047
+.long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d
+.long 0x09090900, 0x00121212, 0x84008484, 0x01010001
+.long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6
+.long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056
+.long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d
+.long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d
+.long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066
+.long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc
+.long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d
+.long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012
+.long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020
+.long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1
+.long 0x33333300, 0x00666666, 0x99009999, 0x99990099
+.long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c
+.long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2
+.long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e
+.long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005
+.long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7
+.long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031
+.long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017
+.long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7
+.long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058
+.long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061
+.long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b
+.long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c
+.long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f
+.long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016
+.long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018
+.long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022
+.long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044
+.long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2
+.long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5
+.long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091
+.long 0x13131300, 0x00262626, 0x89008989, 0x08080008
+.long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8
+.long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc
+.long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050
+.long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0
+.long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d
+.long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089
+.long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097
+.long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b
+.long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095
+.long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff
+.long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2
+.long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4
+.long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048
+.long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7
+.long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db
+.long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003
+.long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da
+.long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f
+.long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094
+.long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c
+.long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002
+.long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a
+.long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033
+.long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067
+.long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3
+.long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f
+.long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2
+.long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b
+.long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026
+.long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037
+.long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b
+.long 0x88888800, 0x00111111, 0x44004444, 0x96960096
+.long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b
+.long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be
+.long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e
+.long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079
+.long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c
+.long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e
+.long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e
+.long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5
+.long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6
+.long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd
+.long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059
+.long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098
+.long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a
+.long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046
+.long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba
+.long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025
+.long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042
+.long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2
+.long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa
+.long 0x40404000, 0x00808080, 0x20002020, 0x07070007
+.long 0x28282800, 0x00505050, 0x14001414, 0x55550055
+.long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee
+.long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a
+.long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049
+.long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068
+.long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038
+.long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4
+.long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028
+.long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b
+.long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9
+.long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1
+.long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3
+.long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4
+.long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7
+.long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/camellia-armv6.S b/cipher/camellia-armv6.S
deleted file mode 100644
index 3544754..0000000
--- a/cipher/camellia-armv6.S
+++ /dev/null
@@ -1,604 +0,0 @@
-/* camellia-armv6.S  -  ARM assembly implementation of Camellia cipher
- *
- * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-
-.text
-
-.syntax unified
-.arm
-
-#define CAMELLIA_TABLE_BYTE_LEN 272
-
-/* struct camellia_ctx: */
-#define key_table 0
-#define key_length CAMELLIA_TABLE_BYTE_LEN
-
-/* register macros */
-#define CTX %r0
-#define RTAB1 %ip
-#define RTAB3 %r1
-#define RMASK %lr
-
-#define IL %r2
-#define IR %r3
-
-#define XL %r4
-#define XR %r5
-#define YL %r6
-#define YR %r7
-
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
-
-/* helper macros */
-#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 3)]; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 0)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 3)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 2)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 1)]; \
-	strb rtmp0, [rdst, #((offs) + 0)];
-
-#ifdef __ARMEL__
-	/* bswap on little-endian */
-	#define host_to_be(reg) \
-		rev reg, reg;
-	#define be_to_host(reg) \
-		rev reg, reg;
-#else
-	/* nop on big-endian */
-	#define host_to_be(reg) /*_*/
-	#define be_to_host(reg) /*_*/
-#endif
-
-#define ldr_input_aligned_be(rin, a, b, c, d) \
-	ldr a, [rin, #0]; \
-	ldr b, [rin, #4]; \
-	be_to_host(a); \
-	ldr c, [rin, #8]; \
-	be_to_host(b); \
-	ldr d, [rin, #12]; \
-	be_to_host(c); \
-	be_to_host(d);
-
-#define str_output_aligned_be(rout, a, b, c, d) \
-	be_to_host(a); \
-	be_to_host(b); \
-	str a, [rout, #0]; \
-	be_to_host(c); \
-	str b, [rout, #4]; \
-	be_to_host(d); \
-	str c, [rout, #8]; \
-	str d, [rout, #12];
-
-#ifdef __ARM_FEATURE_UNALIGNED
-	/* unaligned word reads/writes allowed */
-	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \
-		ldr_input_aligned_be(rin, ra, rb, rc, rd)
-
-	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
-		str_output_aligned_be(rout, ra, rb, rc, rd)
-#else
-	/* need to handle unaligned reads/writes by byte reads */
-	#define ldr_input_be(rin, ra, rb, rc, rd, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_be(ra, rin, 0, rtmp0); \
-			ldr_unaligned_be(rb, rin, 4, rtmp0); \
-			ldr_unaligned_be(rc, rin, 8, rtmp0); \
-			ldr_unaligned_be(rd, rin, 12, rtmp0); \
-			b 2f; \
-		1:;\
-			ldr_input_aligned_be(rin, ra, rb, rc, rd); \
-		2:;
-
-	#define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_be(ra, rout, 0, rtmp0, rtmp1); \
-			str_unaligned_be(rb, rout, 4, rtmp0, rtmp1); \
-			str_unaligned_be(rc, rout, 8, rtmp0, rtmp1); \
-			str_unaligned_be(rd, rout, 12, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			str_output_aligned_be(rout, ra, rb, rc, rd); \
-		2:;
-#endif
-
-/**********************************************************************
-  1-way camellia
- **********************************************************************/
-#define roundsm(xl, xr, kl, kr, yl, yr) \
-	ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \
-	and  IR, RMASK, xr, lsl#(4);      /*sp1110*/ \
-	ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \
-	and  IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \
-	and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \
-	ldr  IR, [RTAB1,  IR]; \
-	and RT1, RMASK, xl, lsr#(8 - 4);  /*sp3033*/ \
-	eor yl, RT2; \
-	ldr  IL, [RTAB1,  IL]; \
-	eor yr, RT3; \
-	\
-	ldr RT0, [RTAB3, RT0]; \
-	add RTAB1, #4; \
-	ldr RT1, [RTAB3, RT1]; \
-	add RTAB3, #4; \
-	\
-	and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \
-	and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \
-	\
-	eor IR, RT0; \
-	eor IL, RT1; \
-	\
-	ldr RT2, [RTAB1, RT2]; \
-	and RT0, RMASK, xr, lsr#(8 - 4);  /*sp4404*/ \
-	ldr RT3, [RTAB1, RT3]; \
-	and RT1, RMASK, xl, lsl#(4);      /*sp4404*/ \
-	\
-	ldr RT0, [RTAB3, RT0]; \
-	sub RTAB1, #4; \
-	ldr RT1, [RTAB3, RT1]; \
-	sub RTAB3, #4; \
-	\
-	eor IR, RT2; \
-	eor IL, RT3; \
-	eor IR, RT0; \
-	eor IL, RT1; \
-	\
-	eor IR, IL; \
-	eor yr, yr, IL, ror#8; \
-	eor yl, IR; \
-	eor yr, IR;
-
-#define enc_rounds(n) \
-	roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \
-	roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \
-	roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \
-	roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \
-	roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \
-	roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR);
-
-#define dec_rounds(n) \
-	roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \
-	roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \
-	roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \
-	roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \
-	roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \
-	roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR);
-
-/* perform FL and FL⁻¹ */
-#define fls(ll, lr, rl, rr, kll, klr, krl, krr) \
-	ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \
-	ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \
-	and RT0, ll; \
-	ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \
-	orr RT2, rr; \
-	ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \
-	eor rl, RT2; \
-	eor lr, lr, RT0, ror#31; \
-	and RT3, rl; \
-	orr RT1, lr; \
-	eor ll, RT1; \
-	eor rr, rr, RT3, ror#31;
-
-#define enc_fls(n) \
-	fls(XL, XR, YL, YR, \
-	    (n) * 2 + 0, (n) * 2 + 1, \
-	    (n) * 2 + 2, (n) * 2 + 3);
-
-#define dec_fls(n) \
-	fls(XL, XR, YL, YR, \
-	    (n) * 2 + 2, (n) * 2 + 3, \
-	    (n) * 2 + 0, (n) * 2 + 1);
-
-#define inpack(n) \
-	ldr_input_be(%r2, XL, XR, YL, YR, RT0); \
-	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
-	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
-	eor XL, RT0; \
-	eor XR, RT1;
-
-#define outunpack(n) \
-	ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \
-	ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \
-	eor YL, RT0; \
-	eor YR, RT1; \
-	str_output_be(%r1, YL, YR, XL, XR, RT0, RT1);
-
-.align 3
-.global _gcry_camellia_armv6_encrypt_block
-.type   _gcry_camellia_armv6_encrypt_block,%function;
-
-_gcry_camellia_armv6_encrypt_block:
-	/* input:
-	 *	%r0: keytable
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: keybitlen
-	 */
-	push {%r1, %r4-%r11, %ip, %lr};
-
-	ldr RTAB1, =.Lcamellia_sp1110;
-	mov RMASK, #0xff;
-	add RTAB3, RTAB1, #(2 * 4);
-	push {%r3};
-	mov RMASK, RMASK, lsl#4 /* byte mask */
-
-	inpack(0);
-
-	enc_rounds(0);
-	enc_fls(8);
-	enc_rounds(8);
-	enc_fls(16);
-	enc_rounds(16);
-
-	pop {RT0};
-	cmp RT0, #(16 * 8);
-	bne .Lenc_256;
-
-	pop {%r1};
-	outunpack(24);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-
-.Lenc_256:
-	enc_fls(24);
-	enc_rounds(24);
-
-	pop {%r1};
-	outunpack(32);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_camellia_armv6_encrypt_block,.-_gcry_camellia_armv6_encrypt_block;
-
-.align 3
-.global _gcry_camellia_armv6_decrypt_block
-.type   _gcry_camellia_armv6_decrypt_block,%function;
-
-_gcry_camellia_armv6_decrypt_block:
-	/* input:
-	 *	%r0: keytable
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: keybitlen
-	 */
-	push {%r1, %r4-%r11, %ip, %lr};
-
-	ldr RTAB1, =.Lcamellia_sp1110;
-	mov RMASK, #0xff;
-	add RTAB3, RTAB1, #(2 * 4);
-	mov RMASK, RMASK, lsl#4 /* byte mask */
-
-	cmp %r3, #(16 * 8);
-	bne .Ldec_256;
-
-	inpack(24);
-
-.Ldec_128:
-	dec_rounds(16);
-	dec_fls(16);
-	dec_rounds(8);
-	dec_fls(8);
-	dec_rounds(0);
-
-	pop {%r1};
-	outunpack(0);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-
-.Ldec_256:
-	inpack(32);
-	dec_rounds(24);
-	dec_fls(24);
-
-	b .Ldec_128;
-.ltorg
-.size _gcry_camellia_armv6_decrypt_block,.-_gcry_camellia_armv6_decrypt_block;
-
-.data
-
-/* Encryption/Decryption tables */
-.align 5
-.Lcamellia_sp1110:
-.long 0x70707000
-.Lcamellia_sp0222:
-            .long 0x00e0e0e0
-.Lcamellia_sp3033:
-                        .long 0x38003838
-.Lcamellia_sp4404:
-                                    .long 0x70700070
-.long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c
-.long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3
-.long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0
-.long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4
-.long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057
-.long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea
-.long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae
-.long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023
-.long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b
-.long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045
-.long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5
-.long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed
-.long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f
-.long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d
-.long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092
-.long 0x23232300, 0x00464646, 0x91009191, 0x86860086
-.long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af
-.long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c
-.long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f
-.long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e
-.long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc
-.long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e
-.long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b
-.long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6
-.long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039
-.long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5
-.long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d
-.long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9
-.long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a
-.long 0x92929200, 0x00252525, 0x49004949, 0x51510051
-.long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c
-.long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b
-.long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a
-.long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb
-.long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0
-.long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074
-.long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b
-.long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0
-.long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084
-.long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df
-.long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb
-.long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034
-.long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076
-.long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d
-.long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9
-.long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1
-.long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004
-.long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014
-.long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a
-.long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de
-.long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011
-.long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032
-.long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c
-.long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053
-.long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2
-.long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe
-.long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf
-.long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3
-.long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a
-.long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024
-.long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8
-.long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060
-.long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069
-.long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa
-.long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0
-.long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1
-.long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062
-.long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054
-.long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e
-.long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0
-.long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064
-.long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010
-.long 0x12121200, 0x00242424, 0x09000909, 0x00000000
-.long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3
-.long 0x20202000, 0x00404040, 0x10001010, 0x75750075
-.long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a
-.long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6
-.long 0x84848400, 0x00090909, 0x42004242, 0x09090009
-.long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd
-.long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087
-.long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083
-.long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd
-.long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090
-.long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073
-.long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6
-.long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d
-.long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf
-.long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052
-.long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8
-.long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8
-.long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6
-.long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081
-.long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f
-.long 0x04040400, 0x00080808, 0x02000202, 0x13130013
-.long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063
-.long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9
-.long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7
-.long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f
-.long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc
-.long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029
-.long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9
-.long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f
-.long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4
-.long 0x32323200, 0x00646464, 0x19001919, 0x78780078
-.long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006
-.long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7
-.long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071
-.long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4
-.long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab
-.long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088
-.long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d
-.long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072
-.long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9
-.long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8
-.long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac
-.long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036
-.long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a
-.long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c
-.long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1
-.long 0x24242400, 0x00484848, 0x12001212, 0x40400040
-.long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3
-.long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb
-.long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043
-.long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015
-.long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad
-.long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077
-.long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080
-.long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082
-.long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec
-.long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027
-.long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5
-.long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085
-.long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035
-.long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c
-.long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041
-.long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef
-.long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093
-.long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019
-.long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021
-.long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e
-.long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e
-.long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065
-.long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd
-.long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8
-.long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f
-.long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb
-.long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce
-.long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030
-.long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f
-.long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5
-.long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a
-.long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1
-.long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca
-.long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047
-.long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d
-.long 0x09090900, 0x00121212, 0x84008484, 0x01010001
-.long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6
-.long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056
-.long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d
-.long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d
-.long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066
-.long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc
-.long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d
-.long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012
-.long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020
-.long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1
-.long 0x33333300, 0x00666666, 0x99009999, 0x99990099
-.long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c
-.long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2
-.long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e
-.long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005
-.long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7
-.long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031
-.long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017
-.long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7
-.long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058
-.long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061
-.long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b
-.long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c
-.long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f
-.long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016
-.long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018
-.long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022
-.long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044
-.long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2
-.long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5
-.long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091
-.long 0x13131300, 0x00262626, 0x89008989, 0x08080008
-.long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8
-.long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc
-.long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050
-.long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0
-.long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d
-.long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089
-.long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097
-.long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b
-.long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095
-.long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff
-.long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2
-.long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4
-.long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048
-.long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7
-.long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db
-.long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003
-.long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da
-.long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f
-.long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094
-.long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c
-.long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002
-.long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a
-.long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033
-.long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067
-.long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3
-.long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f
-.long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2
-.long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b
-.long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026
-.long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037
-.long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b
-.long 0x88888800, 0x00111111, 0x44004444, 0x96960096
-.long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b
-.long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be
-.long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e
-.long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079
-.long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c
-.long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e
-.long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e
-.long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5
-.long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6
-.long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd
-.long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059
-.long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098
-.long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a
-.long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046
-.long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba
-.long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025
-.long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042
-.long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2
-.long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa
-.long 0x40404000, 0x00808080, 0x20002020, 0x07070007
-.long 0x28282800, 0x00505050, 0x14001414, 0x55550055
-.long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee
-.long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a
-.long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049
-.long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068
-.long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038
-.long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4
-.long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028
-.long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b
-.long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9
-.long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1
-.long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3
-.long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4
-.long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7
-.long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e
-
-#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
-#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 29cb7a5..e6d4029 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -193,14 +193,14 @@ camellia_setkey(void *c, const byte *key, unsigned keylen)
   return 0;
 }
 
-#ifdef USE_ARMV6_ASM
+#ifdef USE_ARM_ASM
 
 /* Assembly implementations of CAST5. */
-extern void _gcry_camellia_armv6_encrypt_block(const KEY_TABLE_TYPE keyTable,
+extern void _gcry_camellia_arm_encrypt_block(const KEY_TABLE_TYPE keyTable,
 					       byte *outbuf, const byte *inbuf,
 					       const int keybits);
 
-extern void _gcry_camellia_armv6_decrypt_block(const KEY_TABLE_TYPE keyTable,
+extern void _gcry_camellia_arm_decrypt_block(const KEY_TABLE_TYPE keyTable,
 					       byte *outbuf, const byte *inbuf,
 					       const int keybits);
 
@@ -209,7 +209,7 @@ static void Camellia_EncryptBlock(const int keyBitLength,
 				  const KEY_TABLE_TYPE keyTable,
 				  unsigned char *cipherText)
 {
-  _gcry_camellia_armv6_encrypt_block(keyTable, cipherText, plaintext,
+  _gcry_camellia_arm_encrypt_block(keyTable, cipherText, plaintext,
 				     keyBitLength);
 }
 
@@ -218,7 +218,7 @@ static void Camellia_DecryptBlock(const int keyBitLength,
 				  const KEY_TABLE_TYPE keyTable,
 				  unsigned char *plaintext)
 {
-  _gcry_camellia_armv6_decrypt_block(keyTable, plaintext, cipherText,
+  _gcry_camellia_arm_decrypt_block(keyTable, plaintext, cipherText,
 				     keyBitLength);
 }
 
@@ -240,7 +240,7 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
 }
 
-#else /*USE_ARMV6_ASM*/
+#else /*USE_ARM_ASM*/
 
 static unsigned int
 camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
@@ -276,7 +276,7 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (CAMELLIA_decrypt_stack_burn_size);
 }
 
-#endif /*!USE_ARMV6_ASM*/
+#endif /*!USE_ARM_ASM*/
 
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
diff --git a/cipher/camellia.c b/cipher/camellia.c
index 03510a3..9067246 100644
--- a/cipher/camellia.c
+++ b/cipher/camellia.c
@@ -861,7 +861,7 @@ void camellia_setup192(const unsigned char *key, u32 *subkey)
 }
 
 
-#ifndef USE_ARMV6_ASM
+#ifndef USE_ARM_ASM
 /**
  * Stuff related to camellia encryption/decryption
  *
@@ -1321,7 +1321,7 @@ void camellia_decrypt256(const u32 *subkey, u32 *blocks)
 
     return;
 }
-#endif /*!USE_ARMV6_ASM*/
+#endif /*!USE_ARM_ASM*/
 
 
 /***
@@ -1349,7 +1349,7 @@ void Camellia_Ekeygen(const int keyBitLength,
 }
 
 
-#ifndef USE_ARMV6_ASM
+#ifndef USE_ARM_ASM
 void Camellia_EncryptBlock(const int keyBitLength,
 			   const unsigned char *plaintext,
 			   const KEY_TABLE_TYPE keyTable,
@@ -1410,4 +1410,4 @@ void Camellia_DecryptBlock(const int keyBitLength,
     PUTU32(plaintext + 8, tmp[2]);
     PUTU32(plaintext + 12, tmp[3]);
 }
-#endif /*!USE_ARMV6_ASM*/
+#endif /*!USE_ARM_ASM*/
diff --git a/cipher/camellia.h b/cipher/camellia.h
index 72f2d1f..20faa2c 100644
--- a/cipher/camellia.h
+++ b/cipher/camellia.h
@@ -32,7 +32,7 @@
 #include <config.h>
 /* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
 # undef USE_ARMV6_ASM
-# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+# if defined(__ARMEL__)
 #  ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
 #   define USE_ARMV6_ASM 1
 #  endif
diff --git a/cipher/cast5-arm.S b/cipher/cast5-arm.S
new file mode 100644
index 0000000..ce7fa93
--- /dev/null
+++ b/cipher/cast5-arm.S
@@ -0,0 +1,715 @@
+/* cast5-arm.S  -  ARM assembly implementation of CAST5 cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+.extern _gcry_cast5_s1to4;
+
+/* structure of crypto context */
+#define Km 0
+#define Kr (Km + (16 * 4))
+#define Kr_arm_enc (Kr + (16))
+#define Kr_arm_dec (Kr_arm_enc + (16))
+
+/* register macros */
+#define CTX %r0
+#define Rs1 %r7
+#define Rs2 %r8
+#define Rs3 %r9
+#define Rs4 %r10
+#define RMASK %r11
+#define RKM %r1
+#define RKR %r2
+
+#define RL0 %r3
+#define RR0 %r4
+
+#define RL1 %r9
+#define RR1 %r10
+
+#define RT0 %lr
+#define RT1 %ip
+#define RT2 %r5
+#define RT3 %r6
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 3)]; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 0)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 3)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 2)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 1)]; \
+	strb rtmp0, [rdst, #((offs) + 0)];
+
+#ifdef __ARMEL__
+	#define ldr_unaligned_host ldr_unaligned_le
+	#define str_unaligned_host str_unaligned_le
+
+	/* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+	#define host_to_be(reg, rtmp) \
+		rev reg, reg;
+	#define be_to_host(reg, rtmp) \
+		rev reg, reg;
+#else
+	#define host_to_be(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+	#define be_to_host(reg, rtmp) \
+		eor	rtmp, reg, reg, ror #16; \
+		mov	rtmp, rtmp, lsr #8; \
+		bic	rtmp, rtmp, #65280; \
+		eor	reg, rtmp, reg, ror #8;
+#endif
+#else
+	#define ldr_unaligned_host ldr_unaligned_be
+	#define str_unaligned_host str_unaligned_be
+
+	/* nop on big-endian */
+	#define host_to_be(reg, rtmp) /*_*/
+	#define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+/**********************************************************************
+  1-way cast5
+ **********************************************************************/
+
+#define dummy(n) /*_*/
+
+#define load_kr(n) \
+	ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */
+
+#define load_dec_kr(n) \
+	ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */
+
+#define load_km(n) \
+	ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */
+
+#define shift_kr(dummy) \
+	mov RKR, RKR, lsr #8;
+
+#define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \
+	op1 RKM, rr; \
+	mov RKM, RKM, ror RKR; \
+	\
+	and RT0, RMASK, RKM, ror #(24); \
+	and RT1, RMASK, RKM, lsr #(16); \
+	and RT2, RMASK, RKM, lsr #(8); \
+	ldr RT0, [Rs1, RT0]; \
+	and RT3, RMASK, RKM; \
+	ldr RT1, [Rs2, RT1]; \
+	shiftkr(RKR); \
+	\
+	ldr RT2, [Rs3, RT2]; \
+	\
+	op2 RT0, RT1; \
+	ldr RT3, [Rs4, RT3]; \
+	op3 RT0, RT2; \
+	loadkm((n) + (1 - ((dec) * 2))); \
+	op4 RT0, RT3; \
+	loadkr((n) + (1 - ((dec) * 2))); \
+	eor rl, RT0;
+
+#define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
+	F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr)
+#define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
+	F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr)
+#define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
+	F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr)
+
+#define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr)
+
+#define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr)
+
+#define read_block_aligned(rin, offs, l0, r0, convert, rtmp) \
+	ldr l0, [rin, #((offs) + 0)]; \
+	ldr r0, [rin, #((offs) + 4)]; \
+	convert(l0, rtmp); \
+	convert(r0, rtmp);
+
+#define write_block_aligned(rout, offs, l0, r0, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	str l0, [rout, #((offs) + 0)]; \
+	str r0, [rout, #((offs) + 4)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0)
+
+	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, be_to_host, rtmp0)
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0)
+
+	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
+		write_block_aligned(rout, offs, r0, l0, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block_host(rin, offs, l0, r0, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block_aligned(rin, offs, l0, r0, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block_aligned(rout, offs, l0, r0, host_to_host, rtmp0); \
+		2:;
+#endif
+
+.align 3
+.globl _gcry_cast5_arm_encrypt_block
+.type  _gcry_cast5_arm_encrypt_block,%function;
+
+_gcry_cast5_arm_encrypt_block:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	ldr Rs1, =_gcry_cast5_s1to4;
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100*4);
+	add Rs3, Rs1, #(0x100*4*2);
+	add Rs4, Rs1, #(0x100*4*3);
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	load_km(0);
+	load_kr(0);
+	enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr);
+	enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr);
+	enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr);
+	enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy);
+	enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
+	enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
+
+	ldr %r1, [%sp], #4;
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_encrypt_block,.-_gcry_cast5_arm_encrypt_block;
+
+.align 3
+.globl _gcry_cast5_arm_decrypt_block
+.type  _gcry_cast5_arm_decrypt_block,%function;
+
+_gcry_cast5_arm_decrypt_block:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	ldr Rs1, =_gcry_cast5_s1to4;
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100 * 4);
+	add Rs3, Rs1, #(0x100 * 4 * 2);
+	add Rs4, Rs1, #(0x100 * 4 * 3);
+
+	read_block(%r2, 0, RL0, RR0, RT0);
+
+	load_km(15);
+	load_dec_kr(15);
+	dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr);
+	dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr);
+	dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr);
+	dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy);
+	dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
+	dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
+
+	ldr %r1, [%sp], #4;
+	write_block(%r1, 0, RR0, RL0, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_decrypt_block,.-_gcry_cast5_arm_decrypt_block;
+
+/**********************************************************************
+  2-way cast5
+ **********************************************************************/
+
+#define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \
+	     loadkr) \
+	op1 RT3, RKM, rr0; \
+	op1 RKM, RKM, rr1; \
+	mov RT3, RT3, ror RKR; \
+	mov RKM, RKM, ror RKR; \
+	\
+	and RT0, RMASK, RT3, ror #(24); \
+	and RT1, RMASK, RT3, lsr #(16); \
+	and RT2, RMASK, RT3, lsr #(8); \
+	and RT3, RMASK, RT3; \
+	\
+	ldr RT0, [Rs1, RT0]; \
+	add RT2, #(0x100 * 4); \
+	ldr RT1, [Rs2, RT1]; \
+	add RT3, #(0x100 * 4 * 2); \
+	\
+	ldr RT2, [Rs2, RT2]; \
+	\
+	op2 RT0, RT1; \
+	ldr RT3, [Rs2, RT3]; \
+	and RT1, RMASK, RKM, ror #(24); \
+	op3 RT0, RT2; \
+	and RT2, RMASK, RKM, lsr #(16); \
+	op4 RT0, RT3; \
+	and RT3, RMASK, RKM, lsr #(8); \
+	eor rl0, RT0; \
+	add RT3, #(0x100 * 4); \
+	ldr RT1, [Rs1, RT1]; \
+	and RT0, RMASK, RKM; \
+	ldr RT2, [Rs2, RT2]; \
+	add RT0, #(0x100 * 4 * 2); \
+	\
+	ldr RT3, [Rs2, RT3]; \
+	\
+	op2 RT1, RT2; \
+	ldr RT0, [Rs2, RT0]; \
+	op3 RT1, RT3; \
+	loadkm((n) + (1 - ((dec) * 2))); \
+	op4 RT1, RT0; \
+	loadkr((n) + (1 - ((dec) * 2))); \
+	shiftkr(RKR); \
+	eor rl1, RT1;
+
+#define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
+	F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \
+	     loadkm, shiftkr, loadkr)
+#define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
+	F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \
+	     loadkm, shiftkr, loadkr)
+#define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
+	F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \
+	     loadkm, shiftkr, loadkr)
+
+#define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr)
+
+#define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
+	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr)
+
+#define read_block2_aligned(rin, l0, r0, l1, r1, convert, rtmp) \
+	ldr l0, [rin, #(0)]; \
+	ldr r0, [rin, #(4)]; \
+	convert(l0, rtmp); \
+	ldr l1, [rin, #(8)]; \
+	convert(r0, rtmp); \
+	ldr r1, [rin, #(12)]; \
+	convert(l1, rtmp); \
+	convert(r1, rtmp);
+
+#define write_block2_aligned(rout, l0, r0, l1, r1, convert, rtmp) \
+	convert(l0, rtmp); \
+	convert(r0, rtmp); \
+	convert(l1, rtmp); \
+	str l0, [rout, #(0)]; \
+	convert(r1, rtmp); \
+	str r0, [rout, #(4)]; \
+	str l1, [rout, #(8)]; \
+	str r1, [rout, #(12)];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads allowed */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0)
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0)
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0)
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0)
+#else
+	/* need to handle unaligned reads by byte reads */
+	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_be(l0, rin, 0, rtmp0); \
+			ldr_unaligned_be(r0, rin, 4, rtmp0); \
+			ldr_unaligned_be(l1, rin, 8, rtmp0); \
+			ldr_unaligned_be(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be, rtmp0); \
+		2:;
+
+	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host, rtmp0); \
+		2:;
+
+	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_host(l0, rin, 0, rtmp0); \
+			ldr_unaligned_host(r0, rin, 4, rtmp0); \
+			ldr_unaligned_host(l1, rin, 8, rtmp0); \
+			ldr_unaligned_host(r1, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+
+	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host, rtmp0); \
+		2:;
+#endif
+
+.align 3
+.type  _gcry_cast5_arm_enc_blk2,%function;
+
+_gcry_cast5_arm_enc_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+	push {%lr};
+
+	ldr Rs1, =_gcry_cast5_s1to4;
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100 * 4);
+
+	load_km(0);
+	load_kr(0);
+	enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(3, F1, RR, RL, load_km, dummy, load_kr);
+	enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(7, F2, RR, RL, load_km, dummy, load_kr);
+	enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(11, F3, RR, RL, load_km, dummy, load_kr);
+	enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy);
+	enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy);
+	enc_round2(15, F1, RR, RL, dummy, dummy, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	pop {%pc};
+.ltorg
+.size _gcry_cast5_arm_enc_blk2,.-_gcry_cast5_arm_enc_blk2;
+
+.align 3
+.globl _gcry_cast5_arm_cfb_dec;
+.type  _gcry_cast5_arm_cfb_dec,%function;
+
+_gcry_cast5_arm_cfb_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r1, %r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
+	ldm %r3, {RL0, RR0};
+	host_to_be(RL0, RT1);
+	host_to_be(RR0, RT1);
+	read_block(%r2, 0, RL1, RR1, %ip);
+
+	/* Update IV, load src[1] and save to iv[0] */
+	read_block_host(%r2, 8, %r5, %r6, %r7);
+	stm %lr, {%r5, %r6};
+
+	bl _gcry_cast5_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: dst, %r1: %src */
+	pop {%r0, %r1};
+
+	/* dst = src ^ result */
+	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_cfb_dec,.-_gcry_cast5_arm_cfb_dec;
+
+.align 3
+.globl _gcry_cast5_arm_ctr_enc;
+.type  _gcry_cast5_arm_ctr_enc,%function;
+
+_gcry_cast5_arm_ctr_enc:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit, big-endian)
+	 */
+	push {%r1, %r2, %r4-%r11, %ip, %lr};
+
+	mov %lr, %r3;
+
+	/* Load IV (big => host endian) */
+	read_block_aligned(%lr, 0, RL0, RR0, be_to_host, RT1);
+
+	/* Construct IVs */
+	adds RR1, RR0, #1; /* +1 */
+	adc RL1, RL0, #0;
+	adds %r6, RR1, #1; /* +2 */
+	adc %r5, RL1, #0;
+
+	/* Store new IV (host => big-endian) */
+	write_block_aligned(%lr, 0, %r5, %r6, host_to_be, RT1);
+
+	bl _gcry_cast5_arm_enc_blk2;
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: dst, %r1: %src */
+	pop {%r0, %r1};
+
+	/* XOR key-stream with plaintext */
+	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
+	eor %r5, %r4;
+	eor %r6, %r3;
+	eor %r7, %r10;
+	eor %r8, %r9;
+	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_ctr_enc,.-_gcry_cast5_arm_ctr_enc;
+
+.align 3
+.type  _gcry_cast5_arm_dec_blk2,%function;
+
+_gcry_cast5_arm_dec_blk2:
+	/* input:
+	 *	preloaded: CTX
+	 *	[RL0, RR0], [RL1, RR1]: src
+	 * output:
+	 *	[RR0, RL0], [RR1, RL1]: dst
+	 */
+
+	ldr Rs1, =_gcry_cast5_s1to4;
+	mov RMASK, #(0xff << 2);
+	add Rs2, Rs1, #(0x100 * 4);
+
+	load_km(15);
+	load_dec_kr(15);
+	dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr);
+	dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr);
+	dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr);
+	dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy);
+	dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy);
+	dec_round2(0, F1, RR, RL, dummy, dummy, dummy);
+
+	host_to_be(RR0, RT0);
+	host_to_be(RL0, RT0);
+	host_to_be(RR1, RT0);
+	host_to_be(RL1, RT0);
+
+	b .Ldec_cbc_tail;
+.ltorg
+.size _gcry_cast5_arm_dec_blk2,.-_gcry_cast5_arm_dec_blk2;
+
+.align 3
+.globl _gcry_cast5_arm_cbc_dec;
+.type  _gcry_cast5_arm_cbc_dec,%function;
+
+_gcry_cast5_arm_cbc_dec:
+	/* input:
+	 *	%r0: CTX
+	 *	%r1: dst (2 blocks)
+	 *	%r2: src (2 blocks)
+	 *	%r3: iv (64bit)
+	 */
+	push {%r1-%r11, %ip, %lr};
+
+	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
+
+	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
+	 * of function call. */
+	b _gcry_cast5_arm_dec_blk2;
+.Ldec_cbc_tail:
+	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
+
+	/* %r0: dst, %r1: %src, %r2: iv */
+	pop {%r0-%r2};
+
+	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r1, 0, %r7, %r8, %r5);
+	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
+	ldm %r2, {%r5, %r6};
+
+	/* out[1] ^= IV+1 */
+	eor %r10, %r7;
+	eor %r9, %r8;
+	/* out[0] ^= IV */
+	eor %r4, %r5;
+	eor %r3, %r6;
+
+	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
+	read_block_host(%r1, 8, %r7, %r8, %r5);
+	/* store IV+2 to iv[0] (aligned). */
+	stm %r2, {%r7, %r8};
+
+	/* store result to dst[0-3]. Might be unaligned. */
+	write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
+
+	pop {%r4-%r11, %ip, %pc};
+.ltorg
+.size _gcry_cast5_arm_cbc_dec,.-_gcry_cast5_arm_cbc_dec;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/cast5-armv6.S b/cipher/cast5-armv6.S
deleted file mode 100644
index 038fc4f..0000000
--- a/cipher/cast5-armv6.S
+++ /dev/null
@@ -1,702 +0,0 @@
-/* cast5-armv6.S  -  ARM assembly implementation of CAST5 cipher
- *
- * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-
-.text
-
-.syntax unified
-.arm
-
-.extern _gcry_cast5_s1to4;
-
-/* structure of crypto context */
-#define Km 0
-#define Kr (Km + (16 * 4))
-#define Kr_arm_enc (Kr + (16))
-#define Kr_arm_dec (Kr_arm_enc + (16))
-
-/* register macros */
-#define CTX %r0
-#define Rs1 %r7
-#define Rs2 %r8
-#define Rs3 %r9
-#define Rs4 %r10
-#define RMASK %r11
-#define RKM %r1
-#define RKR %r2
-
-#define RL0 %r3
-#define RR0 %r4
-
-#define RL1 %r9
-#define RR1 %r10
-
-#define RT0 %lr
-#define RT1 %ip
-#define RT2 %r5
-#define RT3 %r6
-
-/* helper macros */
-#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 0)]; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 3)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 0)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 1)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 2)]; \
-	strb rtmp0, [rdst, #((offs) + 3)];
-
-#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 3)]; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 0)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 3)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 2)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 1)]; \
-	strb rtmp0, [rdst, #((offs) + 0)];
-
-#ifdef __ARMEL__
-	#define ldr_unaligned_host ldr_unaligned_le
-	#define str_unaligned_host str_unaligned_le
-
-	/* bswap on little-endian */
-	#define host_to_be(reg) \
-		rev reg, reg;
-	#define be_to_host(reg) \
-		rev reg, reg;
-#else
-	#define ldr_unaligned_host ldr_unaligned_be
-	#define str_unaligned_host str_unaligned_be
-
-	/* nop on big-endian */
-	#define host_to_be(reg) /*_*/
-	#define be_to_host(reg) /*_*/
-#endif
-
-#define host_to_host(x) /*_*/
-
-/**********************************************************************
-  1-way cast5
- **********************************************************************/
-
-#define dummy(n) /*_*/
-
-#define load_kr(n) \
-	ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */
-
-#define load_dec_kr(n) \
-	ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */
-
-#define load_km(n) \
-	ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */
-
-#define shift_kr(dummy) \
-	mov RKR, RKR, lsr #8;
-
-#define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \
-	op1 RKM, rr; \
-	mov RKM, RKM, ror RKR; \
-	\
-	and RT0, RMASK, RKM, ror #(24); \
-	and RT1, RMASK, RKM, lsr #(16); \
-	and RT2, RMASK, RKM, lsr #(8); \
-	ldr RT0, [Rs1, RT0]; \
-	and RT3, RMASK, RKM; \
-	ldr RT1, [Rs2, RT1]; \
-	shiftkr(RKR); \
-	\
-	ldr RT2, [Rs3, RT2]; \
-	\
-	op2 RT0, RT1; \
-	ldr RT3, [Rs4, RT3]; \
-	op3 RT0, RT2; \
-	loadkm((n) + (1 - ((dec) * 2))); \
-	op4 RT0, RT3; \
-	loadkr((n) + (1 - ((dec) * 2))); \
-	eor rl, RT0;
-
-#define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
-	F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr)
-#define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
-	F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr)
-#define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \
-	F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr)
-
-#define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
-	Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr)
-
-#define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
-	Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr)
-
-#define read_block_aligned(rin, offs, l0, r0, convert) \
-	ldr l0, [rin, #((offs) + 0)]; \
-	ldr r0, [rin, #((offs) + 4)]; \
-	convert(l0); \
-	convert(r0);
-
-#define write_block_aligned(rout, offs, l0, r0, convert) \
-	convert(l0); \
-	convert(r0); \
-	str l0, [rout, #((offs) + 0)]; \
-	str r0, [rout, #((offs) + 4)];
-
-#ifdef __ARM_FEATURE_UNALIGNED
-	/* unaligned word reads allowed */
-	#define read_block(rin, offs, l0, r0, rtmp0) \
-		read_block_aligned(rin, offs, l0, r0, host_to_be)
-
-	#define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \
-		write_block_aligned(rout, offs, r0, l0, be_to_host)
-
-	#define read_block_host(rin, offs, l0, r0, rtmp0) \
-		read_block_aligned(rin, offs, l0, r0, host_to_host)
-
-	#define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \
-		write_block_aligned(rout, offs, r0, l0, host_to_host)
-#else
-	/* need to handle unaligned reads by byte reads */
-	#define read_block(rin, offs, l0, r0, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \
-			ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block_aligned(rin, offs, l0, r0, host_to_be); \
-		2:;
-
-	#define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \
-			str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block_aligned(rout, offs, l0, r0, be_to_host); \
-		2:;
-
-	#define read_block_host(rin, offs, l0, r0, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \
-			ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block_aligned(rin, offs, l0, r0, host_to_host); \
-		2:;
-
-	#define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \
-			str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block_aligned(rout, offs, l0, r0, host_to_host); \
-		2:;
-#endif
-
-.align 3
-.globl _gcry_cast5_armv6_encrypt_block
-.type  _gcry_cast5_armv6_encrypt_block,%function;
-
-_gcry_cast5_armv6_encrypt_block:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 */
-	push {%r1, %r4-%r11, %ip, %lr};
-
-	ldr Rs1, =_gcry_cast5_s1to4;
-	mov RMASK, #(0xff << 2);
-	add Rs2, Rs1, #(0x100*4);
-	add Rs3, Rs1, #(0x100*4*2);
-	add Rs4, Rs1, #(0x100*4*3);
-
-	read_block(%r2, 0, RL0, RR0, RT0);
-
-	load_km(0);
-	load_kr(0);
-	enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy);
-	enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr);
-	enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy);
-	enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr);
-	enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy);
-	enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr);
-	enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy);
-	enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy);
-	enc_round(15, F1, RR0, RL0, dummy, dummy, dummy);
-
-	ldr %r1, [%sp], #4;
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_cast5_armv6_encrypt_block,.-_gcry_cast5_armv6_encrypt_block;
-
-.align 3
-.globl _gcry_cast5_armv6_decrypt_block
-.type  _gcry_cast5_armv6_decrypt_block,%function;
-
-_gcry_cast5_armv6_decrypt_block:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 */
-	push {%r1, %r4-%r11, %ip, %lr};
-
-	ldr Rs1, =_gcry_cast5_s1to4;
-	mov RMASK, #(0xff << 2);
-	add Rs2, Rs1, #(0x100 * 4);
-	add Rs3, Rs1, #(0x100 * 4 * 2);
-	add Rs4, Rs1, #(0x100 * 4 * 3);
-
-	read_block(%r2, 0, RL0, RR0, RT0);
-
-	load_km(15);
-	load_dec_kr(15);
-	dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy);
-	dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr);
-	dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy);
-	dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr);
-	dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy);
-	dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr);
-	dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy);
-	dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy);
-	dec_round(0, F1, RR0, RL0, dummy, dummy, dummy);
-
-	ldr %r1, [%sp], #4;
-	write_block(%r1, 0, RR0, RL0, RT0, RT1);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_cast5_armv6_decrypt_block,.-_gcry_cast5_armv6_decrypt_block;
-
-/**********************************************************************
-  2-way cast5
- **********************************************************************/
-
-#define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \
-	     loadkr) \
-	op1 RT3, RKM, rr0; \
-	op1 RKM, RKM, rr1; \
-	mov RT3, RT3, ror RKR; \
-	mov RKM, RKM, ror RKR; \
-	\
-	and RT0, RMASK, RT3, ror #(24); \
-	and RT1, RMASK, RT3, lsr #(16); \
-	and RT2, RMASK, RT3, lsr #(8); \
-	and RT3, RMASK, RT3; \
-	\
-	ldr RT0, [Rs1, RT0]; \
-	add RT2, #(0x100 * 4); \
-	ldr RT1, [Rs2, RT1]; \
-	add RT3, #(0x100 * 4 * 2); \
-	\
-	ldr RT2, [Rs2, RT2]; \
-	\
-	op2 RT0, RT1; \
-	ldr RT3, [Rs2, RT3]; \
-	and RT1, RMASK, RKM, ror #(24); \
-	op3 RT0, RT2; \
-	and RT2, RMASK, RKM, lsr #(16); \
-	op4 RT0, RT3; \
-	and RT3, RMASK, RKM, lsr #(8); \
-	eor rl0, RT0; \
-	add RT3, #(0x100 * 4); \
-	ldr RT1, [Rs1, RT1]; \
-	and RT0, RMASK, RKM; \
-	ldr RT2, [Rs2, RT2]; \
-	add RT0, #(0x100 * 4 * 2); \
-	\
-	ldr RT3, [Rs2, RT3]; \
-	\
-	op2 RT1, RT2; \
-	ldr RT0, [Rs2, RT0]; \
-	op3 RT1, RT3; \
-	loadkm((n) + (1 - ((dec) * 2))); \
-	op4 RT1, RT0; \
-	loadkr((n) + (1 - ((dec) * 2))); \
-	shiftkr(RKR); \
-	eor rl1, RT1;
-
-#define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
-	F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \
-	     loadkm, shiftkr, loadkr)
-#define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
-	F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \
-	     loadkm, shiftkr, loadkr)
-#define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \
-	F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \
-	     loadkm, shiftkr, loadkr)
-
-#define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
-	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr)
-
-#define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \
-	Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr)
-
-#define read_block2_aligned(rin, l0, r0, l1, r1, convert) \
-	ldr l0, [rin, #(0)]; \
-	ldr r0, [rin, #(4)]; \
-	convert(l0); \
-	ldr l1, [rin, #(8)]; \
-	convert(r0); \
-	ldr r1, [rin, #(12)]; \
-	convert(l1); \
-	convert(r1);
-
-#define write_block2_aligned(rout, l0, r0, l1, r1, convert) \
-	convert(l0); \
-	convert(r0); \
-	convert(l1); \
-	str l0, [rout, #(0)]; \
-	convert(r1); \
-	str r0, [rout, #(4)]; \
-	str l1, [rout, #(8)]; \
-	str r1, [rout, #(12)];
-
-#ifdef __ARM_FEATURE_UNALIGNED
-	/* unaligned word reads allowed */
-	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
-		read_block2_aligned(rin, l0, r0, l1, r1, host_to_be)
-
-	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		write_block2_aligned(rout, l0, r0, l1, r1, be_to_host)
-
-	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
-		read_block2_aligned(rin, l0, r0, l1, r1, host_to_host)
-
-	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		write_block2_aligned(rout, l0, r0, l1, r1, host_to_host)
-#else
-	/* need to handle unaligned reads by byte reads */
-	#define read_block2(rin, l0, r0, l1, r1, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_be(l0, rin, 0, rtmp0); \
-			ldr_unaligned_be(r0, rin, 4, rtmp0); \
-			ldr_unaligned_be(l1, rin, 8, rtmp0); \
-			ldr_unaligned_be(r1, rin, 12, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block2_aligned(rin, l0, r0, l1, r1, host_to_be); \
-		2:;
-
-	#define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \
-			str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \
-			str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \
-			str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block2_aligned(rout, l0, r0, l1, r1, be_to_host); \
-		2:;
-
-	#define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_host(l0, rin, 0, rtmp0); \
-			ldr_unaligned_host(r0, rin, 4, rtmp0); \
-			ldr_unaligned_host(l1, rin, 8, rtmp0); \
-			ldr_unaligned_host(r1, rin, 12, rtmp0); \
-			b 2f; \
-		1:;\
-			read_block2_aligned(rin, l0, r0, l1, r1, host_to_host); \
-		2:;
-
-	#define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \
-			str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \
-			str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \
-			str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			write_block2_aligned(rout, l0, r0, l1, r1, host_to_host); \
-		2:;
-#endif
-
-.align 3
-.type  _gcry_cast5_armv6_enc_blk2,%function;
-
-_gcry_cast5_armv6_enc_blk2:
-	/* input:
-	 *	preloaded: CTX
-	 *	[RL0, RR0], [RL1, RR1]: src
-	 * output:
-	 *	[RR0, RL0], [RR1, RL1]: dst
-	 */
-	push {%lr};
-
-	ldr Rs1, =_gcry_cast5_s1to4;
-	mov RMASK, #(0xff << 2);
-	add Rs2, Rs1, #(0x100 * 4);
-
-	load_km(0);
-	load_kr(0);
-	enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy);
-	enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(3, F1, RR, RL, load_km, dummy, load_kr);
-	enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy);
-	enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(7, F2, RR, RL, load_km, dummy, load_kr);
-	enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy);
-	enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(11, F3, RR, RL, load_km, dummy, load_kr);
-	enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy);
-	enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy);
-	enc_round2(15, F1, RR, RL, dummy, dummy, dummy);
-
-	host_to_be(RR0);
-	host_to_be(RL0);
-	host_to_be(RR1);
-	host_to_be(RL1);
-
-	pop {%pc};
-.ltorg
-.size _gcry_cast5_armv6_enc_blk2,.-_gcry_cast5_armv6_enc_blk2;
-
-.align 3
-.globl _gcry_cast5_armv6_cfb_dec;
-.type  _gcry_cast5_armv6_cfb_dec,%function;
-
-_gcry_cast5_armv6_cfb_dec:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
-	 */
-	push {%r1, %r2, %r4-%r11, %ip, %lr};
-
-	mov %lr, %r3;
-
-	/* Load input (iv/%r3 is aligned, src/%r2 might not be) */
-	ldm %r3, {RL0, RR0};
-	host_to_be(RL0);
-	host_to_be(RR0);
-	read_block(%r2, 0, RL1, RR1, %ip);
-
-	/* Update IV, load src[1] and save to iv[0] */
-	read_block_host(%r2, 8, %r5, %r6, %r7);
-	stm %lr, {%r5, %r6};
-
-	bl _gcry_cast5_armv6_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
-
-	/* %r0: dst, %r1: %src */
-	pop {%r0, %r1};
-
-	/* dst = src ^ result */
-	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_cast5_armv6_cfb_dec,.-_gcry_cast5_armv6_cfb_dec;
-
-.align 3
-.globl _gcry_cast5_armv6_ctr_enc;
-.type  _gcry_cast5_armv6_ctr_enc,%function;
-
-_gcry_cast5_armv6_ctr_enc:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit, big-endian)
-	 */
-	push {%r1, %r2, %r4-%r11, %ip, %lr};
-
-	mov %lr, %r3;
-
-	/* Load IV (big => host endian) */
-	read_block_aligned(%lr, 0, RL0, RR0, be_to_host);
-
-	/* Construct IVs */
-	adds RR1, RR0, #1; /* +1 */
-	adc RL1, RL0, #0;
-	adds %r6, RR1, #1; /* +2 */
-	adc %r5, RL1, #0;
-
-	/* Store new IV (host => big-endian) */
-	write_block_aligned(%lr, 0, %r5, %r6, host_to_be);
-
-	bl _gcry_cast5_armv6_enc_blk2;
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
-
-	/* %r0: dst, %r1: %src */
-	pop {%r0, %r1};
-
-	/* XOR key-stream with plaintext */
-	read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr);
-	eor %r5, %r4;
-	eor %r6, %r3;
-	eor %r7, %r10;
-	eor %r8, %r9;
-	write_block2_host(%r0, %r5, %r6, %r7, %r8, %r1, %r2);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_cast5_armv6_ctr_enc,.-_gcry_cast5_armv6_ctr_enc;
-
-.align 3
-.type  _gcry_cast5_armv6_dec_blk2,%function;
-
-_gcry_cast5_armv6_dec_blk2:
-	/* input:
-	 *	preloaded: CTX
-	 *	[RL0, RR0], [RL1, RR1]: src
-	 * output:
-	 *	[RR0, RL0], [RR1, RL1]: dst
-	 */
-
-	ldr Rs1, =_gcry_cast5_s1to4;
-	mov RMASK, #(0xff << 2);
-	add Rs2, Rs1, #(0x100 * 4);
-
-	load_km(15);
-	load_dec_kr(15);
-	dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy);
-	dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr);
-	dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy);
-	dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr);
-	dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy);
-	dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr);
-	dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy);
-	dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy);
-	dec_round2(0, F1, RR, RL, dummy, dummy, dummy);
-
-	host_to_be(RR0);
-	host_to_be(RL0);
-	host_to_be(RR1);
-	host_to_be(RL1);
-
-	b .Ldec_cbc_tail;
-.ltorg
-.size _gcry_cast5_armv6_dec_blk2,.-_gcry_cast5_armv6_dec_blk2;
-
-.align 3
-.globl _gcry_cast5_armv6_cbc_dec;
-.type  _gcry_cast5_armv6_cbc_dec,%function;
-
-_gcry_cast5_armv6_cbc_dec:
-	/* input:
-	 *	%r0: CTX
-	 *	%r1: dst (2 blocks)
-	 *	%r2: src (2 blocks)
-	 *	%r3: iv (64bit)
-	 */
-	push {%r1-%r11, %ip, %lr};
-
-	read_block2(%r2, RL0, RR0, RL1, RR1, RT0);
-
-	/* dec_blk2 is only used by cbc_dec, jump directly in/out instead
-	 * of function call. */
-	b _gcry_cast5_armv6_dec_blk2;
-.Ldec_cbc_tail:
-	/* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */
-
-	/* %r0: dst, %r1: %src, %r2: iv */
-	pop {%r0-%r2};
-
-	/* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r1, 0, %r7, %r8, %r5);
-	/* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */
-	ldm %r2, {%r5, %r6};
-
-	/* out[1] ^= IV+1 */
-	eor %r10, %r7;
-	eor %r9, %r8;
-	/* out[0] ^= IV */
-	eor %r4, %r5;
-	eor %r3, %r6;
-
-	/* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */
-	read_block_host(%r1, 8, %r7, %r8, %r5);
-	/* store IV+2 to iv[0] (aligned). */
-	stm %r2, {%r7, %r8};
-
-	/* store result to dst[0-3]. Might be unaligned. */
-	write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6);
-
-	pop {%r4-%r11, %ip, %pc};
-.ltorg
-.size _gcry_cast5_armv6_cbc_dec,.-_gcry_cast5_armv6_cbc_dec;
-
-#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
-#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 92d9af8..a954657 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -52,11 +52,11 @@
 # define USE_AMD64_ASM 1
 #endif
 
-/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
-#undef USE_ARMV6_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+/* USE_ARM_ASM indicates whether to use ARMv6 assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
 # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-#  define USE_ARMV6_ASM 1
+#  define USE_ARM_ASM 1
 # endif
 #endif
 
@@ -65,7 +65,7 @@
 typedef struct {
     u32  Km[16];
     byte Kr[16];
-#ifdef USE_ARMV6_ASM
+#ifdef USE_ARM_ASM
     u32 Kr_arm_enc[16 / sizeof(u32)];
     u32 Kr_arm_dec[16 / sizeof(u32)];
 #endif
@@ -400,35 +400,35 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (2*8);
 }
 
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
 
 /* ARMv6 assembly implementations of CAST5. */
-extern void _gcry_cast5_armv6_encrypt_block(CAST5_context *c, byte *outbuf,
+extern void _gcry_cast5_arm_encrypt_block(CAST5_context *c, byte *outbuf,
 					    const byte *inbuf);
 
-extern void _gcry_cast5_armv6_decrypt_block(CAST5_context *c, byte *outbuf,
+extern void _gcry_cast5_arm_decrypt_block(CAST5_context *c, byte *outbuf,
 					    const byte *inbuf);
 
 /* These assembly implementations process two blocks in parallel. */
-extern void _gcry_cast5_armv6_ctr_enc(CAST5_context *ctx, byte *out,
+extern void _gcry_cast5_arm_ctr_enc(CAST5_context *ctx, byte *out,
 				      const byte *in, byte *ctr);
 
-extern void _gcry_cast5_armv6_cbc_dec(CAST5_context *ctx, byte *out,
+extern void _gcry_cast5_arm_cbc_dec(CAST5_context *ctx, byte *out,
 				      const byte *in, byte *iv);
 
-extern void _gcry_cast5_armv6_cfb_dec(CAST5_context *ctx, byte *out,
+extern void _gcry_cast5_arm_cfb_dec(CAST5_context *ctx, byte *out,
 				      const byte *in, byte *iv);
 
 static void
 do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
-  _gcry_cast5_armv6_encrypt_block (context, outbuf, inbuf);
+  _gcry_cast5_arm_encrypt_block (context, outbuf, inbuf);
 }
 
 static void
 do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
-  _gcry_cast5_armv6_decrypt_block (context, outbuf, inbuf);
+  _gcry_cast5_arm_decrypt_block (context, outbuf, inbuf);
 }
 
 static unsigned int
@@ -447,7 +447,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (10*4);
 }
 
-#else /*USE_ARMV6_ASM*/
+#else /*USE_ARM_ASM*/
 
 #define F1(D,m,r)  (  (I = ((m) + (D))), (I=rol(I,(r))),   \
     (((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff]) )
@@ -556,7 +556,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   return /*burn_stack*/ (20+4*sizeof(void*));
 }
 
-#endif /*!USE_ARMV6_ASM*/
+#endif /*!USE_ARM_ASM*/
 
 
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
@@ -592,12 +592,12 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Use generic code to handle smaller chunks... */
     /* TODO: use caching instead? */
   }
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
   {
     /* Process data in 2 block chunks. */
     while (nblocks >= 2)
       {
-        _gcry_cast5_armv6_ctr_enc(ctx, outbuf, inbuf, ctr);
+        _gcry_cast5_arm_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 2;
         outbuf += 2 * CAST5_BLOCKSIZE;
@@ -660,12 +660,12 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
   {
     /* Process data in 2 block chunks. */
     while (nblocks >= 2)
       {
-        _gcry_cast5_armv6_cbc_dec(ctx, outbuf, inbuf, iv);
+        _gcry_cast5_arm_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 2;
         outbuf += 2 * CAST5_BLOCKSIZE;
@@ -722,12 +722,12 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
-#elif defined(USE_ARMV6_ASM)
+#elif defined(USE_ARM_ASM)
   {
     /* Process data in 2 block chunks. */
     while (nblocks >= 2)
       {
-        _gcry_cast5_armv6_cfb_dec(ctx, outbuf, inbuf, iv);
+        _gcry_cast5_arm_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 2;
         outbuf += 2 * CAST5_BLOCKSIZE;
@@ -936,7 +936,7 @@ do_cast_setkey( CAST5_context *c, const byte *key, unsigned keylen )
   for(i=0; i < 16; i++ )
     c->Kr[i] = k[i] & 0x1f;
 
-#ifdef USE_ARMV6_ASM
+#ifdef USE_ARM_ASM
   for (i = 0; i < 4; i++)
     {
       byte Kr_arm[4];
diff --git a/cipher/rijndael-arm.S b/cipher/rijndael-arm.S
new file mode 100644
index 0000000..2a747bf
--- /dev/null
+++ b/cipher/rijndael-arm.S
@@ -0,0 +1,853 @@
+/* rijndael-arm.S  -  ARM assembly implementation of AES cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* register macros */
+#define CTX	%r0
+#define RTAB	%lr
+#define RMASK	%ip
+
+#define RA	%r4
+#define RB	%r5
+#define RC	%r6
+#define RD	%r7
+
+#define RNA	%r8
+#define RNB	%r9
+#define RNC	%r10
+#define RND	%r11
+
+#define RT0	%r1
+#define RT1	%r2
+#define RT2	%r3
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+/***********************************************************************
+ * ARM assembly implementation of the AES cipher
+ ***********************************************************************/
+#define preload_first_key(round, ra) \
+	ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
+
+#define dummy(round, ra) /* nothing */
+
+#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldm CTX, {rna, rnb, rnc, rnd}; \
+	eor ra, rna; \
+	eor rb, rnb; \
+	eor rc, rnc; \
+	preload_key(1, rna); \
+	eor rd, rnd;
+
+#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+	\
+	and RT0, RMASK, ra, lsl#3; \
+	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+	and RT1, RMASK, ra, lsr#(8 - 3); \
+	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+	and RT2, RMASK, ra, lsr#(16 - 3); \
+	ldr RT0, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rna, rna, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rd, lsl#3; \
+	ldr ra,  [RTAB, ra]; \
+	\
+	eor rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 3); \
+	eor rnc, rnc, RT2, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 3); \
+	eor rnb, rnb, ra, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnd, rnd, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#3; \
+	ldr rd,  [RTAB, rd]; \
+	\
+	eor rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 3); \
+	eor rnb, rnb, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 3); \
+	eor rna, rna, rd, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnc, rnc, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rb, lsl#3; \
+	ldr rc,  [RTAB, rc]; \
+	\
+	eor rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 3); \
+	eor rna, rna, RT2, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 3); \
+	eor rnd, rnd, rc, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnb, rnb, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	eor rna, rna, RT1, ror #24; \
+	ldr rb,  [RTAB, rb]; \
+	\
+	eor rnd, rnd, RT2, ror #16; \
+	preload_key((next_r) + 1, ra); \
+	eor rnc, rnc, rb, ror #8;
+
+#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	and RT0, RMASK, ra, lsl#3; \
+	and RT1, RMASK, ra, lsr#(8 - 3); \
+	and RT2, RMASK, ra, lsr#(16 - 3); \
+	ldr rna, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 3); \
+	ldr rnd, [RTAB, RT1]; \
+	and RT0, RMASK, rd, lsl#3; \
+	ldr rnc, [RTAB, RT2]; \
+	mov rnd, rnd, ror #24; \
+	ldr rnb, [RTAB, ra]; \
+	and RT1, RMASK, rd, lsr#(8 - 3); \
+	mov rnc, rnc, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 3); \
+	mov rnb, rnb, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 3); \
+	ldr RT1, [RTAB, RT1]; \
+	\
+	orr rnd, rnd, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#3; \
+	ldr rd,  [RTAB, rd]; \
+	orr rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 3); \
+	orr rnb, rnb, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 3); \
+	orr rna, rna, rd, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 3); \
+	ldr RT1, [RTAB, RT1]; \
+	\
+	orr rnc, rnc, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rb, lsl#3; \
+	ldr rc,  [RTAB, rc]; \
+	orr rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 3); \
+	orr rna, rna, RT2, ror #16; \
+	ldr RT0, [RTAB, RT0]; \
+	and RT2, RMASK, rb, lsr#(16 - 3); \
+	ldr RT1, [RTAB, RT1]; \
+	orr rnd, rnd, rc, ror #8; \
+	ldr RT2, [RTAB, RT2]; \
+	and rb,  RMASK, rb, lsr#(24 - 3); \
+	ldr rb,  [RTAB, rb]; \
+	\
+	orr rnb, rnb, RT0; \
+	orr rna, rna, RT1, ror #24; \
+	orr rnd, rnd, RT2, ror #16; \
+	orr rnc, rnc, rb, ror #8;
+
+#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
+	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	add CTX, #(((round) + 1) * 16); \
+	add RTAB, #4; \
+	do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.align 3
+.global _gcry_aes_arm_encrypt_block
+.type   _gcry_aes_arm_encrypt_block,%function;
+
+_gcry_aes_arm_encrypt_block:
+	/* input:
+	 *	%r0: keysched, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: number of rounds.. 10, 12 or 14
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	/* read input block */
+#ifndef __ARM_FEATURE_UNALIGNED
+	/* test if src is unaligned */
+	tst	%r2, #3;
+	beq	1f;
+
+	/* unaligned load */
+	ldr_unaligned_le(RA, %r2, 0, RNA);
+	ldr_unaligned_le(RB, %r2, 4, RNB);
+	ldr_unaligned_le(RC, %r2, 8, RNA);
+	ldr_unaligned_le(RD, %r2, 12, RNB);
+	b	2f;
+.ltorg
+1:
+#endif
+	/* aligned load */
+	ldm	%r2, {RA, RB, RC, RD};
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+2:
+	sub	%sp, #16;
+
+	ldr	RTAB, =.LtableE0;
+
+	str	%r1, [%sp, #4];		/* dst */
+	mov	RMASK, #0xff;
+	str	%r3, [%sp, #8];		/* nrounds */
+	mov	RMASK, RMASK, lsl#3;	/* byte mask */
+
+	firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+
+	ldr	RT0, [%sp, #8];		/* nrounds */
+	cmp	RT0, #12;
+	bge	.Lenc_not_128;
+
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+.Lenc_done:
+	ldr	RT0, [%sp, #4];		/* dst */
+	add	%sp, #16;
+
+	/* store output block */
+#ifndef __ARM_FEATURE_UNALIGNED
+	/* test if dst is unaligned */
+	tst	RT0, #3;
+	beq	1f;
+
+	/* unaligned store */
+	str_unaligned_le(RA, RT0, 0, RNA, RNB);
+	str_unaligned_le(RB, RT0, 4, RNA, RNB);
+	str_unaligned_le(RC, RT0, 8, RNA, RNB);
+	str_unaligned_le(RD, RT0, 12, RNA, RNB);
+	b	2f;
+.ltorg
+1:
+#endif
+	/* aligned store */
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+	/* write output block */
+	stm	RT0, {RA, RB, RC, RD};
+2:
+	pop {%r4-%r11, %ip, %pc};
+
+.ltorg
+.Lenc_not_128:
+	beq .Lenc_192
+
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	b .Lenc_done;
+
+.ltorg
+.Lenc_192:
+	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	b .Lenc_done;
+.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;
+
+#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
+	ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
+	eor ra, rna; \
+	ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
+	eor rb, rnb; \
+	ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
+	eor rc, rnc; \
+	preload_first_key((round) - 1, rna); \
+	eor rd, rnd;
+
+#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
+	\
+	and RT0, RMASK, ra, lsl#3; \
+	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
+	and RT1, RMASK, ra, lsr#(8 - 3); \
+	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
+	and RT2, RMASK, ra, lsr#(16 - 3); \
+	ldr RT0, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rna, rna, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rb, lsl#3; \
+	ldr ra,  [RTAB, ra]; \
+	\
+	eor rnb, rnb, RT1, ror #24; \
+	and RT1, RMASK, rb, lsr#(8 - 3); \
+	eor rnc, rnc, RT2, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 3); \
+	eor rnd, rnd, ra, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnb, rnb, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#3; \
+	ldr rb,  [RTAB, rb]; \
+	\
+	eor rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 3); \
+	eor rnd, rnd, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 3); \
+	eor rna, rna, rb, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnc, rnc, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rd, lsl#3; \
+	ldr rc,  [RTAB, rc]; \
+	\
+	eor rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 3); \
+	eor rna, rna, RT2, ror #16; \
+	and RT2, RMASK, rd, lsr#(16 - 3); \
+	eor rnb, rnb, rc, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rd,  RMASK, rd, lsr#(24 - 3); \
+	\
+	ldr RT1, [RTAB, RT1]; \
+	eor rnd, rnd, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	eor rna, rna, RT1, ror #24; \
+	ldr rd,  [RTAB, rd]; \
+	\
+	eor rnb, rnb, RT2, ror #16; \
+	preload_key((next_r) - 1, ra); \
+	eor rnc, rnc, rd, ror #8;
+
+#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	and RT0, RMASK, ra, lsl#3; \
+	and RT1, RMASK, ra, lsr#(8 - 3); \
+	and RT2, RMASK, ra, lsr#(16 - 3); \
+	ldr rna, [RTAB, RT0]; \
+	and ra,  RMASK, ra, lsr#(24 - 3); \
+	ldr rnb, [RTAB, RT1]; \
+	and RT0, RMASK, rb, lsl#3; \
+	ldr rnc, [RTAB, RT2]; \
+	mov rnb, rnb, ror #24; \
+	ldr rnd, [RTAB, ra]; \
+	and RT1, RMASK, rb, lsr#(8 - 3); \
+	mov rnc, rnc, ror #16; \
+	and RT2, RMASK, rb, lsr#(16 - 3); \
+	mov rnd, rnd, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rb,  RMASK, rb, lsr#(24 - 3); \
+	ldr RT1, [RTAB, RT1]; \
+	\
+	orr rnb, rnb, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rc, lsl#3; \
+	ldr rb,  [RTAB, rb]; \
+	orr rnc, rnc, RT1, ror #24; \
+	and RT1, RMASK, rc, lsr#(8 - 3); \
+	orr rnd, rnd, RT2, ror #16; \
+	and RT2, RMASK, rc, lsr#(16 - 3); \
+	orr rna, rna, rb, ror #8; \
+	ldr RT0, [RTAB, RT0]; \
+	and rc,  RMASK, rc, lsr#(24 - 3); \
+	ldr RT1, [RTAB, RT1]; \
+	\
+	orr rnc, rnc, RT0; \
+	ldr RT2, [RTAB, RT2]; \
+	and RT0, RMASK, rd, lsl#3; \
+	ldr rc,  [RTAB, rc]; \
+	orr rnd, rnd, RT1, ror #24; \
+	and RT1, RMASK, rd, lsr#(8 - 3); \
+	orr rna, rna, RT2, ror #16; \
+	ldr RT0, [RTAB, RT0]; \
+	and RT2, RMASK, rd, lsr#(16 - 3); \
+	ldr RT1, [RTAB, RT1]; \
+	orr rnb, rnb, rc, ror #8; \
+	ldr RT2, [RTAB, RT2]; \
+	and rd,  RMASK, rd, lsr#(24 - 3); \
+	ldr rd,  [RTAB, rd]; \
+	\
+	orr rnd, rnd, RT0; \
+	orr rna, rna, RT1, ror #24; \
+	orr rnb, rnb, RT2, ror #16; \
+	orr rnc, rnc, rd, ror #8;
+
+#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
+
+#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
+	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
+
+#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
+	add RTAB, #4; \
+	do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
+	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
+
+.align 3
+.global _gcry_aes_arm_decrypt_block
+.type   _gcry_aes_arm_decrypt_block,%function;
+
+_gcry_aes_arm_decrypt_block:
+	/* input:
+	 *	%r0: keysched, CTX
+	 *	%r1: dst
+	 *	%r2: src
+	 *	%r3: number of rounds.. 10, 12 or 14
+	 */
+	push {%r4-%r11, %ip, %lr};
+
+	/* read input block */
+#ifndef __ARM_FEATURE_UNALIGNED
+	/* test if src is unaligned */
+	tst	%r2, #3;
+	beq	1f;
+
+	/* unaligned load */
+	ldr_unaligned_le(RA, %r2, 0, RNA);
+	ldr_unaligned_le(RB, %r2, 4, RNB);
+	ldr_unaligned_le(RC, %r2, 8, RNA);
+	ldr_unaligned_le(RD, %r2, 12, RNB);
+	b	2f;
+.ltorg
+1:
+#endif
+	/* aligned load */
+	ldm	%r2, {RA, RB, RC, RD};
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+2:
+	sub	%sp, #16;
+
+	ldr	RTAB, =.LtableD0;
+
+	mov	RMASK, #0xff;
+	str	%r1, [%sp, #4];		/* dst */
+	mov	RMASK, RMASK, lsl#3;	/* byte mask */
+
+	cmp	%r3, #12;
+	bge	.Ldec_256;
+
+	firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+.Ldec_tail:
+	decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
+	lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
+
+	ldr	RT0, [%sp, #4];		/* dst */
+	add	%sp, #16;
+
+	/* store output block */
+#ifndef __ARM_FEATURE_UNALIGNED
+	/* test if dst is unaligned */
+	tst	RT0, #3;
+	beq	1f;
+
+	/* unaligned store */
+	str_unaligned_le(RA, RT0, 0, RNA, RNB);
+	str_unaligned_le(RB, RT0, 4, RNA, RNB);
+	str_unaligned_le(RC, RT0, 8, RNA, RNB);
+	str_unaligned_le(RD, RT0, 12, RNA, RNB);
+	b	2f;
+.ltorg
+1:
+#endif
+	/* aligned store */
+#ifndef __ARMEL__
+	rev	RA, RA;
+	rev	RB, RB;
+	rev	RC, RC;
+	rev	RD, RD;
+#endif
+	/* write output block */
+	stm	RT0, {RA, RB, RC, RD};
+2:
+	pop {%r4-%r11, %ip, %pc};
+
+.ltorg
+.Ldec_256:
+	beq .Ldec_192;
+
+	firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+	b .Ldec_tail;
+
+.ltorg
+.Ldec_192:
+	firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
+	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
+	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
+
+	b .Ldec_tail;
+.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;
+
+.data
+
+/* Encryption tables */
+.align 5
+.type .LtableE0, %object
+.type .LtableEs0, %object
+.LtableE0:
+.long 0xa56363c6
+.LtableEs0:
+.long             0x00000063, 0x847c7cf8, 0x0000007c
+.long 0x997777ee, 0x00000077, 0x8d7b7bf6, 0x0000007b
+.long 0x0df2f2ff, 0x000000f2, 0xbd6b6bd6, 0x0000006b
+.long 0xb16f6fde, 0x0000006f, 0x54c5c591, 0x000000c5
+.long 0x50303060, 0x00000030, 0x03010102, 0x00000001
+.long 0xa96767ce, 0x00000067, 0x7d2b2b56, 0x0000002b
+.long 0x19fefee7, 0x000000fe, 0x62d7d7b5, 0x000000d7
+.long 0xe6abab4d, 0x000000ab, 0x9a7676ec, 0x00000076
+.long 0x45caca8f, 0x000000ca, 0x9d82821f, 0x00000082
+.long 0x40c9c989, 0x000000c9, 0x877d7dfa, 0x0000007d
+.long 0x15fafaef, 0x000000fa, 0xeb5959b2, 0x00000059
+.long 0xc947478e, 0x00000047, 0x0bf0f0fb, 0x000000f0
+.long 0xecadad41, 0x000000ad, 0x67d4d4b3, 0x000000d4
+.long 0xfda2a25f, 0x000000a2, 0xeaafaf45, 0x000000af
+.long 0xbf9c9c23, 0x0000009c, 0xf7a4a453, 0x000000a4
+.long 0x967272e4, 0x00000072, 0x5bc0c09b, 0x000000c0
+.long 0xc2b7b775, 0x000000b7, 0x1cfdfde1, 0x000000fd
+.long 0xae93933d, 0x00000093, 0x6a26264c, 0x00000026
+.long 0x5a36366c, 0x00000036, 0x413f3f7e, 0x0000003f
+.long 0x02f7f7f5, 0x000000f7, 0x4fcccc83, 0x000000cc
+.long 0x5c343468, 0x00000034, 0xf4a5a551, 0x000000a5
+.long 0x34e5e5d1, 0x000000e5, 0x08f1f1f9, 0x000000f1
+.long 0x937171e2, 0x00000071, 0x73d8d8ab, 0x000000d8
+.long 0x53313162, 0x00000031, 0x3f15152a, 0x00000015
+.long 0x0c040408, 0x00000004, 0x52c7c795, 0x000000c7
+.long 0x65232346, 0x00000023, 0x5ec3c39d, 0x000000c3
+.long 0x28181830, 0x00000018, 0xa1969637, 0x00000096
+.long 0x0f05050a, 0x00000005, 0xb59a9a2f, 0x0000009a
+.long 0x0907070e, 0x00000007, 0x36121224, 0x00000012
+.long 0x9b80801b, 0x00000080, 0x3de2e2df, 0x000000e2
+.long 0x26ebebcd, 0x000000eb, 0x6927274e, 0x00000027
+.long 0xcdb2b27f, 0x000000b2, 0x9f7575ea, 0x00000075
+.long 0x1b090912, 0x00000009, 0x9e83831d, 0x00000083
+.long 0x742c2c58, 0x0000002c, 0x2e1a1a34, 0x0000001a
+.long 0x2d1b1b36, 0x0000001b, 0xb26e6edc, 0x0000006e
+.long 0xee5a5ab4, 0x0000005a, 0xfba0a05b, 0x000000a0
+.long 0xf65252a4, 0x00000052, 0x4d3b3b76, 0x0000003b
+.long 0x61d6d6b7, 0x000000d6, 0xceb3b37d, 0x000000b3
+.long 0x7b292952, 0x00000029, 0x3ee3e3dd, 0x000000e3
+.long 0x712f2f5e, 0x0000002f, 0x97848413, 0x00000084
+.long 0xf55353a6, 0x00000053, 0x68d1d1b9, 0x000000d1
+.long 0x00000000, 0x00000000, 0x2cededc1, 0x000000ed
+.long 0x60202040, 0x00000020, 0x1ffcfce3, 0x000000fc
+.long 0xc8b1b179, 0x000000b1, 0xed5b5bb6, 0x0000005b
+.long 0xbe6a6ad4, 0x0000006a, 0x46cbcb8d, 0x000000cb
+.long 0xd9bebe67, 0x000000be, 0x4b393972, 0x00000039
+.long 0xde4a4a94, 0x0000004a, 0xd44c4c98, 0x0000004c
+.long 0xe85858b0, 0x00000058, 0x4acfcf85, 0x000000cf
+.long 0x6bd0d0bb, 0x000000d0, 0x2aefefc5, 0x000000ef
+.long 0xe5aaaa4f, 0x000000aa, 0x16fbfbed, 0x000000fb
+.long 0xc5434386, 0x00000043, 0xd74d4d9a, 0x0000004d
+.long 0x55333366, 0x00000033, 0x94858511, 0x00000085
+.long 0xcf45458a, 0x00000045, 0x10f9f9e9, 0x000000f9
+.long 0x06020204, 0x00000002, 0x817f7ffe, 0x0000007f
+.long 0xf05050a0, 0x00000050, 0x443c3c78, 0x0000003c
+.long 0xba9f9f25, 0x0000009f, 0xe3a8a84b, 0x000000a8
+.long 0xf35151a2, 0x00000051, 0xfea3a35d, 0x000000a3
+.long 0xc0404080, 0x00000040, 0x8a8f8f05, 0x0000008f
+.long 0xad92923f, 0x00000092, 0xbc9d9d21, 0x0000009d
+.long 0x48383870, 0x00000038, 0x04f5f5f1, 0x000000f5
+.long 0xdfbcbc63, 0x000000bc, 0xc1b6b677, 0x000000b6
+.long 0x75dadaaf, 0x000000da, 0x63212142, 0x00000021
+.long 0x30101020, 0x00000010, 0x1affffe5, 0x000000ff
+.long 0x0ef3f3fd, 0x000000f3, 0x6dd2d2bf, 0x000000d2
+.long 0x4ccdcd81, 0x000000cd, 0x140c0c18, 0x0000000c
+.long 0x35131326, 0x00000013, 0x2fececc3, 0x000000ec
+.long 0xe15f5fbe, 0x0000005f, 0xa2979735, 0x00000097
+.long 0xcc444488, 0x00000044, 0x3917172e, 0x00000017
+.long 0x57c4c493, 0x000000c4, 0xf2a7a755, 0x000000a7
+.long 0x827e7efc, 0x0000007e, 0x473d3d7a, 0x0000003d
+.long 0xac6464c8, 0x00000064, 0xe75d5dba, 0x0000005d
+.long 0x2b191932, 0x00000019, 0x957373e6, 0x00000073
+.long 0xa06060c0, 0x00000060, 0x98818119, 0x00000081
+.long 0xd14f4f9e, 0x0000004f, 0x7fdcdca3, 0x000000dc
+.long 0x66222244, 0x00000022, 0x7e2a2a54, 0x0000002a
+.long 0xab90903b, 0x00000090, 0x8388880b, 0x00000088
+.long 0xca46468c, 0x00000046, 0x29eeeec7, 0x000000ee
+.long 0xd3b8b86b, 0x000000b8, 0x3c141428, 0x00000014
+.long 0x79dedea7, 0x000000de, 0xe25e5ebc, 0x0000005e
+.long 0x1d0b0b16, 0x0000000b, 0x76dbdbad, 0x000000db
+.long 0x3be0e0db, 0x000000e0, 0x56323264, 0x00000032
+.long 0x4e3a3a74, 0x0000003a, 0x1e0a0a14, 0x0000000a
+.long 0xdb494992, 0x00000049, 0x0a06060c, 0x00000006
+.long 0x6c242448, 0x00000024, 0xe45c5cb8, 0x0000005c
+.long 0x5dc2c29f, 0x000000c2, 0x6ed3d3bd, 0x000000d3
+.long 0xefacac43, 0x000000ac, 0xa66262c4, 0x00000062
+.long 0xa8919139, 0x00000091, 0xa4959531, 0x00000095
+.long 0x37e4e4d3, 0x000000e4, 0x8b7979f2, 0x00000079
+.long 0x32e7e7d5, 0x000000e7, 0x43c8c88b, 0x000000c8
+.long 0x5937376e, 0x00000037, 0xb76d6dda, 0x0000006d
+.long 0x8c8d8d01, 0x0000008d, 0x64d5d5b1, 0x000000d5
+.long 0xd24e4e9c, 0x0000004e, 0xe0a9a949, 0x000000a9
+.long 0xb46c6cd8, 0x0000006c, 0xfa5656ac, 0x00000056
+.long 0x07f4f4f3, 0x000000f4, 0x25eaeacf, 0x000000ea
+.long 0xaf6565ca, 0x00000065, 0x8e7a7af4, 0x0000007a
+.long 0xe9aeae47, 0x000000ae, 0x18080810, 0x00000008
+.long 0xd5baba6f, 0x000000ba, 0x887878f0, 0x00000078
+.long 0x6f25254a, 0x00000025, 0x722e2e5c, 0x0000002e
+.long 0x241c1c38, 0x0000001c, 0xf1a6a657, 0x000000a6
+.long 0xc7b4b473, 0x000000b4, 0x51c6c697, 0x000000c6
+.long 0x23e8e8cb, 0x000000e8, 0x7cdddda1, 0x000000dd
+.long 0x9c7474e8, 0x00000074, 0x211f1f3e, 0x0000001f
+.long 0xdd4b4b96, 0x0000004b, 0xdcbdbd61, 0x000000bd
+.long 0x868b8b0d, 0x0000008b, 0x858a8a0f, 0x0000008a
+.long 0x907070e0, 0x00000070, 0x423e3e7c, 0x0000003e
+.long 0xc4b5b571, 0x000000b5, 0xaa6666cc, 0x00000066
+.long 0xd8484890, 0x00000048, 0x05030306, 0x00000003
+.long 0x01f6f6f7, 0x000000f6, 0x120e0e1c, 0x0000000e
+.long 0xa36161c2, 0x00000061, 0x5f35356a, 0x00000035
+.long 0xf95757ae, 0x00000057, 0xd0b9b969, 0x000000b9
+.long 0x91868617, 0x00000086, 0x58c1c199, 0x000000c1
+.long 0x271d1d3a, 0x0000001d, 0xb99e9e27, 0x0000009e
+.long 0x38e1e1d9, 0x000000e1, 0x13f8f8eb, 0x000000f8
+.long 0xb398982b, 0x00000098, 0x33111122, 0x00000011
+.long 0xbb6969d2, 0x00000069, 0x70d9d9a9, 0x000000d9
+.long 0x898e8e07, 0x0000008e, 0xa7949433, 0x00000094
+.long 0xb69b9b2d, 0x0000009b, 0x221e1e3c, 0x0000001e
+.long 0x92878715, 0x00000087, 0x20e9e9c9, 0x000000e9
+.long 0x49cece87, 0x000000ce, 0xff5555aa, 0x00000055
+.long 0x78282850, 0x00000028, 0x7adfdfa5, 0x000000df
+.long 0x8f8c8c03, 0x0000008c, 0xf8a1a159, 0x000000a1
+.long 0x80898909, 0x00000089, 0x170d0d1a, 0x0000000d
+.long 0xdabfbf65, 0x000000bf, 0x31e6e6d7, 0x000000e6
+.long 0xc6424284, 0x00000042, 0xb86868d0, 0x00000068
+.long 0xc3414182, 0x00000041, 0xb0999929, 0x00000099
+.long 0x772d2d5a, 0x0000002d, 0x110f0f1e, 0x0000000f
+.long 0xcbb0b07b, 0x000000b0, 0xfc5454a8, 0x00000054
+.long 0xd6bbbb6d, 0x000000bb, 0x3a16162c, 0x00000016
+
+/* Decryption tables */
+.align 5
+.type .LtableD0, %object
+.type .LtableDs0, %object
+.LtableD0:
+.long 0x50a7f451
+.LtableDs0:
+.long             0x00000052, 0x5365417e, 0x00000009
+.long 0xc3a4171a, 0x0000006a, 0x965e273a, 0x000000d5
+.long 0xcb6bab3b, 0x00000030, 0xf1459d1f, 0x00000036
+.long 0xab58faac, 0x000000a5, 0x9303e34b, 0x00000038
+.long 0x55fa3020, 0x000000bf, 0xf66d76ad, 0x00000040
+.long 0x9176cc88, 0x000000a3, 0x254c02f5, 0x0000009e
+.long 0xfcd7e54f, 0x00000081, 0xd7cb2ac5, 0x000000f3
+.long 0x80443526, 0x000000d7, 0x8fa362b5, 0x000000fb
+.long 0x495ab1de, 0x0000007c, 0x671bba25, 0x000000e3
+.long 0x980eea45, 0x00000039, 0xe1c0fe5d, 0x00000082
+.long 0x02752fc3, 0x0000009b, 0x12f04c81, 0x0000002f
+.long 0xa397468d, 0x000000ff, 0xc6f9d36b, 0x00000087
+.long 0xe75f8f03, 0x00000034, 0x959c9215, 0x0000008e
+.long 0xeb7a6dbf, 0x00000043, 0xda595295, 0x00000044
+.long 0x2d83bed4, 0x000000c4, 0xd3217458, 0x000000de
+.long 0x2969e049, 0x000000e9, 0x44c8c98e, 0x000000cb
+.long 0x6a89c275, 0x00000054, 0x78798ef4, 0x0000007b
+.long 0x6b3e5899, 0x00000094, 0xdd71b927, 0x00000032
+.long 0xb64fe1be, 0x000000a6, 0x17ad88f0, 0x000000c2
+.long 0x66ac20c9, 0x00000023, 0xb43ace7d, 0x0000003d
+.long 0x184adf63, 0x000000ee, 0x82311ae5, 0x0000004c
+.long 0x60335197, 0x00000095, 0x457f5362, 0x0000000b
+.long 0xe07764b1, 0x00000042, 0x84ae6bbb, 0x000000fa
+.long 0x1ca081fe, 0x000000c3, 0x942b08f9, 0x0000004e
+.long 0x58684870, 0x00000008, 0x19fd458f, 0x0000002e
+.long 0x876cde94, 0x000000a1, 0xb7f87b52, 0x00000066
+.long 0x23d373ab, 0x00000028, 0xe2024b72, 0x000000d9
+.long 0x578f1fe3, 0x00000024, 0x2aab5566, 0x000000b2
+.long 0x0728ebb2, 0x00000076, 0x03c2b52f, 0x0000005b
+.long 0x9a7bc586, 0x000000a2, 0xa50837d3, 0x00000049
+.long 0xf2872830, 0x0000006d, 0xb2a5bf23, 0x0000008b
+.long 0xba6a0302, 0x000000d1, 0x5c8216ed, 0x00000025
+.long 0x2b1ccf8a, 0x00000072, 0x92b479a7, 0x000000f8
+.long 0xf0f207f3, 0x000000f6, 0xa1e2694e, 0x00000064
+.long 0xcdf4da65, 0x00000086, 0xd5be0506, 0x00000068
+.long 0x1f6234d1, 0x00000098, 0x8afea6c4, 0x00000016
+.long 0x9d532e34, 0x000000d4, 0xa055f3a2, 0x000000a4
+.long 0x32e18a05, 0x0000005c, 0x75ebf6a4, 0x000000cc
+.long 0x39ec830b, 0x0000005d, 0xaaef6040, 0x00000065
+.long 0x069f715e, 0x000000b6, 0x51106ebd, 0x00000092
+.long 0xf98a213e, 0x0000006c, 0x3d06dd96, 0x00000070
+.long 0xae053edd, 0x00000048, 0x46bde64d, 0x00000050
+.long 0xb58d5491, 0x000000fd, 0x055dc471, 0x000000ed
+.long 0x6fd40604, 0x000000b9, 0xff155060, 0x000000da
+.long 0x24fb9819, 0x0000005e, 0x97e9bdd6, 0x00000015
+.long 0xcc434089, 0x00000046, 0x779ed967, 0x00000057
+.long 0xbd42e8b0, 0x000000a7, 0x888b8907, 0x0000008d
+.long 0x385b19e7, 0x0000009d, 0xdbeec879, 0x00000084
+.long 0x470a7ca1, 0x00000090, 0xe90f427c, 0x000000d8
+.long 0xc91e84f8, 0x000000ab, 0x00000000, 0x00000000
+.long 0x83868009, 0x0000008c, 0x48ed2b32, 0x000000bc
+.long 0xac70111e, 0x000000d3, 0x4e725a6c, 0x0000000a
+.long 0xfbff0efd, 0x000000f7, 0x5638850f, 0x000000e4
+.long 0x1ed5ae3d, 0x00000058, 0x27392d36, 0x00000005
+.long 0x64d90f0a, 0x000000b8, 0x21a65c68, 0x000000b3
+.long 0xd1545b9b, 0x00000045, 0x3a2e3624, 0x00000006
+.long 0xb1670a0c, 0x000000d0, 0x0fe75793, 0x0000002c
+.long 0xd296eeb4, 0x0000001e, 0x9e919b1b, 0x0000008f
+.long 0x4fc5c080, 0x000000ca, 0xa220dc61, 0x0000003f
+.long 0x694b775a, 0x0000000f, 0x161a121c, 0x00000002
+.long 0x0aba93e2, 0x000000c1, 0xe52aa0c0, 0x000000af
+.long 0x43e0223c, 0x000000bd, 0x1d171b12, 0x00000003
+.long 0x0b0d090e, 0x00000001, 0xadc78bf2, 0x00000013
+.long 0xb9a8b62d, 0x0000008a, 0xc8a91e14, 0x0000006b
+.long 0x8519f157, 0x0000003a, 0x4c0775af, 0x00000091
+.long 0xbbdd99ee, 0x00000011, 0xfd607fa3, 0x00000041
+.long 0x9f2601f7, 0x0000004f, 0xbcf5725c, 0x00000067
+.long 0xc53b6644, 0x000000dc, 0x347efb5b, 0x000000ea
+.long 0x7629438b, 0x00000097, 0xdcc623cb, 0x000000f2
+.long 0x68fcedb6, 0x000000cf, 0x63f1e4b8, 0x000000ce
+.long 0xcadc31d7, 0x000000f0, 0x10856342, 0x000000b4
+.long 0x40229713, 0x000000e6, 0x2011c684, 0x00000073
+.long 0x7d244a85, 0x00000096, 0xf83dbbd2, 0x000000ac
+.long 0x1132f9ae, 0x00000074, 0x6da129c7, 0x00000022
+.long 0x4b2f9e1d, 0x000000e7, 0xf330b2dc, 0x000000ad
+.long 0xec52860d, 0x00000035, 0xd0e3c177, 0x00000085
+.long 0x6c16b32b, 0x000000e2, 0x99b970a9, 0x000000f9
+.long 0xfa489411, 0x00000037, 0x2264e947, 0x000000e8
+.long 0xc48cfca8, 0x0000001c, 0x1a3ff0a0, 0x00000075
+.long 0xd82c7d56, 0x000000df, 0xef903322, 0x0000006e
+.long 0xc74e4987, 0x00000047, 0xc1d138d9, 0x000000f1
+.long 0xfea2ca8c, 0x0000001a, 0x360bd498, 0x00000071
+.long 0xcf81f5a6, 0x0000001d, 0x28de7aa5, 0x00000029
+.long 0x268eb7da, 0x000000c5, 0xa4bfad3f, 0x00000089
+.long 0xe49d3a2c, 0x0000006f, 0x0d927850, 0x000000b7
+.long 0x9bcc5f6a, 0x00000062, 0x62467e54, 0x0000000e
+.long 0xc2138df6, 0x000000aa, 0xe8b8d890, 0x00000018
+.long 0x5ef7392e, 0x000000be, 0xf5afc382, 0x0000001b
+.long 0xbe805d9f, 0x000000fc, 0x7c93d069, 0x00000056
+.long 0xa92dd56f, 0x0000003e, 0xb31225cf, 0x0000004b
+.long 0x3b99acc8, 0x000000c6, 0xa77d1810, 0x000000d2
+.long 0x6e639ce8, 0x00000079, 0x7bbb3bdb, 0x00000020
+.long 0x097826cd, 0x0000009a, 0xf418596e, 0x000000db
+.long 0x01b79aec, 0x000000c0, 0xa89a4f83, 0x000000fe
+.long 0x656e95e6, 0x00000078, 0x7ee6ffaa, 0x000000cd
+.long 0x08cfbc21, 0x0000005a, 0xe6e815ef, 0x000000f4
+.long 0xd99be7ba, 0x0000001f, 0xce366f4a, 0x000000dd
+.long 0xd4099fea, 0x000000a8, 0xd67cb029, 0x00000033
+.long 0xafb2a431, 0x00000088, 0x31233f2a, 0x00000007
+.long 0x3094a5c6, 0x000000c7, 0xc066a235, 0x00000031
+.long 0x37bc4e74, 0x000000b1, 0xa6ca82fc, 0x00000012
+.long 0xb0d090e0, 0x00000010, 0x15d8a733, 0x00000059
+.long 0x4a9804f1, 0x00000027, 0xf7daec41, 0x00000080
+.long 0x0e50cd7f, 0x000000ec, 0x2ff69117, 0x0000005f
+.long 0x8dd64d76, 0x00000060, 0x4db0ef43, 0x00000051
+.long 0x544daacc, 0x0000007f, 0xdf0496e4, 0x000000a9
+.long 0xe3b5d19e, 0x00000019, 0x1b886a4c, 0x000000b5
+.long 0xb81f2cc1, 0x0000004a, 0x7f516546, 0x0000000d
+.long 0x04ea5e9d, 0x0000002d, 0x5d358c01, 0x000000e5
+.long 0x737487fa, 0x0000007a, 0x2e410bfb, 0x0000009f
+.long 0x5a1d67b3, 0x00000093, 0x52d2db92, 0x000000c9
+.long 0x335610e9, 0x0000009c, 0x1347d66d, 0x000000ef
+.long 0x8c61d79a, 0x000000a0, 0x7a0ca137, 0x000000e0
+.long 0x8e14f859, 0x0000003b, 0x893c13eb, 0x0000004d
+.long 0xee27a9ce, 0x000000ae, 0x35c961b7, 0x0000002a
+.long 0xede51ce1, 0x000000f5, 0x3cb1477a, 0x000000b0
+.long 0x59dfd29c, 0x000000c8, 0x3f73f255, 0x000000eb
+.long 0x79ce1418, 0x000000bb, 0xbf37c773, 0x0000003c
+.long 0xeacdf753, 0x00000083, 0x5baafd5f, 0x00000053
+.long 0x146f3ddf, 0x00000099, 0x86db4478, 0x00000061
+.long 0x81f3afca, 0x00000017, 0x3ec468b9, 0x0000002b
+.long 0x2c342438, 0x00000004, 0x5f40a3c2, 0x0000007e
+.long 0x72c31d16, 0x000000ba, 0x0c25e2bc, 0x00000077
+.long 0x8b493c28, 0x000000d6, 0x41950dff, 0x00000026
+.long 0x7101a839, 0x000000e1, 0xdeb30c08, 0x00000069
+.long 0x9ce4b4d8, 0x00000014, 0x90c15664, 0x00000063
+.long 0x6184cb7b, 0x00000055, 0x70b632d5, 0x00000021
+.long 0x745c6c48, 0x0000000c, 0x4257b8d0, 0x0000007d
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARMEL__ */
diff --git a/cipher/rijndael-armv6.S b/cipher/rijndael-armv6.S
deleted file mode 100644
index bbbfb0e..0000000
--- a/cipher/rijndael-armv6.S
+++ /dev/null
@@ -1,853 +0,0 @@
-/* rijndael-armv6.S  -  ARM assembly implementation of AES cipher
- *
- * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-
-.text
-
-.syntax unified
-.arm
-
-/* register macros */
-#define CTX	%r0
-#define RTAB	%lr
-#define RMASK	%ip
-
-#define RA	%r4
-#define RB	%r5
-#define RC	%r6
-#define RD	%r7
-
-#define RNA	%r8
-#define RNB	%r9
-#define RNC	%r10
-#define RND	%r11
-
-#define RT0	%r1
-#define RT1	%r2
-#define RT2	%r3
-
-/* helper macros */
-#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 0)]; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 3)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 0)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 1)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 2)]; \
-	strb rtmp0, [rdst, #((offs) + 3)];
-
-/***********************************************************************
- * ARM assembly implementation of the AES cipher
- ***********************************************************************/
-#define preload_first_key(round, ra) \
-	ldr ra, [CTX, #(((round) * 16) + 0 * 4)];
-
-#define dummy(round, ra) /* nothing */
-
-#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
-	ldm CTX, {rna, rnb, rnc, rnd}; \
-	eor ra, rna; \
-	eor rb, rnb; \
-	eor rc, rnc; \
-	preload_key(1, rna); \
-	eor rd, rnd;
-
-#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
-	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
-	\
-	and RT0, RMASK, ra, lsl#3; \
-	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
-	and RT1, RMASK, ra, lsr#(8 - 3); \
-	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
-	and RT2, RMASK, ra, lsr#(16 - 3); \
-	ldr RT0, [RTAB, RT0]; \
-	and ra,  RMASK, ra, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rna, rna, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rd, lsl#3; \
-	ldr ra,  [RTAB, ra]; \
-	\
-	eor rnd, rnd, RT1, ror #24; \
-	and RT1, RMASK, rd, lsr#(8 - 3); \
-	eor rnc, rnc, RT2, ror #16; \
-	and RT2, RMASK, rd, lsr#(16 - 3); \
-	eor rnb, rnb, ra, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rd,  RMASK, rd, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rnd, rnd, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rc, lsl#3; \
-	ldr rd,  [RTAB, rd]; \
-	\
-	eor rnc, rnc, RT1, ror #24; \
-	and RT1, RMASK, rc, lsr#(8 - 3); \
-	eor rnb, rnb, RT2, ror #16; \
-	and RT2, RMASK, rc, lsr#(16 - 3); \
-	eor rna, rna, rd, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rc,  RMASK, rc, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rnc, rnc, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rb, lsl#3; \
-	ldr rc,  [RTAB, rc]; \
-	\
-	eor rnb, rnb, RT1, ror #24; \
-	and RT1, RMASK, rb, lsr#(8 - 3); \
-	eor rna, rna, RT2, ror #16; \
-	and RT2, RMASK, rb, lsr#(16 - 3); \
-	eor rnd, rnd, rc, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rb,  RMASK, rb, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rnb, rnb, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	eor rna, rna, RT1, ror #24; \
-	ldr rb,  [RTAB, rb]; \
-	\
-	eor rnd, rnd, RT2, ror #16; \
-	preload_key((next_r) + 1, ra); \
-	eor rnc, rnc, rb, ror #8;
-
-#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	and RT0, RMASK, ra, lsl#3; \
-	and RT1, RMASK, ra, lsr#(8 - 3); \
-	and RT2, RMASK, ra, lsr#(16 - 3); \
-	ldr rna, [RTAB, RT0]; \
-	and ra,  RMASK, ra, lsr#(24 - 3); \
-	ldr rnd, [RTAB, RT1]; \
-	and RT0, RMASK, rd, lsl#3; \
-	ldr rnc, [RTAB, RT2]; \
-	mov rnd, rnd, ror #24; \
-	ldr rnb, [RTAB, ra]; \
-	and RT1, RMASK, rd, lsr#(8 - 3); \
-	mov rnc, rnc, ror #16; \
-	and RT2, RMASK, rd, lsr#(16 - 3); \
-	mov rnb, rnb, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rd,  RMASK, rd, lsr#(24 - 3); \
-	ldr RT1, [RTAB, RT1]; \
-	\
-	orr rnd, rnd, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rc, lsl#3; \
-	ldr rd,  [RTAB, rd]; \
-	orr rnc, rnc, RT1, ror #24; \
-	and RT1, RMASK, rc, lsr#(8 - 3); \
-	orr rnb, rnb, RT2, ror #16; \
-	and RT2, RMASK, rc, lsr#(16 - 3); \
-	orr rna, rna, rd, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rc,  RMASK, rc, lsr#(24 - 3); \
-	ldr RT1, [RTAB, RT1]; \
-	\
-	orr rnc, rnc, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rb, lsl#3; \
-	ldr rc,  [RTAB, rc]; \
-	orr rnb, rnb, RT1, ror #24; \
-	and RT1, RMASK, rb, lsr#(8 - 3); \
-	orr rna, rna, RT2, ror #16; \
-	ldr RT0, [RTAB, RT0]; \
-	and RT2, RMASK, rb, lsr#(16 - 3); \
-	ldr RT1, [RTAB, RT1]; \
-	orr rnd, rnd, rc, ror #8; \
-	ldr RT2, [RTAB, RT2]; \
-	and rb,  RMASK, rb, lsr#(24 - 3); \
-	ldr rb,  [RTAB, rb]; \
-	\
-	orr rnb, rnb, RT0; \
-	orr rna, rna, RT1, ror #24; \
-	orr rnd, rnd, RT2, ror #16; \
-	orr rnc, rnc, rb, ror #8;
-
-#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \
-	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
-
-#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
-	do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
-
-#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	add CTX, #(((round) + 1) * 16); \
-	add RTAB, #4; \
-	do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
-	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
-
-.align 3
-.global _gcry_aes_armv6_encrypt_block
-.type   _gcry_aes_armv6_encrypt_block,%function;
-
-_gcry_aes_armv6_encrypt_block:
-	/* input:
-	 *	%r0: keysched, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: number of rounds.. 10, 12 or 14
-	 */
-	push {%r4-%r11, %ip, %lr};
-
-	/* read input block */
-#ifndef __ARM_FEATURE_UNALIGNED
-	/* test if src is unaligned */
-	tst	%r2, #3;
-	beq	1f;
-
-	/* unaligned load */
-	ldr_unaligned_le(RA, %r2, 0, RNA);
-	ldr_unaligned_le(RB, %r2, 4, RNB);
-	ldr_unaligned_le(RC, %r2, 8, RNA);
-	ldr_unaligned_le(RD, %r2, 12, RNB);
-	b	2f;
-.ltorg
-1:
-#endif
-	/* aligned load */
-	ldm	%r2, {RA, RB, RC, RD};
-#ifndef __ARMEL__
-	rev	RA, RA;
-	rev	RB, RB;
-	rev	RC, RC;
-	rev	RD, RD;
-#endif
-2:
-	sub	%sp, #16;
-
-	ldr	RTAB, =.LtableE0;
-
-	str	%r1, [%sp, #4];		/* dst */
-	mov	RMASK, #0xff;
-	str	%r3, [%sp, #8];		/* nrounds */
-	mov	RMASK, RMASK, lsl#3;	/* byte mask */
-
-	firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND);
-	encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-
-	ldr	RT0, [%sp, #8];		/* nrounds */
-	cmp	RT0, #12;
-	bge	.Lenc_not_128;
-
-	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
-	lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD);
-
-.Lenc_done:
-	ldr	RT0, [%sp, #4];		/* dst */
-	add	%sp, #16;
-
-	/* store output block */
-#ifndef __ARM_FEATURE_UNALIGNED
-	/* test if dst is unaligned */
-	tst	RT0, #3;
-	beq	1f;
-
-	/* unaligned store */
-	str_unaligned_le(RA, RT0, 0, RNA, RNB);
-	str_unaligned_le(RB, RT0, 4, RNA, RNB);
-	str_unaligned_le(RC, RT0, 8, RNA, RNB);
-	str_unaligned_le(RD, RT0, 12, RNA, RNB);
-	b	2f;
-.ltorg
-1:
-#endif
-	/* aligned store */
-#ifndef __ARMEL__
-	rev	RA, RA;
-	rev	RB, RB;
-	rev	RC, RC;
-	rev	RD, RD;
-#endif
-	/* write output block */
-	stm	RT0, {RA, RB, RC, RD};
-2:
-	pop {%r4-%r11, %ip, %pc};
-
-.ltorg
-.Lenc_not_128:
-	beq .Lenc_192
-
-	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
-	lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD);
-
-	b .Lenc_done;
-
-.ltorg
-.Lenc_192:
-	encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
-	lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD);
-
-	b .Lenc_done;
-.size _gcry_aes_armv6_encrypt_block,.-_gcry_aes_armv6_encrypt_block;
-
-#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \
-	ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \
-	eor ra, rna; \
-	ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \
-	eor rb, rnb; \
-	ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \
-	eor rc, rnc; \
-	preload_first_key((round) - 1, rna); \
-	eor rd, rnd;
-
-#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
-	ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \
-	\
-	and RT0, RMASK, ra, lsl#3; \
-	ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \
-	and RT1, RMASK, ra, lsr#(8 - 3); \
-	ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \
-	and RT2, RMASK, ra, lsr#(16 - 3); \
-	ldr RT0, [RTAB, RT0]; \
-	and ra,  RMASK, ra, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rna, rna, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rb, lsl#3; \
-	ldr ra,  [RTAB, ra]; \
-	\
-	eor rnb, rnb, RT1, ror #24; \
-	and RT1, RMASK, rb, lsr#(8 - 3); \
-	eor rnc, rnc, RT2, ror #16; \
-	and RT2, RMASK, rb, lsr#(16 - 3); \
-	eor rnd, rnd, ra, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rb,  RMASK, rb, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rnb, rnb, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rc, lsl#3; \
-	ldr rb,  [RTAB, rb]; \
-	\
-	eor rnc, rnc, RT1, ror #24; \
-	and RT1, RMASK, rc, lsr#(8 - 3); \
-	eor rnd, rnd, RT2, ror #16; \
-	and RT2, RMASK, rc, lsr#(16 - 3); \
-	eor rna, rna, rb, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rc,  RMASK, rc, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rnc, rnc, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rd, lsl#3; \
-	ldr rc,  [RTAB, rc]; \
-	\
-	eor rnd, rnd, RT1, ror #24; \
-	and RT1, RMASK, rd, lsr#(8 - 3); \
-	eor rna, rna, RT2, ror #16; \
-	and RT2, RMASK, rd, lsr#(16 - 3); \
-	eor rnb, rnb, rc, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rd,  RMASK, rd, lsr#(24 - 3); \
-	\
-	ldr RT1, [RTAB, RT1]; \
-	eor rnd, rnd, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	eor rna, rna, RT1, ror #24; \
-	ldr rd,  [RTAB, rd]; \
-	\
-	eor rnb, rnb, RT2, ror #16; \
-	preload_key((next_r) - 1, ra); \
-	eor rnc, rnc, rd, ror #8;
-
-#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	and RT0, RMASK, ra, lsl#3; \
-	and RT1, RMASK, ra, lsr#(8 - 3); \
-	and RT2, RMASK, ra, lsr#(16 - 3); \
-	ldr rna, [RTAB, RT0]; \
-	and ra,  RMASK, ra, lsr#(24 - 3); \
-	ldr rnb, [RTAB, RT1]; \
-	and RT0, RMASK, rb, lsl#3; \
-	ldr rnc, [RTAB, RT2]; \
-	mov rnb, rnb, ror #24; \
-	ldr rnd, [RTAB, ra]; \
-	and RT1, RMASK, rb, lsr#(8 - 3); \
-	mov rnc, rnc, ror #16; \
-	and RT2, RMASK, rb, lsr#(16 - 3); \
-	mov rnd, rnd, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rb,  RMASK, rb, lsr#(24 - 3); \
-	ldr RT1, [RTAB, RT1]; \
-	\
-	orr rnb, rnb, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rc, lsl#3; \
-	ldr rb,  [RTAB, rb]; \
-	orr rnc, rnc, RT1, ror #24; \
-	and RT1, RMASK, rc, lsr#(8 - 3); \
-	orr rnd, rnd, RT2, ror #16; \
-	and RT2, RMASK, rc, lsr#(16 - 3); \
-	orr rna, rna, rb, ror #8; \
-	ldr RT0, [RTAB, RT0]; \
-	and rc,  RMASK, rc, lsr#(24 - 3); \
-	ldr RT1, [RTAB, RT1]; \
-	\
-	orr rnc, rnc, RT0; \
-	ldr RT2, [RTAB, RT2]; \
-	and RT0, RMASK, rd, lsl#3; \
-	ldr rc,  [RTAB, rc]; \
-	orr rnd, rnd, RT1, ror #24; \
-	and RT1, RMASK, rd, lsr#(8 - 3); \
-	orr rna, rna, RT2, ror #16; \
-	ldr RT0, [RTAB, RT0]; \
-	and RT2, RMASK, rd, lsr#(16 - 3); \
-	ldr RT1, [RTAB, RT1]; \
-	orr rnb, rnb, rc, ror #8; \
-	ldr RT2, [RTAB, RT2]; \
-	and rd,  RMASK, rd, lsr#(24 - 3); \
-	ldr rd,  [RTAB, rd]; \
-	\
-	orr rnd, rnd, RT0; \
-	orr rna, rna, RT1, ror #24; \
-	orr rnb, rnb, RT2, ror #16; \
-	orr rnc, rnc, rd, ror #8;
-
-#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \
-	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key);
-
-#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \
-	do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key);
-
-#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \
-	add RTAB, #4; \
-	do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \
-	addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy);
-
-.align 3
-.global _gcry_aes_armv6_decrypt_block
-.type   _gcry_aes_armv6_decrypt_block,%function;
-
-_gcry_aes_armv6_decrypt_block:
-	/* input:
-	 *	%r0: keysched, CTX
-	 *	%r1: dst
-	 *	%r2: src
-	 *	%r3: number of rounds.. 10, 12 or 14
-	 */
-	push {%r4-%r11, %ip, %lr};
-
-	/* read input block */
-#ifndef __ARM_FEATURE_UNALIGNED
-	/* test if src is unaligned */
-	tst	%r2, #3;
-	beq	1f;
-
-	/* unaligned load */
-	ldr_unaligned_le(RA, %r2, 0, RNA);
-	ldr_unaligned_le(RB, %r2, 4, RNB);
-	ldr_unaligned_le(RC, %r2, 8, RNA);
-	ldr_unaligned_le(RD, %r2, 12, RNB);
-	b	2f;
-.ltorg
-1:
-#endif
-	/* aligned load */
-	ldm	%r2, {RA, RB, RC, RD};
-#ifndef __ARMEL__
-	rev	RA, RA;
-	rev	RB, RB;
-	rev	RC, RC;
-	rev	RD, RD;
-#endif
-2:
-	sub	%sp, #16;
-
-	ldr	RTAB, =.LtableD0;
-
-	mov	RMASK, #0xff;
-	str	%r1, [%sp, #4];		/* dst */
-	mov	RMASK, RMASK, lsl#3;	/* byte mask */
-
-	cmp	%r3, #12;
-	bge	.Ldec_256;
-
-	firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND);
-.Ldec_tail:
-	decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy);
-	lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD);
-
-	ldr	RT0, [%sp, #4];		/* dst */
-	add	%sp, #16;
-
-	/* store output block */
-#ifndef __ARM_FEATURE_UNALIGNED
-	/* test if dst is unaligned */
-	tst	RT0, #3;
-	beq	1f;
-
-	/* unaligned store */
-	str_unaligned_le(RA, RT0, 0, RNA, RNB);
-	str_unaligned_le(RB, RT0, 4, RNA, RNB);
-	str_unaligned_le(RC, RT0, 8, RNA, RNB);
-	str_unaligned_le(RD, RT0, 12, RNA, RNB);
-	b	2f;
-.ltorg
-1:
-#endif
-	/* aligned store */
-#ifndef __ARMEL__
-	rev	RA, RA;
-	rev	RB, RB;
-	rev	RC, RC;
-	rev	RD, RD;
-#endif
-	/* write output block */
-	stm	RT0, {RA, RB, RC, RD};
-2:
-	pop {%r4-%r11, %ip, %pc};
-
-.ltorg
-.Ldec_256:
-	beq .Ldec_192;
-
-	firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND);
-	decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-
-	b .Ldec_tail;
-
-.ltorg
-.Ldec_192:
-	firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND);
-	decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key);
-	decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key);
-
-	b .Ldec_tail;
-.size _gcry_aes_armv6_encrypt_block,.-_gcry_aes_armv6_encrypt_block;
-
-.data
-
-/* Encryption tables */
-.align 5
-.type .LtableE0, %object
-.type .LtableEs0, %object
-.LtableE0:
-.long 0xa56363c6
-.LtableEs0:
-.long             0x00000063, 0x847c7cf8, 0x0000007c
-.long 0x997777ee, 0x00000077, 0x8d7b7bf6, 0x0000007b
-.long 0x0df2f2ff, 0x000000f2, 0xbd6b6bd6, 0x0000006b
-.long 0xb16f6fde, 0x0000006f, 0x54c5c591, 0x000000c5
-.long 0x50303060, 0x00000030, 0x03010102, 0x00000001
-.long 0xa96767ce, 0x00000067, 0x7d2b2b56, 0x0000002b
-.long 0x19fefee7, 0x000000fe, 0x62d7d7b5, 0x000000d7
-.long 0xe6abab4d, 0x000000ab, 0x9a7676ec, 0x00000076
-.long 0x45caca8f, 0x000000ca, 0x9d82821f, 0x00000082
-.long 0x40c9c989, 0x000000c9, 0x877d7dfa, 0x0000007d
-.long 0x15fafaef, 0x000000fa, 0xeb5959b2, 0x00000059
-.long 0xc947478e, 0x00000047, 0x0bf0f0fb, 0x000000f0
-.long 0xecadad41, 0x000000ad, 0x67d4d4b3, 0x000000d4
-.long 0xfda2a25f, 0x000000a2, 0xeaafaf45, 0x000000af
-.long 0xbf9c9c23, 0x0000009c, 0xf7a4a453, 0x000000a4
-.long 0x967272e4, 0x00000072, 0x5bc0c09b, 0x000000c0
-.long 0xc2b7b775, 0x000000b7, 0x1cfdfde1, 0x000000fd
-.long 0xae93933d, 0x00000093, 0x6a26264c, 0x00000026
-.long 0x5a36366c, 0x00000036, 0x413f3f7e, 0x0000003f
-.long 0x02f7f7f5, 0x000000f7, 0x4fcccc83, 0x000000cc
-.long 0x5c343468, 0x00000034, 0xf4a5a551, 0x000000a5
-.long 0x34e5e5d1, 0x000000e5, 0x08f1f1f9, 0x000000f1
-.long 0x937171e2, 0x00000071, 0x73d8d8ab, 0x000000d8
-.long 0x53313162, 0x00000031, 0x3f15152a, 0x00000015
-.long 0x0c040408, 0x00000004, 0x52c7c795, 0x000000c7
-.long 0x65232346, 0x00000023, 0x5ec3c39d, 0x000000c3
-.long 0x28181830, 0x00000018, 0xa1969637, 0x00000096
-.long 0x0f05050a, 0x00000005, 0xb59a9a2f, 0x0000009a
-.long 0x0907070e, 0x00000007, 0x36121224, 0x00000012
-.long 0x9b80801b, 0x00000080, 0x3de2e2df, 0x000000e2
-.long 0x26ebebcd, 0x000000eb, 0x6927274e, 0x00000027
-.long 0xcdb2b27f, 0x000000b2, 0x9f7575ea, 0x00000075
-.long 0x1b090912, 0x00000009, 0x9e83831d, 0x00000083
-.long 0x742c2c58, 0x0000002c, 0x2e1a1a34, 0x0000001a
-.long 0x2d1b1b36, 0x0000001b, 0xb26e6edc, 0x0000006e
-.long 0xee5a5ab4, 0x0000005a, 0xfba0a05b, 0x000000a0
-.long 0xf65252a4, 0x00000052, 0x4d3b3b76, 0x0000003b
-.long 0x61d6d6b7, 0x000000d6, 0xceb3b37d, 0x000000b3
-.long 0x7b292952, 0x00000029, 0x3ee3e3dd, 0x000000e3
-.long 0x712f2f5e, 0x0000002f, 0x97848413, 0x00000084
-.long 0xf55353a6, 0x00000053, 0x68d1d1b9, 0x000000d1
-.long 0x00000000, 0x00000000, 0x2cededc1, 0x000000ed
-.long 0x60202040, 0x00000020, 0x1ffcfce3, 0x000000fc
-.long 0xc8b1b179, 0x000000b1, 0xed5b5bb6, 0x0000005b
-.long 0xbe6a6ad4, 0x0000006a, 0x46cbcb8d, 0x000000cb
-.long 0xd9bebe67, 0x000000be, 0x4b393972, 0x00000039
-.long 0xde4a4a94, 0x0000004a, 0xd44c4c98, 0x0000004c
-.long 0xe85858b0, 0x00000058, 0x4acfcf85, 0x000000cf
-.long 0x6bd0d0bb, 0x000000d0, 0x2aefefc5, 0x000000ef
-.long 0xe5aaaa4f, 0x000000aa, 0x16fbfbed, 0x000000fb
-.long 0xc5434386, 0x00000043, 0xd74d4d9a, 0x0000004d
-.long 0x55333366, 0x00000033, 0x94858511, 0x00000085
-.long 0xcf45458a, 0x00000045, 0x10f9f9e9, 0x000000f9
-.long 0x06020204, 0x00000002, 0x817f7ffe, 0x0000007f
-.long 0xf05050a0, 0x00000050, 0x443c3c78, 0x0000003c
-.long 0xba9f9f25, 0x0000009f, 0xe3a8a84b, 0x000000a8
-.long 0xf35151a2, 0x00000051, 0xfea3a35d, 0x000000a3
-.long 0xc0404080, 0x00000040, 0x8a8f8f05, 0x0000008f
-.long 0xad92923f, 0x00000092, 0xbc9d9d21, 0x0000009d
-.long 0x48383870, 0x00000038, 0x04f5f5f1, 0x000000f5
-.long 0xdfbcbc63, 0x000000bc, 0xc1b6b677, 0x000000b6
-.long 0x75dadaaf, 0x000000da, 0x63212142, 0x00000021
-.long 0x30101020, 0x00000010, 0x1affffe5, 0x000000ff
-.long 0x0ef3f3fd, 0x000000f3, 0x6dd2d2bf, 0x000000d2
-.long 0x4ccdcd81, 0x000000cd, 0x140c0c18, 0x0000000c
-.long 0x35131326, 0x00000013, 0x2fececc3, 0x000000ec
-.long 0xe15f5fbe, 0x0000005f, 0xa2979735, 0x00000097
-.long 0xcc444488, 0x00000044, 0x3917172e, 0x00000017
-.long 0x57c4c493, 0x000000c4, 0xf2a7a755, 0x000000a7
-.long 0x827e7efc, 0x0000007e, 0x473d3d7a, 0x0000003d
-.long 0xac6464c8, 0x00000064, 0xe75d5dba, 0x0000005d
-.long 0x2b191932, 0x00000019, 0x957373e6, 0x00000073
-.long 0xa06060c0, 0x00000060, 0x98818119, 0x00000081
-.long 0xd14f4f9e, 0x0000004f, 0x7fdcdca3, 0x000000dc
-.long 0x66222244, 0x00000022, 0x7e2a2a54, 0x0000002a
-.long 0xab90903b, 0x00000090, 0x8388880b, 0x00000088
-.long 0xca46468c, 0x00000046, 0x29eeeec7, 0x000000ee
-.long 0xd3b8b86b, 0x000000b8, 0x3c141428, 0x00000014
-.long 0x79dedea7, 0x000000de, 0xe25e5ebc, 0x0000005e
-.long 0x1d0b0b16, 0x0000000b, 0x76dbdbad, 0x000000db
-.long 0x3be0e0db, 0x000000e0, 0x56323264, 0x00000032
-.long 0x4e3a3a74, 0x0000003a, 0x1e0a0a14, 0x0000000a
-.long 0xdb494992, 0x00000049, 0x0a06060c, 0x00000006
-.long 0x6c242448, 0x00000024, 0xe45c5cb8, 0x0000005c
-.long 0x5dc2c29f, 0x000000c2, 0x6ed3d3bd, 0x000000d3
-.long 0xefacac43, 0x000000ac, 0xa66262c4, 0x00000062
-.long 0xa8919139, 0x00000091, 0xa4959531, 0x00000095
-.long 0x37e4e4d3, 0x000000e4, 0x8b7979f2, 0x00000079
-.long 0x32e7e7d5, 0x000000e7, 0x43c8c88b, 0x000000c8
-.long 0x5937376e, 0x00000037, 0xb76d6dda, 0x0000006d
-.long 0x8c8d8d01, 0x0000008d, 0x64d5d5b1, 0x000000d5
-.long 0xd24e4e9c, 0x0000004e, 0xe0a9a949, 0x000000a9
-.long 0xb46c6cd8, 0x0000006c, 0xfa5656ac, 0x00000056
-.long 0x07f4f4f3, 0x000000f4, 0x25eaeacf, 0x000000ea
-.long 0xaf6565ca, 0x00000065, 0x8e7a7af4, 0x0000007a
-.long 0xe9aeae47, 0x000000ae, 0x18080810, 0x00000008
-.long 0xd5baba6f, 0x000000ba, 0x887878f0, 0x00000078
-.long 0x6f25254a, 0x00000025, 0x722e2e5c, 0x0000002e
-.long 0x241c1c38, 0x0000001c, 0xf1a6a657, 0x000000a6
-.long 0xc7b4b473, 0x000000b4, 0x51c6c697, 0x000000c6
-.long 0x23e8e8cb, 0x000000e8, 0x7cdddda1, 0x000000dd
-.long 0x9c7474e8, 0x00000074, 0x211f1f3e, 0x0000001f
-.long 0xdd4b4b96, 0x0000004b, 0xdcbdbd61, 0x000000bd
-.long 0x868b8b0d, 0x0000008b, 0x858a8a0f, 0x0000008a
-.long 0x907070e0, 0x00000070, 0x423e3e7c, 0x0000003e
-.long 0xc4b5b571, 0x000000b5, 0xaa6666cc, 0x00000066
-.long 0xd8484890, 0x00000048, 0x05030306, 0x00000003
-.long 0x01f6f6f7, 0x000000f6, 0x120e0e1c, 0x0000000e
-.long 0xa36161c2, 0x00000061, 0x5f35356a, 0x00000035
-.long 0xf95757ae, 0x00000057, 0xd0b9b969, 0x000000b9
-.long 0x91868617, 0x00000086, 0x58c1c199, 0x000000c1
-.long 0x271d1d3a, 0x0000001d, 0xb99e9e27, 0x0000009e
-.long 0x38e1e1d9, 0x000000e1, 0x13f8f8eb, 0x000000f8
-.long 0xb398982b, 0x00000098, 0x33111122, 0x00000011
-.long 0xbb6969d2, 0x00000069, 0x70d9d9a9, 0x000000d9
-.long 0x898e8e07, 0x0000008e, 0xa7949433, 0x00000094
-.long 0xb69b9b2d, 0x0000009b, 0x221e1e3c, 0x0000001e
-.long 0x92878715, 0x00000087, 0x20e9e9c9, 0x000000e9
-.long 0x49cece87, 0x000000ce, 0xff5555aa, 0x00000055
-.long 0x78282850, 0x00000028, 0x7adfdfa5, 0x000000df
-.long 0x8f8c8c03, 0x0000008c, 0xf8a1a159, 0x000000a1
-.long 0x80898909, 0x00000089, 0x170d0d1a, 0x0000000d
-.long 0xdabfbf65, 0x000000bf, 0x31e6e6d7, 0x000000e6
-.long 0xc6424284, 0x00000042, 0xb86868d0, 0x00000068
-.long 0xc3414182, 0x00000041, 0xb0999929, 0x00000099
-.long 0x772d2d5a, 0x0000002d, 0x110f0f1e, 0x0000000f
-.long 0xcbb0b07b, 0x000000b0, 0xfc5454a8, 0x00000054
-.long 0xd6bbbb6d, 0x000000bb, 0x3a16162c, 0x00000016
-
-/* Decryption tables */
-.align 5
-.type .LtableD0, %object
-.type .LtableDs0, %object
-.LtableD0:
-.long 0x50a7f451
-.LtableDs0:
-.long             0x00000052, 0x5365417e, 0x00000009
-.long 0xc3a4171a, 0x0000006a, 0x965e273a, 0x000000d5
-.long 0xcb6bab3b, 0x00000030, 0xf1459d1f, 0x00000036
-.long 0xab58faac, 0x000000a5, 0x9303e34b, 0x00000038
-.long 0x55fa3020, 0x000000bf, 0xf66d76ad, 0x00000040
-.long 0x9176cc88, 0x000000a3, 0x254c02f5, 0x0000009e
-.long 0xfcd7e54f, 0x00000081, 0xd7cb2ac5, 0x000000f3
-.long 0x80443526, 0x000000d7, 0x8fa362b5, 0x000000fb
-.long 0x495ab1de, 0x0000007c, 0x671bba25, 0x000000e3
-.long 0x980eea45, 0x00000039, 0xe1c0fe5d, 0x00000082
-.long 0x02752fc3, 0x0000009b, 0x12f04c81, 0x0000002f
-.long 0xa397468d, 0x000000ff, 0xc6f9d36b, 0x00000087
-.long 0xe75f8f03, 0x00000034, 0x959c9215, 0x0000008e
-.long 0xeb7a6dbf, 0x00000043, 0xda595295, 0x00000044
-.long 0x2d83bed4, 0x000000c4, 0xd3217458, 0x000000de
-.long 0x2969e049, 0x000000e9, 0x44c8c98e, 0x000000cb
-.long 0x6a89c275, 0x00000054, 0x78798ef4, 0x0000007b
-.long 0x6b3e5899, 0x00000094, 0xdd71b927, 0x00000032
-.long 0xb64fe1be, 0x000000a6, 0x17ad88f0, 0x000000c2
-.long 0x66ac20c9, 0x00000023, 0xb43ace7d, 0x0000003d
-.long 0x184adf63, 0x000000ee, 0x82311ae5, 0x0000004c
-.long 0x60335197, 0x00000095, 0x457f5362, 0x0000000b
-.long 0xe07764b1, 0x00000042, 0x84ae6bbb, 0x000000fa
-.long 0x1ca081fe, 0x000000c3, 0x942b08f9, 0x0000004e
-.long 0x58684870, 0x00000008, 0x19fd458f, 0x0000002e
-.long 0x876cde94, 0x000000a1, 0xb7f87b52, 0x00000066
-.long 0x23d373ab, 0x00000028, 0xe2024b72, 0x000000d9
-.long 0x578f1fe3, 0x00000024, 0x2aab5566, 0x000000b2
-.long 0x0728ebb2, 0x00000076, 0x03c2b52f, 0x0000005b
-.long 0x9a7bc586, 0x000000a2, 0xa50837d3, 0x00000049
-.long 0xf2872830, 0x0000006d, 0xb2a5bf23, 0x0000008b
-.long 0xba6a0302, 0x000000d1, 0x5c8216ed, 0x00000025
-.long 0x2b1ccf8a, 0x00000072, 0x92b479a7, 0x000000f8
-.long 0xf0f207f3, 0x000000f6, 0xa1e2694e, 0x00000064
-.long 0xcdf4da65, 0x00000086, 0xd5be0506, 0x00000068
-.long 0x1f6234d1, 0x00000098, 0x8afea6c4, 0x00000016
-.long 0x9d532e34, 0x000000d4, 0xa055f3a2, 0x000000a4
-.long 0x32e18a05, 0x0000005c, 0x75ebf6a4, 0x000000cc
-.long 0x39ec830b, 0x0000005d, 0xaaef6040, 0x00000065
-.long 0x069f715e, 0x000000b6, 0x51106ebd, 0x00000092
-.long 0xf98a213e, 0x0000006c, 0x3d06dd96, 0x00000070
-.long 0xae053edd, 0x00000048, 0x46bde64d, 0x00000050
-.long 0xb58d5491, 0x000000fd, 0x055dc471, 0x000000ed
-.long 0x6fd40604, 0x000000b9, 0xff155060, 0x000000da
-.long 0x24fb9819, 0x0000005e, 0x97e9bdd6, 0x00000015
-.long 0xcc434089, 0x00000046, 0x779ed967, 0x00000057
-.long 0xbd42e8b0, 0x000000a7, 0x888b8907, 0x0000008d
-.long 0x385b19e7, 0x0000009d, 0xdbeec879, 0x00000084
-.long 0x470a7ca1, 0x00000090, 0xe90f427c, 0x000000d8
-.long 0xc91e84f8, 0x000000ab, 0x00000000, 0x00000000
-.long 0x83868009, 0x0000008c, 0x48ed2b32, 0x000000bc
-.long 0xac70111e, 0x000000d3, 0x4e725a6c, 0x0000000a
-.long 0xfbff0efd, 0x000000f7, 0x5638850f, 0x000000e4
-.long 0x1ed5ae3d, 0x00000058, 0x27392d36, 0x00000005
-.long 0x64d90f0a, 0x000000b8, 0x21a65c68, 0x000000b3
-.long 0xd1545b9b, 0x00000045, 0x3a2e3624, 0x00000006
-.long 0xb1670a0c, 0x000000d0, 0x0fe75793, 0x0000002c
-.long 0xd296eeb4, 0x0000001e, 0x9e919b1b, 0x0000008f
-.long 0x4fc5c080, 0x000000ca, 0xa220dc61, 0x0000003f
-.long 0x694b775a, 0x0000000f, 0x161a121c, 0x00000002
-.long 0x0aba93e2, 0x000000c1, 0xe52aa0c0, 0x000000af
-.long 0x43e0223c, 0x000000bd, 0x1d171b12, 0x00000003
-.long 0x0b0d090e, 0x00000001, 0xadc78bf2, 0x00000013
-.long 0xb9a8b62d, 0x0000008a, 0xc8a91e14, 0x0000006b
-.long 0x8519f157, 0x0000003a, 0x4c0775af, 0x00000091
-.long 0xbbdd99ee, 0x00000011, 0xfd607fa3, 0x00000041
-.long 0x9f2601f7, 0x0000004f, 0xbcf5725c, 0x00000067
-.long 0xc53b6644, 0x000000dc, 0x347efb5b, 0x000000ea
-.long 0x7629438b, 0x00000097, 0xdcc623cb, 0x000000f2
-.long 0x68fcedb6, 0x000000cf, 0x63f1e4b8, 0x000000ce
-.long 0xcadc31d7, 0x000000f0, 0x10856342, 0x000000b4
-.long 0x40229713, 0x000000e6, 0x2011c684, 0x00000073
-.long 0x7d244a85, 0x00000096, 0xf83dbbd2, 0x000000ac
-.long 0x1132f9ae, 0x00000074, 0x6da129c7, 0x00000022
-.long 0x4b2f9e1d, 0x000000e7, 0xf330b2dc, 0x000000ad
-.long 0xec52860d, 0x00000035, 0xd0e3c177, 0x00000085
-.long 0x6c16b32b, 0x000000e2, 0x99b970a9, 0x000000f9
-.long 0xfa489411, 0x00000037, 0x2264e947, 0x000000e8
-.long 0xc48cfca8, 0x0000001c, 0x1a3ff0a0, 0x00000075
-.long 0xd82c7d56, 0x000000df, 0xef903322, 0x0000006e
-.long 0xc74e4987, 0x00000047, 0xc1d138d9, 0x000000f1
-.long 0xfea2ca8c, 0x0000001a, 0x360bd498, 0x00000071
-.long 0xcf81f5a6, 0x0000001d, 0x28de7aa5, 0x00000029
-.long 0x268eb7da, 0x000000c5, 0xa4bfad3f, 0x00000089
-.long 0xe49d3a2c, 0x0000006f, 0x0d927850, 0x000000b7
-.long 0x9bcc5f6a, 0x00000062, 0x62467e54, 0x0000000e
-.long 0xc2138df6, 0x000000aa, 0xe8b8d890, 0x00000018
-.long 0x5ef7392e, 0x000000be, 0xf5afc382, 0x0000001b
-.long 0xbe805d9f, 0x000000fc, 0x7c93d069, 0x00000056
-.long 0xa92dd56f, 0x0000003e, 0xb31225cf, 0x0000004b
-.long 0x3b99acc8, 0x000000c6, 0xa77d1810, 0x000000d2
-.long 0x6e639ce8, 0x00000079, 0x7bbb3bdb, 0x00000020
-.long 0x097826cd, 0x0000009a, 0xf418596e, 0x000000db
-.long 0x01b79aec, 0x000000c0, 0xa89a4f83, 0x000000fe
-.long 0x656e95e6, 0x00000078, 0x7ee6ffaa, 0x000000cd
-.long 0x08cfbc21, 0x0000005a, 0xe6e815ef, 0x000000f4
-.long 0xd99be7ba, 0x0000001f, 0xce366f4a, 0x000000dd
-.long 0xd4099fea, 0x000000a8, 0xd67cb029, 0x00000033
-.long 0xafb2a431, 0x00000088, 0x31233f2a, 0x00000007
-.long 0x3094a5c6, 0x000000c7, 0xc066a235, 0x00000031
-.long 0x37bc4e74, 0x000000b1, 0xa6ca82fc, 0x00000012
-.long 0xb0d090e0, 0x00000010, 0x15d8a733, 0x00000059
-.long 0x4a9804f1, 0x00000027, 0xf7daec41, 0x00000080
-.long 0x0e50cd7f, 0x000000ec, 0x2ff69117, 0x0000005f
-.long 0x8dd64d76, 0x00000060, 0x4db0ef43, 0x00000051
-.long 0x544daacc, 0x0000007f, 0xdf0496e4, 0x000000a9
-.long 0xe3b5d19e, 0x00000019, 0x1b886a4c, 0x000000b5
-.long 0xb81f2cc1, 0x0000004a, 0x7f516546, 0x0000000d
-.long 0x04ea5e9d, 0x0000002d, 0x5d358c01, 0x000000e5
-.long 0x737487fa, 0x0000007a, 0x2e410bfb, 0x0000009f
-.long 0x5a1d67b3, 0x00000093, 0x52d2db92, 0x000000c9
-.long 0x335610e9, 0x0000009c, 0x1347d66d, 0x000000ef
-.long 0x8c61d79a, 0x000000a0, 0x7a0ca137, 0x000000e0
-.long 0x8e14f859, 0x0000003b, 0x893c13eb, 0x0000004d
-.long 0xee27a9ce, 0x000000ae, 0x35c961b7, 0x0000002a
-.long 0xede51ce1, 0x000000f5, 0x3cb1477a, 0x000000b0
-.long 0x59dfd29c, 0x000000c8, 0x3f73f255, 0x000000eb
-.long 0x79ce1418, 0x000000bb, 0xbf37c773, 0x0000003c
-.long 0xeacdf753, 0x00000083, 0x5baafd5f, 0x00000053
-.long 0x146f3ddf, 0x00000099, 0x86db4478, 0x00000061
-.long 0x81f3afca, 0x00000017, 0x3ec468b9, 0x0000002b
-.long 0x2c342438, 0x00000004, 0x5f40a3c2, 0x0000007e
-.long 0x72c31d16, 0x000000ba, 0x0c25e2bc, 0x00000077
-.long 0x8b493c28, 0x000000d6, 0x41950dff, 0x00000026
-.long 0x7101a839, 0x000000e1, 0xdeb30c08, 0x00000069
-.long 0x9ce4b4d8, 0x00000014, 0x90c15664, 0x00000063
-.long 0x6184cb7b, 0x00000055, 0x70b632d5, 0x00000021
-.long 0x745c6c48, 0x0000000c, 0x4257b8d0, 0x0000007d
-
-#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
-#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 85c1a41..68ab5ea 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -67,11 +67,11 @@
 # define USE_AMD64_ASM 1
 #endif
 
-/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
-#undef USE_ARMV6_ASM
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+/* USE_ARM_ASM indicates whether to use ARMv6 assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__)
 # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-#  define USE_ARMV6_ASM 1
+#  define USE_ARM_ASM 1
 # endif
 #endif
 
@@ -123,18 +123,18 @@ extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
 					  int rounds);
 #endif /*USE_AMD64_ASM*/
 
-#ifdef USE_ARMV6_ASM
+#ifdef USE_ARM_ASM
 /* ARMv6 assembly implementations of AES */
-extern void _gcry_aes_armv6_encrypt_block(const void *keysched_enc,
+extern void _gcry_aes_arm_encrypt_block(const void *keysched_enc,
 					  unsigned char *out,
 					  const unsigned char *in,
 					  int rounds);
 
-extern void _gcry_aes_armv6_decrypt_block(const void *keysched_dec,
+extern void _gcry_aes_arm_decrypt_block(const void *keysched_dec,
 					  unsigned char *out,
 					  const unsigned char *in,
 					  int rounds);
-#endif /*USE_ARMV6_ASM*/
+#endif /*USE_ARM_ASM*/
 
 
 

@@ -567,8 +567,8 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
 {
 #ifdef USE_AMD64_ASM
   _gcry_aes_amd64_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
-#elif defined(USE_ARMV6_ASM)
-  _gcry_aes_armv6_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
+#elif defined(USE_ARM_ASM)
+  _gcry_aes_arm_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
 #else
 #define rk (ctx->keyschenc)
   int rounds = ctx->rounds;
@@ -651,7 +651,7 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
   *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]);
   *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]);
 #undef rk
-#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
 }
 
 
@@ -659,7 +659,7 @@ static void
 do_encrypt (const RIJNDAEL_context *ctx,
             unsigned char *bx, const unsigned char *ax)
 {
-#if !defined(USE_AMD64_ASM) && !defined(USE_ARMV6_ASM)
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
   /* BX and AX are not necessary correctly aligned.  Thus we might
      need to copy them here.  We try to align to a 16 bytes.  */
   if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
@@ -680,7 +680,7 @@ do_encrypt (const RIJNDAEL_context *ctx,
       memcpy (bx, b.b, 16);
     }
   else
-#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
     {
       do_encrypt_aligned (ctx, bx, ax);
     }
@@ -1694,8 +1694,8 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
 {
 #ifdef USE_AMD64_ASM
   _gcry_aes_amd64_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
-#elif defined(USE_ARMV6_ASM)
-  _gcry_aes_armv6_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
+#elif defined(USE_ARM_ASM)
+  _gcry_aes_arm_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
 #else
 #define rk  (ctx->keyschdec)
   int rounds = ctx->rounds;
@@ -1779,7 +1779,7 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
   *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]);
   *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]);
 #undef rk
-#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
 }
 
 
@@ -1794,7 +1794,7 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
       ctx->decryption_prepared = 1;
     }
 
-#if !defined(USE_AMD64_ASM) && !defined(USE_ARMV6_ASM)
+#if !defined(USE_AMD64_ASM) && !defined(USE_ARM_ASM)
   /* BX and AX are not necessary correctly aligned.  Thus we might
      need to copy them here.  We try to align to a 16 bytes. */
   if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
@@ -1815,7 +1815,7 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
       memcpy (bx, b.b, 16);
     }
   else
-#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
     {
       do_decrypt_aligned (ctx, bx, ax);
     }
diff --git a/cipher/twofish-arm.S b/cipher/twofish-arm.S
new file mode 100644
index 0000000..9565c4a
--- /dev/null
+++ b/cipher/twofish-arm.S
@@ -0,0 +1,365 @@
+/* twofish-arm.S  -  ARM assembly implementation of Twofish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w  ((s3) + 4 * 256)
+#define k  ((w) + 4 * 8)
+
+/* register macros */
+#define CTX %r0
+#define CTXs0 %r0
+#define CTXs1 %r1
+#define CTXs3 %r7
+
+#define RA %r3
+#define RB %r4
+#define RC %r5
+#define RD %r6
+
+#define RX %r2
+#define RY %ip
+
+#define RMASK %lr
+
+#define RT0 %r8
+#define RT1 %r9
+#define RT2 %r10
+#define RT3 %r11
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+	ldrb rout, [rsrc, #((offs) + 0)]; \
+	ldrb rtmp, [rsrc, #((offs) + 1)]; \
+	orr rout, rout, rtmp, lsl #8; \
+	ldrb rtmp, [rsrc, #((offs) + 2)]; \
+	orr rout, rout, rtmp, lsl #16; \
+	ldrb rtmp, [rsrc, #((offs) + 3)]; \
+	orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+	mov rtmp0, rin, lsr #8; \
+	strb rin, [rdst, #((offs) + 0)]; \
+	mov rtmp1, rin, lsr #16; \
+	strb rtmp0, [rdst, #((offs) + 1)]; \
+	mov rtmp0, rin, lsr #24; \
+	strb rtmp1, [rdst, #((offs) + 2)]; \
+	strb rtmp0, [rdst, #((offs) + 3)];
+
+#ifndef __ARMEL__
+	/* bswap on big-endian */
+	#define host_to_le(reg) \
+		rev reg, reg;
+	#define le_to_host(reg) \
+		rev reg, reg;
+#else
+	/* nop on little-endian */
+	#define host_to_le(reg) /*_*/
+	#define le_to_host(reg) /*_*/
+#endif
+
+#define ldr_input_aligned_le(rin, a, b, c, d) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	le_to_host(a); \
+	ldr c, [rin, #8]; \
+	le_to_host(b); \
+	ldr d, [rin, #12]; \
+	le_to_host(c); \
+	le_to_host(d);
+
+#define str_output_aligned_le(rout, a, b, c, d) \
+	le_to_host(a); \
+	le_to_host(b); \
+	str a, [rout, #0]; \
+	le_to_host(c); \
+	str b, [rout, #4]; \
+	le_to_host(d); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* unaligned word reads/writes allowed */
+	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
+		ldr_input_aligned_le(rin, ra, rb, rc, rd)
+
+	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		str_output_aligned_le(rout, ra, rb, rc, rd)
+#else
+	/* need to handle unaligned reads/writes by byte reads */
+	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
+		tst rin, #3; \
+		beq 1f; \
+			ldr_unaligned_le(ra, rin, 0, rtmp0); \
+			ldr_unaligned_le(rb, rin, 4, rtmp0); \
+			ldr_unaligned_le(rc, rin, 8, rtmp0); \
+			ldr_unaligned_le(rd, rin, 12, rtmp0); \
+			b 2f; \
+		1:;\
+			ldr_input_aligned_le(rin, ra, rb, rc, rd); \
+		2:;
+
+	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+		tst rout, #3; \
+		beq 1f; \
+			str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
+			str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
+			str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
+			str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
+			b 2f; \
+		1:;\
+			str_output_aligned_le(rout, ra, rb, rc, rd); \
+		2:;
+#endif
+
+/**********************************************************************
+  1-way twofish
+ **********************************************************************/
+#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
+	and RT0, RMASK, b, lsr#(8 - 2); \
+	and RY, RMASK, b, lsr#(16 - 2); \
+	add RT0, RT0, #(s2 - s1); \
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	ldr RY, [CTXs3, RY]; \
+	and RT2, RMASK, b, lsl#(2); \
+	ldr RT0, [CTXs1, RT0]; \
+	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
+	ldr RT1, [CTXs0, RT1]; \
+	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
+	ldr RT2, [CTXs1, RT2]; \
+	add RT3, RT3, #(s2 - s1); \
+	ldr RX, [CTXs1, RX]; \
+	ror_a(a); \
+	\
+	eor RY, RY, RT0; \
+	ldr RT3, [CTXs1, RT3]; \
+	and RT0, RMASK, a, lsl#(2); \
+	eor RY, RY, RT1; \
+	and RT1, RMASK, a, lsr#(24 - 2); \
+	eor RY, RY, RT2; \
+	ldr RT0, [CTXs0, RT0]; \
+	eor RX, RX, RT3; \
+	ldr RT1, [CTXs3, RT1]; \
+	eor RX, RX, RT0; \
+	\
+	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT1; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT3; \
+	add RX, RX, RT2; \
+	eor rd, RT0, rd, ror #31; \
+	eor rc, rc, RX;
+
+#define dummy(x) /*_*/
+
+#define ror1(r) \
+	ror r, r, #1;
+
+#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
+	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
+	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
+	ror_b(b); \
+	and RT2, RMASK, a, lsl#(2); \
+	and RT0, RMASK, a, lsr#(8 - 2); \
+	\
+	ldr RY, [CTXs1, RT3]; \
+	add RT1, RT1, #(s2 - s1); \
+	ldr RX, [CTXs0, RT2]; \
+	and RT3, RMASK, b, lsr#(16 - 2); \
+	ldr RT1, [CTXs1, RT1]; \
+	and RT2, RMASK, a, lsr#(16 - 2); \
+	ldr RT0, [CTXs1, RT0]; \
+	\
+	add RT2, RT2, #(s2 - s1); \
+	ldr RT3, [CTXs3, RT3]; \
+	eor RY, RY, RT1; \
+	\
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs1, RT2]; \
+	and RT0, RMASK, a, lsr#(24 - 2); \
+	\
+	ldr RT1, [CTXs0, RT1]; \
+	\
+	eor RY, RY, RT3; \
+	ldr RT0, [CTXs3, RT0]; \
+	eor RX, RX, RT2; \
+	eor RY, RY, RT1; \
+	\
+	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT1; \
+	add RX, RX, RT2; \
+	eor rd, rd, RT0; \
+	eor rc, RX, rc, ror #31;
+
+#define first_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define last_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	ror1(RA);
+
+#define first_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define last_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	ror1(RD);
+
+.align 3
+.global _gcry_twofish_arm_encrypt_block
+.type   _gcry_twofish_arm_encrypt_block,%function;
+
+_gcry_twofish_arm_encrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	add RY, CTXs0, #w;
+
+	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+
+	/* Input whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	add CTXs3, CTXs0, #(s3 - s0);
+	add CTXs1, CTXs0, #(s1 - s0);
+	mov RMASK, #(0xff << 2);
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	first_encrypt_cycle(0);
+	encrypt_cycle(1);
+	encrypt_cycle(2);
+	encrypt_cycle(3);
+	encrypt_cycle(4);
+	encrypt_cycle(5);
+	encrypt_cycle(6);
+	last_encrypt_cycle(7);
+
+	add RY, CTXs3, #(w + 4*4 - s3);
+	pop {%r1}; /* dst */
+
+	/* Output whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %lr};
+	bx %lr;
+.ltorg
+.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
+
+.align 3
+.global _gcry_twofish_arm_decrypt_block
+.type   _gcry_twofish_arm_decrypt_block,%function;
+
+_gcry_twofish_arm_decrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+	push {%r1, %r4-%r11, %ip, %lr};
+
+	add CTXs3, CTXs0, #(s3 - s0);
+
+	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+
+	add RY, CTXs3, #(w + 4*4 - s3);
+	add CTXs3, CTXs0, #(s3 - s0);
+
+	/* Input whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	add CTXs1, CTXs0, #(s1 - s0);
+	mov RMASK, #(0xff << 2);
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	first_decrypt_cycle(7);
+	decrypt_cycle(6);
+	decrypt_cycle(5);
+	decrypt_cycle(4);
+	decrypt_cycle(3);
+	decrypt_cycle(2);
+	decrypt_cycle(1);
+	last_decrypt_cycle(0);
+
+	add RY, CTXs0, #w;
+	pop {%r1}; /* dst */
+
+	/* Output whitening */
+	ldm RY, {RT0, RT1, RT2, RT3};
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+
+	pop {%r4-%r11, %ip, %lr};
+	bx %lr;
+.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/twofish-armv6.S b/cipher/twofish-armv6.S
deleted file mode 100644
index b76ab37..0000000
--- a/cipher/twofish-armv6.S
+++ /dev/null
@@ -1,365 +0,0 @@
-/* twofish-armv6.S  -  ARM assembly implementation of Twofish cipher
- *
- * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
-#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
-
-.text
-
-.syntax unified
-.arm
-
-/* structure of TWOFISH_context: */
-#define s0 0
-#define s1 ((s0) + 4 * 256)
-#define s2 ((s1) + 4 * 256)
-#define s3 ((s2) + 4 * 256)
-#define w  ((s3) + 4 * 256)
-#define k  ((w) + 4 * 8)
-
-/* register macros */
-#define CTX %r0
-#define CTXs0 %r0
-#define CTXs1 %r1
-#define CTXs3 %r7
-
-#define RA %r3
-#define RB %r4
-#define RC %r5
-#define RD %r6
-
-#define RX %r2
-#define RY %ip
-
-#define RMASK %lr
-
-#define RT0 %r8
-#define RT1 %r9
-#define RT2 %r10
-#define RT3 %r11
-
-/* helper macros */
-#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
-	ldrb rout, [rsrc, #((offs) + 0)]; \
-	ldrb rtmp, [rsrc, #((offs) + 1)]; \
-	orr rout, rout, rtmp, lsl #8; \
-	ldrb rtmp, [rsrc, #((offs) + 2)]; \
-	orr rout, rout, rtmp, lsl #16; \
-	ldrb rtmp, [rsrc, #((offs) + 3)]; \
-	orr rout, rout, rtmp, lsl #24;
-
-#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
-	mov rtmp0, rin, lsr #8; \
-	strb rin, [rdst, #((offs) + 0)]; \
-	mov rtmp1, rin, lsr #16; \
-	strb rtmp0, [rdst, #((offs) + 1)]; \
-	mov rtmp0, rin, lsr #24; \
-	strb rtmp1, [rdst, #((offs) + 2)]; \
-	strb rtmp0, [rdst, #((offs) + 3)];
-
-#ifndef __ARMEL__
-	/* bswap on big-endian */
-	#define host_to_le(reg) \
-		rev reg, reg;
-	#define le_to_host(reg) \
-		rev reg, reg;
-#else
-	/* nop on little-endian */
-	#define host_to_le(reg) /*_*/
-	#define le_to_host(reg) /*_*/
-#endif
-
-#define ldr_input_aligned_le(rin, a, b, c, d) \
-	ldr a, [rin, #0]; \
-	ldr b, [rin, #4]; \
-	le_to_host(a); \
-	ldr c, [rin, #8]; \
-	le_to_host(b); \
-	ldr d, [rin, #12]; \
-	le_to_host(c); \
-	le_to_host(d);
-
-#define str_output_aligned_le(rout, a, b, c, d) \
-	le_to_host(a); \
-	le_to_host(b); \
-	str a, [rout, #0]; \
-	le_to_host(c); \
-	str b, [rout, #4]; \
-	le_to_host(d); \
-	str c, [rout, #8]; \
-	str d, [rout, #12];
-
-#ifdef __ARM_FEATURE_UNALIGNED
-	/* unaligned word reads/writes allowed */
-	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
-		ldr_input_aligned_le(rin, ra, rb, rc, rd)
-
-	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
-		str_output_aligned_le(rout, ra, rb, rc, rd)
-#else
-	/* need to handle unaligned reads/writes by byte reads */
-	#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
-		tst rin, #3; \
-		beq 1f; \
-			ldr_unaligned_le(ra, rin, 0, rtmp0); \
-			ldr_unaligned_le(rb, rin, 4, rtmp0); \
-			ldr_unaligned_le(rc, rin, 8, rtmp0); \
-			ldr_unaligned_le(rd, rin, 12, rtmp0); \
-			b 2f; \
-		1:;\
-			ldr_input_aligned_le(rin, ra, rb, rc, rd); \
-		2:;
-
-	#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
-		tst rout, #3; \
-		beq 1f; \
-			str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
-			str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
-			str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
-			str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
-			b 2f; \
-		1:;\
-			str_output_aligned_le(rout, ra, rb, rc, rd); \
-		2:;
-#endif
-
-/**********************************************************************
-  1-way twofish
- **********************************************************************/
-#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
-	and RT0, RMASK, b, lsr#(8 - 2); \
-	and RY, RMASK, b, lsr#(16 - 2); \
-	add RT0, RT0, #(s2 - s1); \
-	and RT1, RMASK, b, lsr#(24 - 2); \
-	ldr RY, [CTXs3, RY]; \
-	and RT2, RMASK, b, lsl#(2); \
-	ldr RT0, [CTXs1, RT0]; \
-	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
-	ldr RT1, [CTXs0, RT1]; \
-	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
-	ldr RT2, [CTXs1, RT2]; \
-	add RT3, RT3, #(s2 - s1); \
-	ldr RX, [CTXs1, RX]; \
-	ror_a(a); \
-	\
-	eor RY, RY, RT0; \
-	ldr RT3, [CTXs1, RT3]; \
-	and RT0, RMASK, a, lsl#(2); \
-	eor RY, RY, RT1; \
-	and RT1, RMASK, a, lsr#(24 - 2); \
-	eor RY, RY, RT2; \
-	ldr RT0, [CTXs0, RT0]; \
-	eor RX, RX, RT3; \
-	ldr RT1, [CTXs3, RT1]; \
-	eor RX, RX, RT0; \
-	\
-	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
-	eor RX, RX, RT1; \
-	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
-	\
-	add RT0, RX, RY, lsl #1; \
-	add RX, RX, RY; \
-	add RT0, RT0, RT3; \
-	add RX, RX, RT2; \
-	eor rd, RT0, rd, ror #31; \
-	eor rc, rc, RX;
-
-#define dummy(x) /*_*/
-
-#define ror1(r) \
-	ror r, r, #1;
-
-#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
-	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
-	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
-	ror_b(b); \
-	and RT2, RMASK, a, lsl#(2); \
-	and RT0, RMASK, a, lsr#(8 - 2); \
-	\
-	ldr RY, [CTXs1, RT3]; \
-	add RT1, RT1, #(s2 - s1); \
-	ldr RX, [CTXs0, RT2]; \
-	and RT3, RMASK, b, lsr#(16 - 2); \
-	ldr RT1, [CTXs1, RT1]; \
-	and RT2, RMASK, a, lsr#(16 - 2); \
-	ldr RT0, [CTXs1, RT0]; \
-	\
-	add RT2, RT2, #(s2 - s1); \
-	ldr RT3, [CTXs3, RT3]; \
-	eor RY, RY, RT1; \
-	\
-	and RT1, RMASK, b, lsr#(24 - 2); \
-	eor RX, RX, RT0; \
-	ldr RT2, [CTXs1, RT2]; \
-	and RT0, RMASK, a, lsr#(24 - 2); \
-	\
-	ldr RT1, [CTXs0, RT1]; \
-	\
-	eor RY, RY, RT3; \
-	ldr RT0, [CTXs3, RT0]; \
-	eor RX, RX, RT2; \
-	eor RY, RY, RT1; \
-	\
-	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
-	eor RX, RX, RT0; \
-	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
-	\
-	add RT0, RX, RY, lsl #1; \
-	add RX, RX, RY; \
-	add RT0, RT0, RT1; \
-	add RX, RX, RT2; \
-	eor rd, rd, RT0; \
-	eor rc, RX, rc, ror #31;
-
-#define first_encrypt_cycle(nc) \
-	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
-	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
-
-#define encrypt_cycle(nc) \
-	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
-	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
-
-#define last_encrypt_cycle(nc) \
-	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
-	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
-	ror1(RA);
-
-#define first_decrypt_cycle(nc) \
-	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
-	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
-
-#define decrypt_cycle(nc) \
-	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
-	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
-
-#define last_decrypt_cycle(nc) \
-	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
-	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
-	ror1(RD);
-
-.align 3
-.global _gcry_twofish_armv6_encrypt_block
-.type   _gcry_twofish_armv6_encrypt_block,%function;
-
-_gcry_twofish_armv6_encrypt_block:
-	/* input:
-	 *	%r0: ctx
-	 *	%r1: dst
-	 *	%r2: src
-	 */
-	push {%r1, %r4-%r11, %ip, %lr};
-
-	add RY, CTXs0, #w;
-
-	ldr_input_le(%r2, RA, RB, RC, RD, RT0);
-
-	/* Input whitening */
-	ldm RY, {RT0, RT1, RT2, RT3};
-	add CTXs3, CTXs0, #(s3 - s0);
-	add CTXs1, CTXs0, #(s1 - s0);
-	mov RMASK, #(0xff << 2);
-	eor RA, RA, RT0;
-	eor RB, RB, RT1;
-	eor RC, RC, RT2;
-	eor RD, RD, RT3;
-
-	first_encrypt_cycle(0);
-	encrypt_cycle(1);
-	encrypt_cycle(2);
-	encrypt_cycle(3);
-	encrypt_cycle(4);
-	encrypt_cycle(5);
-	encrypt_cycle(6);
-	last_encrypt_cycle(7);
-
-	add RY, CTXs3, #(w + 4*4 - s3);
-	pop {%r1}; /* dst */
-
-	/* Output whitening */
-	ldm RY, {RT0, RT1, RT2, RT3};
-	eor RC, RC, RT0;
-	eor RD, RD, RT1;
-	eor RA, RA, RT2;
-	eor RB, RB, RT3;
-
-	str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
-
-	pop {%r4-%r11, %ip, %lr};
-	bx %lr;
-.ltorg
-.size _gcry_twofish_armv6_encrypt_block,.-_gcry_twofish_armv6_encrypt_block;
-
-.align 3
-.global _gcry_twofish_armv6_decrypt_block
-.type   _gcry_twofish_armv6_decrypt_block,%function;
-
-_gcry_twofish_armv6_decrypt_block:
-	/* input:
-	 *	%r0: ctx
-	 *	%r1: dst
-	 *	%r2: src
-	 */
-	push {%r1, %r4-%r11, %ip, %lr};
-
-	add CTXs3, CTXs0, #(s3 - s0);
-
-	ldr_input_le(%r2, RC, RD, RA, RB, RT0);
-
-	add RY, CTXs3, #(w + 4*4 - s3);
-	add CTXs3, CTXs0, #(s3 - s0);
-
-	/* Input whitening */
-	ldm RY, {RT0, RT1, RT2, RT3};
-	add CTXs1, CTXs0, #(s1 - s0);
-	mov RMASK, #(0xff << 2);
-	eor RC, RC, RT0;
-	eor RD, RD, RT1;
-	eor RA, RA, RT2;
-	eor RB, RB, RT3;
-
-	first_decrypt_cycle(7);
-	decrypt_cycle(6);
-	decrypt_cycle(5);
-	decrypt_cycle(4);
-	decrypt_cycle(3);
-	decrypt_cycle(2);
-	decrypt_cycle(1);
-	last_decrypt_cycle(0);
-
-	add RY, CTXs0, #w;
-	pop {%r1}; /* dst */
-
-	/* Output whitening */
-	ldm RY, {RT0, RT1, RT2, RT3};
-	eor RA, RA, RT0;
-	eor RB, RB, RT1;
-	eor RC, RC, RT2;
-	eor RD, RD, RT3;
-
-	str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
-
-	pop {%r4-%r11, %ip, %lr};
-	bx %lr;
-.size _gcry_twofish_armv6_decrypt_block,.-_gcry_twofish_armv6_decrypt_block;
-
-#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
-#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index d2cabbe..8f9f3fc 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -757,10 +757,10 @@ extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
 #elif defined(USE_ARMV6_ASM)
 
 /* Assembly implementations of Twofish. */
-extern void _gcry_twofish_armv6_encrypt_block(const TWOFISH_context *c,
+extern void _gcry_twofish_arm_encrypt_block(const TWOFISH_context *c,
 					      byte *out, const byte *in);
 
-extern void _gcry_twofish_armv6_decrypt_block(const TWOFISH_context *c,
+extern void _gcry_twofish_arm_decrypt_block(const TWOFISH_context *c,
 					      byte *out, const byte *in);
 
 #else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
@@ -843,7 +843,7 @@ static unsigned int
 twofish_encrypt (void *context, byte *out, const byte *in)
 {
   TWOFISH_context *ctx = context;
-  _gcry_twofish_armv6_encrypt_block(ctx, out, in);
+  _gcry_twofish_arm_encrypt_block(ctx, out, in);
   return /*burn_stack*/ (4*sizeof (void*));
 }
 
@@ -910,7 +910,7 @@ static unsigned int
 twofish_decrypt (void *context, byte *out, const byte *in)
 {
   TWOFISH_context *ctx = context;
-  _gcry_twofish_armv6_decrypt_block(ctx, out, in);
+  _gcry_twofish_arm_decrypt_block(ctx, out, in);
   return /*burn_stack*/ (4*sizeof (void*));
 }
 
diff --git a/configure.ac b/configure.ac
index 58916e8..9b8ce33 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1424,7 +1424,7 @@ if test "$found" = "1" ; then
       ;;
       arm*-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-armv6.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-arm.lo"
       ;;
    esac
 fi
@@ -1441,7 +1441,7 @@ if test "$found" = "1" ; then
       ;;
       arm*-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-armv6.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-arm.lo"
       ;;
    esac
 fi
@@ -1464,7 +1464,7 @@ if test "$found" = "1" ; then
       ;;
       arm*-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv6.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-arm.lo"
       ;;
    esac
 fi
@@ -1481,7 +1481,7 @@ if test "$found" = "1" ; then
       ;;
       arm*-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-armv6.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo"
       ;;
    esac
 fi
@@ -1524,7 +1524,7 @@ if test "$found" = "1" ; then
    case "${host}" in
       arm*-*-*)
          # Build with the assembly implementation
-         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-armv6.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-arm.lo"
       ;;
    esac
 
-- 
1.8.4.rc3




More information about the Gcrypt-devel mailing list