[PATCH] Add ARM assembly implementation of SHA-512

Sun Jan 31 00:11:38 CET 2016

* cipher/Makefile.am: Add 'sha512-arm.S'.
* cipher/sha512-arm.S: New.
* cipher/sha512.c (USE_ARM_ASM): New.
(_gcry_sha512_transform_arm): New.
(transform) [USE_ARM_ASM]: Use ARM assembly implementation instead of
generic.
* configure.ac: Add 'sha512-arm.lo'.
--

Benchmark on Cortex-A8 (armv6, 1008 Mhz):

 Before:
                 |  nanosecs/byte   mebibytes/sec   cycles/byte
  SHA512         |     112.0 ns/B      8.52 MiB/s     112.9 c/B

 After (3.3x faster):
                 |  nanosecs/byte   mebibytes/sec   cycles/byte
  SHA512         |     34.01 ns/B     28.04 MiB/s     34.28 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am  |    2 
 cipher/sha512-arm.S |  465 +++++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/sha512.c     |   82 +++++----
 configure.ac        |    4 
 4 files changed, 520 insertions(+), 33 deletions(-)
 create mode 100644 cipher/sha512-arm.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 88c8fbf..65d7afb 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -89,7 +89,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
   sha1-armv7-neon.S \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
-  sha512-armv7-neon.S \
+  sha512-armv7-neon.S sha512-arm.S \
 keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
diff --git a/cipher/sha512-arm.S b/cipher/sha512-arm.S
new file mode 100644
index 0000000..28f156e
--- /dev/null
+++ b/cipher/sha512-arm.S
@@ -0,0 +1,465 @@
+/* sha512-arm.S  -  ARM assembly implementation of SHA-512 transform
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+#define hd_h ((hd_g) + 8)
+
+/* register macros */
+#define RK    %r2
+
+#define RElo %r0
+#define REhi %r1
+
+#define RT1lo %r3
+#define RT1hi %r4
+#define RT2lo %r5
+#define RT2hi %r6
+#define RWlo  %r7
+#define RWhi  %r8
+#define RT3lo %r9
+#define RT3hi %r10
+#define RT4lo %r11
+#define RT4hi %ip
+
+#define RRND  %lr
+
+/* variable offsets in stack */
+#define ctx (0)
+#define data ((ctx) + 4)
+#define nblks ((data) + 4)
+#define _a ((nblks) + 4)
+#define _b ((_a) + 8)
+#define _c ((_b) + 8)
+#define _d ((_c) + 8)
+#define _e ((_d) + 8)
+#define _f ((_e) + 8)
+#define _g ((_f) + 8)
+#define _h ((_g) + 8)
+
+#define w(i) ((_h) + 8 + ((i) % 16) * 8)
+
+#define STACK_MAX (w(15) + 8)
+
+/* helper macros */
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+    ldrb rout, [rsrc, #((offs) + 3)]; \
+    ldrb rtmp, [rsrc, #((offs) + 2)]; \
+    orr rout, rout, rtmp, lsl #8; \
+    ldrb rtmp, [rsrc, #((offs) + 1)]; \
+    orr rout, rout, rtmp, lsl #16; \
+    ldrb rtmp, [rsrc, #((offs) + 0)]; \
+    orr rout, rout, rtmp, lsl #24;
+
+#ifdef __ARMEL__
+    /* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+    #define be_to_host(reg, rtmp) \
+	rev reg, reg;
+#else
+    #define be_to_host(reg, rtmp) \
+	eor rtmp, reg, reg, ror #16; \
+	mov rtmp, rtmp, lsr #8; \
+	bic rtmp, rtmp, #65280; \
+	eor reg, rtmp, reg, ror #8;
+#endif
+#else
+    /* nop on big-endian */
+    #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+#define read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, convert, rtmp) \
+    ldr lo0, [rin, #((offs) + 0 * 8 + 4)]; \
+    ldr hi0, [rin, #((offs) + 0 * 8 + 0)]; \
+    ldr lo1, [rin, #((offs) + 1 * 8 + 4)]; \
+    ldr hi1, [rin, #((offs) + 1 * 8 + 0)]; \
+    ldr lo2, [rin, #((offs) + 2 * 8 + 4)]; \
+    convert(lo0, rtmp); \
+    ldr hi2, [rin, #((offs) + 2 * 8 + 0)]; \
+    convert(hi0, rtmp); \
+    ldr lo3, [rin, #((offs) + 3 * 8 + 4)]; \
+    convert(lo1, rtmp); \
+    ldr hi3, [rin, #((offs) + 3 * 8 + 0)]; \
+    convert(hi1, rtmp); \
+    convert(lo2, rtmp); \
+    convert(hi2, rtmp); \
+    convert(lo3, rtmp); \
+    convert(hi3, rtmp);
+
+#define read_be64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
+    read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, be_to_host, rtmp0)
+
+/* need to handle unaligned reads by byte reads */
+#define read_be64_unaligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
+    ldr_unaligned_be(lo0, rin, (offs) + 0 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi0, rin, (offs) + 0 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo1, rin, (offs) + 1 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi1, rin, (offs) + 1 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo2, rin, (offs) + 2 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi2, rin, (offs) + 2 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo3, rin, (offs) + 3 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi3, rin, (offs) + 3 * 8 + 0, rtmp0);
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+
+/* Round function */
+
+#define R(_a,_b,_c,_d,_e,_f,_g,_h,W,wi) \
+    /* Message expansion, t1 = _h + w[i] */ \
+    W(_a,_h,wi); \
+    \
+    /* w = Sum1(_e) */ \
+    mov RWlo, RElo, lsr#14; \
+    ldm RK!, {RT2lo-RT2hi}; \
+    mov RWhi, REhi, lsr#14; \
+    eor RWlo, RWlo, RElo, lsr#18; \
+    eor RWhi, RWhi, REhi, lsr#18; \
+    ldr RT3lo, [%sp, #(_f)]; \
+    adds RT1lo, RT2lo; /* t1 += K */ \
+    ldr RT3hi, [%sp, #(_f) + 4]; \
+    adc RT1hi, RT2hi; \
+    ldr RT4lo, [%sp, #(_g)]; \
+    eor RWlo, RWlo, RElo, lsl#23; \
+    ldr RT4hi, [%sp, #(_g) + 4]; \
+    eor RWhi, RWhi, REhi, lsl#23; \
+    eor RWlo, RWlo, REhi, lsl#18; \
+    eor RWhi, RWhi, RElo, lsl#18; \
+    eor RWlo, RWlo, REhi, lsl#14; \
+    eor RWhi, RWhi, RElo, lsl#14; \
+    eor RWlo, RWlo, REhi, lsr#9; \
+    eor RWhi, RWhi, RElo, lsr#9; \
+    \
+    /* Cho(_e,_f,_g) => (_e & _f) ^ (~_e & _g) */ \
+    adds RT1lo, RWlo; /* t1 += Sum1(_e) */ \
+    and RT3lo, RT3lo, RElo; \
+    adc RT1hi, RWhi; \
+    and RT3hi, RT3hi, REhi; \
+    bic RT4lo, RT4lo, RElo; \
+    bic RT4hi, RT4hi, REhi; \
+    eor RT3lo, RT3lo, RT4lo; \
+    eor RT3hi, RT3hi, RT4hi; \
+    \
+    /* Load D */ \
+    /* t1 += Cho(_e,_f,_g) */ \
+    ldr RElo, [%sp, #(_d)]; \
+    adds RT1lo, RT3lo; \
+    ldr REhi, [%sp, #(_d) + 4]; \
+    adc RT1hi, RT3hi; \
+    \
+    /* Load A */ \
+    ldr RT3lo, [%sp, #(_a)]; \
+    \
+    /* _d += t1 */ \
+    adds RElo, RT1lo; \
+    ldr RT3hi, [%sp, #(_a) + 4]; \
+    adc REhi, RT1hi; \
+    \
+    /* Store D */ \
+    str RElo, [%sp, #(_d)]; \
+    \
+    /* t2 = Sum0(_a) */ \
+    mov RT2lo, RT3lo, lsr#28; \
+    str REhi, [%sp, #(_d) + 4]; \
+    mov RT2hi, RT3hi, lsr#28; \
+    ldr RWlo, [%sp, #(_b)]; \
+    eor RT2lo, RT2lo, RT3lo, lsl#30; \
+    ldr RWhi, [%sp, #(_b) + 4]; \
+    eor RT2hi, RT2hi, RT3hi, lsl#30; \
+    eor RT2lo, RT2lo, RT3lo, lsl#25; \
+    eor RT2hi, RT2hi, RT3hi, lsl#25; \
+    eor RT2lo, RT2lo, RT3hi, lsl#4; \
+    eor RT2hi, RT2hi, RT3lo, lsl#4; \
+    eor RT2lo, RT2lo, RT3hi, lsr#2; \
+    eor RT2hi, RT2hi, RT3lo, lsr#2; \
+    eor RT2lo, RT2lo, RT3hi, lsr#7; \
+    eor RT2hi, RT2hi, RT3lo, lsr#7; \
+    \
+    /* t2 += t1 */ \
+    adds RT2lo, RT1lo; \
+    ldr RT1lo, [%sp, #(_c)]; \
+    adc RT2hi, RT1hi; \
+    \
+    /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \
+    ldr RT1hi, [%sp, #(_c) + 4]; \
+    and RT4lo, RWlo, RT3lo; \
+    and RT4hi, RWhi, RT3hi; \
+    eor RWlo, RWlo, RT3lo; \
+    eor RWhi, RWhi, RT3hi; \
+    and RWlo, RWlo, RT1lo; \
+    and RWhi, RWhi, RT1hi; \
+    eor RWlo, RWlo, RT4lo; \
+    eor RWhi, RWhi, RT4hi; \
+
+/* Message expansion */
+
+#define W_0_63(_a,_h,i) \
+    ldr RT3lo, [%sp, #(w(i-2))]; \
+    adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
+    ldr RT3hi, [%sp, #(w(i-2)) + 4]; \
+    adc RT2hi, RWhi; \
+    /* nw = S1(w[i-2]) */ \
+    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    mov RWlo, RT3lo, lsr#19; \
+    str RT2lo, [%sp, #(_a)]; \
+    eor RWlo, RWlo, RT3lo, lsl#3; \
+    ldr RT1hi, [%sp, #(_h) + 4]; \
+    mov RWhi, RT3hi, lsr#19; \
+    ldr RT2lo, [%sp, #(w(i-7))]; \
+    eor RWhi, RWhi, RT3hi, lsl#3; \
+    str RT2hi, [%sp, #(_a) + 4]; \
+    eor RWlo, RWlo, RT3lo, lsr#6; \
+    ldr RT2hi, [%sp, #(w(i-7)) + 4]; \
+    eor RWhi, RWhi, RT3hi, lsr#6; \
+    eor RWlo, RWlo, RT3hi, lsl#13; \
+    eor RWhi, RWhi, RT3lo, lsl#13; \
+    eor RWlo, RWlo, RT3hi, lsr#29; \
+    eor RWhi, RWhi, RT3lo, lsr#29; \
+    ldr RT3lo, [%sp, #(w(i-15))]; \
+    eor RWlo, RWlo, RT3hi, lsl#26; \
+    ldr RT3hi, [%sp, #(w(i-15)) + 4]; \
+    \
+    adds RT2lo, RWlo; /* nw += w[i-7] */ \
+    ldr RWlo, [%sp, #(w(i-16))]; \
+    adc RT2hi, RWhi; \
+    mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \
+    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    mov RT4hi, RT3hi, lsr#1; \
+    adds RT2lo, RWlo; /* nw += w[i-16] */ \
+    eor RT4lo, RT4lo, RT3lo, lsr#8; \
+    eor RT4hi, RT4hi, RT3hi, lsr#8; \
+    eor RT4lo, RT4lo, RT3lo, lsr#7; \
+    eor RT4hi, RT4hi, RT3hi, lsr#7; \
+    eor RT4lo, RT4lo, RT3hi, lsl#31; \
+    eor RT4hi, RT4hi, RT3lo, lsl#31; \
+    eor RT4lo, RT4lo, RT3hi, lsl#24; \
+    eor RT4hi, RT4hi, RT3lo, lsl#24; \
+    eor RT4lo, RT4lo, RT3hi, lsl#25; \
+    adc RT2hi, RWhi; \
+    \
+    /* nw += S0(w[i-15]) */ \
+    adds RT2lo, RT4lo; \
+    adc RT2hi, RT4hi; \
+    \
+    /* w[0] = nw */ \
+    str RT2lo, [%sp, #(w(i))]; \
+    adds RT1lo, RWlo; \
+    str RT2hi, [%sp, #(w(i)) + 4]; \
+    adc RT1hi, RWhi;
+
+#define W_64_79(_a,_h,i) \
+    adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
+    ldr RWlo, [%sp, #(w(i-16))]; \
+    adc RT2hi, RWhi; \
+    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    ldr RT1hi, [%sp, #(_h) + 4]; \
+    str RT2lo, [%sp, #(_a)]; \
+    str RT2hi, [%sp, #(_a) + 4]; \
+    adds RT1lo, RWlo; \
+    adc RT1hi, RWhi;
+
+.align 3
+.globl _gcry_sha512_transform_arm
+.type  _gcry_sha512_transform_arm,%function;
+
+_gcry_sha512_transform_arm:
+	/* Input:
+	 *	%r0: SHA512_CONTEXT
+	 *	%r1: data
+	 *	%r2: u64 k[] constants
+	 *	%r3: nblks
+	 */
+	push {%r4-%r11, %ip, %lr};
+	sub %sp, %sp, #STACK_MAX;
+	movs RWlo, %r3;
+	str %r0, [%sp, #(ctx)];
+
+	beq .Ldone;
+
+.Loop_blocks:
+	str RWlo, [%sp, #nblks];
+
+	/* Load context to stack */
+	add RWhi, %sp, #(_a);
+	ldm %r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm %r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	/* Load input to w[16] */
+#ifndef __ARM_FEATURE_UNALIGNED
+	/* test if data is unaligned */
+	tst %r1, #3;
+	beq 1f;
+
+	/* unaligned load */
+	add RWhi, %sp, #(w(0));
+	read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	b 2f;
+#endif
+1:
+	/* aligned load */
+	add RWhi, %sp, #(w(0));
+	read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+2:
+	add %r1, #(16 * 8);
+	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	str %r1, [%sp, #(data)];
+
+	/* preload E & A */
+	ldr RElo, [%sp, #(_e)];
+	ldr REhi, [%sp, #(_e) + 4];
+	mov RWlo, #0;
+	ldr RT2lo, [%sp, #(_a)];
+	mov RRND, #(80-16);
+	ldr RT2hi, [%sp, #(_a) + 4];
+	mov RWhi, #0;
+
+.Loop_rounds:
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 16);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 17);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 18);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 19);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 20);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 21);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 22);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 23);
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 24);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 25);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 26);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 27);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 28);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 29);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 30);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 31);
+
+	subs RRND, #16;
+	bne .Loop_rounds;
+
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 16);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 17);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 18);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 19);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 20);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 21);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 22);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 23);
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 24);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 25);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 26);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 27);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 28);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 29);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31);
+
+	ldr %r0, [%sp, #(ctx)];
+	adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */
+	ldr %r1, [%sp, #(data)];
+	adc RT2hi, RWhi;
+
+	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	adds RT1lo, RT2lo;
+	ldr RT2lo, [%sp, #(_b + 0)];
+	adc  RT1hi, RT2hi;
+	ldr RT2hi, [%sp, #(_b + 4)];
+	adds RWlo, RT2lo;
+	ldr RT2lo, [%sp, #(_c + 0)];
+	adc  RWhi, RT2hi;
+	ldr RT2hi, [%sp, #(_c + 4)];
+	adds RT3lo, RT2lo;
+	ldr RT2lo, [%sp, #(_d + 0)];
+	adc  RT3hi, RT2hi;
+	ldr RT2hi, [%sp, #(_d + 4)];
+	adds RT4lo, RT2lo;
+	ldr RT2lo, [%sp, #(_e + 0)];
+	adc  RT4hi, RT2hi;
+	stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	ldr RT2hi, [%sp, #(_e + 4)];
+	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	adds RT1lo, RT2lo;
+	ldr RT2lo, [%sp, #(_f + 0)];
+	adc  RT1hi, RT2hi;
+	ldr RT2hi, [%sp, #(_f + 4)];
+	adds RWlo, RT2lo;
+	ldr RT2lo, [%sp, #(_g + 0)];
+	adc  RWhi, RT2hi;
+	ldr RT2hi, [%sp, #(_g + 4)];
+	adds RT3lo, RT2lo;
+	ldr RT2lo, [%sp, #(_h + 0)];
+	adc  RT3hi, RT2hi;
+	ldr RT2hi, [%sp, #(_h + 4)];
+	adds RT4lo, RT2lo;
+	adc  RT4hi, RT2hi;
+	stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	sub %r0, %r0, #(4 * 8);
+	ldr RWlo, [%sp, #nblks];
+
+	sub RK, #(80 * 8);
+	subs RWlo, #1;
+	bne .Loop_blocks;
+
+.Ldone:
+	mov %r0, #STACK_MAX;
+__out:
+	add %sp, %sp, #STACK_MAX;
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm;
+
+#endif
+#endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 1196db9..5b25965 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -66,6 +66,13 @@
 #endif /*ENABLE_NEON_SUPPORT*/
 
 
+/* USE_ARM_ASM indicates whether to enable ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+# define USE_ARM_ASM 1
+#endif
+
+
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
@@ -204,36 +211,6 @@ sha384_init (void *context, unsigned int flags)
 }
 
 
-static inline u64
-ROTR (u64 x, u64 n)
-{
-  return ((x >> n) | (x << (64 - n)));
-}
-
-static inline u64
-Ch (u64 x, u64 y, u64 z)
-{
-  return ((x & y) ^ ( ~x & z));
-}
-
-static inline u64
-Maj (u64 x, u64 y, u64 z)
-{
-  return ((x & y) ^ (x & z) ^ (y & z));
-}
-
-static inline u64
-Sum0 (u64 x)
-{
-  return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
-}
-
-static inline u64
-Sum1 (u64 x)
-{
-  return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
-}
-
 static const u64 k[] =
   {
     U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
@@ -278,6 +255,38 @@ static const u64 k[] =
     U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
   };
 
+#ifndef USE_ARM_ASM
+
+static inline u64
+ROTR (u64 x, u64 n)
+{
+  return ((x >> n) | (x << (64 - n)));
+}
+
+static inline u64
+Ch (u64 x, u64 y, u64 z)
+{
+  return ((x & y) ^ ( ~x & z));
+}
+
+static inline u64
+Maj (u64 x, u64 y, u64 z)
+{
+  return ((x & y) ^ (x & z) ^ (y & z));
+}
+
+static inline u64
+Sum0 (u64 x)
+{
+  return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
+}
+
+static inline u64
+Sum1 (u64 x)
+{
+  return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
+}
+
 /****************
  * Transform the message W which consists of 16 64-bit-words
  */
@@ -304,7 +313,6 @@ transform_blk (SHA512_STATE *hd, const unsigned char *data)
 #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
 #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 
-
   for (t = 0; t < 80 - 16; )
     {
       u64 t1, t2;
@@ -545,7 +553,7 @@ transform_blk (SHA512_STATE *hd, const unsigned char *data)
   return /* burn_stack */ (8 + 16) * sizeof(u64) + sizeof(u32) +
                           3 * sizeof(void*);
 }
-
+#endif /*!USE_ARM_ASM*/
 
 /* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
@@ -568,6 +576,12 @@ void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
 					const u64 k[], size_t num_blks);
 #endif
 
+#ifdef USE_ARM_ASM
+unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd,
+					 const unsigned char *data,
+					 const u64 k[], size_t num_blks);
+#endif
+
 #ifdef USE_SSSE3
 unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
                                                 void *state,
@@ -622,6 +636,9 @@ transform (void *context, const unsigned char *data, size_t nblks)
     }
 #endif
 
+#ifdef USE_ARM_ASM
+  burn = _gcry_sha512_transform_arm (&ctx->state, data, k, nblks);
+#else
   do
     {
       burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*);
@@ -636,6 +653,7 @@ transform (void *context, const unsigned char *data, size_t nblks)
    */
   burn += ASM_EXTRA_STACK;
 #endif
+#endif
 
   return burn;
 }
diff --git a/configure.ac b/configure.ac
index ed37ab5..8b50360 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2086,6 +2086,10 @@ if test "$found" = "1" ; then
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo"
       ;;
+      arm*-*-*)
+         # Build with the assembly implementation
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then