[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-312-g8353884

Mon Feb 8 19:16:52 CET 2016

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  8353884bc65c820d5bcacaf1ac23cdee72091a09 (commit)
      from  b8b3361504950689ef1e779fb3357cecf8a9f739 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 8353884bc65c820d5bcacaf1ac23cdee72091a09
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Mon Feb 8 20:13:38 2016 +0200

    Add ARM assembly implementation of SHA-512
    
    * cipher/Makefile.am: Add 'sha512-arm.S'.
    * cipher/sha512-arm.S: New.
    * cipher/sha512.c (USE_ARM_ASM): New.
    (_gcry_sha512_transform_arm): New.
    (transform) [USE_ARM_ASM]: Use ARM assembly implementation instead of
    generic.
    * configure.ac: Add 'sha512-arm.lo'.
    --
    
    Benchmark on Cortex-A8 (armv6, 1008 Mhz):
    
     Before:
                     |  nanosecs/byte   mebibytes/sec   cycles/byte
      SHA512         |     112.0 ns/B      8.52 MiB/s     112.9 c/B
    
     After (3.3x faster):
                     |  nanosecs/byte   mebibytes/sec   cycles/byte
      SHA512         |     34.01 ns/B     28.04 MiB/s     34.28 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 88c8fbf..65d7afb 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -89,7 +89,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
   sha1-armv7-neon.S \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
-  sha512-armv7-neon.S \
+  sha512-armv7-neon.S sha512-arm.S \
 keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
diff --git a/cipher/sha512-arm.S b/cipher/sha512-arm.S
new file mode 100644
index 0000000..28f156e
--- /dev/null
+++ b/cipher/sha512-arm.S
@@ -0,0 +1,465 @@
+/* sha512-arm.S  -  ARM assembly implementation of SHA-512 transform
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <config.h>
+
+#if defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+#define hd_h ((hd_g) + 8)
+
+/* register macros */
+#define RK    %r2
+
+#define RElo %r0
+#define REhi %r1
+
+#define RT1lo %r3
+#define RT1hi %r4
+#define RT2lo %r5
+#define RT2hi %r6
+#define RWlo  %r7
+#define RWhi  %r8
+#define RT3lo %r9
+#define RT3hi %r10
+#define RT4lo %r11
+#define RT4hi %ip
+
+#define RRND  %lr
+
+/* variable offsets in stack */
+#define ctx (0)
+#define data ((ctx) + 4)
+#define nblks ((data) + 4)
+#define _a ((nblks) + 4)
+#define _b ((_a) + 8)
+#define _c ((_b) + 8)
+#define _d ((_c) + 8)
+#define _e ((_d) + 8)
+#define _f ((_e) + 8)
+#define _g ((_f) + 8)
+#define _h ((_g) + 8)
+
+#define w(i) ((_h) + 8 + ((i) % 16) * 8)
+
+#define STACK_MAX (w(15) + 8)
+
+/* helper macros */
+#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \
+    ldrb rout, [rsrc, #((offs) + 3)]; \
+    ldrb rtmp, [rsrc, #((offs) + 2)]; \
+    orr rout, rout, rtmp, lsl #8; \
+    ldrb rtmp, [rsrc, #((offs) + 1)]; \
+    orr rout, rout, rtmp, lsl #16; \
+    ldrb rtmp, [rsrc, #((offs) + 0)]; \
+    orr rout, rout, rtmp, lsl #24;
+
+#ifdef __ARMEL__
+    /* bswap on little-endian */
+#ifdef HAVE_ARM_ARCH_V6
+    #define be_to_host(reg, rtmp) \
+	rev reg, reg;
+#else
+    #define be_to_host(reg, rtmp) \
+	eor rtmp, reg, reg, ror #16; \
+	mov rtmp, rtmp, lsr #8; \
+	bic rtmp, rtmp, #65280; \
+	eor reg, rtmp, reg, ror #8;
+#endif
+#else
+    /* nop on big-endian */
+    #define be_to_host(reg, rtmp) /*_*/
+#endif
+
+#define host_to_host(x, y) /*_*/
+
+#define read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, convert, rtmp) \
+    ldr lo0, [rin, #((offs) + 0 * 8 + 4)]; \
+    ldr hi0, [rin, #((offs) + 0 * 8 + 0)]; \
+    ldr lo1, [rin, #((offs) + 1 * 8 + 4)]; \
+    ldr hi1, [rin, #((offs) + 1 * 8 + 0)]; \
+    ldr lo2, [rin, #((offs) + 2 * 8 + 4)]; \
+    convert(lo0, rtmp); \
+    ldr hi2, [rin, #((offs) + 2 * 8 + 0)]; \
+    convert(hi0, rtmp); \
+    ldr lo3, [rin, #((offs) + 3 * 8 + 4)]; \
+    convert(lo1, rtmp); \
+    ldr hi3, [rin, #((offs) + 3 * 8 + 0)]; \
+    convert(hi1, rtmp); \
+    convert(lo2, rtmp); \
+    convert(hi2, rtmp); \
+    convert(lo3, rtmp); \
+    convert(hi3, rtmp);
+
+#define read_be64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
+    read_u64_aligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, be_to_host, rtmp0)
+
+/* need to handle unaligned reads by byte reads */
+#define read_be64_unaligned_4(rin, offs, lo0, hi0, lo1, hi1, lo2, hi2, lo3, hi3, rtmp0) \
+    ldr_unaligned_be(lo0, rin, (offs) + 0 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi0, rin, (offs) + 0 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo1, rin, (offs) + 1 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi1, rin, (offs) + 1 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo2, rin, (offs) + 2 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi2, rin, (offs) + 2 * 8 + 0, rtmp0); \
+    ldr_unaligned_be(lo3, rin, (offs) + 3 * 8 + 4, rtmp0); \
+    ldr_unaligned_be(hi3, rin, (offs) + 3 * 8 + 0, rtmp0);
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+
+/* Round function */
+
+#define R(_a,_b,_c,_d,_e,_f,_g,_h,W,wi) \
+    /* Message expansion, t1 = _h + w[i] */ \
+    W(_a,_h,wi); \
+    \
+    /* w = Sum1(_e) */ \
+    mov RWlo, RElo, lsr#14; \
+    ldm RK!, {RT2lo-RT2hi}; \
+    mov RWhi, REhi, lsr#14; \
+    eor RWlo, RWlo, RElo, lsr#18; \
+    eor RWhi, RWhi, REhi, lsr#18; \
+    ldr RT3lo, [%sp, #(_f)]; \
+    adds RT1lo, RT2lo; /* t1 += K */ \
+    ldr RT3hi, [%sp, #(_f) + 4]; \
+    adc RT1hi, RT2hi; \
+    ldr RT4lo, [%sp, #(_g)]; \
+    eor RWlo, RWlo, RElo, lsl#23; \
+    ldr RT4hi, [%sp, #(_g) + 4]; \
+    eor RWhi, RWhi, REhi, lsl#23; \
+    eor RWlo, RWlo, REhi, lsl#18; \
+    eor RWhi, RWhi, RElo, lsl#18; \
+    eor RWlo, RWlo, REhi, lsl#14; \
+    eor RWhi, RWhi, RElo, lsl#14; \
+    eor RWlo, RWlo, REhi, lsr#9; \
+    eor RWhi, RWhi, RElo, lsr#9; \
+    \
+    /* Cho(_e,_f,_g) => (_e & _f) ^ (~_e & _g) */ \
+    adds RT1lo, RWlo; /* t1 += Sum1(_e) */ \
+    and RT3lo, RT3lo, RElo; \
+    adc RT1hi, RWhi; \
+    and RT3hi, RT3hi, REhi; \
+    bic RT4lo, RT4lo, RElo; \
+    bic RT4hi, RT4hi, REhi; \
+    eor RT3lo, RT3lo, RT4lo; \
+    eor RT3hi, RT3hi, RT4hi; \
+    \
+    /* Load D */ \
+    /* t1 += Cho(_e,_f,_g) */ \
+    ldr RElo, [%sp, #(_d)]; \
+    adds RT1lo, RT3lo; \
+    ldr REhi, [%sp, #(_d) + 4]; \
+    adc RT1hi, RT3hi; \
+    \
+    /* Load A */ \
+    ldr RT3lo, [%sp, #(_a)]; \
+    \
+    /* _d += t1 */ \
+    adds RElo, RT1lo; \
+    ldr RT3hi, [%sp, #(_a) + 4]; \
+    adc REhi, RT1hi; \
+    \
+    /* Store D */ \
+    str RElo, [%sp, #(_d)]; \
+    \
+    /* t2 = Sum0(_a) */ \
+    mov RT2lo, RT3lo, lsr#28; \
+    str REhi, [%sp, #(_d) + 4]; \
+    mov RT2hi, RT3hi, lsr#28; \
+    ldr RWlo, [%sp, #(_b)]; \
+    eor RT2lo, RT2lo, RT3lo, lsl#30; \
+    ldr RWhi, [%sp, #(_b) + 4]; \
+    eor RT2hi, RT2hi, RT3hi, lsl#30; \
+    eor RT2lo, RT2lo, RT3lo, lsl#25; \
+    eor RT2hi, RT2hi, RT3hi, lsl#25; \
+    eor RT2lo, RT2lo, RT3hi, lsl#4; \
+    eor RT2hi, RT2hi, RT3lo, lsl#4; \
+    eor RT2lo, RT2lo, RT3hi, lsr#2; \
+    eor RT2hi, RT2hi, RT3lo, lsr#2; \
+    eor RT2lo, RT2lo, RT3hi, lsr#7; \
+    eor RT2hi, RT2hi, RT3lo, lsr#7; \
+    \
+    /* t2 += t1 */ \
+    adds RT2lo, RT1lo; \
+    ldr RT1lo, [%sp, #(_c)]; \
+    adc RT2hi, RT1hi; \
+    \
+    /* Maj(_a,_b,_c) => ((_a & _b) ^ (_c & (_a ^ _b))) */ \
+    ldr RT1hi, [%sp, #(_c) + 4]; \
+    and RT4lo, RWlo, RT3lo; \
+    and RT4hi, RWhi, RT3hi; \
+    eor RWlo, RWlo, RT3lo; \
+    eor RWhi, RWhi, RT3hi; \
+    and RWlo, RWlo, RT1lo; \
+    and RWhi, RWhi, RT1hi; \
+    eor RWlo, RWlo, RT4lo; \
+    eor RWhi, RWhi, RT4hi; \
+
+/* Message expansion */
+
+#define W_0_63(_a,_h,i) \
+    ldr RT3lo, [%sp, #(w(i-2))]; \
+    adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
+    ldr RT3hi, [%sp, #(w(i-2)) + 4]; \
+    adc RT2hi, RWhi; \
+    /* nw = S1(w[i-2]) */ \
+    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    mov RWlo, RT3lo, lsr#19; \
+    str RT2lo, [%sp, #(_a)]; \
+    eor RWlo, RWlo, RT3lo, lsl#3; \
+    ldr RT1hi, [%sp, #(_h) + 4]; \
+    mov RWhi, RT3hi, lsr#19; \
+    ldr RT2lo, [%sp, #(w(i-7))]; \
+    eor RWhi, RWhi, RT3hi, lsl#3; \
+    str RT2hi, [%sp, #(_a) + 4]; \
+    eor RWlo, RWlo, RT3lo, lsr#6; \
+    ldr RT2hi, [%sp, #(w(i-7)) + 4]; \
+    eor RWhi, RWhi, RT3hi, lsr#6; \
+    eor RWlo, RWlo, RT3hi, lsl#13; \
+    eor RWhi, RWhi, RT3lo, lsl#13; \
+    eor RWlo, RWlo, RT3hi, lsr#29; \
+    eor RWhi, RWhi, RT3lo, lsr#29; \
+    ldr RT3lo, [%sp, #(w(i-15))]; \
+    eor RWlo, RWlo, RT3hi, lsl#26; \
+    ldr RT3hi, [%sp, #(w(i-15)) + 4]; \
+    \
+    adds RT2lo, RWlo; /* nw += w[i-7] */ \
+    ldr RWlo, [%sp, #(w(i-16))]; \
+    adc RT2hi, RWhi; \
+    mov RT4lo, RT3lo, lsr#1; /* S0(w[i-15]) */ \
+    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    mov RT4hi, RT3hi, lsr#1; \
+    adds RT2lo, RWlo; /* nw += w[i-16] */ \
+    eor RT4lo, RT4lo, RT3lo, lsr#8; \
+    eor RT4hi, RT4hi, RT3hi, lsr#8; \
+    eor RT4lo, RT4lo, RT3lo, lsr#7; \
+    eor RT4hi, RT4hi, RT3hi, lsr#7; \
+    eor RT4lo, RT4lo, RT3hi, lsl#31; \
+    eor RT4hi, RT4hi, RT3lo, lsl#31; \
+    eor RT4lo, RT4lo, RT3hi, lsl#24; \
+    eor RT4hi, RT4hi, RT3lo, lsl#24; \
+    eor RT4lo, RT4lo, RT3hi, lsl#25; \
+    adc RT2hi, RWhi; \
+    \
+    /* nw += S0(w[i-15]) */ \
+    adds RT2lo, RT4lo; \
+    adc RT2hi, RT4hi; \
+    \
+    /* w[0] = nw */ \
+    str RT2lo, [%sp, #(w(i))]; \
+    adds RT1lo, RWlo; \
+    str RT2hi, [%sp, #(w(i)) + 4]; \
+    adc RT1hi, RWhi;
+
+#define W_64_79(_a,_h,i) \
+    adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */ \
+    ldr RWlo, [%sp, #(w(i-16))]; \
+    adc RT2hi, RWhi; \
+    ldr RWhi, [%sp, #(w(i-16)) + 4]; \
+    ldr RT1lo, [%sp, #(_h)]; /* Load H */ \
+    ldr RT1hi, [%sp, #(_h) + 4]; \
+    str RT2lo, [%sp, #(_a)]; \
+    str RT2hi, [%sp, #(_a) + 4]; \
+    adds RT1lo, RWlo; \
+    adc RT1hi, RWhi;
+
+.align 3
+.globl _gcry_sha512_transform_arm
+.type  _gcry_sha512_transform_arm,%function;
+
+_gcry_sha512_transform_arm:
+	/* Input:
+	 *	%r0: SHA512_CONTEXT
+	 *	%r1: data
+	 *	%r2: u64 k[] constants
+	 *	%r3: nblks
+	 */
+	push {%r4-%r11, %ip, %lr};
+	sub %sp, %sp, #STACK_MAX;
+	movs RWlo, %r3;
+	str %r0, [%sp, #(ctx)];
+
+	beq .Ldone;
+
+.Loop_blocks:
+	str RWlo, [%sp, #nblks];
+
+	/* Load context to stack */
+	add RWhi, %sp, #(_a);
+	ldm %r0!,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	ldm %r0,  {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	/* Load input to w[16] */
+#ifndef __ARM_FEATURE_UNALIGNED
+	/* test if data is unaligned */
+	tst %r1, #3;
+	beq 1f;
+
+	/* unaligned load */
+	add RWhi, %sp, #(w(0));
+	read_be64_unaligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_unaligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	b 2f;
+#endif
+1:
+	/* aligned load */
+	add RWhi, %sp, #(w(0));
+	read_be64_aligned_4(%r1, 0 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 4 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 8 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+	stm RWhi!, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	read_be64_aligned_4(%r1, 12 * 8, RT1lo, RT1hi, RT2lo, RT2hi, RT3lo, RT3hi, RT4lo, RT4hi, RWlo);
+2:
+	add %r1, #(16 * 8);
+	stm RWhi, {RT1lo,RT1hi,RT2lo,RT2hi,RT3lo,RT3hi,RT4lo,RT4hi}
+	str %r1, [%sp, #(data)];
+
+	/* preload E & A */
+	ldr RElo, [%sp, #(_e)];
+	ldr REhi, [%sp, #(_e) + 4];
+	mov RWlo, #0;
+	ldr RT2lo, [%sp, #(_a)];
+	mov RRND, #(80-16);
+	ldr RT2hi, [%sp, #(_a) + 4];
+	mov RWhi, #0;
+
+.Loop_rounds:
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 16);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 17);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 18);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 19);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 20);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 21);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 22);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 23);
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_0_63, 24);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_0_63, 25);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_0_63, 26);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_0_63, 27);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_0_63, 28);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_0_63, 29);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_0_63, 30);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_0_63, 31);
+
+	subs RRND, #16;
+	bne .Loop_rounds;
+
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 16);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 17);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 18);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 19);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 20);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 21);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 22);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 23);
+	R(_a, _b, _c, _d, _e, _f, _g, _h, W_64_79, 24);
+	R(_h, _a, _b, _c, _d, _e, _f, _g, W_64_79, 25);
+	R(_g, _h, _a, _b, _c, _d, _e, _f, W_64_79, 26);
+	R(_f, _g, _h, _a, _b, _c, _d, _e, W_64_79, 27);
+	R(_e, _f, _g, _h, _a, _b, _c, _d, W_64_79, 28);
+	R(_d, _e, _f, _g, _h, _a, _b, _c, W_64_79, 29);
+	R(_c, _d, _e, _f, _g, _h, _a, _b, W_64_79, 30);
+	R(_b, _c, _d, _e, _f, _g, _h, _a, W_64_79, 31);
+
+	ldr %r0, [%sp, #(ctx)];
+	adds RT2lo, RWlo; /* _h = t2 + Maj(_a,_b,_c) */
+	ldr %r1, [%sp, #(data)];
+	adc RT2hi, RWhi;
+
+	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	adds RT1lo, RT2lo;
+	ldr RT2lo, [%sp, #(_b + 0)];
+	adc  RT1hi, RT2hi;
+	ldr RT2hi, [%sp, #(_b + 4)];
+	adds RWlo, RT2lo;
+	ldr RT2lo, [%sp, #(_c + 0)];
+	adc  RWhi, RT2hi;
+	ldr RT2hi, [%sp, #(_c + 4)];
+	adds RT3lo, RT2lo;
+	ldr RT2lo, [%sp, #(_d + 0)];
+	adc  RT3hi, RT2hi;
+	ldr RT2hi, [%sp, #(_d + 4)];
+	adds RT4lo, RT2lo;
+	ldr RT2lo, [%sp, #(_e + 0)];
+	adc  RT4hi, RT2hi;
+	stm %r0!, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+
+	ldr RT2hi, [%sp, #(_e + 4)];
+	ldm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	adds RT1lo, RT2lo;
+	ldr RT2lo, [%sp, #(_f + 0)];
+	adc  RT1hi, RT2hi;
+	ldr RT2hi, [%sp, #(_f + 4)];
+	adds RWlo, RT2lo;
+	ldr RT2lo, [%sp, #(_g + 0)];
+	adc  RWhi, RT2hi;
+	ldr RT2hi, [%sp, #(_g + 4)];
+	adds RT3lo, RT2lo;
+	ldr RT2lo, [%sp, #(_h + 0)];
+	adc  RT3hi, RT2hi;
+	ldr RT2hi, [%sp, #(_h + 4)];
+	adds RT4lo, RT2lo;
+	adc  RT4hi, RT2hi;
+	stm %r0, {RT1lo,RT1hi,RWlo,RWhi,RT3lo,RT3hi,RT4lo,RT4hi}
+	sub %r0, %r0, #(4 * 8);
+	ldr RWlo, [%sp, #nblks];
+
+	sub RK, #(80 * 8);
+	subs RWlo, #1;
+	bne .Loop_blocks;
+
+.Ldone:
+	mov %r0, #STACK_MAX;
+__out:
+	add %sp, %sp, #STACK_MAX;
+	pop {%r4-%r11, %ip, %pc};
+.size _gcry_sha512_transform_arm,.-_gcry_sha512_transform_arm;
+
+#endif
+#endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 1196db9..5b25965 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -66,6 +66,13 @@
 #endif /*ENABLE_NEON_SUPPORT*/
 
 
+/* USE_ARM_ASM indicates whether to enable ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+# define USE_ARM_ASM 1
+#endif
+
+
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
 #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
@@ -204,36 +211,6 @@ sha384_init (void *context, unsigned int flags)
 }
 
 
-static inline u64
-ROTR (u64 x, u64 n)
-{
-  return ((x >> n) | (x << (64 - n)));
-}
-
-static inline u64
-Ch (u64 x, u64 y, u64 z)
-{
-  return ((x & y) ^ ( ~x & z));
-}
-
-static inline u64
-Maj (u64 x, u64 y, u64 z)
-{
-  return ((x & y) ^ (x & z) ^ (y & z));
-}
-
-static inline u64
-Sum0 (u64 x)
-{
-  return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
-}
-
-static inline u64
-Sum1 (u64 x)
-{
-  return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
-}
-
 static const u64 k[] =
   {
     U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
@@ -278,6 +255,38 @@ static const u64 k[] =
     U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
   };
 
+#ifndef USE_ARM_ASM
+
+static inline u64
+ROTR (u64 x, u64 n)
+{
+  return ((x >> n) | (x << (64 - n)));
+}
+
+static inline u64
+Ch (u64 x, u64 y, u64 z)
+{
+  return ((x & y) ^ ( ~x & z));
+}
+
+static inline u64
+Maj (u64 x, u64 y, u64 z)
+{
+  return ((x & y) ^ (x & z) ^ (y & z));
+}
+
+static inline u64
+Sum0 (u64 x)
+{
+  return (ROTR (x, 28) ^ ROTR (x, 34) ^ ROTR (x, 39));
+}
+
+static inline u64
+Sum1 (u64 x)
+{
+  return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
+}
+
 /****************
  * Transform the message W which consists of 16 64-bit-words
  */
@@ -304,7 +313,6 @@ transform_blk (SHA512_STATE *hd, const unsigned char *data)
 #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
 #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 
-
   for (t = 0; t < 80 - 16; )
     {
       u64 t1, t2;
@@ -545,7 +553,7 @@ transform_blk (SHA512_STATE *hd, const unsigned char *data)
   return /* burn_stack */ (8 + 16) * sizeof(u64) + sizeof(u32) +
                           3 * sizeof(void*);
 }
-
+#endif /*!USE_ARM_ASM*/
 
 /* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
  * stack to store XMM6-XMM15 needed on Win64. */
@@ -568,6 +576,12 @@ void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
 					const u64 k[], size_t num_blks);
 #endif
 
+#ifdef USE_ARM_ASM
+unsigned int _gcry_sha512_transform_arm (SHA512_STATE *hd,
+					 const unsigned char *data,
+					 const u64 k[], size_t num_blks);
+#endif
+
 #ifdef USE_SSSE3
 unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
                                                 void *state,
@@ -622,6 +636,9 @@ transform (void *context, const unsigned char *data, size_t nblks)
     }
 #endif
 
+#ifdef USE_ARM_ASM
+  burn = _gcry_sha512_transform_arm (&ctx->state, data, k, nblks);
+#else
   do
     {
       burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*);
@@ -636,6 +653,7 @@ transform (void *context, const unsigned char *data, size_t nblks)
    */
   burn += ASM_EXTRA_STACK;
 #endif
+#endif
 
   return burn;
 }
diff --git a/configure.ac b/configure.ac
index ed37ab5..8b50360 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2086,6 +2086,10 @@ if test "$found" = "1" ; then
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo"
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo"
       ;;
+      arm*-*-*)
+         # Build with the assembly implementation
+         GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-arm.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am  |   2 +-
 cipher/sha512-arm.S | 465 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/sha512.c     |  82 +++++----
 configure.ac        |   4 +
 4 files changed, 520 insertions(+), 33 deletions(-)
 create mode 100644 cipher/sha512-arm.S


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org