[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-202-g99d1554

Sun Sep 1 16:14:40 CEST 2013

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  99d15543b8d94a8f1ef66c6ccb862b0ce82c514d (commit)
       via  03da7f8ba3ec24d4639a2bcebbc0d9d831734c08 (commit)
       via  9c95be105f518d18407115c2c06893857c24b116 (commit)
       via  7b0ebe69fe35f2ee13e1e1beb2766a1eaadb7f0c (commit)
      from  e9b711e6ddb480a71d2996465074e436c752c005 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 99d15543b8d94a8f1ef66c6ccb862b0ce82c514d
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Aug 31 12:48:31 2013 +0300

    sha512: add ARM/NEON assembly version of transform function
    
    * cipher/Makefile.am: Add 'sha512-armv7-neon.S'.
    * cipher/sha512-armv7-neon.S: New file.
    * cipher/sha512.c (USE_ARM_NEON_ASM): New macro.
    (SHA512_CONTEXT) [USE_ARM_NEON_ASM]: Add 'use_neon'.
    (sha512_init, sha384_init) [USE_ARM_NEON_ASM]: Enable 'use_neon' if
    CPU support NEON instructions.
    (k): Round constant array moved outside of 'transform' function.
    (__transform): Renamed from 'tranform' function.
    [USE_ARM_NEON_ASM] (_gcry_sha512_transform_armv7_neon): New prototype.
    (transform): New wrapper function for different transform versions.
    (sha512_write, sha512_final): Burn stack by the amount returned by
    transform function.
    * configure.ac (sha512) [neonsupport]: Add 'sha512-armv7-neon.lo'.
    --
    
    Add NEON assembly for transform function for faster SHA512 on ARM. Major speed
    up thanks to 64-bit integer registers and large register file that can hold
    full input buffer.
    
    Benchmark results on Cortex-A8, 1Ghz:
    
    Old:
    $ tests/benchmark --hash-repetitions 100 md sha512 sha384
    SHA512       17050ms 18780ms 29120ms 18040ms 17190ms
    SHA384       17130ms 18720ms 29160ms 18090ms 17280ms
    
    New:
    $ tests/benchmark --hash-repetitions 100 md sha512 sha384
    SHA512        3600ms  5070ms 15330ms  4510ms  3480ms
    SHA384        3590ms  5060ms 15350ms  4510ms  3520ms
    
    New vs old:
    SHA512        4.74x   3.70x   1.90x   4.00x   4.94x
    SHA384        4.77x   3.70x   1.90x   4.01x   4.91x
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index e233e79..3dd6f88 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -73,7 +73,7 @@ seed.c \
 serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
 sha1.c \
 sha256.c \
-sha512.c \
+sha512.c sha512-armv7-neon.S \
 tiger.c \
 whirlpool.c \
 twofish.c twofish-amd64.S \
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S
new file mode 100644
index 0000000..042b15a
--- /dev/null
+++ b/cipher/sha512-armv7-neon.S
@@ -0,0 +1,316 @@
+/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+
+/* register macros */
+#define RK %r2
+
+#define RA d0
+#define RB d1
+#define RC d2
+#define RD d3
+#define RE d4
+#define RF d5
+#define RG d6
+#define RH d7
+
+#define RT0 d8
+#define RT1 d9
+#define RT2 d10
+#define RT3 d11
+#define RT4 d12
+#define RT5 d13
+#define RT6 d14
+#define RT7 d15
+
+#define RW0 d16
+#define RW1 d17
+#define RW2 d18
+#define RW3 d19
+#define RW4 d20
+#define RW5 d21
+#define RW6 d22
+#define RW7 d23
+#define RW8 d24
+#define RW9 d25
+#define RW10 d26
+#define RW11 d27
+#define RW12 d28
+#define RW13 d29
+#define RW14 d30
+#define RW15 d31
+
+#define RW01q q8
+#define RW23q q9
+#define RW45q q10
+#define RW67q q11
+#define RW89q q12
+#define RW1011q q13
+#define RW1213q q14
+#define RW1415q q15
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+#define round_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw14, rw9, rw1) \
+	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+	vshr.u64 RT1, re, #14; \
+	vshl.u64 RT3, re, #64 - 14; \
+	vshr.u64 RT4, re, #18; \
+	vshl.u64 RT5, re, #64 - 18; \
+	veor.64 RT1, RT1, RT3; \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT1, RT1, RT4; \
+	vshr.u64 RT3, re, #41; \
+	vshl.u64 RT4, re, #64 - 41; \
+	veor.64 RT1, RT1, RT5; \
+	vadd.u64 RT0, RT0, rw0; \
+	veor.64 RT1, RT1, RT3; \
+	vand.64 RT2, re, rf; \
+	veor.64 RT1, RT1, RT4; \
+	vbic.64 RT6, rg, re; \
+	\
+	vadd.u64 RT1, RT1, rh; \
+	veor.64 RT2, RT2, RT6; \
+	vshr.u64 rh, ra, #28; \
+	vshl.u64 RT3, ra, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, ra, #34; \
+	veor.64 rh, rh, RT3; \
+	vshl.u64 RT5, ra, #64 - 34; \
+	vadd.u64 RT1, RT1, RT2; \
+	\
+	/* h = Sum0 (a) + Maj (a, b, c); */ \
+	veor.64 rh, rh, RT4; \
+	vshr.u64 RT3, ra, #39; \
+	vshl.u64 RT4, ra, #64 - 39; \
+	vorr.64 RT6, ra, rb; \
+	vand.64 RT0, ra, rb; \
+	veor.64 rh, rh, RT5; \
+	vand.64 RT6, RT6, rc; \
+	veor.64 rh, rh, RT3; \
+	vorr.64 RT0, RT0, RT6; \
+	veor.64 rh, rh, RT4; \
+	vshr.u64 RT4, rw14, #19; \
+	vadd.u64 rh, rh, RT0; \
+	vshl.u64 RT2, rw14, #64 - 19; \
+	\
+	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
+	vshr.u64 RT3, rw14, #61; \
+	vshl.u64 RT6, rw14, #64 - 61; \
+	veor.64 RT0, RT4, RT2; \
+	vshr.u64 RT2, rw14, 6; \
+	veor.64 RT0, RT0, RT3; \
+	vshr.u64 RT7, rw1, #1; \
+	veor.64 RT0, RT0, RT6; \
+	vshl.u64 RT4, rw1, #64 - 1; \
+	veor.64 RT0, RT0, RT2; \
+	vshr.u64 RT5, rw1, #8; \
+	vadd.u64 rw0, rw0, RT0; \
+	vshl.u64 RT6, rw1, #64 - 8; \
+	veor.64 RT7, RT7, RT4; \
+	vshr.u64 RT4, rw1, 7; \
+	veor.64 RT7, RT7, RT5; \
+	vadd.u64 rw0, rw0, rw9; /* w[0]+=w[9]; */\
+	veor.64 RT7, RT7, RT6; \
+	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+	veor.64 RT7, RT7, RT4; \
+	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+	vadd.u64 rw0, rw0, RT7; \
+
+#define round_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0) \
+	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+	vld1.64 {RT0}, [RK]!; \
+	vshr.u64 RT1, re, #14; \
+	vshl.u64 RT3, re, #64 - 14; \
+	vshr.u64 RT4, re, #18; \
+	vshl.u64 RT5, re, #64 - 18; \
+	veor.64 RT1, RT1, RT3; \
+	vshr.u64 RT7, ra, #28; \
+	veor.64 RT1, RT1, RT4; \
+	vshr.u64 RT3, re, #41; \
+	vshl.u64 RT4, re, #64 - 41; \
+	veor.64 RT1, RT1, RT5; \
+	vadd.u64 RT0, RT0, rw0; \
+	veor.64 RT1, RT1, RT3; \
+	vand.64 RT2, re, rf; \
+	veor.64 RT1, RT1, RT4; \
+	vbic.64 RT6, rg, re; \
+	\
+	vadd.u64 RT1, RT1, rh; \
+	veor.64 RT2, RT2, RT6; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, ra, #34; \
+	vshl.u64 RT5, ra, #64 - 34; \
+	\
+	/* t7 = Sum0 (a) + Maj (a, b, c); */ \
+	vshl.u64 RT6, ra, #64 - 28; \
+	veor.64 RT7, RT7, RT4; \
+	vshr.u64 RT3, ra, #39; \
+	veor.64 RT7, RT7, RT6; \
+	vshl.u64 RT4, ra, #64 - 39; \
+	vorr.64 RT6, ra, rb; \
+	vand.64 RT0, ra, rb; \
+	veor.64 RT7, RT7, RT5; \
+	vand.64 RT6, RT6, rc; \
+	veor.64 RT7, RT7, RT3; \
+	vorr.64 RT0, RT0, RT6; \
+	veor.64 RT7, RT7, RT4; \
+	vadd.u64 RT1, RT1, RT2; \
+	vadd.u64 RT7, RT7, RT0; \
+	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+	vadd.u64 rh, RT7, RT1; /* h=t7+t1; */
+
+.align 3
+.globl _gcry_sha512_transform_armv7_neon
+.type  _gcry_sha512_transform_armv7_neon,%function;
+
+_gcry_sha512_transform_armv7_neon:
+	/* Input:
+	 *	%r0: SHA512_CONTEXT
+	 *	%r1: data
+	 *	%r2: u64 k[] constants
+	 */
+	mov %r3, #0;
+
+	/* Load context to d0-d7 */
+	vld1.64 {RA-RD}, [%r0]!;
+	vld1.64 {RE-RH}, [%r0];
+	sub %r0, #(4*8);
+
+	/* Load input to w[16], d16-d31 */
+	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
+	vld1.64 {RW0-RW3}, [%r1]!;
+	vld1.64 {RW4-RW7}, [%r1]!;
+	vld1.64 {RW8-RW11}, [%r1]!;
+	vld1.64 {RW12-RW15}, [%r1];
+#ifdef __ARMEL__
+	/* byteswap */
+	vrev64.8 RW01q, RW01q;
+	vrev64.8 RW23q, RW23q;
+	vrev64.8 RW45q, RW45q;
+	vrev64.8 RW67q, RW67q;
+	vrev64.8 RW89q, RW89q;
+	vrev64.8 RW1011q, RW1011q;
+	vrev64.8 RW1213q, RW1213q;
+	vrev64.8 RW1415q, RW1415q;
+#endif
+
+	/* EABI says that d8-d15 must be preserved by callee. */
+	vpush {RT0-RT7};
+
+.Loop:
+	add %r3, #16;
+	round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW14, RW9, RW1);
+	cmp %r3, #64;
+	round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW1, RW15, RW10, RW2);
+	round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW0, RW11, RW3);
+	round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW3, RW1, RW12, RW4);
+	round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW2, RW13, RW5);
+	round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW5, RW3, RW14, RW6);
+	round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW4, RW15, RW7);
+	round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW7, RW5, RW0, RW8);
+	round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW6, RW1, RW9);
+	round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW9, RW7, RW2, RW10);
+	round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW8, RW3, RW11);
+	round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW11, RW9, RW4, RW12);
+	round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW10, RW5, RW13);
+	round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW13, RW11, RW6, RW14);
+	round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW12, RW7, RW15);
+	round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW15, RW13, RW8, RW0);
+	bne .Loop;
+
+	round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0);
+	round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW1);
+	round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2);
+	round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW3);
+	round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4);
+	round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW5);
+	round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6);
+	round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW7);
+	round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8);
+	round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW9);
+	round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10);
+	round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW11);
+	round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12);
+	round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW13);
+	round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14);
+	round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW15);
+
+	/* Load context to d16-d23 */
+	vld1.64 {RW0-RW3}, [%r0]!;
+	vld1.64 {RW4-RW7}, [%r0];
+	sub %r0, #(4*8);
+
+	vadd.u64 RA, RW0;
+	vadd.u64 RB, RW1;
+	vadd.u64 RC, RW2;
+	vadd.u64 RD, RW3;
+	vadd.u64 RE, RW4;
+	vadd.u64 RF, RW5;
+	vadd.u64 RG, RW6;
+	vadd.u64 RH, RW7;
+
+	/* Store the first half of context */
+	vst1.64 {RA-RD}, [%r0]!;
+
+	/* Clear used registers */
+	/* d16-d31 */
+	veor.u64 RW01q, RW01q;
+	veor.u64 RW23q, RW23q;
+	veor.u64 RW45q, RW45q;
+	veor.u64 RW67q, RW67q;
+	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	veor.u64 RW89q, RW89q;
+	veor.u64 RW1011q, RW1011q;
+	veor.u64 RW1213q, RW1213q;
+	veor.u64 RW1415q, RW1415q;
+	/* d8-d15 */
+	vpop {RT0-RT7};
+	/* d0-d7 (q0-q3) */
+	veor.u64 %q0, %q0;
+	veor.u64 %q1, %q1;
+	veor.u64 %q2, %q2;
+	veor.u64 %q3, %q3;
+
+	bx %lr;
+.size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
+
+#endif
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 1bbcd11..fee3e71 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -53,12 +53,26 @@
 #include "cipher.h"
 #include "hash-common.h"
 
+
+/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
+#undef USE_ARM_NEON_ASM
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+     defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_ARM_NEON_ASM 1
+# endif
+#endif
+
+
 typedef struct
 {
   u64 h0, h1, h2, h3, h4, h5, h6, h7;
   u64 nblocks;
   byte buf[128];
   int count;
+#ifdef USE_ARM_NEON_ASM
+  int use_neon;
+#endif
 } SHA512_CONTEXT;
 
 static void
@@ -77,6 +91,9 @@ sha512_init (void *context)
 
   hd->nblocks = 0;
   hd->count = 0;
+#ifdef USE_ARM_NEON_ASM
+  hd->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
+#endif
 }
 
 static void
@@ -95,6 +112,9 @@ sha384_init (void *context)
 
   hd->nblocks = 0;
   hd->count = 0;
+#ifdef USE_ARM_NEON_ASM
+  hd->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0;
+#endif
 }
 
 
@@ -128,58 +148,59 @@ Sum1 (u64 x)
   return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41));
 }
 
+static const u64 k[] =
+  {
+    U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
+    U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
+    U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
+    U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
+    U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
+    U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
+    U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
+    U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
+    U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
+    U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
+    U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
+    U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
+    U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
+    U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
+    U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
+    U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
+    U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
+    U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
+    U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
+    U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
+    U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
+    U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
+    U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
+    U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
+    U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
+    U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
+    U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
+    U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
+    U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
+    U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
+    U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
+    U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
+    U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
+    U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
+    U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
+    U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
+    U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
+    U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
+    U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
+    U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
+  };
+
 /****************
  * Transform the message W which consists of 16 64-bit-words
  */
 static void
-transform (SHA512_CONTEXT *hd, const unsigned char *data)
+__transform (SHA512_CONTEXT *hd, const unsigned char *data)
 {
   u64 a, b, c, d, e, f, g, h;
   u64 w[16];
   int t;
-  static const u64 k[] =
-    {
-      U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
-      U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
-      U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
-      U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
-      U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
-      U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
-      U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
-      U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
-      U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
-      U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
-      U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
-      U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
-      U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
-      U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
-      U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
-      U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
-      U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
-      U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
-      U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
-      U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
-      U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
-      U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
-      U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
-      U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
-      U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
-      U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
-      U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
-      U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
-      U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
-      U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
-      U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
-      U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
-      U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
-      U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
-      U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
-      U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
-      U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
-      U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
-      U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
-      U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
-    };
 
   /* get values from the chaining vars */
   a = hd->h0;
@@ -455,6 +476,33 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
 }
 
 
+#ifdef USE_ARM_NEON_ASM
+void _gcry_sha512_transform_armv7_neon (SHA512_CONTEXT *hd,
+					const unsigned char *data,
+					const u64 k[]);
+#endif
+
+
+static unsigned int
+transform (SHA512_CONTEXT *hd, const unsigned char *data)
+{
+#ifdef USE_ARM_NEON_ASM
+  if (hd->use_neon)
+    {
+      _gcry_sha512_transform_armv7_neon(hd, data, k);
+
+      /* return stack burn depth */
+      return (sizeof(void *) * 3);
+    }
+#endif
+
+  __transform (hd, data);
+
+  /* return stack burn depth */
+  return 256;
+}
+
+
 /* Update the message digest with the contents
  * of INBUF with length INLEN.
  */
@@ -463,11 +511,12 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen)
 {
   const unsigned char *inbuf = inbuf_arg;
   SHA512_CONTEXT *hd = context;
+  unsigned int stack_burn_depth = 0;
 
   if (hd->count == 128)
     {				/* flush the buffer */
-      transform (hd, hd->buf);
-      _gcry_burn_stack (256);
+      stack_burn_depth = transform (hd, hd->buf);
+      _gcry_burn_stack (stack_burn_depth);
       hd->count = 0;
       hd->nblocks++;
     }
@@ -484,13 +533,13 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen)
 
   while (inlen >= 128)
     {
-      transform (hd, inbuf);
+      stack_burn_depth = transform (hd, inbuf);
       hd->count = 0;
       hd->nblocks++;
       inlen -= 128;
       inbuf += 128;
     }
-  _gcry_burn_stack (256);
+  _gcry_burn_stack (stack_burn_depth);
   for (; inlen && hd->count < 128; inlen--)
     hd->buf[hd->count++] = *inbuf++;
 }
@@ -508,6 +557,7 @@ static void
 sha512_final (void *context)
 {
   SHA512_CONTEXT *hd = context;
+  unsigned int stack_burn_depth;
   u64 t, msb, lsb;
   byte *p;
 
@@ -559,8 +609,8 @@ sha512_final (void *context)
   hd->buf[125] = lsb >> 16;
   hd->buf[126] = lsb >> 8;
   hd->buf[127] = lsb;
-  transform (hd, hd->buf);
-  _gcry_burn_stack (256);
+  stack_burn_depth = transform (hd, hd->buf);
+  _gcry_burn_stack (stack_burn_depth);
 
   p = hd->buf;
 #ifdef WORDS_BIGENDIAN
diff --git a/configure.ac b/configure.ac
index 89b9366..959327a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1526,6 +1526,11 @@ LIST_MEMBER(sha512, $enabled_digests)
 if test "$found" = "1" ; then
    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo"
    AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included])
+
+   if test x"$neonsupport" = xyes ; then
+     # Build with the NEON implementation
+     GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo"
+   fi
 fi
 
 LIST_MEMBER(tiger, $enabled_digests)

commit 03da7f8ba3ec24d4639a2bcebbc0d9d831734c08
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Aug 31 12:48:30 2013 +0300

    sha512: reduce stack use in transform function by 512 bytes
    
    * cipher/sha512.c (transform): Change 'u64 w[80]' to 'u64 w[16]' and
    inline input expansion to first 64 rounds.
    (sha512_write, sha512_final): Reduce burn_stack depth by 512 bytes.
    --
    
    The input expansion to w[] array can be inlined with rounds and size of array
    reduced from u64[80] to u64[16]. On Cortex-A8, this change gives small boost,
    possibly thanks to reduced burn_stack depth.
    
    New vs old (tests/benchmark md sha512 sha384):
    SHA512	1.09x	1.11x	1.06x	1.09x	1.08x
    SHA384	1.09x	1.11x	1.06x	1.09x	1.09x
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/sha512.c b/cipher/sha512.c
index 2163e60..1bbcd11 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -135,7 +135,7 @@ static void
 transform (SHA512_CONTEXT *hd, const unsigned char *data)
 {
   u64 a, b, c, d, e, f, g, h;
-  u64 w[80];
+  u64 w[16];
   int t;
   static const u64 k[] =
     {
@@ -215,11 +215,8 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
 #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
 #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 
-  for (t = 16; t < 80; t++)
-    w[t] = S1 (w[t - 2]) + w[t - 7] + S0 (w[t - 15]) + w[t - 16];
 
-
-  for (t = 0; t < 80; )
+  for (t = 0; t < 80 - 16; )
     {
       u64 t1, t2;
 
@@ -232,7 +229,125 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
          Unrolled with inline:      330ms
       */
 #if 0 /* Not unrolled.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t];
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
+      w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]);
+      t2 = Sum0 (a) + Maj (a, b, c);
+      h = g;
+      g = f;
+      f = e;
+      e = d + t1;
+      d = c;
+      c = b;
+      b = a;
+      a = t1 + t2;
+      t++;
+#else /* Unrolled to interweave the chain variables.  */
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+      w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
+      t2 = Sum0 (a) + Maj (a, b, c);
+      d += t1;
+      h = t1 + t2;
+
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+      w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
+      t2 = Sum0 (h) + Maj (h, a, b);
+      c += t1;
+      g  = t1 + t2;
+
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+      w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
+      t2 = Sum0 (g) + Maj (g, h, a);
+      b += t1;
+      f  = t1 + t2;
+
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+      w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
+      t2 = Sum0 (f) + Maj (f, g, h);
+      a += t1;
+      e  = t1 + t2;
+
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+      w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
+      t2 = Sum0 (e) + Maj (e, f, g);
+      h += t1;
+      d  = t1 + t2;
+
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+      w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
+      t2 = Sum0 (d) + Maj (d, e, f);
+      g += t1;
+      c  = t1 + t2;
+
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+      w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
+      t2 = Sum0 (c) + Maj (c, d, e);
+      f += t1;
+      b  = t1 + t2;
+
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+      w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
+      t2 = Sum0 (b) + Maj (b, c, d);
+      e += t1;
+      a  = t1 + t2;
+
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
+      w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
+      t2 = Sum0 (a) + Maj (a, b, c);
+      d += t1;
+      h  = t1 + t2;
+
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
+      w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
+      t2 = Sum0 (h) + Maj (h, a, b);
+      c += t1;
+      g  = t1 + t2;
+
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
+      w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
+      t2 = Sum0 (g) + Maj (g, h, a);
+      b += t1;
+      f  = t1 + t2;
+
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
+      w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
+      t2 = Sum0 (f) + Maj (f, g, h);
+      a += t1;
+      e  = t1 + t2;
+
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
+      w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
+      t2 = Sum0 (e) + Maj (e, f, g);
+      h += t1;
+      d  = t1 + t2;
+
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
+      w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
+      t2 = Sum0 (d) + Maj (d, e, f);
+      g += t1;
+      c  = t1 + t2;
+
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
+      w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
+      t2 = Sum0 (c) + Maj (c, d, e);
+      f += t1;
+      b  = t1 + t2;
+
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
+      w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
+      t2 = Sum0 (b) + Maj (b, c, d);
+      e += t1;
+      a  = t1 + t2;
+
+      t += 16;
+#endif
+    }
+
+  for (; t < 80; )
+    {
+      u64 t1, t2;
+
+#if 0 /* Not unrolled.  */
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
       t2 = Sum0 (a) + Maj (a, b, c);
       h = g;
       g = f;
@@ -244,47 +359,87 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
       a = t1 + t2;
       t++;
 #else /* Unrolled to interweave the chain variables.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t];
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+      t2 = Sum0 (a) + Maj (a, b, c);
+      d += t1;
+      h  = t1 + t2;
+
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+      t2 = Sum0 (h) + Maj (h, a, b);
+      c += t1;
+      g  = t1 + t2;
+
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+      t2 = Sum0 (g) + Maj (g, h, a);
+      b += t1;
+      f  = t1 + t2;
+
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+      t2 = Sum0 (f) + Maj (f, g, h);
+      a += t1;
+      e  = t1 + t2;
+
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+      t2 = Sum0 (e) + Maj (e, f, g);
+      h += t1;
+      d  = t1 + t2;
+
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+      t2 = Sum0 (d) + Maj (d, e, f);
+      g += t1;
+      c  = t1 + t2;
+
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+      t2 = Sum0 (c) + Maj (c, d, e);
+      f += t1;
+      b  = t1 + t2;
+
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+      t2 = Sum0 (b) + Maj (b, c, d);
+      e += t1;
+      a  = t1 + t2;
+
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
       t2 = Sum0 (a) + Maj (a, b, c);
       d += t1;
       h  = t1 + t2;
 
-      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[t+1];
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
       t2 = Sum0 (h) + Maj (h, a, b);
       c += t1;
       g  = t1 + t2;
 
-      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[t+2];
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
       t2 = Sum0 (g) + Maj (g, h, a);
       b += t1;
       f  = t1 + t2;
 
-      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[t+3];
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
       t2 = Sum0 (f) + Maj (f, g, h);
       a += t1;
       e  = t1 + t2;
 
-      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[t+4];
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
       t2 = Sum0 (e) + Maj (e, f, g);
       h += t1;
       d  = t1 + t2;
 
-      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[t+5];
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
       t2 = Sum0 (d) + Maj (d, e, f);
       g += t1;
       c  = t1 + t2;
 
-      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[t+6];
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
       t2 = Sum0 (c) + Maj (c, d, e);
       f += t1;
       b  = t1 + t2;
 
-      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[t+7];
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
       t2 = Sum0 (b) + Maj (b, c, d);
       e += t1;
       a  = t1 + t2;
 
-      t += 8;
+      t += 16;
 #endif
     }
 
@@ -312,7 +467,7 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen)
   if (hd->count == 128)
     {				/* flush the buffer */
       transform (hd, hd->buf);
-      _gcry_burn_stack (768);
+      _gcry_burn_stack (256);
       hd->count = 0;
       hd->nblocks++;
     }
@@ -335,7 +490,7 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen)
       inlen -= 128;
       inbuf += 128;
     }
-  _gcry_burn_stack (768);
+  _gcry_burn_stack (256);
   for (; inlen && hd->count < 128; inlen--)
     hd->buf[hd->count++] = *inbuf++;
 }
@@ -405,7 +560,7 @@ sha512_final (void *context)
   hd->buf[126] = lsb >> 8;
   hd->buf[127] = lsb;
   transform (hd, hd->buf);
-  _gcry_burn_stack (768);
+  _gcry_burn_stack (256);
 
   p = hd->buf;
 #ifdef WORDS_BIGENDIAN

commit 9c95be105f518d18407115c2c06893857c24b116
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Aug 31 12:48:30 2013 +0300

    Add ARM HW feature detection module and add NEON detection
    
    * configure.ac: Add option --disable-neon-support.
    (HAVE_GCC_INLINE_ASM_NEON): New.
    (ENABLE_NEON_SUPPORT): New.
    [arm]: Add 'hwf-arm.lo' as HW feature module.
    * src/Makefile.am: Add 'hwf-arm.c'.
    * src/g10lib.h (HWF_ARM_NEON): New macro.
    * src/global.c (hwflist): Add HWF_ARM_NEON entry.
    * src/hwf-arm.c: New file.
    * src/hwf-common.h (_gcry_hwf_detect_arm): New prototype.
    * src/hwfeatures.c (_gcry_detect_hw_features) [HAVE_CPU_ARCH_ARM]: Add
    call to _gcry_hwf_detect_arm.
    --
    
    Add HW detection module for detecting ARM NEON instruction set. ARM does not
    have cpuid instruction so we have to rely on OS to pass feature set information
    to user-space. For linux, NEON support can be detected by parsing
    '/proc/self/auxv' for hardware capabilities information. For other OSes, NEON
    can be detected by checking if platform/compiler only supports NEON capable
    CPUs (by check if __ARM_NEON__ macro is defined).
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/configure.ac b/configure.ac
index b54b4d6..89b9366 100644
--- a/configure.ac
+++ b/configure.ac
@@ -595,6 +595,14 @@ AC_ARG_ENABLE(avx2-support,
 	      avx2support=$enableval,avx2support=yes)
 AC_MSG_RESULT($avx2support)
 
+# Implementation of the --disable-neon-support switch.
+AC_MSG_CHECKING([whether NEON support is requested])
+AC_ARG_ENABLE(neon-support,
+              AC_HELP_STRING([--disable-neon-support],
+                 [Disable support for the ARM NEON instructions]),
+	      neonsupport=$enableval,neonsupport=yes)
+AC_MSG_RESULT($neonsupport)
+
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
@@ -988,6 +996,30 @@ fi
 
 
 #
+# Check whether GCC inline assembler supports NEON instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions],
+       [gcry_cv_gcc_inline_asm_neon],
+       [gcry_cv_gcc_inline_asm_neon=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[__asm__(
+                ".syntax unified\n\t"
+                ".thumb\n\t"
+                ".fpu neon\n\t"
+                "vld1.64 {%q0-%q1}, [%r0]!;\n\t"
+                "vrev64.8 %q0, %q3;\n\t"
+                "vadd.u64 %q0, %q1;\n\t"
+                "vadd.s64 %d3, %d2, %d3;\n\t"
+                );
+            ]])],
+          [gcry_cv_gcc_inline_asm_neon=yes])])
+if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1,
+     [Defined if inline assembler supports NEON instructions])
+fi
+
+
+#
 # Check whether GCC assembler supports features needed for our ARM
 # implementations
 #
@@ -1269,6 +1301,11 @@ if test x"$avx2support" = xyes ; then
     avx2support="no (unsupported by compiler)"
   fi
 fi
+if test x"$neonsupport" = xyes ; then
+  if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
+    neonsupport="no (unsupported by compiler)"
+  fi
+fi
 
 if test x"$aesnisupport" = xyes ; then
   AC_DEFINE(ENABLE_AESNI_SUPPORT, 1,
@@ -1282,6 +1319,10 @@ if test x"$avx2support" = xyes ; then
   AC_DEFINE(ENABLE_AVX2_SUPPORT,1,
             [Enable support for Intel AVX2 instructions.])
 fi
+if test x"$neonsupport" = xyes ; then
+  AC_DEFINE(ENABLE_NEON_SUPPORT,1,
+            [Enable support for ARM NEON instructions.])
+fi
 
 
 # Define conditional sources and config.h symbols depending on the
@@ -1597,6 +1638,7 @@ case "$mpi_cpu_arch" in
         ;;
      arm)
         AC_DEFINE(HAVE_CPU_ARCH_ARM, 1,   [Defined for ARM platforms])
+        GCRYPT_HWF_MODULES="hwf-arm.lo"
         ;;
 esac
 AC_SUBST([GCRYPT_HWF_MODULES])
@@ -1660,6 +1702,7 @@ GCRY_MSG_SHOW([Try using AES-NI crypto:  ],[$aesnisupport])
 GCRY_MSG_SHOW([Try using DRNG (RDRAND):  ],[$drngsupport])
 GCRY_MSG_SHOW([Try using Intel AVX:      ],[$avxsupport])
 GCRY_MSG_SHOW([Try using Intel AVX2:     ],[$avx2support])
+GCRY_MSG_SHOW([Try using ARM NEON:       ],[$neonsupport])
 GCRY_MSG_SHOW([],[])
 
 if test "$print_egd_notice" = "yes"; then
diff --git a/src/Makefile.am b/src/Makefile.am
index 8eb46e6..d4329c9 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -64,7 +64,7 @@ libgcrypt_la_SOURCES = \
 	ec-context.h \
 	ath.h ath.c
 
-EXTRA_libgcrypt_la_SOURCES = hwf-x86.c
+EXTRA_libgcrypt_la_SOURCES = hwf-x86.c hwf-arm.c
 gcrypt_hwf_modules = @GCRYPT_HWF_MODULES@
 
 
diff --git a/src/g10lib.h b/src/g10lib.h
index 198ab38..31131a5 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -155,6 +155,8 @@ int _gcry_log_verbosity( int level );
 #define HWF_INTEL_AVX    1024
 #define HWF_INTEL_AVX2   2048
 
+#define HWF_ARM_NEON     4096
+
 
 unsigned int _gcry_get_hw_features (void);
 void _gcry_detect_hw_features (unsigned int);
diff --git a/src/global.c b/src/global.c
index 9c80573..44667cf 100644
--- a/src/global.c
+++ b/src/global.c
@@ -70,6 +70,7 @@ static struct
     { HWF_INTEL_RDRAND,"intel-rdrand" },
     { HWF_INTEL_AVX,   "intel-avx" },
     { HWF_INTEL_AVX2,  "intel-avx2" },
+    { HWF_ARM_NEON,    "arm-neon" },
     { 0, NULL}
   };
 
diff --git a/src/hwf-arm.c b/src/hwf-arm.c
new file mode 100644
index 0000000..9ab4cd0
--- /dev/null
+++ b/src/hwf-arm.c
@@ -0,0 +1,113 @@
+/* hwf-arm.c - Detect hardware features - ARM part
+ * Copyright © 2013  Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <unistd.h>
+
+#include "g10lib.h"
+#include "hwf-common.h"
+
+#if !defined (__arm__)
+# error Module build for wrong CPU.
+#endif
+
+#undef HAS_SYS_AT_HWCAP
+#ifdef __linux__
+
+#define HAS_SYS_AT_HWCAP 1
+
+#define AT_HWCAP 16
+#define HWCAP_NEON 4096
+
+static int get_hwcap(unsigned int *hwcap)
+{
+  struct { unsigned int a_type; unsigned int a_val; } auxv;
+  FILE *f;
+  int err = -1;
+  static int hwcap_initialized = 0;
+  static unsigned int stored_hwcap;
+
+  if (hwcap_initialized)
+    {
+      *hwcap = stored_hwcap;
+      return 0;
+    }
+
+  f = fopen("/proc/self/auxv", "r");
+  if (!f)
+    {
+      *hwcap = stored_hwcap;
+      return -1;
+    }
+
+  while (fread(&auxv, sizeof(auxv), 1, f) > 0)
+    {
+      if (auxv.a_type != AT_HWCAP)
+        continue;
+
+      stored_hwcap = auxv.a_val;
+      hwcap_initialized = 1;
+      err = 0;
+      break;
+    }
+
+  fclose(f);
+  *hwcap = stored_hwcap;
+  return err;
+}
+
+static unsigned int detect_arm_at_hwcap(void)
+{
+  unsigned int hwcap;
+  unsigned int features = 0;
+
+  if (get_hwcap(&hwcap) < 0)
+    return features;
+
+#ifdef ENABLE_NEON_SUPPORT
+  if (hwcap & HWCAP_NEON)
+    features |= HWF_ARM_NEON;
+#endif
+
+  return features;
+}
+
+#endif /* __linux__ */
+
+unsigned int
+_gcry_hwf_detect_arm (void)
+{
+  unsigned int ret = 0;
+
+#if defined (HAS_SYS_AT_HWCAP)
+  ret |= detect_arm_at_hwcap ();
+#else
+  ret |= 0;
+#endif
+
+#if defined(__ARM_NEON__) && defined(ENABLE_NEON_SUPPORT)
+  ret |= HWF_ARM_NEON;
+#endif
+
+  return ret;
+}
diff --git a/src/hwf-common.h b/src/hwf-common.h
index 974f47d..8f156b5 100644
--- a/src/hwf-common.h
+++ b/src/hwf-common.h
@@ -21,6 +21,7 @@
 #define HWF_COMMON_H
 
 unsigned int _gcry_hwf_detect_x86 (void);
+unsigned int _gcry_hwf_detect_arm (void);
 
 
 #endif /*HWF_COMMON_H*/
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 87d05d8..1e3c27d 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -59,6 +59,11 @@ _gcry_detect_hw_features (unsigned int disabled_features)
     hw_features = _gcry_hwf_detect_x86 ();
   }
 #endif /* HAVE_CPU_ARCH_X86 */
+#if defined (HAVE_CPU_ARCH_ARM)
+  {
+    hw_features = _gcry_hwf_detect_arm ();
+  }
+#endif /* HAVE_CPU_ARCH_ARM */
 
   hw_features &= ~disabled_features;
 }

commit 7b0ebe69fe35f2ee13e1e1beb2766a1eaadb7f0c
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Aug 31 12:48:30 2013 +0300

    Correct mpi_cpu_arch for ARMv6
    
    * mpi/config.links [armv6]: Set mpi_cpu_arch to "arm", instead of
    "armv6".
    --
    
    Without this change, HAVE_CPU_ARCH_ARM stays undefined.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/mpi/config.links b/mpi/config.links
index f300255..9fb4f10 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -141,7 +141,7 @@ case "${host}" in
 	  if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
 	    echo '/* configured for armv6 */' >>./mpi/asm-syntax.h
 	    path="armv6"
-	    mpi_cpu_arch="armv6"
+	    mpi_cpu_arch="arm"
 	  else
 	    echo '/* No assembler modules configured */' >>./mpi/asm-syntax.h
 	    path=""

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am         |    2 +-
 cipher/sha512-armv7-neon.S |  316 +++++++++++++++++++++++++++++++++++++++++
 cipher/sha512.c            |  335 +++++++++++++++++++++++++++++++++++---------
 configure.ac               |   48 +++++++
 mpi/config.links           |    2 +-
 src/Makefile.am            |    2 +-
 src/g10lib.h               |    2 +
 src/global.c               |    1 +
 src/hwf-arm.c              |  113 +++++++++++++++
 src/hwf-common.h           |    1 +
 src/hwfeatures.c           |    5 +
 11 files changed, 759 insertions(+), 68 deletions(-)
 create mode 100644 cipher/sha512-armv7-neon.S
 create mode 100644 src/hwf-arm.c


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org