[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-280-g89fa74d

by Jussi Kivilinna cvs at cvs.gnupg.org
Thu Nov 5 18:11:43 CET 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0 (commit)
       via  a1cc7bb15473a2419b24ecac765ae0ce5989a13b (commit)
       via  2857cb89c6dc1c02266600bc1fd2967a3cd5cf88 (commit)
       via  07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2 (commit)
      from  c0b9eee2d93a13930244f9ce0c14ed6b4aeb6c29 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Nov 1 20:44:09 2015 +0200

    Improve performance of Tiger hash algorithms
    
    * cipher/tiger.c (tiger_round, pass, key_schedule): Convert functions
    to macros.
    (transform_blk): Pass variable names instead of pointers to 'pass'.
    --
    
    Benchmark results on Intel Haswell @ 3.2 Ghz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |      3.25 ns/B     293.5 MiB/s     10.40 c/B
    
    After (1.75x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |      1.85 ns/B     515.3 MiB/s      5.92 c/B
    
    Benchmark results on Cortex-A8 @ 1008 Mhz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |     63.42 ns/B     15.04 MiB/s     63.93 c/B
    
    After (1.26x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |     49.99 ns/B     19.08 MiB/s     50.39 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/tiger.c b/cipher/tiger.c
index 078133a..516bd44 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -633,68 +633,44 @@ tiger2_init (void *context, unsigned int flags)
   do_init (context, 2);
 }
 
-static void
-tiger_round( u64 *ra, u64 *rb, u64 *rc, u64 x, int mul )
-{
-  u64 a = *ra;
-  u64 b = *rb;
-  u64 c = *rc;
-
-  c ^= x;
-  a -= (  sbox1[  c        & 0xff ] ^ sbox2[ (c >> 16) & 0xff ]
-        ^ sbox3[ (c >> 32) & 0xff ] ^ sbox4[ (c >> 48) & 0xff ]);
-  b += (  sbox4[ (c >>  8) & 0xff ] ^ sbox3[ (c >> 24) & 0xff ]
-        ^ sbox2[ (c >> 40) & 0xff ] ^ sbox1[ (c >> 56) & 0xff ]);
-  b *= mul;
-
-  *ra = a;
-  *rb = b;
-  *rc = c;
-}
-
-
-static void
-pass( u64 *ra, u64 *rb, u64 *rc, u64 *x, int mul )
-{
-  u64 a = *ra;
-  u64 b = *rb;
-  u64 c = *rc;
-
-  tiger_round( &a, &b, &c, x[0], mul );
-  tiger_round( &b, &c, &a, x[1], mul );
-  tiger_round( &c, &a, &b, x[2], mul );
-  tiger_round( &a, &b, &c, x[3], mul );
-  tiger_round( &b, &c, &a, x[4], mul );
-  tiger_round( &c, &a, &b, x[5], mul );
-  tiger_round( &a, &b, &c, x[6], mul );
-  tiger_round( &b, &c, &a, x[7], mul );
-
-  *ra = a;
-  *rb = b;
-  *rc = c;
-}
-
 
-static void
-key_schedule( u64 *x )
-{
-  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL;
-  x[1] ^= x[0];
-  x[2] += x[1];
-  x[3] -= x[2] ^ ((~x[1]) << 19 );
-  x[4] ^= x[3];
-  x[5] += x[4];
-  x[6] -= x[5] ^ ((~x[4]) >> 23 );
-  x[7] ^= x[6];
-  x[0] += x[7];
-  x[1] -= x[0] ^ ((~x[7]) << 19 );
-  x[2] ^= x[1];
-  x[3] += x[2];
-  x[4] -= x[3] ^ ((~x[2]) >> 23 );
-  x[5] ^= x[4];
-  x[6] += x[5];
-  x[7] -= x[6] ^ 0x0123456789abcdefLL;
-}
+#define tiger_round(xa, xb, xc, xx, xmul) { \
+  xc ^= xx; \
+  xa -= (  sbox1[  (xc)        & 0xff ] ^ sbox2[ ((xc) >> 16) & 0xff ] \
+         ^ sbox3[ ((xc) >> 32) & 0xff ] ^ sbox4[ ((xc) >> 48) & 0xff ]); \
+  xb += (  sbox4[ ((xc) >>  8) & 0xff ] ^ sbox3[ ((xc) >> 24) & 0xff ] \
+         ^ sbox2[ ((xc) >> 40) & 0xff ] ^ sbox1[ ((xc) >> 56) & 0xff ]); \
+  xb *= xmul; }
+
+
+#define pass(ya, yb, yc, yx, ymul) { \
+  tiger_round( ya, yb, yc, yx[0], ymul ); \
+  tiger_round( yb, yc, ya, yx[1], ymul ); \
+  tiger_round( yc, ya, yb, yx[2], ymul ); \
+  tiger_round( ya, yb, yc, yx[3], ymul ); \
+  tiger_round( yb, yc, ya, yx[4], ymul ); \
+  tiger_round( yc, ya, yb, yx[5], ymul ); \
+  tiger_round( ya, yb, yc, yx[6], ymul ); \
+  tiger_round( yb, yc, ya, yx[7], ymul ); }
+
+
+#define key_schedule(x) { \
+  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL; \
+  x[1] ^= x[0]; \
+  x[2] += x[1]; \
+  x[3] -= x[2] ^ ((~x[1]) << 19 ); \
+  x[4] ^= x[3]; \
+  x[5] += x[4]; \
+  x[6] -= x[5] ^ ((~x[4]) >> 23 ); \
+  x[7] ^= x[6]; \
+  x[0] += x[7]; \
+  x[1] -= x[0] ^ ((~x[7]) << 19 ); \
+  x[2] ^= x[1]; \
+  x[3] += x[2]; \
+  x[4] -= x[3] ^ ((~x[2]) >> 23 ); \
+  x[5] ^= x[4]; \
+  x[6] += x[5]; \
+  x[7] -= x[6] ^ 0x0123456789abcdefLL; }
 
 
 /****************
@@ -716,11 +692,11 @@ transform_blk ( void *ctx, const unsigned char *data )
   b = bb = hd->b;
   c = cc = hd->c;
 
-  pass( &a, &b, &c, x, 5);
+  pass( a, b, c, x, 5);
   key_schedule( x );
-  pass( &c, &a, &b, x, 7);
+  pass( c, a, b, x, 7);
   key_schedule( x );
-  pass( &b, &c, &a, x, 9);
+  pass( b, c, a, x, 9);
 
   /* feedforward */
   a ^= aa;

commit a1cc7bb15473a2419b24ecac765ae0ce5989a13b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Nov 1 16:06:26 2015 +0200

    Add ARMv7/NEON implementation of Keccak
    
    * cipher/Makefile.am: Add 'keccak-armv7-neon.S'.
    * cipher/keccak-armv7-neon.S: New.
    * cipher/keccak.c (USE_64BIT_ARM_NEON): New.
    (NEED_COMMON64): Select if USE_64BIT_ARM_NEON.
    [NEED_COMMON64] (round_consts_64bit): Rename to...
    [NEED_COMMON64] (_gcry_keccak_round_consts_64bit): ...this; Add
    terminator at end.
    [USE_64BIT_ARM_NEON] (_gcry_keccak_permute_armv7_neon)
    (_gcry_keccak_absorb_lanes64_armv7_neon, keccak_permute64_armv7_neon)
    (keccak_absorb_lanes64_armv7_neon, keccak_armv7_neon_64_ops): New.
    (keccak_init) [USE_64BIT_ARM_NEON]: Select ARM/NEON implementation
    if supported by HW.
    * cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Update
    to use new round constant table.
    * configure.ac: Add 'keccak-armv7-neon.lo'.
    --
    
    Patch adds ARMv7/NEON implementation of Keccak (SHAKE/SHA3). Patch
    is based on public-domain implementation by Ronny Van Keer from
    SUPERCOP package:
     https://github.com/floodyberry/supercop/blob/master/crypto_hash/\
    keccakc1024/inplace-armv7a-neon/keccak2.s
    
    Benchmark results on Cortex-A8 @ 1008 Mhz:
    
    Before (generic 32-bit bit-interleaved impl.):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |     83.00 ns/B     11.49 MiB/s     83.67 c/B
     SHAKE256       |     101.7 ns/B      9.38 MiB/s     102.5 c/B
     SHA3-224       |     96.13 ns/B      9.92 MiB/s     96.90 c/B
     SHA3-256       |     101.5 ns/B      9.40 MiB/s     102.3 c/B
     SHA3-384       |     131.4 ns/B      7.26 MiB/s     132.5 c/B
     SHA3-512       |     189.1 ns/B      5.04 MiB/s     190.6 c/B
    
    After (ARM/NEON, ~3.2x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |     25.09 ns/B     38.01 MiB/s     25.29 c/B
     SHAKE256       |     30.95 ns/B     30.82 MiB/s     31.19 c/B
     SHA3-224       |     29.24 ns/B     32.61 MiB/s     29.48 c/B
     SHA3-256       |     30.95 ns/B     30.82 MiB/s     31.19 c/B
     SHA3-384       |     40.42 ns/B     23.59 MiB/s     40.74 c/B
     SHA3-512       |     58.37 ns/B     16.34 MiB/s     58.84 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index be03d06..88c8fbf 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -90,7 +90,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
   sha512-armv7-neon.S \
-keccak.c keccak_permute_32.h keccak_permute_64.h \
+keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/keccak-armv7-neon.S b/cipher/keccak-armv7-neon.S
new file mode 100644
index 0000000..0bec8d5
--- /dev/null
+++ b/cipher/keccak-armv7-neon.S
@@ -0,0 +1,945 @@
+/* keccak-armv7-neon.S  -  ARMv7/NEON implementation of Keccak
+ *
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+/* Based on public-domain/CC0 implementation from SUPERCOP package
+ * (keccakc1024/inplace-armv7a-neon/keccak2.s)
+ *
+ * Original copyright header follows:
+ */
+
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Michaël Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@
+@ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+
+.extern _gcry_keccak_round_consts_64bit;
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+@//  --- offsets in state
+.equ Aba, 0*8
+.equ Aga, 1*8
+.equ Aka, 2*8
+.equ Ama, 3*8
+.equ Asa, 4*8
+
+@// --- macros
+
+.macro    KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
+
+    @Prepare Theta
+    @Ca = Aba^Aga^Aka^Ama^Asa@
+    @Ce = Abe^Age^Ake^Ame^Ase@
+    @Ci = Abi^Agi^Aki^Ami^Asi@
+    @Co = Abo^Ago^Ako^Amo^Aso@
+    @Cu = Abu^Agu^Aku^Amu^Asu@
+    @De = Ca^ROL64(Ci, 1)@
+    @Di = Ce^ROL64(Co, 1)@
+    @Do = Ci^ROL64(Cu, 1)@
+    @Du = Co^ROL64(Ca, 1)@
+    @Da = Cu^ROL64(Ce, 1)@
+
+    veor.64 q4, q6, q7
+    veor.64 q5, q9, q10
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d1,  d8,   d16
+    veor.64 d2,  d10,   d17
+
+    veor.64 q4, q11, q12
+    veor.64 q5, q14, q15
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d3,  d8,   d26
+
+    vadd.u64 q4, q1, q1
+    veor.64 d4,  d10,   d27
+    vmov.64  d0, d5
+    vsri.64 q4, q1, #63
+
+    vadd.u64 q5, q2, q2
+    veor.64 q4, q4, q0
+    vsri.64 q5, q2, #63
+    vadd.u64 d7, d1, d1
+    veor.64 \argA2, \argA2, d8
+    veor.64 q5, q5, q1
+
+    vsri.64 d7, d1, #63
+    vshl.u64 d1, \argA2, #44
+    veor.64 \argA3, \argA3, d9
+    veor.64 d7, d7, d4
+
+    @Ba = argA1^Da@
+    @Be = ROL64((argA2^De), 44)@
+    @Bi = ROL64((argA3^Di), 43)@
+    @Bo = ROL64((argA4^Do), 21)@
+    @Bu = ROL64((argA5^Du), 14)@
+    @argA2 =   Be ^((~Bi)& Bo )@
+    @argA3 =   Bi ^((~Bo)& Bu )@
+    @argA4 =   Bo ^((~Bu)& Ba )@
+    @argA5 =   Bu ^((~Ba)& Be )@
+    @argA1 =   Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
+    vsri.64 d1, \argA2, #64-44
+    vshl.u64 d2, \argA3, #43
+    vldr.64 d0, [sp, #\argA1]
+    veor.64 \argA4, \argA4, d10
+    vsri.64 d2, \argA3, #64-43
+    vshl.u64 d3, \argA4, #21
+    veor.64 \argA5, \argA5, d11
+    veor.64 d0, d0, d7
+    vsri.64 d3, \argA4, #64-21
+    vbic.64   d5, d2, d1
+    vshl.u64 d4, \argA5, #14
+    vbic.64   \argA2, d3, d2
+    vld1.64   d6, [ip]!
+    veor.64   d5, d0
+    vsri.64 d4, \argA5, #64-14
+    veor.64   d5, d6
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA2, d1
+    vstr.64   d5, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi1   argA1, argA2, argA3, argA4, argA5
+
+    @d2 = ROL64((argA1^Da), 3)@
+    @d3 = ROL64((argA2^De), 45)@
+    @d4 = ROL64((argA3^Di), 61)@
+    @d0 = ROL64((argA4^Do), 28)@
+    @d1 = ROL64((argA5^Du), 20)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d3, \argA2, #45
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d4, \argA3, #61
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d3, \argA2, #64-45
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d4, \argA3, #64-61
+    vshl.u64  d0, \argA4, #28
+    veor.64 d6, d6, d7
+    vshl.u64  d1, \argA5, #20
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA4, #64-28
+    vbic.64   \argA4, d0, d4
+    vshl.u64  d2, d6, #3
+    vsri.64  d1, \argA5, #64-20
+    veor.64   \argA4, d3
+    vsri.64  d2, d6, #64-3
+    vbic.64   \argA5, d1, d0
+    vbic.64   d6, d2, d1
+    vbic.64   \argA2, d3, d2
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
+
+    @d4 = ROL64((argA1^Da), 18)@
+    @d0 = ROL64((argA2^De), 1)@
+    @d1 = ROL64((argA3^Di), 6)@
+    @d2 = ROL64((argA4^Do), 25)@
+    @d3 = ROL64((argA5^Du), 8)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA3, \argA3, d9
+    veor.64 \argA4, \argA4, d10
+    vshl.u64  d1, \argA3, #6
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d2, \argA4, #25
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d1, \argA3, #64-6
+    veor.64 \argA2, \argA2, d8
+    vsri.64  d2, \argA4, #64-25
+    vext.8  d3, \argA5, \argA5, #7
+    veor.64 d6, d6, d7
+    vbic.64  \argA3, d2, d1
+    vadd.u64  d0, \argA2, \argA2
+    vbic.64   \argA4, d3, d2
+    vsri.64  d0, \argA2, #64-1
+    vshl.u64  d4, d6, #18
+    veor.64  \argA2, d1, \argA4
+    veor.64  \argA3, d0
+    vsri.64  d4, d6, #64-18
+    vstr.64   \argA3, [sp, #\argA1]
+    veor.64  d5, \argA3
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
+
+    @d1 = ROL64((argA1^Da), 36)@
+    @d2 = ROL64((argA2^De), 10)@
+    @d3 = ROL64((argA3^Di), 15)@
+    @d4 = ROL64((argA4^Do), 56)@
+    @d0 = ROL64((argA5^Du), 27)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d2, \argA2, #10
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d3, \argA3, #15
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d2, \argA2, #64-10
+    vsri.64  d3, \argA3, #64-15
+    veor.64 \argA5, \argA5, d11
+    vext.8  d4, \argA4, \argA4, #1
+    vbic.64   \argA2, d3, d2
+    vshl.u64  d0, \argA5, #27
+    veor.64 d6, d6, d7
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA5, #64-27
+    vshl.u64  d1, d6, #36
+    veor.64   \argA3, d2
+    vbic.64   \argA4, d0, d4
+    vsri.64  d1, d6, #64-36
+
+    veor.64   \argA4, d3
+    vbic.64   d6, d2, d1
+    vbic.64   \argA5, d1, d0
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
+
+    @d3 = ROL64((argA1^Da), 41)@
+    @d4 = ROL64((argA2^De), 2)@
+    @d0 = ROL64((argA3^Di), 62)@
+    @d1 = ROL64((argA4^Do), 55)@
+    @d2 = ROL64((argA5^Du), 39)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d4, \argA2, #2
+    veor.64 \argA5, \argA5, d11
+    vshl.u64  d0, \argA3, #62
+    vldr.64 d6, [sp, #\argA1]
+    vsri.64  d4, \argA2, #64-2
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d0, \argA3, #64-62
+
+    vshl.u64  d1, \argA4, #55
+    veor.64 d6, d6, d7
+    vshl.u64  d2, \argA5, #39
+    vsri.64  d1, \argA4, #64-55
+    vbic.64  \argA4, d0, d4
+    vsri.64  d2, \argA5, #64-39
+    vbic.64  \argA2, d1, d0
+    vshl.u64  d3, d6, #41
+    veor.64  \argA5, d4, \argA2
+    vbic.64  \argA2, d2, d1
+    vsri.64  d3, d6, #64-41
+    veor.64  d6, d0, \argA2
+
+    vbic.64 \argA2, d3, d2
+    vbic.64 \argA3, d4, d3
+    veor.64 \argA2, d1
+    vstr.64 d6, [sp, #\argA1]
+    veor.64 d5, d6
+    veor.64 \argA3, d2
+    veor.64 \argA4, d3
+
+    .endm
+
+
+@// --- code
+
+ at not callable from C!
+.p2align 3
+.type  KeccakF_armv7a_neon_asm,%function;
+KeccakF_armv7a_neon_asm:  @
+
+.LroundLoop:
+
+    KeccakThetaRhoPiChiIota  Aba, d13, d19, d25, d31
+    KeccakThetaRhoPiChi1    Aka, d15, d21, d22, d28
+    KeccakThetaRhoPiChi2    Asa, d12, d18, d24, d30
+    KeccakThetaRhoPiChi3    Aga, d14, d20, d26, d27
+    KeccakThetaRhoPiChi4    Ama, d16, d17, d23, d29
+
+    KeccakThetaRhoPiChiIota  Aba, d15, d18, d26, d29
+    KeccakThetaRhoPiChi1    Asa, d14, d17, d25, d28
+    KeccakThetaRhoPiChi2    Ama, d13, d21, d24, d27
+    KeccakThetaRhoPiChi3    Aka, d12, d20, d23, d31
+    KeccakThetaRhoPiChi4    Aga, d16, d19, d22, d30
+
+    KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
+    KeccakThetaRhoPiChi1    Ama, d12, d19, d26, d28
+    KeccakThetaRhoPiChi2    Aga, d15, d17, d24, d31
+    KeccakThetaRhoPiChi3    Asa, d13, d20, d22, d29
+    KeccakThetaRhoPiChi4    Aka, d16, d18, d25, d27
+
+    KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
+    KeccakThetaRhoPiChi1    Aga, d13, d18, d23, d28
+    KeccakThetaRhoPiChi2    Aka, d14, d19, d24, d29
+    ldr    r0, [ip]
+    KeccakThetaRhoPiChi3    Ama, d15, d20, d25, d30
+    cmp    r0, #0xFFFFFFFF
+    KeccakThetaRhoPiChi4    Asa, d16, d21, d26, d31
+
+    bne    .LroundLoop
+    sub    ip, #(8*24)
+    bx    lr
+.p2align 2
+.ltorg
+.size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm;
+
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state)  callable from C
+.p2align 3
+.global   _gcry_keccak_permute_armv7_neon
+.type  _gcry_keccak_permute_armv7_neon,%function;
+_gcry_keccak_permute_armv7_neon:
+
+    push   {ip, lr}
+    vpush  {q4-q7}
+    sub    sp,sp, #5*8
+
+    vldr.64  d0,  [r0, #0*8]
+    vldr.64  d12, [r0, #1*8]
+    vldr.64  d17, [r0, #2*8]
+    vldr.64  d22, [r0, #3*8]
+    vldr.64  d27, [r0, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r0, #5*8]
+    vldr.64  d13, [r0, #6*8]
+    vldr.64  d18, [r0, #7*8]
+    vldr.64  d23, [r0, #8*8]
+    vldr.64  d28, [r0, #9*8]
+
+    vldr.64  d2,  [r0, #10*8]
+    vldr.64  d14, [r0, #11*8]
+    vldr.64  d19, [r0, #12*8]
+    vldr.64  d24, [r0, #13*8]
+    vldr.64  d29, [r0, #14*8]
+
+    vldr.64  d3,  [r0, #15*8]
+    vldr.64  d15, [r0, #16*8]
+    vldr.64  d20, [r0, #17*8]
+    vldr.64  d25, [r0, #18*8]
+    vldr.64  d30, [r0, #19*8]
+
+    vldr.64  d4,  [r0, #20*8]
+    vldr.64  d16, [r0, #21*8]
+    vldr.64  d21, [r0, #22*8]
+    vldr.64  d26, [r0, #23*8]
+    vldr.64  d31, [r0, #24*8]
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    mov      r1, r0
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    vpop.64  { d0- d4 }
+
+    vstr.64  d0,  [r1, #0*8]
+    vstr.64  d12, [r1, #1*8]
+    vstr.64  d17, [r1, #2*8]
+    vstr.64  d22, [r1, #3*8]
+    vstr.64  d27, [r1, #4*8]
+
+    vstr.64  d1,  [r1, #5*8]
+    vstr.64  d13, [r1, #6*8]
+    vstr.64  d18, [r1, #7*8]
+    vstr.64  d23, [r1, #8*8]
+    vstr.64  d28, [r1, #9*8]
+
+    vstr.64  d2,  [r1, #10*8]
+    vstr.64  d14, [r1, #11*8]
+    vstr.64  d19, [r1, #12*8]
+    vstr.64  d24, [r1, #13*8]
+    vstr.64  d29, [r1, #14*8]
+
+    vstr.64  d3,  [r1, #15*8]
+    vstr.64  d15, [r1, #16*8]
+    vstr.64  d20, [r1, #17*8]
+    vstr.64  d25, [r1, #18*8]
+    vstr.64  d30, [r1, #19*8]
+
+    vstr.64  d4,  [r1, #20*8]
+    vstr.64  d16, [r1, #21*8]
+    vstr.64  d21, [r1, #22*8]
+    vstr.64  d26, [r1, #23*8]
+    vstr.64  d31, [r1, #24*8]
+
+    mov   r0, #112
+    vpop  {q4-q7}
+    pop   {ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon;
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4
+@					    int pos,    @r1
+@					    const byte *lanes,   @r2
+@					    unsigned int nlanes, @r3
+@					    int blocklanes) @ r5 callable from C
+.p2align 3
+.global   _gcry_keccak_absorb_lanes64_armv7_neon
+.type  _gcry_keccak_absorb_lanes64_armv7_neon,%function;
+_gcry_keccak_absorb_lanes64_armv7_neon:
+
+    cmp    r3, #0	@ nlanes == 0
+    itt eq
+    moveq  r0, #0
+    bxeq   lr
+
+    push   {r4-r5, ip, lr}
+    beq    .Lout
+    mov    r4, r0
+    ldr    r5, [sp, #(4*4)]
+    vpush  {q4-q7}
+
+    @ load state
+    vldr.64  d0,  [r4, #0*8]
+    vldr.64  d12, [r4, #1*8]
+    vldr.64  d17, [r4, #2*8]
+    vldr.64  d22, [r4, #3*8]
+    vldr.64  d27, [r4, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r4, #5*8]
+    vldr.64  d13, [r4, #6*8]
+    vldr.64  d18, [r4, #7*8]
+    vldr.64  d23, [r4, #8*8]
+    vldr.64  d28, [r4, #9*8]
+
+    vldr.64  d2,  [r4, #10*8]
+    vldr.64  d14, [r4, #11*8]
+    vldr.64  d19, [r4, #12*8]
+    vldr.64  d24, [r4, #13*8]
+    vldr.64  d29, [r4, #14*8]
+
+    vldr.64  d3,  [r4, #15*8]
+    vldr.64  d15, [r4, #16*8]
+    vldr.64  d20, [r4, #17*8]
+    vldr.64  d25, [r4, #18*8]
+    vldr.64  d30, [r4, #19*8]
+
+    vldr.64  d4,  [r4, #20*8]
+    vldr.64  d16, [r4, #21*8]
+    vldr.64  d21, [r4, #22*8]
+    vldr.64  d26, [r4, #23*8]
+    vldr.64  d31, [r4, #24*8]
+
+.Lmain_loop:
+
+    @ detect absorb mode (full blocks vs lanes)
+
+    cmp r1, #0		@ pos != 0
+    bne .Llanes_loop
+
+.Lmain_loop_pos0:
+
+    @ full blocks mode
+
+    @ switch (blocksize)
+    cmp r5, #21
+    beq .Lfull_block_21
+    cmp r5, #18
+    beq .Lfull_block_18
+    cmp r5, #17
+    beq .Lfull_block_17
+    cmp r5, #13
+    beq .Lfull_block_13
+    cmp r5, #9
+    beq .Lfull_block_9
+
+    @ unknown blocksize
+    b .Llanes_loop
+
+.Lfull_block_21:
+
+    @ SHAKE128
+
+    cmp r3, #21		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+    veor d25, d9
+    veor d30, d10
+
+    veor d4,  d11
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #21	@ nlanes -= 21
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_21
+
+.Lfull_block_18:
+
+    @ SHA3-224
+
+    cmp r3, #18		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #18	@ nlanes -= 18
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_18
+
+.Lfull_block_17:
+
+    @ SHA3-256 & SHAKE256
+
+    cmp r3, #17		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d7}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #17	@ nlanes -= 17
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_17
+
+.Lfull_block_13:
+
+    @ SHA3-384
+
+    cmp r3, #13		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d10}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    veor d14, d9
+    veor d19, d10
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #13	@ nlanes -= 13
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_13
+
+.Lfull_block_9:
+
+    @ SHA3-512
+
+    cmp r3, #9		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d6}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    veor d18, d5
+    veor d23, d6
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #9		@ nlanes -= 9
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_9
+
+.Llanes_loop:
+
+    @ per-lane mode
+
+    @ switch (pos)
+    ldrb r0, [pc, r1]
+    add pc, pc, r0, lsl #2
+.Lswitch_table:
+    .byte (.Llane0-.Lswitch_table-4)/4
+    .byte (.Llane1-.Lswitch_table-4)/4
+    .byte (.Llane2-.Lswitch_table-4)/4
+    .byte (.Llane3-.Lswitch_table-4)/4
+    .byte (.Llane4-.Lswitch_table-4)/4
+    .byte (.Llane5-.Lswitch_table-4)/4
+    .byte (.Llane6-.Lswitch_table-4)/4
+    .byte (.Llane7-.Lswitch_table-4)/4
+    .byte (.Llane8-.Lswitch_table-4)/4
+    .byte (.Llane9-.Lswitch_table-4)/4
+    .byte (.Llane10-.Lswitch_table-4)/4
+    .byte (.Llane11-.Lswitch_table-4)/4
+    .byte (.Llane12-.Lswitch_table-4)/4
+    .byte (.Llane13-.Lswitch_table-4)/4
+    .byte (.Llane14-.Lswitch_table-4)/4
+    .byte (.Llane15-.Lswitch_table-4)/4
+    .byte (.Llane16-.Lswitch_table-4)/4
+    .byte (.Llane17-.Lswitch_table-4)/4
+    .byte (.Llane18-.Lswitch_table-4)/4
+    .byte (.Llane19-.Lswitch_table-4)/4
+    .byte (.Llane20-.Lswitch_table-4)/4
+    .byte (.Llane21-.Lswitch_table-4)/4
+    .byte (.Llane22-.Lswitch_table-4)/4
+    .byte (.Llane23-.Lswitch_table-4)/4
+    .byte (.Llane24-.Lswitch_table-4)/4
+.p2align 2
+
+#define ABSORB_LANE(label, vreg) \
+    label: \
+      add     r1, #1; \
+      vld1.64 d5, [r2]!; \
+      cmp     r1, r5; /* pos == blocklanes */ \
+      veor    vreg, vreg, d5; \
+      beq     .Llanes_permute; \
+      subs    r3, #1; \
+      beq     .Ldone;
+
+    ABSORB_LANE(.Llane0, d0)
+    ABSORB_LANE(.Llane1, d12)
+    ABSORB_LANE(.Llane2, d17)
+    ABSORB_LANE(.Llane3, d22)
+    ABSORB_LANE(.Llane4, d27)
+
+    ABSORB_LANE(.Llane5, d1)
+    ABSORB_LANE(.Llane6, d13)
+    ABSORB_LANE(.Llane7, d18)
+    ABSORB_LANE(.Llane8, d23)
+    ABSORB_LANE(.Llane9, d28)
+
+    ABSORB_LANE(.Llane10, d2)
+    ABSORB_LANE(.Llane11, d14)
+    ABSORB_LANE(.Llane12, d19)
+    ABSORB_LANE(.Llane13, d24)
+    ABSORB_LANE(.Llane14, d29)
+
+    ABSORB_LANE(.Llane15, d3)
+    ABSORB_LANE(.Llane16, d15)
+    ABSORB_LANE(.Llane17, d20)
+    ABSORB_LANE(.Llane18, d25)
+    ABSORB_LANE(.Llane19, d30)
+
+    ABSORB_LANE(.Llane20, d4)
+    ABSORB_LANE(.Llane21, d16)
+    ABSORB_LANE(.Llane22, d21)
+    ABSORB_LANE(.Llane23, d26)
+    ABSORB_LANE(.Llane24, d31)
+
+    b .Llanes_loop
+
+.Llanes_permute:
+
+    sub    sp,sp, #5*8
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    mov  r1, #0   @ pos <= 0
+    subs r3, #1
+
+    vpop.64  { d0-d4 }
+
+    beq  .Ldone
+
+    b .Lmain_loop_pos0
+
+.Ldone:
+
+    @ save state
+    vstr.64  d0,  [r4, #0*8]
+    vstr.64  d12, [r4, #1*8]
+    vstr.64  d17, [r4, #2*8]
+    vstr.64  d22, [r4, #3*8]
+    vstr.64  d27, [r4, #4*8]
+
+    vstr.64  d1,  [r4, #5*8]
+    vstr.64  d13, [r4, #6*8]
+    vstr.64  d18, [r4, #7*8]
+    vstr.64  d23, [r4, #8*8]
+    vstr.64  d28, [r4, #9*8]
+
+    vstr.64  d2,  [r4, #10*8]
+    vstr.64  d14, [r4, #11*8]
+    vstr.64  d19, [r4, #12*8]
+    vstr.64  d24, [r4, #13*8]
+    vstr.64  d29, [r4, #14*8]
+
+    vstr.64  d3,  [r4, #15*8]
+    vstr.64  d15, [r4, #16*8]
+    vstr.64  d20, [r4, #17*8]
+    vstr.64  d25, [r4, #18*8]
+    vstr.64  d30, [r4, #19*8]
+
+    vstr.64  d4,  [r4, #20*8]
+    vstr.64  d16, [r4, #21*8]
+    vstr.64  d21, [r4, #22*8]
+    vstr.64  d26, [r4, #23*8]
+    vstr.64  d31, [r4, #24*8]
+
+    mov   r0, #120
+    vpop  {q4-q7}
+.Lout:
+    pop   {r4-r5, ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon;
+
+#endif
diff --git a/cipher/keccak.c b/cipher/keccak.c
index ce57860..0bb3155 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -59,7 +59,19 @@
 #endif
 
 
-#ifdef USE_64BIT
+/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
+ * code. */
+#undef USE_64BIT_ARM_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_64BIT_ARM_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
 # define NEED_COMMON64 1
 #endif
 
@@ -109,7 +121,7 @@ typedef struct KECCAK_CONTEXT_S
 
 #ifdef NEED_COMMON64
 
-static const u64 round_consts_64bit[24] =
+const u64 _gcry_keccak_round_consts_64bit[24 + 1] =
 {
   U64_C(0x0000000000000001), U64_C(0x0000000000008082),
   U64_C(0x800000000000808A), U64_C(0x8000000080008000),
@@ -122,7 +134,8 @@ static const u64 round_consts_64bit[24] =
   U64_C(0x8000000000008002), U64_C(0x8000000000000080),
   U64_C(0x000000000000800A), U64_C(0x800000008000000A),
   U64_C(0x8000000080008081), U64_C(0x8000000000008080),
-  U64_C(0x0000000080000001), U64_C(0x8000000080008008)
+  U64_C(0x0000000080000001), U64_C(0x8000000080008008),
+  U64_C(0xFFFFFFFFFFFFFFFF)
 };
 
 static unsigned int
@@ -400,6 +413,54 @@ static const keccak_ops_t keccak_bmi2_64_ops =
 #endif /* USE_64BIT_BMI2 */
 
 
+/* 64-bit ARMv7/NEON implementation. */
+#ifdef USE_64BIT_ARM_NEON
+
+unsigned int _gcry_keccak_permute_armv7_neon(u64 *state);
+unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos,
+						    const byte *lanes,
+						    unsigned int nlanes,
+						    int blocklanes);
+
+static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd)
+{
+  return _gcry_keccak_permute_armv7_neon(hd->u.state64);
+}
+
+static unsigned int
+keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes,
+				 unsigned int nlanes, int blocklanes)
+{
+  if (blocklanes < 0)
+    {
+      /* blocklanes == -1, permutationless absorb from keccak_final. */
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+	}
+
+      return 0;
+    }
+  else
+    {
+      return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes,
+						    nlanes, blocklanes);
+    }
+}
+
+static const keccak_ops_t keccak_armv7_neon_64_ops =
+{
+  .permute = keccak_permute64_armv7_neon,
+  .absorb = keccak_absorb_lanes64_armv7_neon,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_ARM_NEON */
+
+
 /* Construct generic 32-bit implementation. */
 #ifdef USE_32BIT
 
@@ -662,6 +723,10 @@ keccak_init (int algo, void *context, unsigned int flags)
 
   /* Select optimized implementation based in hw features. */
   if (0) {}
+#ifdef USE_64BIT_ARM_NEON
+  else if (features & HWF_ARM_NEON)
+    ctx->ops = &keccak_armv7_neon_64_ops;
+#endif
 #ifdef USE_64BIT_BMI2
   else if (features & HWF_INTEL_BMI2)
     ctx->ops = &keccak_bmi2_64_ops;
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 6f24217..1a80192 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -25,7 +25,7 @@
 static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
-  const u64 *round_consts = round_consts_64bit;
+  const u64 *round_consts = _gcry_keccak_round_consts_64bit;
   u64 Aba, Abe, Abi, Abo, Abu;
   u64 Aga, Age, Agi, Ago, Agu;
   u64 Aka, Ake, Aki, Ako, Aku;
diff --git a/configure.ac b/configure.ac
index 2acfa36..ed37ab5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2108,7 +2108,7 @@ if test "$found" = "1" ; then
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
-     :
+     GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 

commit 2857cb89c6dc1c02266600bc1fd2967a3cd5cf88
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Oct 31 21:29:56 2015 +0200

    Optimize Keccak 64-bit absorb functions
    
    * cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
    (absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
    * cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
    (absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
    [USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    [USE_64BIT] (keccak_absorb_lanes64): Remove.
    [USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    [USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
    [USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    [USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
    * cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    --
    
    Optimize 64-bit absorb functions for small speed-up. After this
    change, 64-bit BMI2 implementation matches speed of fastest results
    from SUPERCOP for Intel Haswell CPUs (long messages).
    
    Benchmark on Intel Haswell @ 3.2 Ghz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |      2.32 ns/B     411.7 MiB/s      7.41 c/B
     SHAKE256       |      2.84 ns/B     336.2 MiB/s      9.08 c/B
     SHA3-224       |      2.69 ns/B     354.9 MiB/s      8.60 c/B
     SHA3-256       |      2.84 ns/B     336.0 MiB/s      9.08 c/B
     SHA3-384       |      3.69 ns/B     258.4 MiB/s     11.81 c/B
     SHA3-512       |      5.30 ns/B     179.9 MiB/s     16.97 c/B
    
    After:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |      2.27 ns/B     420.6 MiB/s      7.26 c/B
     SHAKE256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
     SHA3-224       |      2.64 ns/B     361.7 MiB/s      8.44 c/B
     SHA3-256       |      2.79 ns/B     341.5 MiB/s      8.94 c/B
     SHA3-384       |      3.65 ns/B     261.4 MiB/s     11.68 c/B
     SHA3-512       |      5.27 ns/B     181.0 MiB/s     16.87 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/keccak.c b/cipher/keccak.c
index f4f0ef3..ce57860 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -223,38 +223,105 @@ keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
 /* Construct generic 64-bit implementation. */
 #ifdef USE_64BIT
 
+#if __GNUC__ >= 4 && defined(__x86_64__)
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "movdqu 2*16(%[dst]), %%xmm2\n\t"
+       "movdqu 3*16(%[dst]), %%xmm3\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu 2*16(%[in]), %%xmm4\n\t"
+       "movdqu 3*16(%[in]), %%xmm5\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "pxor %%xmm4, %%xmm2\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       "pxor %%xmm5, %%xmm3\n\t"
+       "movdqu %%xmm2, 2*16(%[dst])\n\t"
+       "movdqu %%xmm3, 3*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm4", "memory");
+}
+
+#else /* __x86_64__ */
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+  dst[4] ^= buf_get_le64(in + 8 * 4);
+  dst[5] ^= buf_get_le64(in + 8 * 5);
+  dst[6] ^= buf_get_le64(in + 8 * 6);
+  dst[7] ^= buf_get_le64(in + 8 * 7);
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+}
+
+#endif /* !__x86_64__ */
+
+static inline void absorb_lanes64_1(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+}
+
+
 # define ANDN64(x, y) (~(x) & (y))
 # define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
 		      ((x) >> ((64 - (unsigned int)(n)) & 63)))
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes,
-		      unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_generic64_ops =
 {
@@ -279,33 +346,13 @@ static const keccak_ops_t keccak_generic64_ops =
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes,
-			   unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64_shld(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_shld_64_ops =
 {
@@ -335,33 +382,13 @@ static const keccak_ops_t keccak_shld_64_ops =
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
-			   unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64_bmi2(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_bmi2_64_ops =
 {
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1264f19..6f24217 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -288,3 +288,102 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 
   return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
 }
+
+static unsigned int
+KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
+			      unsigned int nlanes, int blocklanes)
+{
+  unsigned int burn = 0;
+
+  while (nlanes)
+    {
+      switch (blocklanes)
+	{
+	case 21:
+	  /* SHAKE128 */
+	  while (pos == 0 && nlanes >= 21)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
+	      absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
+	      lanes += 8 * 21;
+	      nlanes -= 21;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 18:
+	  /* SHA3-224 */
+	  while (pos == 0 && nlanes >= 18)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
+	      lanes += 8 * 18;
+	      nlanes -= 18;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 17:
+	  /* SHA3-256 & SHAKE256 */
+	  while (pos == 0 && nlanes >= 17)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
+	      lanes += 8 * 17;
+	      nlanes -= 17;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 13:
+	  /* SHA3-384 */
+	  while (pos == 0 && nlanes >= 13)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
+	      lanes += 8 * 13;
+	      nlanes -= 13;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 9:
+	  /* SHA3-512 */
+	  while (pos == 0 && nlanes >= 9)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
+	      lanes += 8 * 9;
+	      nlanes -= 9;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+	}
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+
+	  if (++pos == blocklanes)
+	    {
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	      pos = 0;
+	      break;
+	    }
+	}
+    }
+
+  return burn;
+}

commit 07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Oct 31 20:19:59 2015 +0200

    Enable CRC test vectors with zero bytes
    
    * tests/basic.c (check_digests): Enable CRC test-vectors with zero
    bytes.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/tests/basic.c b/tests/basic.c
index 0762a89..7d5de00 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -5851,16 +5851,12 @@ check_digests (void)
       {	GCRY_MD_CRC32_RFC1510, "test0123456789", "\xb8\x3e\x88\xd6" },
       {	GCRY_MD_CRC32_RFC1510, "MASSACHVSETTS INSTITVTE OF TECHNOLOGY",
 	"\xe3\x41\x80\xf7" },
-#if 0
-      {	GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20" },
-#endif
+      {	GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b", 2 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32", 2 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20", 2 },
       {	GCRY_MD_CRC32_RFC1510, "\x80", "\xed\xb8\x83\x20" },
-#if 0
-      {	GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96" },
-#endif
+      {	GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b", 4 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96", 4 },
       { GCRY_MD_CRC32_RFC1510, "123456789", "\x2d\xfd\x2d\x88" },
       {	GCRY_MD_CRC24_RFC2440, "", "\xb7\x04\xce" },
       {	GCRY_MD_CRC24_RFC2440, "foo", "\x4f\xc2\x55" },

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am         |   2 +-
 cipher/keccak-armv7-neon.S | 945 +++++++++++++++++++++++++++++++++++++++++++++
 cipher/keccak.c            | 220 ++++++++---
 cipher/keccak_permute_64.h | 101 ++++-
 cipher/tiger.c             | 104 ++---
 configure.ac               |   2 +-
 tests/basic.c              |  14 +-
 7 files changed, 1248 insertions(+), 140 deletions(-)
 create mode 100644 cipher/keccak-armv7-neon.S


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


More information about the Gcrypt-devel mailing list