[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-280-g89fa74d
by Jussi Kivilinna
cvs at cvs.gnupg.org
Thu Nov 5 18:11:43 CET 2015
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".
The branch, master has been updated
via 89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0 (commit)
via a1cc7bb15473a2419b24ecac765ae0ce5989a13b (commit)
via 2857cb89c6dc1c02266600bc1fd2967a3cd5cf88 (commit)
via 07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2 (commit)
from c0b9eee2d93a13930244f9ce0c14ed6b4aeb6c29 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Sun Nov 1 20:44:09 2015 +0200
Improve performance of Tiger hash algorithms
* cipher/tiger.c (tiger_round, pass, key_schedule): Convert functions
to macros.
(transform_blk): Pass variable names instead of pointers to 'pass'.
--
Benchmark results on Intel Haswell @ 3.2 Ghz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
TIGER | 3.25 ns/B 293.5 MiB/s 10.40 c/B
After (1.75x faster):
| nanosecs/byte mebibytes/sec cycles/byte
TIGER | 1.85 ns/B 515.3 MiB/s 5.92 c/B
Benchmark results on Cortex-A8 @ 1008 Mhz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
TIGER | 63.42 ns/B 15.04 MiB/s 63.93 c/B
After (1.26x faster):
| nanosecs/byte mebibytes/sec cycles/byte
TIGER | 49.99 ns/B 19.08 MiB/s 50.39 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/tiger.c b/cipher/tiger.c
index 078133a..516bd44 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -633,68 +633,44 @@ tiger2_init (void *context, unsigned int flags)
do_init (context, 2);
}
-static void
-tiger_round( u64 *ra, u64 *rb, u64 *rc, u64 x, int mul )
-{
- u64 a = *ra;
- u64 b = *rb;
- u64 c = *rc;
-
- c ^= x;
- a -= ( sbox1[ c & 0xff ] ^ sbox2[ (c >> 16) & 0xff ]
- ^ sbox3[ (c >> 32) & 0xff ] ^ sbox4[ (c >> 48) & 0xff ]);
- b += ( sbox4[ (c >> 8) & 0xff ] ^ sbox3[ (c >> 24) & 0xff ]
- ^ sbox2[ (c >> 40) & 0xff ] ^ sbox1[ (c >> 56) & 0xff ]);
- b *= mul;
-
- *ra = a;
- *rb = b;
- *rc = c;
-}
-
-
-static void
-pass( u64 *ra, u64 *rb, u64 *rc, u64 *x, int mul )
-{
- u64 a = *ra;
- u64 b = *rb;
- u64 c = *rc;
-
- tiger_round( &a, &b, &c, x[0], mul );
- tiger_round( &b, &c, &a, x[1], mul );
- tiger_round( &c, &a, &b, x[2], mul );
- tiger_round( &a, &b, &c, x[3], mul );
- tiger_round( &b, &c, &a, x[4], mul );
- tiger_round( &c, &a, &b, x[5], mul );
- tiger_round( &a, &b, &c, x[6], mul );
- tiger_round( &b, &c, &a, x[7], mul );
-
- *ra = a;
- *rb = b;
- *rc = c;
-}
-
-static void
-key_schedule( u64 *x )
-{
- x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL;
- x[1] ^= x[0];
- x[2] += x[1];
- x[3] -= x[2] ^ ((~x[1]) << 19 );
- x[4] ^= x[3];
- x[5] += x[4];
- x[6] -= x[5] ^ ((~x[4]) >> 23 );
- x[7] ^= x[6];
- x[0] += x[7];
- x[1] -= x[0] ^ ((~x[7]) << 19 );
- x[2] ^= x[1];
- x[3] += x[2];
- x[4] -= x[3] ^ ((~x[2]) >> 23 );
- x[5] ^= x[4];
- x[6] += x[5];
- x[7] -= x[6] ^ 0x0123456789abcdefLL;
-}
+#define tiger_round(xa, xb, xc, xx, xmul) { \
+ xc ^= xx; \
+ xa -= ( sbox1[ (xc) & 0xff ] ^ sbox2[ ((xc) >> 16) & 0xff ] \
+ ^ sbox3[ ((xc) >> 32) & 0xff ] ^ sbox4[ ((xc) >> 48) & 0xff ]); \
+ xb += ( sbox4[ ((xc) >> 8) & 0xff ] ^ sbox3[ ((xc) >> 24) & 0xff ] \
+ ^ sbox2[ ((xc) >> 40) & 0xff ] ^ sbox1[ ((xc) >> 56) & 0xff ]); \
+ xb *= xmul; }
+
+
+#define pass(ya, yb, yc, yx, ymul) { \
+ tiger_round( ya, yb, yc, yx[0], ymul ); \
+ tiger_round( yb, yc, ya, yx[1], ymul ); \
+ tiger_round( yc, ya, yb, yx[2], ymul ); \
+ tiger_round( ya, yb, yc, yx[3], ymul ); \
+ tiger_round( yb, yc, ya, yx[4], ymul ); \
+ tiger_round( yc, ya, yb, yx[5], ymul ); \
+ tiger_round( ya, yb, yc, yx[6], ymul ); \
+ tiger_round( yb, yc, ya, yx[7], ymul ); }
+
+
+#define key_schedule(x) { \
+ x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL; \
+ x[1] ^= x[0]; \
+ x[2] += x[1]; \
+ x[3] -= x[2] ^ ((~x[1]) << 19 ); \
+ x[4] ^= x[3]; \
+ x[5] += x[4]; \
+ x[6] -= x[5] ^ ((~x[4]) >> 23 ); \
+ x[7] ^= x[6]; \
+ x[0] += x[7]; \
+ x[1] -= x[0] ^ ((~x[7]) << 19 ); \
+ x[2] ^= x[1]; \
+ x[3] += x[2]; \
+ x[4] -= x[3] ^ ((~x[2]) >> 23 ); \
+ x[5] ^= x[4]; \
+ x[6] += x[5]; \
+ x[7] -= x[6] ^ 0x0123456789abcdefLL; }
/****************
@@ -716,11 +692,11 @@ transform_blk ( void *ctx, const unsigned char *data )
b = bb = hd->b;
c = cc = hd->c;
- pass( &a, &b, &c, x, 5);
+ pass( a, b, c, x, 5);
key_schedule( x );
- pass( &c, &a, &b, x, 7);
+ pass( c, a, b, x, 7);
key_schedule( x );
- pass( &b, &c, &a, x, 9);
+ pass( b, c, a, x, 9);
/* feedforward */
a ^= aa;
commit a1cc7bb15473a2419b24ecac765ae0ce5989a13b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Sun Nov 1 16:06:26 2015 +0200
Add ARMv7/NEON implementation of Keccak
* cipher/Makefile.am: Add 'keccak-armv7-neon.S'.
* cipher/keccak-armv7-neon.S: New.
* cipher/keccak.c (USE_64BIT_ARM_NEON): New.
(NEED_COMMON64): Select if USE_64BIT_ARM_NEON.
[NEED_COMMON64] (round_consts_64bit): Rename to...
[NEED_COMMON64] (_gcry_keccak_round_consts_64bit): ...this; Add
terminator at end.
[USE_64BIT_ARM_NEON] (_gcry_keccak_permute_armv7_neon)
(_gcry_keccak_absorb_lanes64_armv7_neon, keccak_permute64_armv7_neon)
(keccak_absorb_lanes64_armv7_neon, keccak_armv7_neon_64_ops): New.
(keccak_init) [USE_64BIT_ARM_NEON]: Select ARM/NEON implementation
if supported by HW.
* cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Update
to use new round constant table.
* configure.ac: Add 'keccak-armv7-neon.lo'.
--
Patch adds ARMv7/NEON implementation of Keccak (SHAKE/SHA3). Patch
is based on public-domain implementation by Ronny Van Keer from
SUPERCOP package:
https://github.com/floodyberry/supercop/blob/master/crypto_hash/\
keccakc1024/inplace-armv7a-neon/keccak2.s
Benchmark results on Cortex-A8 @ 1008 Mhz:
Before (generic 32-bit bit-interleaved impl.):
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 83.00 ns/B 11.49 MiB/s 83.67 c/B
SHAKE256 | 101.7 ns/B 9.38 MiB/s 102.5 c/B
SHA3-224 | 96.13 ns/B 9.92 MiB/s 96.90 c/B
SHA3-256 | 101.5 ns/B 9.40 MiB/s 102.3 c/B
SHA3-384 | 131.4 ns/B 7.26 MiB/s 132.5 c/B
SHA3-512 | 189.1 ns/B 5.04 MiB/s 190.6 c/B
After (ARM/NEON, ~3.2x faster):
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 25.09 ns/B 38.01 MiB/s 25.29 c/B
SHAKE256 | 30.95 ns/B 30.82 MiB/s 31.19 c/B
SHA3-224 | 29.24 ns/B 32.61 MiB/s 29.48 c/B
SHA3-256 | 30.95 ns/B 30.82 MiB/s 31.19 c/B
SHA3-384 | 40.42 ns/B 23.59 MiB/s 40.74 c/B
SHA3-512 | 58.37 ns/B 16.34 MiB/s 58.84 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index be03d06..88c8fbf 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -90,7 +90,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
sha512-armv7-neon.S \
-keccak.c keccak_permute_32.h keccak_permute_64.h \
+keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
stribog.c \
tiger.c \
whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/keccak-armv7-neon.S b/cipher/keccak-armv7-neon.S
new file mode 100644
index 0000000..0bec8d5
--- /dev/null
+++ b/cipher/keccak-armv7-neon.S
@@ -0,0 +1,945 @@
+/* keccak-armv7-neon.S - ARMv7/NEON implementation of Keccak
+ *
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON)
+
+/* Based on public-domain/CC0 implementation from SUPERCOP package
+ * (keccakc1024/inplace-armv7a-neon/keccak2.s)
+ *
+ * Original copyright header follows:
+ */
+
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Michaël Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@
+@ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+
+.extern _gcry_keccak_round_consts_64bit;
+
+#ifdef __PIC__
+# define GET_DATA_POINTER(reg, name, rtmp) \
+ ldr reg, 1f; \
+ ldr rtmp, 2f; \
+ b 3f; \
+ 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+ 2: .word name(GOT); \
+ 3: add reg, pc, reg; \
+ ldr reg, [reg, rtmp];
+#else
+# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+@// --- offsets in state
+.equ Aba, 0*8
+.equ Aga, 1*8
+.equ Aka, 2*8
+.equ Ama, 3*8
+.equ Asa, 4*8
+
+@// --- macros
+
+.macro KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
+
+ @Prepare Theta
+ @Ca = Aba^Aga^Aka^Ama^Asa@
+ @Ce = Abe^Age^Ake^Ame^Ase@
+ @Ci = Abi^Agi^Aki^Ami^Asi@
+ @Co = Abo^Ago^Ako^Amo^Aso@
+ @Cu = Abu^Agu^Aku^Amu^Asu@
+ @De = Ca^ROL64(Ci, 1)@
+ @Di = Ce^ROL64(Co, 1)@
+ @Do = Ci^ROL64(Cu, 1)@
+ @Du = Co^ROL64(Ca, 1)@
+ @Da = Cu^ROL64(Ce, 1)@
+
+ veor.64 q4, q6, q7
+ veor.64 q5, q9, q10
+ veor.64 d8, d8, d9
+ veor.64 d10, d10, d11
+ veor.64 d1, d8, d16
+ veor.64 d2, d10, d17
+
+ veor.64 q4, q11, q12
+ veor.64 q5, q14, q15
+ veor.64 d8, d8, d9
+ veor.64 d10, d10, d11
+ veor.64 d3, d8, d26
+
+ vadd.u64 q4, q1, q1
+ veor.64 d4, d10, d27
+ vmov.64 d0, d5
+ vsri.64 q4, q1, #63
+
+ vadd.u64 q5, q2, q2
+ veor.64 q4, q4, q0
+ vsri.64 q5, q2, #63
+ vadd.u64 d7, d1, d1
+ veor.64 \argA2, \argA2, d8
+ veor.64 q5, q5, q1
+
+ vsri.64 d7, d1, #63
+ vshl.u64 d1, \argA2, #44
+ veor.64 \argA3, \argA3, d9
+ veor.64 d7, d7, d4
+
+ @Ba = argA1^Da@
+ @Be = ROL64((argA2^De), 44)@
+ @Bi = ROL64((argA3^Di), 43)@
+ @Bo = ROL64((argA4^Do), 21)@
+ @Bu = ROL64((argA5^Du), 14)@
+ @argA2 = Be ^((~Bi)& Bo )@
+ @argA3 = Bi ^((~Bo)& Bu )@
+ @argA4 = Bo ^((~Bu)& Ba )@
+ @argA5 = Bu ^((~Ba)& Be )@
+ @argA1 = Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
+ vsri.64 d1, \argA2, #64-44
+ vshl.u64 d2, \argA3, #43
+ vldr.64 d0, [sp, #\argA1]
+ veor.64 \argA4, \argA4, d10
+ vsri.64 d2, \argA3, #64-43
+ vshl.u64 d3, \argA4, #21
+ veor.64 \argA5, \argA5, d11
+ veor.64 d0, d0, d7
+ vsri.64 d3, \argA4, #64-21
+ vbic.64 d5, d2, d1
+ vshl.u64 d4, \argA5, #14
+ vbic.64 \argA2, d3, d2
+ vld1.64 d6, [ip]!
+ veor.64 d5, d0
+ vsri.64 d4, \argA5, #64-14
+ veor.64 d5, d6
+ vbic.64 \argA5, d1, d0
+ vbic.64 \argA3, d4, d3
+ vbic.64 \argA4, d0, d4
+ veor.64 \argA2, d1
+ vstr.64 d5, [sp, #\argA1]
+ veor.64 \argA3, d2
+ veor.64 \argA4, d3
+ veor.64 \argA5, d4
+
+ .endm
+
+.macro KeccakThetaRhoPiChi1 argA1, argA2, argA3, argA4, argA5
+
+ @d2 = ROL64((argA1^Da), 3)@
+ @d3 = ROL64((argA2^De), 45)@
+ @d4 = ROL64((argA3^Di), 61)@
+ @d0 = ROL64((argA4^Do), 28)@
+ @d1 = ROL64((argA5^Du), 20)@
+ @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@
+ @argA2 = Be ^((~Bi)& Bo )@
+ @argA3 = Bi ^((~Bo)& Bu )@
+ @argA4 = Bo ^((~Bu)& Ba )@
+ @argA5 = Bu ^((~Ba)& Be )@
+
+ veor.64 \argA2, \argA2, d8
+ veor.64 \argA3, \argA3, d9
+ vshl.u64 d3, \argA2, #45
+ vldr.64 d6, [sp, #\argA1]
+ vshl.u64 d4, \argA3, #61
+ veor.64 \argA4, \argA4, d10
+ vsri.64 d3, \argA2, #64-45
+ veor.64 \argA5, \argA5, d11
+ vsri.64 d4, \argA3, #64-61
+ vshl.u64 d0, \argA4, #28
+ veor.64 d6, d6, d7
+ vshl.u64 d1, \argA5, #20
+ vbic.64 \argA3, d4, d3
+ vsri.64 d0, \argA4, #64-28
+ vbic.64 \argA4, d0, d4
+ vshl.u64 d2, d6, #3
+ vsri.64 d1, \argA5, #64-20
+ veor.64 \argA4, d3
+ vsri.64 d2, d6, #64-3
+ vbic.64 \argA5, d1, d0
+ vbic.64 d6, d2, d1
+ vbic.64 \argA2, d3, d2
+ veor.64 d6, d0
+ veor.64 \argA2, d1
+ vstr.64 d6, [sp, #\argA1]
+ veor.64 \argA3, d2
+ veor.64 d5, d6
+ veor.64 \argA5, d4
+
+ .endm
+
+.macro KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
+
+ @d4 = ROL64((argA1^Da), 18)@
+ @d0 = ROL64((argA2^De), 1)@
+ @d1 = ROL64((argA3^Di), 6)@
+ @d2 = ROL64((argA4^Do), 25)@
+ @d3 = ROL64((argA5^Du), 8)@
+ @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@
+ @argA2 = Be ^((~Bi)& Bo )@
+ @argA3 = Bi ^((~Bo)& Bu )@
+ @argA4 = Bo ^((~Bu)& Ba )@
+ @argA5 = Bu ^((~Ba)& Be )@
+
+ veor.64 \argA3, \argA3, d9
+ veor.64 \argA4, \argA4, d10
+ vshl.u64 d1, \argA3, #6
+ vldr.64 d6, [sp, #\argA1]
+ vshl.u64 d2, \argA4, #25
+ veor.64 \argA5, \argA5, d11
+ vsri.64 d1, \argA3, #64-6
+ veor.64 \argA2, \argA2, d8
+ vsri.64 d2, \argA4, #64-25
+ vext.8 d3, \argA5, \argA5, #7
+ veor.64 d6, d6, d7
+ vbic.64 \argA3, d2, d1
+ vadd.u64 d0, \argA2, \argA2
+ vbic.64 \argA4, d3, d2
+ vsri.64 d0, \argA2, #64-1
+ vshl.u64 d4, d6, #18
+ veor.64 \argA2, d1, \argA4
+ veor.64 \argA3, d0
+ vsri.64 d4, d6, #64-18
+ vstr.64 \argA3, [sp, #\argA1]
+ veor.64 d5, \argA3
+ vbic.64 \argA5, d1, d0
+ vbic.64 \argA3, d4, d3
+ vbic.64 \argA4, d0, d4
+ veor.64 \argA3, d2
+ veor.64 \argA4, d3
+ veor.64 \argA5, d4
+
+ .endm
+
+.macro KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
+
+ @d1 = ROL64((argA1^Da), 36)@
+ @d2 = ROL64((argA2^De), 10)@
+ @d3 = ROL64((argA3^Di), 15)@
+ @d4 = ROL64((argA4^Do), 56)@
+ @d0 = ROL64((argA5^Du), 27)@
+ @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@
+ @argA2 = Be ^((~Bi)& Bo )@
+ @argA3 = Bi ^((~Bo)& Bu )@
+ @argA4 = Bo ^((~Bu)& Ba )@
+ @argA5 = Bu ^((~Ba)& Be )@
+
+ veor.64 \argA2, \argA2, d8
+ veor.64 \argA3, \argA3, d9
+ vshl.u64 d2, \argA2, #10
+ vldr.64 d6, [sp, #\argA1]
+ vshl.u64 d3, \argA3, #15
+ veor.64 \argA4, \argA4, d10
+ vsri.64 d2, \argA2, #64-10
+ vsri.64 d3, \argA3, #64-15
+ veor.64 \argA5, \argA5, d11
+ vext.8 d4, \argA4, \argA4, #1
+ vbic.64 \argA2, d3, d2
+ vshl.u64 d0, \argA5, #27
+ veor.64 d6, d6, d7
+ vbic.64 \argA3, d4, d3
+ vsri.64 d0, \argA5, #64-27
+ vshl.u64 d1, d6, #36
+ veor.64 \argA3, d2
+ vbic.64 \argA4, d0, d4
+ vsri.64 d1, d6, #64-36
+
+ veor.64 \argA4, d3
+ vbic.64 d6, d2, d1
+ vbic.64 \argA5, d1, d0
+ veor.64 d6, d0
+ veor.64 \argA2, d1
+ vstr.64 d6, [sp, #\argA1]
+ veor.64 d5, d6
+ veor.64 \argA5, d4
+
+ .endm
+
+.macro KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
+
+ @d3 = ROL64((argA1^Da), 41)@
+ @d4 = ROL64((argA2^De), 2)@
+ @d0 = ROL64((argA3^Di), 62)@
+ @d1 = ROL64((argA4^Do), 55)@
+ @d2 = ROL64((argA5^Du), 39)@
+ @argA1 = Ba ^((~Be)& Bi )@ Ca ^= argA1@
+ @argA2 = Be ^((~Bi)& Bo )@
+ @argA3 = Bi ^((~Bo)& Bu )@
+ @argA4 = Bo ^((~Bu)& Ba )@
+ @argA5 = Bu ^((~Ba)& Be )@
+
+ veor.64 \argA2, \argA2, d8
+ veor.64 \argA3, \argA3, d9
+ vshl.u64 d4, \argA2, #2
+ veor.64 \argA5, \argA5, d11
+ vshl.u64 d0, \argA3, #62
+ vldr.64 d6, [sp, #\argA1]
+ vsri.64 d4, \argA2, #64-2
+ veor.64 \argA4, \argA4, d10
+ vsri.64 d0, \argA3, #64-62
+
+ vshl.u64 d1, \argA4, #55
+ veor.64 d6, d6, d7
+ vshl.u64 d2, \argA5, #39
+ vsri.64 d1, \argA4, #64-55
+ vbic.64 \argA4, d0, d4
+ vsri.64 d2, \argA5, #64-39
+ vbic.64 \argA2, d1, d0
+ vshl.u64 d3, d6, #41
+ veor.64 \argA5, d4, \argA2
+ vbic.64 \argA2, d2, d1
+ vsri.64 d3, d6, #64-41
+ veor.64 d6, d0, \argA2
+
+ vbic.64 \argA2, d3, d2
+ vbic.64 \argA3, d4, d3
+ veor.64 \argA2, d1
+ vstr.64 d6, [sp, #\argA1]
+ veor.64 d5, d6
+ veor.64 \argA3, d2
+ veor.64 \argA4, d3
+
+ .endm
+
+
+@// --- code
+
+ at not callable from C!
+.p2align 3
+.type KeccakF_armv7a_neon_asm,%function;
+KeccakF_armv7a_neon_asm: @
+
+.LroundLoop:
+
+ KeccakThetaRhoPiChiIota Aba, d13, d19, d25, d31
+ KeccakThetaRhoPiChi1 Aka, d15, d21, d22, d28
+ KeccakThetaRhoPiChi2 Asa, d12, d18, d24, d30
+ KeccakThetaRhoPiChi3 Aga, d14, d20, d26, d27
+ KeccakThetaRhoPiChi4 Ama, d16, d17, d23, d29
+
+ KeccakThetaRhoPiChiIota Aba, d15, d18, d26, d29
+ KeccakThetaRhoPiChi1 Asa, d14, d17, d25, d28
+ KeccakThetaRhoPiChi2 Ama, d13, d21, d24, d27
+ KeccakThetaRhoPiChi3 Aka, d12, d20, d23, d31
+ KeccakThetaRhoPiChi4 Aga, d16, d19, d22, d30
+
+ KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
+ KeccakThetaRhoPiChi1 Ama, d12, d19, d26, d28
+ KeccakThetaRhoPiChi2 Aga, d15, d17, d24, d31
+ KeccakThetaRhoPiChi3 Asa, d13, d20, d22, d29
+ KeccakThetaRhoPiChi4 Aka, d16, d18, d25, d27
+
+ KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
+ KeccakThetaRhoPiChi1 Aga, d13, d18, d23, d28
+ KeccakThetaRhoPiChi2 Aka, d14, d19, d24, d29
+ ldr r0, [ip]
+ KeccakThetaRhoPiChi3 Ama, d15, d20, d25, d30
+ cmp r0, #0xFFFFFFFF
+ KeccakThetaRhoPiChi4 Asa, d16, d21, d26, d31
+
+ bne .LroundLoop
+ sub ip, #(8*24)
+ bx lr
+.p2align 2
+.ltorg
+.size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm;
+
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state) callable from C
+.p2align 3
+.global _gcry_keccak_permute_armv7_neon
+.type _gcry_keccak_permute_armv7_neon,%function;
+_gcry_keccak_permute_armv7_neon:
+
+ push {ip, lr}
+ vpush {q4-q7}
+ sub sp,sp, #5*8
+
+ vldr.64 d0, [r0, #0*8]
+ vldr.64 d12, [r0, #1*8]
+ vldr.64 d17, [r0, #2*8]
+ vldr.64 d22, [r0, #3*8]
+ vldr.64 d27, [r0, #4*8]
+
+ GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+ vldr.64 d1, [r0, #5*8]
+ vldr.64 d13, [r0, #6*8]
+ vldr.64 d18, [r0, #7*8]
+ vldr.64 d23, [r0, #8*8]
+ vldr.64 d28, [r0, #9*8]
+
+ vldr.64 d2, [r0, #10*8]
+ vldr.64 d14, [r0, #11*8]
+ vldr.64 d19, [r0, #12*8]
+ vldr.64 d24, [r0, #13*8]
+ vldr.64 d29, [r0, #14*8]
+
+ vldr.64 d3, [r0, #15*8]
+ vldr.64 d15, [r0, #16*8]
+ vldr.64 d20, [r0, #17*8]
+ vldr.64 d25, [r0, #18*8]
+ vldr.64 d30, [r0, #19*8]
+
+ vldr.64 d4, [r0, #20*8]
+ vldr.64 d16, [r0, #21*8]
+ vldr.64 d21, [r0, #22*8]
+ vldr.64 d26, [r0, #23*8]
+ vldr.64 d31, [r0, #24*8]
+
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ mov r1, r0
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ vpop.64 { d0- d4 }
+
+ vstr.64 d0, [r1, #0*8]
+ vstr.64 d12, [r1, #1*8]
+ vstr.64 d17, [r1, #2*8]
+ vstr.64 d22, [r1, #3*8]
+ vstr.64 d27, [r1, #4*8]
+
+ vstr.64 d1, [r1, #5*8]
+ vstr.64 d13, [r1, #6*8]
+ vstr.64 d18, [r1, #7*8]
+ vstr.64 d23, [r1, #8*8]
+ vstr.64 d28, [r1, #9*8]
+
+ vstr.64 d2, [r1, #10*8]
+ vstr.64 d14, [r1, #11*8]
+ vstr.64 d19, [r1, #12*8]
+ vstr.64 d24, [r1, #13*8]
+ vstr.64 d29, [r1, #14*8]
+
+ vstr.64 d3, [r1, #15*8]
+ vstr.64 d15, [r1, #16*8]
+ vstr.64 d20, [r1, #17*8]
+ vstr.64 d25, [r1, #18*8]
+ vstr.64 d30, [r1, #19*8]
+
+ vstr.64 d4, [r1, #20*8]
+ vstr.64 d16, [r1, #21*8]
+ vstr.64 d21, [r1, #22*8]
+ vstr.64 d26, [r1, #23*8]
+ vstr.64 d31, [r1, #24*8]
+
+ mov r0, #112
+ vpop {q4-q7}
+ pop {ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon;
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4
+@ int pos, @r1
+@ const byte *lanes, @r2
+@ unsigned int nlanes, @r3
+@ int blocklanes) @ r5 callable from C
+.p2align 3
+.global _gcry_keccak_absorb_lanes64_armv7_neon
+.type _gcry_keccak_absorb_lanes64_armv7_neon,%function;
+_gcry_keccak_absorb_lanes64_armv7_neon:
+
+ cmp r3, #0 @ nlanes == 0
+ itt eq
+ moveq r0, #0
+ bxeq lr
+
+ push {r4-r5, ip, lr}
+ beq .Lout
+ mov r4, r0
+ ldr r5, [sp, #(4*4)]
+ vpush {q4-q7}
+
+ @ load state
+ vldr.64 d0, [r4, #0*8]
+ vldr.64 d12, [r4, #1*8]
+ vldr.64 d17, [r4, #2*8]
+ vldr.64 d22, [r4, #3*8]
+ vldr.64 d27, [r4, #4*8]
+
+ GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+ vldr.64 d1, [r4, #5*8]
+ vldr.64 d13, [r4, #6*8]
+ vldr.64 d18, [r4, #7*8]
+ vldr.64 d23, [r4, #8*8]
+ vldr.64 d28, [r4, #9*8]
+
+ vldr.64 d2, [r4, #10*8]
+ vldr.64 d14, [r4, #11*8]
+ vldr.64 d19, [r4, #12*8]
+ vldr.64 d24, [r4, #13*8]
+ vldr.64 d29, [r4, #14*8]
+
+ vldr.64 d3, [r4, #15*8]
+ vldr.64 d15, [r4, #16*8]
+ vldr.64 d20, [r4, #17*8]
+ vldr.64 d25, [r4, #18*8]
+ vldr.64 d30, [r4, #19*8]
+
+ vldr.64 d4, [r4, #20*8]
+ vldr.64 d16, [r4, #21*8]
+ vldr.64 d21, [r4, #22*8]
+ vldr.64 d26, [r4, #23*8]
+ vldr.64 d31, [r4, #24*8]
+
+.Lmain_loop:
+
+ @ detect absorb mode (full blocks vs lanes)
+
+ cmp r1, #0 @ pos != 0
+ bne .Llanes_loop
+
+.Lmain_loop_pos0:
+
+ @ full blocks mode
+
+ @ switch (blocksize)
+ cmp r5, #21
+ beq .Lfull_block_21
+ cmp r5, #18
+ beq .Lfull_block_18
+ cmp r5, #17
+ beq .Lfull_block_17
+ cmp r5, #13
+ beq .Lfull_block_13
+ cmp r5, #9
+ beq .Lfull_block_9
+
+ @ unknown blocksize
+ b .Llanes_loop
+
+.Lfull_block_21:
+
+ @ SHAKE128
+
+ cmp r3, #21 @ nlanes < blocklanes
+ blo .Llanes_loop
+
+ sub sp,sp, #5*8
+
+ vld1.64 {d5-d8}, [r2]!
+ veor d0, d5
+ vld1.64 {d9-d11}, [r2]!
+ veor d12, d6
+ veor d17, d7
+ veor d22, d8
+ vld1.64 {d5-d8}, [r2]!
+ veor d27, d9
+
+ veor d1, d10
+ veor d13, d11
+ vld1.64 {d9-d11}, [r2]!
+ veor d18, d5
+ veor d23, d6
+ veor d28, d7
+
+ veor d2, d8
+ vld1.64 {d5-d8}, [r2]!
+ veor d14, d9
+ veor d19, d10
+ veor d24, d11
+ vld1.64 {d9-d11}, [r2]!
+ veor d29, d5
+
+ veor d3, d6
+ veor d15, d7
+ veor d20, d8
+ veor d25, d9
+ veor d30, d10
+
+ veor d4, d11
+
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ subs r3, #21 @ nlanes -= 21
+ vpop.64 { d0-d4 }
+
+ beq .Ldone
+
+ b .Lfull_block_21
+
+.Lfull_block_18:
+
+ @ SHA3-224
+
+ cmp r3, #18 @ nlanes < blocklanes
+ blo .Llanes_loop
+
+ sub sp,sp, #5*8
+
+ vld1.64 {d5-d8}, [r2]!
+ veor d0, d5
+ vld1.64 {d9-d11}, [r2]!
+ veor d12, d6
+ veor d17, d7
+ veor d22, d8
+ vld1.64 {d5-d8}, [r2]!
+ veor d27, d9
+
+ veor d1, d10
+ veor d13, d11
+ vld1.64 {d9-d11}, [r2]!
+ veor d18, d5
+ veor d23, d6
+ veor d28, d7
+
+ veor d2, d8
+ vld1.64 {d5-d8}, [r2]!
+ veor d14, d9
+ veor d19, d10
+ veor d24, d11
+ veor d29, d5
+
+ veor d3, d6
+ veor d15, d7
+ veor d20, d8
+
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ subs r3, #18 @ nlanes -= 18
+ vpop.64 { d0-d4 }
+
+ beq .Ldone
+
+ b .Lfull_block_18
+
+.Lfull_block_17:
+
+ @ SHA3-256 & SHAKE256
+
+ cmp r3, #17 @ nlanes < blocklanes
+ blo .Llanes_loop
+
+ sub sp,sp, #5*8
+
+ vld1.64 {d5-d8}, [r2]!
+ veor d0, d5
+ vld1.64 {d9-d11}, [r2]!
+ veor d12, d6
+ veor d17, d7
+ veor d22, d8
+ vld1.64 {d5-d8}, [r2]!
+ veor d27, d9
+
+ veor d1, d10
+ veor d13, d11
+ vld1.64 {d9-d11}, [r2]!
+ veor d18, d5
+ veor d23, d6
+ veor d28, d7
+
+ veor d2, d8
+ vld1.64 {d5-d7}, [r2]!
+ veor d14, d9
+ veor d19, d10
+ veor d24, d11
+ veor d29, d5
+
+ veor d3, d6
+ veor d15, d7
+
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ subs r3, #17 @ nlanes -= 17
+ vpop.64 { d0-d4 }
+
+ beq .Ldone
+
+ b .Lfull_block_17
+
+.Lfull_block_13:
+
+ @ SHA3-384
+
+ cmp r3, #13 @ nlanes < blocklanes
+ blo .Llanes_loop
+
+ sub sp,sp, #5*8
+
+ vld1.64 {d5-d8}, [r2]!
+ veor d0, d5
+ vld1.64 {d9-d11}, [r2]!
+ veor d12, d6
+ veor d17, d7
+ veor d22, d8
+ vld1.64 {d5-d8}, [r2]!
+ veor d27, d9
+
+ veor d1, d10
+ veor d13, d11
+ vld1.64 {d9-d10}, [r2]!
+ veor d18, d5
+ veor d23, d6
+ veor d28, d7
+
+ veor d2, d8
+ veor d14, d9
+ veor d19, d10
+
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ subs r3, #13 @ nlanes -= 13
+ vpop.64 { d0-d4 }
+
+ beq .Ldone
+
+ b .Lfull_block_13
+
+.Lfull_block_9:
+
+ @ SHA3-512
+
+ cmp r3, #9 @ nlanes < blocklanes
+ blo .Llanes_loop
+
+ sub sp,sp, #5*8
+
+ vld1.64 {d5-d8}, [r2]!
+ veor d0, d5
+ vld1.64 {d9-d11}, [r2]!
+ veor d12, d6
+ veor d17, d7
+ veor d22, d8
+ vld1.64 {d5-d6}, [r2]!
+ veor d27, d9
+
+ veor d1, d10
+ veor d13, d11
+ veor d18, d5
+ veor d23, d6
+
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ subs r3, #9 @ nlanes -= 9
+ vpop.64 { d0-d4 }
+
+ beq .Ldone
+
+ b .Lfull_block_9
+
+.Llanes_loop:
+
+ @ per-lane mode
+
+ @ switch (pos)
+ ldrb r0, [pc, r1]
+ add pc, pc, r0, lsl #2
+.Lswitch_table:
+ .byte (.Llane0-.Lswitch_table-4)/4
+ .byte (.Llane1-.Lswitch_table-4)/4
+ .byte (.Llane2-.Lswitch_table-4)/4
+ .byte (.Llane3-.Lswitch_table-4)/4
+ .byte (.Llane4-.Lswitch_table-4)/4
+ .byte (.Llane5-.Lswitch_table-4)/4
+ .byte (.Llane6-.Lswitch_table-4)/4
+ .byte (.Llane7-.Lswitch_table-4)/4
+ .byte (.Llane8-.Lswitch_table-4)/4
+ .byte (.Llane9-.Lswitch_table-4)/4
+ .byte (.Llane10-.Lswitch_table-4)/4
+ .byte (.Llane11-.Lswitch_table-4)/4
+ .byte (.Llane12-.Lswitch_table-4)/4
+ .byte (.Llane13-.Lswitch_table-4)/4
+ .byte (.Llane14-.Lswitch_table-4)/4
+ .byte (.Llane15-.Lswitch_table-4)/4
+ .byte (.Llane16-.Lswitch_table-4)/4
+ .byte (.Llane17-.Lswitch_table-4)/4
+ .byte (.Llane18-.Lswitch_table-4)/4
+ .byte (.Llane19-.Lswitch_table-4)/4
+ .byte (.Llane20-.Lswitch_table-4)/4
+ .byte (.Llane21-.Lswitch_table-4)/4
+ .byte (.Llane22-.Lswitch_table-4)/4
+ .byte (.Llane23-.Lswitch_table-4)/4
+ .byte (.Llane24-.Lswitch_table-4)/4
+.p2align 2
+
+#define ABSORB_LANE(label, vreg) \
+ label: \
+ add r1, #1; \
+ vld1.64 d5, [r2]!; \
+ cmp r1, r5; /* pos == blocklanes */ \
+ veor vreg, vreg, d5; \
+ beq .Llanes_permute; \
+ subs r3, #1; \
+ beq .Ldone;
+
+ ABSORB_LANE(.Llane0, d0)
+ ABSORB_LANE(.Llane1, d12)
+ ABSORB_LANE(.Llane2, d17)
+ ABSORB_LANE(.Llane3, d22)
+ ABSORB_LANE(.Llane4, d27)
+
+ ABSORB_LANE(.Llane5, d1)
+ ABSORB_LANE(.Llane6, d13)
+ ABSORB_LANE(.Llane7, d18)
+ ABSORB_LANE(.Llane8, d23)
+ ABSORB_LANE(.Llane9, d28)
+
+ ABSORB_LANE(.Llane10, d2)
+ ABSORB_LANE(.Llane11, d14)
+ ABSORB_LANE(.Llane12, d19)
+ ABSORB_LANE(.Llane13, d24)
+ ABSORB_LANE(.Llane14, d29)
+
+ ABSORB_LANE(.Llane15, d3)
+ ABSORB_LANE(.Llane16, d15)
+ ABSORB_LANE(.Llane17, d20)
+ ABSORB_LANE(.Llane18, d25)
+ ABSORB_LANE(.Llane19, d30)
+
+ ABSORB_LANE(.Llane20, d4)
+ ABSORB_LANE(.Llane21, d16)
+ ABSORB_LANE(.Llane22, d21)
+ ABSORB_LANE(.Llane23, d26)
+ ABSORB_LANE(.Llane24, d31)
+
+ b .Llanes_loop
+
+.Llanes_permute:
+
+ sub sp,sp, #5*8
+ vstr.64 d0, [sp, #Aba]
+ vstr.64 d1, [sp, #Aga]
+ veor.64 q0, q0, q1
+ vstr.64 d2, [sp, #Aka]
+ veor.64 d5, d0, d1
+ vstr.64 d3, [sp, #Ama]
+ vstr.64 d4, [sp, #Asa]
+ veor.64 d5, d5, d4
+
+ bl KeccakF_armv7a_neon_asm
+
+ mov r1, #0 @ pos <= 0
+ subs r3, #1
+
+ vpop.64 { d0-d4 }
+
+ beq .Ldone
+
+ b .Lmain_loop_pos0
+
+.Ldone:
+
+ @ save state
+ vstr.64 d0, [r4, #0*8]
+ vstr.64 d12, [r4, #1*8]
+ vstr.64 d17, [r4, #2*8]
+ vstr.64 d22, [r4, #3*8]
+ vstr.64 d27, [r4, #4*8]
+
+ vstr.64 d1, [r4, #5*8]
+ vstr.64 d13, [r4, #6*8]
+ vstr.64 d18, [r4, #7*8]
+ vstr.64 d23, [r4, #8*8]
+ vstr.64 d28, [r4, #9*8]
+
+ vstr.64 d2, [r4, #10*8]
+ vstr.64 d14, [r4, #11*8]
+ vstr.64 d19, [r4, #12*8]
+ vstr.64 d24, [r4, #13*8]
+ vstr.64 d29, [r4, #14*8]
+
+ vstr.64 d3, [r4, #15*8]
+ vstr.64 d15, [r4, #16*8]
+ vstr.64 d20, [r4, #17*8]
+ vstr.64 d25, [r4, #18*8]
+ vstr.64 d30, [r4, #19*8]
+
+ vstr.64 d4, [r4, #20*8]
+ vstr.64 d16, [r4, #21*8]
+ vstr.64 d21, [r4, #22*8]
+ vstr.64 d26, [r4, #23*8]
+ vstr.64 d31, [r4, #24*8]
+
+ mov r0, #120
+ vpop {q4-q7}
+.Lout:
+ pop {r4-r5, ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon;
+
+#endif
diff --git a/cipher/keccak.c b/cipher/keccak.c
index ce57860..0bb3155 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -59,7 +59,19 @@
#endif
-#ifdef USE_64BIT
+/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
+ * code. */
+#undef USE_64BIT_ARM_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
+# define USE_64BIT_ARM_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
# define NEED_COMMON64 1
#endif
@@ -109,7 +121,7 @@ typedef struct KECCAK_CONTEXT_S
#ifdef NEED_COMMON64
-static const u64 round_consts_64bit[24] =
+const u64 _gcry_keccak_round_consts_64bit[24 + 1] =
{
U64_C(0x0000000000000001), U64_C(0x0000000000008082),
U64_C(0x800000000000808A), U64_C(0x8000000080008000),
@@ -122,7 +134,8 @@ static const u64 round_consts_64bit[24] =
U64_C(0x8000000000008002), U64_C(0x8000000000000080),
U64_C(0x000000000000800A), U64_C(0x800000008000000A),
U64_C(0x8000000080008081), U64_C(0x8000000000008080),
- U64_C(0x0000000080000001), U64_C(0x8000000080008008)
+ U64_C(0x0000000080000001), U64_C(0x8000000080008008),
+ U64_C(0xFFFFFFFFFFFFFFFF)
};
static unsigned int
@@ -400,6 +413,54 @@ static const keccak_ops_t keccak_bmi2_64_ops =
#endif /* USE_64BIT_BMI2 */
+/* 64-bit ARMv7/NEON implementation. */
+#ifdef USE_64BIT_ARM_NEON
+
+unsigned int _gcry_keccak_permute_armv7_neon(u64 *state);
+unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos,
+ const byte *lanes,
+ unsigned int nlanes,
+ int blocklanes);
+
+static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd)
+{
+ return _gcry_keccak_permute_armv7_neon(hd->u.state64);
+}
+
+static unsigned int
+keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes,
+ unsigned int nlanes, int blocklanes)
+{
+ if (blocklanes < 0)
+ {
+ /* blocklanes == -1, permutationless absorb from keccak_final. */
+
+ while (nlanes)
+ {
+ hd->u.state64[pos] ^= buf_get_le64(lanes);
+ lanes += 8;
+ nlanes--;
+ }
+
+ return 0;
+ }
+ else
+ {
+ return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes,
+ nlanes, blocklanes);
+ }
+}
+
+static const keccak_ops_t keccak_armv7_neon_64_ops =
+{
+ .permute = keccak_permute64_armv7_neon,
+ .absorb = keccak_absorb_lanes64_armv7_neon,
+ .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_ARM_NEON */
+
+
/* Construct generic 32-bit implementation. */
#ifdef USE_32BIT
@@ -662,6 +723,10 @@ keccak_init (int algo, void *context, unsigned int flags)
/* Select optimized implementation based in hw features. */
if (0) {}
+#ifdef USE_64BIT_ARM_NEON
+ else if (features & HWF_ARM_NEON)
+ ctx->ops = &keccak_armv7_neon_64_ops;
+#endif
#ifdef USE_64BIT_BMI2
else if (features & HWF_INTEL_BMI2)
ctx->ops = &keccak_bmi2_64_ops;
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 6f24217..1a80192 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -25,7 +25,7 @@
static unsigned int
KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
{
- const u64 *round_consts = round_consts_64bit;
+ const u64 *round_consts = _gcry_keccak_round_consts_64bit;
u64 Aba, Abe, Abi, Abo, Abu;
u64 Aga, Age, Agi, Ago, Agu;
u64 Aka, Ake, Aki, Ako, Aku;
diff --git a/configure.ac b/configure.ac
index 2acfa36..ed37ab5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2108,7 +2108,7 @@ if test "$found" = "1" ; then
if test x"$neonsupport" = xyes ; then
# Build with the NEON implementation
- :
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
fi
fi
commit 2857cb89c6dc1c02266600bc1fd2967a3cd5cf88
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Sat Oct 31 21:29:56 2015 +0200
Optimize Keccak 64-bit absorb functions
* cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
* cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
[USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT] (keccak_absorb_lanes64): Remove.
[USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
[USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
* cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
--
Optimize 64-bit absorb functions for small speed-up. After this
change, 64-bit BMI2 implementation matches speed of fastest results
from SUPERCOP for Intel Haswell CPUs (long messages).
Benchmark on Intel Haswell @ 3.2 Ghz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.32 ns/B 411.7 MiB/s 7.41 c/B
SHAKE256 | 2.84 ns/B 336.2 MiB/s 9.08 c/B
SHA3-224 | 2.69 ns/B 354.9 MiB/s 8.60 c/B
SHA3-256 | 2.84 ns/B 336.0 MiB/s 9.08 c/B
SHA3-384 | 3.69 ns/B 258.4 MiB/s 11.81 c/B
SHA3-512 | 5.30 ns/B 179.9 MiB/s 16.97 c/B
After:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.27 ns/B 420.6 MiB/s 7.26 c/B
SHAKE256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B
SHA3-224 | 2.64 ns/B 361.7 MiB/s 8.44 c/B
SHA3-256 | 2.79 ns/B 341.5 MiB/s 8.94 c/B
SHA3-384 | 3.65 ns/B 261.4 MiB/s 11.68 c/B
SHA3-512 | 5.27 ns/B 181.0 MiB/s 16.87 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/keccak.c b/cipher/keccak.c
index f4f0ef3..ce57860 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -223,38 +223,105 @@ keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
/* Construct generic 64-bit implementation. */
#ifdef USE_64BIT
+#if __GNUC__ >= 4 && defined(__x86_64__)
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+ asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+ "movdqu 0*16(%[in]), %%xmm4\n\t"
+ "movdqu 1*16(%[dst]), %%xmm1\n\t"
+ "movdqu 1*16(%[in]), %%xmm5\n\t"
+ "movdqu 2*16(%[dst]), %%xmm2\n\t"
+ "movdqu 3*16(%[dst]), %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu 2*16(%[in]), %%xmm4\n\t"
+ "movdqu 3*16(%[in]), %%xmm5\n\t"
+ "movdqu %%xmm0, 0*16(%[dst])\n\t"
+ "pxor %%xmm4, %%xmm2\n\t"
+ "movdqu %%xmm1, 1*16(%[dst])\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm2, 2*16(%[dst])\n\t"
+ "movdqu %%xmm3, 3*16(%[dst])\n\t"
+ :
+ : [dst] "r" (dst), [in] "r" (in)
+ : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+ asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+ "movdqu 0*16(%[in]), %%xmm4\n\t"
+ "movdqu 1*16(%[dst]), %%xmm1\n\t"
+ "movdqu 1*16(%[in]), %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm0, 0*16(%[dst])\n\t"
+ "movdqu %%xmm1, 1*16(%[dst])\n\t"
+ :
+ : [dst] "r" (dst), [in] "r" (in)
+ : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+ asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+ "movdqu 0*16(%[in]), %%xmm4\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %%xmm0, 0*16(%[dst])\n\t"
+ :
+ : [dst] "r" (dst), [in] "r" (in)
+ : "xmm0", "xmm4", "memory");
+}
+
+#else /* __x86_64__ */
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+ dst[1] ^= buf_get_le64(in + 8 * 1);
+ dst[2] ^= buf_get_le64(in + 8 * 2);
+ dst[3] ^= buf_get_le64(in + 8 * 3);
+ dst[4] ^= buf_get_le64(in + 8 * 4);
+ dst[5] ^= buf_get_le64(in + 8 * 5);
+ dst[6] ^= buf_get_le64(in + 8 * 6);
+ dst[7] ^= buf_get_le64(in + 8 * 7);
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+ dst[1] ^= buf_get_le64(in + 8 * 1);
+ dst[2] ^= buf_get_le64(in + 8 * 2);
+ dst[3] ^= buf_get_le64(in + 8 * 3);
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+ dst[1] ^= buf_get_le64(in + 8 * 1);
+}
+
+#endif /* !__x86_64__ */
+
+static inline void absorb_lanes64_1(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+}
+
+
# define ANDN64(x, y) (~(x) & (y))
# define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
((x) >> ((64 - (unsigned int)(n)) & 63)))
# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
# include "keccak_permute_64.h"
# undef ANDN64
# undef ROL64
# undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes,
- unsigned int nlanes, int blocklanes)
-{
- unsigned int burn = 0;
-
- while (nlanes)
- {
- hd->u.state64[pos] ^= buf_get_le64(lanes);
- lanes += 8;
- nlanes--;
-
- if (++pos == blocklanes)
- {
- burn = keccak_f1600_state_permute64(hd);
- pos = 0;
- }
- }
-
- return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
static const keccak_ops_t keccak_generic64_ops =
{
@@ -279,33 +346,13 @@ static const keccak_ops_t keccak_generic64_ops =
tmp; })
# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
# include "keccak_permute_64.h"
# undef ANDN64
# undef ROL64
# undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes,
- unsigned int nlanes, int blocklanes)
-{
- unsigned int burn = 0;
-
- while (nlanes)
- {
- hd->u.state64[pos] ^= buf_get_le64(lanes);
- lanes += 8;
- nlanes--;
-
- if (++pos == blocklanes)
- {
- burn = keccak_f1600_state_permute64_shld(hd);
- pos = 0;
- }
- }
-
- return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
static const keccak_ops_t keccak_shld_64_ops =
{
@@ -335,33 +382,13 @@ static const keccak_ops_t keccak_shld_64_ops =
tmp; })
# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
# include "keccak_permute_64.h"
# undef ANDN64
# undef ROL64
# undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
- unsigned int nlanes, int blocklanes)
-{
- unsigned int burn = 0;
-
- while (nlanes)
- {
- hd->u.state64[pos] ^= buf_get_le64(lanes);
- lanes += 8;
- nlanes--;
-
- if (++pos == blocklanes)
- {
- burn = keccak_f1600_state_permute64_bmi2(hd);
- pos = 0;
- }
- }
-
- return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
static const keccak_ops_t keccak_bmi2_64_ops =
{
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1264f19..6f24217 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -288,3 +288,102 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
}
+
+static unsigned int
+KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
+ unsigned int nlanes, int blocklanes)
+{
+ unsigned int burn = 0;
+
+ while (nlanes)
+ {
+ switch (blocklanes)
+ {
+ case 21:
+ /* SHAKE128 */
+ while (pos == 0 && nlanes >= 21)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
+ absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
+ lanes += 8 * 21;
+ nlanes -= 21;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 18:
+ /* SHA3-224 */
+ while (pos == 0 && nlanes >= 18)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
+ lanes += 8 * 18;
+ nlanes -= 18;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 17:
+ /* SHA3-256 & SHAKE256 */
+ while (pos == 0 && nlanes >= 17)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
+ lanes += 8 * 17;
+ nlanes -= 17;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 13:
+ /* SHA3-384 */
+ while (pos == 0 && nlanes >= 13)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
+ lanes += 8 * 13;
+ nlanes -= 13;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 9:
+ /* SHA3-512 */
+ while (pos == 0 && nlanes >= 9)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
+ lanes += 8 * 9;
+ nlanes -= 9;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+ }
+
+ while (nlanes)
+ {
+ hd->u.state64[pos] ^= buf_get_le64(lanes);
+ lanes += 8;
+ nlanes--;
+
+ if (++pos == blocklanes)
+ {
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ pos = 0;
+ break;
+ }
+ }
+ }
+
+ return burn;
+}
commit 07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Sat Oct 31 20:19:59 2015 +0200
Enable CRC test vectors with zero bytes
* tests/basic.c (check_digests): Enable CRC test-vectors with zero
bytes.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/tests/basic.c b/tests/basic.c
index 0762a89..7d5de00 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -5851,16 +5851,12 @@ check_digests (void)
{ GCRY_MD_CRC32_RFC1510, "test0123456789", "\xb8\x3e\x88\xd6" },
{ GCRY_MD_CRC32_RFC1510, "MASSACHVSETTS INSTITVTE OF TECHNOLOGY",
"\xe3\x41\x80\xf7" },
-#if 0
- { GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b" },
- { GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32" },
- { GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20" },
-#endif
+ { GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b", 2 },
+ { GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32", 2 },
+ { GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20", 2 },
{ GCRY_MD_CRC32_RFC1510, "\x80", "\xed\xb8\x83\x20" },
-#if 0
- { GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b" },
- { GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96" },
-#endif
+ { GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b", 4 },
+ { GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96", 4 },
{ GCRY_MD_CRC32_RFC1510, "123456789", "\x2d\xfd\x2d\x88" },
{ GCRY_MD_CRC24_RFC2440, "", "\xb7\x04\xce" },
{ GCRY_MD_CRC24_RFC2440, "foo", "\x4f\xc2\x55" },
-----------------------------------------------------------------------
Summary of changes:
cipher/Makefile.am | 2 +-
cipher/keccak-armv7-neon.S | 945 +++++++++++++++++++++++++++++++++++++++++++++
cipher/keccak.c | 220 ++++++++---
cipher/keccak_permute_64.h | 101 ++++-
cipher/tiger.c | 104 ++---
configure.ac | 2 +-
tests/basic.c | 14 +-
7 files changed, 1248 insertions(+), 140 deletions(-)
create mode 100644 cipher/keccak-armv7-neon.S
hooks/post-receive
--
The GNU crypto library
http://git.gnupg.org
More information about the Gnupg-commits
mailing list