From jussi.kivilinna at iki.fi  Sun Nov  1 20:06:09 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 01 Nov 2015 21:06:09 +0200
Subject: [PATCH 1/4] Enable CRC test vectors with zero bytes
Message-ID: <20151101190609.1607.2281.stgit@localhost6.localdomain6>


---
 tests/basic.c |   14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/basic.c b/tests/basic.c
index 0762a89..7d5de00 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -5851,16 +5851,12 @@ check_digests (void)
       {	GCRY_MD_CRC32_RFC1510, "test0123456789", "\xb8\x3e\x88\xd6" },
       {	GCRY_MD_CRC32_RFC1510, "MASSACHVSETTS INSTITVTE OF TECHNOLOGY",
 	"\xe3\x41\x80\xf7" },
-#if 0
-      {	GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20" },
-#endif
+      {	GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b", 2 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32", 2 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20", 2 },
       {	GCRY_MD_CRC32_RFC1510, "\x80", "\xed\xb8\x83\x20" },
-#if 0
-      {	GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96" },
-#endif
+      {	GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b", 4 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96", 4 },
       { GCRY_MD_CRC32_RFC1510, "123456789", "\x2d\xfd\x2d\x88" },
       {	GCRY_MD_CRC24_RFC2440, "", "\xb7\x04\xce" },
       {	GCRY_MD_CRC24_RFC2440, "foo", "\x4f\xc2\x55" },


From jussi.kivilinna at iki.fi  Sun Nov  1 20:06:20 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 01 Nov 2015 21:06:20 +0200
Subject: [PATCH 3/4] Add ARMv7/NEON implementation of Keccak
In-Reply-To: <20151101190609.1607.2281.stgit@localhost6.localdomain6>
References: <20151101190609.1607.2281.stgit@localhost6.localdomain6>
Message-ID: <20151101190619.1607.71346.stgit@localhost6.localdomain6>

* cipher/Makefile.am: Add 'keccak-armv7-neon.S'.
* cipher/keccak-armv7-neon.S: New.
* cipher/keccak.c (USE_64BIT_ARM_NEON): New.
(NEED_COMMON64): Select if USE_64BIT_ARM_NEON.
[NEED_COMMON64] (round_consts_64bit): Rename to...
[NEED_COMMON64] (_gcry_keccak_round_consts_64bit): ...this; Add
terminator at end.
[USE_64BIT_ARM_NEON] (_gcry_keccak_permute_armv7_neon)
(_gcry_keccak_absorb_lanes64_armv7_neon, keccak_permute64_armv7_neon)
(keccak_absorb_lanes64_armv7_neon, keccak_armv7_neon_64_ops): New.
(keccak_init) [USE_64BIT_ARM_NEON]: Select ARM/NEON implementation
if supported by HW.
* cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Update
to use new round constant table.
* configure.ac: Add 'keccak-armv7-neon.lo'.
--

Patch adds ARMv7/NEON implementation of Keccak (SHAKE/SHA3). Patch
is based on public-domain implementation by Ronny Van Keer from
SUPERCOP package:
 https://github.com/floodyberry/supercop/blob/master/crypto_hash/\
keccakc1024/inplace-armv7a-neon/keccak2.s

Benchmark results on Cortex-A8 @ 1008 Mhz:

Before (generic 32-bit bit-interleaved impl.):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHAKE128       |     83.00 ns/B     11.49 MiB/s     83.67 c/B
 SHAKE256       |     101.7 ns/B      9.38 MiB/s     102.5 c/B
 SHA3-224       |     96.13 ns/B      9.92 MiB/s     96.90 c/B
 SHA3-256       |     101.5 ns/B      9.40 MiB/s     102.3 c/B
 SHA3-384       |     131.4 ns/B      7.26 MiB/s     132.5 c/B
 SHA3-512       |     189.1 ns/B      5.04 MiB/s     190.6 c/B

After (ARM/NEON, ~3.2x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHAKE128       |     25.09 ns/B     38.01 MiB/s     25.29 c/B
 SHAKE256       |     30.95 ns/B     30.82 MiB/s     31.19 c/B
 SHA3-224       |     29.24 ns/B     32.61 MiB/s     29.48 c/B
 SHA3-256       |     30.95 ns/B     30.82 MiB/s     31.19 c/B
 SHA3-384       |     40.42 ns/B     23.59 MiB/s     40.74 c/B
 SHA3-512       |     58.37 ns/B     16.34 MiB/s     58.84 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am         |    2 
 cipher/keccak-armv7-neon.S |  945 ++++++++++++++++++++++++++++++++++++++++++++
 cipher/keccak.c            |   71 +++
 cipher/keccak_permute_64.h |    2 
 configure.ac               |    2 
 5 files changed, 1016 insertions(+), 6 deletions(-)
 create mode 100644 cipher/keccak-armv7-neon.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index be03d06..88c8fbf 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -90,7 +90,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
   sha512-armv7-neon.S \
-keccak.c keccak_permute_32.h keccak_permute_64.h \
+keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/keccak-armv7-neon.S b/cipher/keccak-armv7-neon.S
new file mode 100644
index 0000000..6118ce4
--- /dev/null
+++ b/cipher/keccak-armv7-neon.S
@@ -0,0 +1,945 @@
+/* keccak-armv7-neon.S  -  ARMv7/NEON implementation of Keccak
+ *
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+/* Based on public-domain/CC0 implementation from SUPERCOP package
+ * (keccakc1024/inplace-armv7a-neon/keccak2.s)
+ *
+ * Original copyright header follows:
+ */
+
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Micha?l Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@
+@ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+
+.extern _gcry_keccak_round_consts_64bit;
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+@//  --- offsets in state
+.equ Aba, 0*8
+.equ Aga, 1*8
+.equ Aka, 2*8
+.equ Ama, 3*8
+.equ Asa, 4*8
+
+@// --- macros
+
+.macro    KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
+
+    @Prepare Theta
+    @Ca = Aba^Aga^Aka^Ama^Asa@
+    @Ce = Abe^Age^Ake^Ame^Ase@
+    @Ci = Abi^Agi^Aki^Ami^Asi@
+    @Co = Abo^Ago^Ako^Amo^Aso@
+    @Cu = Abu^Agu^Aku^Amu^Asu@
+    @De = Ca^ROL64(Ci, 1)@
+    @Di = Ce^ROL64(Co, 1)@
+    @Do = Ci^ROL64(Cu, 1)@
+    @Du = Co^ROL64(Ca, 1)@
+    @Da = Cu^ROL64(Ce, 1)@
+
+    veor.64 q4, q6, q7
+    veor.64 q5, q9, q10
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d1,  d8,   d16
+    veor.64 d2,  d10,   d17
+
+    veor.64 q4, q11, q12
+    veor.64 q5, q14, q15
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d3,  d8,   d26
+
+    vadd.u64 q4, q1, q1
+    veor.64 d4,  d10,   d27
+    vmov.64  d0, d5
+    vsri.64 q4, q1, #63
+
+    vadd.u64 q5, q2, q2
+    veor.64 q4, q4, q0
+    vsri.64 q5, q2, #63
+    vadd.u64 d7, d1, d1
+    veor.64 \argA2, \argA2, d8
+    veor.64 q5, q5, q1
+
+    vsri.64 d7, d1, #63
+    vshl.u64 d1, \argA2, #44
+    veor.64 \argA3, \argA3, d9
+    veor.64 d7, d7, d4
+
+    @Ba = argA1^Da@
+    @Be = ROL64((argA2^De), 44)@
+    @Bi = ROL64((argA3^Di), 43)@
+    @Bo = ROL64((argA4^Do), 21)@
+    @Bu = ROL64((argA5^Du), 14)@
+    @argA2 =   Be ^((~Bi)& Bo )@
+    @argA3 =   Bi ^((~Bo)& Bu )@
+    @argA4 =   Bo ^((~Bu)& Ba )@
+    @argA5 =   Bu ^((~Ba)& Be )@ 
+    @argA1 =   Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
+    vsri.64 d1, \argA2, #64-44
+    vshl.u64 d2, \argA3, #43
+    vldr.64 d0, [sp, #\argA1]
+    veor.64 \argA4, \argA4, d10
+    vsri.64 d2, \argA3, #64-43
+    vshl.u64 d3, \argA4, #21
+    veor.64 \argA5, \argA5, d11
+    veor.64 d0, d0, d7
+    vsri.64 d3, \argA4, #64-21
+    vbic.64   d5, d2, d1
+    vshl.u64 d4, \argA5, #14
+    vbic.64   \argA2, d3, d2
+    vld1.64   d6, [ip]!
+    veor.64   d5, d0
+    vsri.64 d4, \argA5, #64-14
+    veor.64   d5, d6
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA2, d1
+    vstr.64   d5, [sp, #\argA1]
+    veor.64   \argA3, d2    
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi1   argA1, argA2, argA3, argA4, argA5
+
+    @d2 = ROL64((argA1^Da), 3)@
+    @d3 = ROL64((argA2^De), 45)@
+    @d4 = ROL64((argA3^Di), 61)@
+    @d0 = ROL64((argA4^Do), 28)@
+    @d1 = ROL64((argA5^Du), 20)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d3, \argA2, #45
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d4, \argA3, #61
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d3, \argA2, #64-45
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d4, \argA3, #64-61
+    vshl.u64  d0, \argA4, #28
+    veor.64 d6, d6, d7
+    vshl.u64  d1, \argA5, #20
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA4, #64-28
+    vbic.64   \argA4, d0, d4
+    vshl.u64  d2, d6, #3
+    vsri.64  d1, \argA5, #64-20
+    veor.64   \argA4, d3
+    vsri.64  d2, d6, #64-3
+    vbic.64   \argA5, d1, d0
+    vbic.64   d6, d2, d1
+    vbic.64   \argA2, d3, d2
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
+
+    @d4 = ROL64((argA1^Da), 18)@
+    @d0 = ROL64((argA2^De), 1)@
+    @d1 = ROL64((argA3^Di), 6)@
+    @d2 = ROL64((argA4^Do), 25)@
+    @d3 = ROL64((argA5^Du), 8)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA3, \argA3, d9
+    veor.64 \argA4, \argA4, d10
+    vshl.u64  d1, \argA3, #6
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d2, \argA4, #25
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d1, \argA3, #64-6
+    veor.64 \argA2, \argA2, d8
+    vsri.64  d2, \argA4, #64-25
+    vext.8  d3, \argA5, \argA5, #7
+    veor.64 d6, d6, d7
+    vbic.64  \argA3, d2, d1
+    vadd.u64  d0, \argA2, \argA2
+    vbic.64   \argA4, d3, d2
+    vsri.64  d0, \argA2, #64-1
+    vshl.u64  d4, d6, #18
+    veor.64  \argA2, d1, \argA4
+    veor.64  \argA3, d0
+    vsri.64  d4, d6, #64-18
+    vstr.64   \argA3, [sp, #\argA1]
+    veor.64  d5, \argA3
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
+
+    @d1 = ROL64((argA1^Da), 36)@
+    @d2 = ROL64((argA2^De), 10)@
+    @d3 = ROL64((argA3^Di), 15)@
+    @d4 = ROL64((argA4^Do), 56)@
+    @d0 = ROL64((argA5^Du), 27)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d2, \argA2, #10
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d3, \argA3, #15
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d2, \argA2, #64-10
+    vsri.64  d3, \argA3, #64-15
+    veor.64 \argA5, \argA5, d11
+    vext.8  d4, \argA4, \argA4, #1
+    vbic.64   \argA2, d3, d2
+    vshl.u64  d0, \argA5, #27
+    veor.64 d6, d6, d7
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA5, #64-27
+    vshl.u64  d1, d6, #36
+    veor.64   \argA3, d2
+    vbic.64   \argA4, d0, d4
+    vsri.64  d1, d6, #64-36
+    
+    veor.64   \argA4, d3
+    vbic.64   d6, d2, d1
+    vbic.64   \argA5, d1, d0
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
+
+    @d3 = ROL64((argA1^Da), 41)@
+    @d4 = ROL64((argA2^De), 2)@
+    @d0 = ROL64((argA3^Di), 62)@
+    @d1 = ROL64((argA4^Do), 55)@
+    @d2 = ROL64((argA5^Du), 39)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d4, \argA2, #2
+    veor.64 \argA5, \argA5, d11
+    vshl.u64  d0, \argA3, #62
+    vldr.64 d6, [sp, #\argA1]
+    vsri.64  d4, \argA2, #64-2
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d0, \argA3, #64-62
+
+    vshl.u64  d1, \argA4, #55
+    veor.64 d6, d6, d7
+    vshl.u64  d2, \argA5, #39
+    vsri.64  d1, \argA4, #64-55
+    vbic.64  \argA4, d0, d4
+    vsri.64  d2, \argA5, #64-39
+    vbic.64  \argA2, d1, d0
+    vshl.u64  d3, d6, #41
+    veor.64  \argA5, d4, \argA2
+    vbic.64  \argA2, d2, d1
+    vsri.64  d3, d6, #64-41
+    veor.64  d6, d0, \argA2
+    
+    vbic.64 \argA2, d3, d2
+    vbic.64 \argA3, d4, d3
+    veor.64 \argA2, d1
+    vstr.64 d6, [sp, #\argA1]
+    veor.64 d5, d6
+    veor.64 \argA3, d2
+    veor.64 \argA4, d3
+
+    .endm
+
+
+@// --- code 
+
+ at not callable from C!
+.p2align 3
+.type  KeccakF_armv7a_neon_asm,%function;
+KeccakF_armv7a_neon_asm:  @
+
+.LroundLoop:
+
+    KeccakThetaRhoPiChiIota  Aba, d13, d19, d25, d31
+    KeccakThetaRhoPiChi1    Aka, d15, d21, d22, d28
+    KeccakThetaRhoPiChi2    Asa, d12, d18, d24, d30
+    KeccakThetaRhoPiChi3    Aga, d14, d20, d26, d27
+    KeccakThetaRhoPiChi4    Ama, d16, d17, d23, d29
+
+    KeccakThetaRhoPiChiIota  Aba, d15, d18, d26, d29
+    KeccakThetaRhoPiChi1    Asa, d14, d17, d25, d28
+    KeccakThetaRhoPiChi2    Ama, d13, d21, d24, d27
+    KeccakThetaRhoPiChi3    Aka, d12, d20, d23, d31
+    KeccakThetaRhoPiChi4    Aga, d16, d19, d22, d30
+
+    KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
+    KeccakThetaRhoPiChi1    Ama, d12, d19, d26, d28
+    KeccakThetaRhoPiChi2    Aga, d15, d17, d24, d31
+    KeccakThetaRhoPiChi3    Asa, d13, d20, d22, d29
+    KeccakThetaRhoPiChi4    Aka, d16, d18, d25, d27
+
+    KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
+    KeccakThetaRhoPiChi1    Aga, d13, d18, d23, d28
+    KeccakThetaRhoPiChi2    Aka, d14, d19, d24, d29
+    ldr    r0, [ip]
+    KeccakThetaRhoPiChi3    Ama, d15, d20, d25, d30
+    cmp    r0, #0xFFFFFFFF
+    KeccakThetaRhoPiChi4    Asa, d16, d21, d26, d31
+
+    bne    .LroundLoop
+    sub    ip, #(8*24)
+    bx    lr
+.p2align 2
+.ltorg
+.size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm;
+
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state)  callable from C
+.p2align 3
+.global   _gcry_keccak_permute_armv7_neon
+.type  _gcry_keccak_permute_armv7_neon,%function;
+_gcry_keccak_permute_armv7_neon:
+
+    push   {ip, lr}
+    vpush  {q4-q7}
+    sub    sp,sp, #5*8
+
+    vldr.64  d0,  [r0, #0*8]
+    vldr.64  d12, [r0, #1*8]
+    vldr.64  d17, [r0, #2*8]
+    vldr.64  d22, [r0, #3*8]
+    vldr.64  d27, [r0, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r0, #5*8]
+    vldr.64  d13, [r0, #6*8]
+    vldr.64  d18, [r0, #7*8]
+    vldr.64  d23, [r0, #8*8]
+    vldr.64  d28, [r0, #9*8]
+
+    vldr.64  d2,  [r0, #10*8]
+    vldr.64  d14, [r0, #11*8]
+    vldr.64  d19, [r0, #12*8]
+    vldr.64  d24, [r0, #13*8]
+    vldr.64  d29, [r0, #14*8]
+
+    vldr.64  d3,  [r0, #15*8]
+    vldr.64  d15, [r0, #16*8]
+    vldr.64  d20, [r0, #17*8]
+    vldr.64  d25, [r0, #18*8]
+    vldr.64  d30, [r0, #19*8]
+
+    vldr.64  d4,  [r0, #20*8]
+    vldr.64  d16, [r0, #21*8]
+    vldr.64  d21, [r0, #22*8]
+    vldr.64  d26, [r0, #23*8]
+    vldr.64  d31, [r0, #24*8]
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    mov      r1, r0
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    vpop.64  { d0- d4 }
+
+    vstr.64  d0,  [r1, #0*8]
+    vstr.64  d12, [r1, #1*8]
+    vstr.64  d17, [r1, #2*8]
+    vstr.64  d22, [r1, #3*8]
+    vstr.64  d27, [r1, #4*8]
+
+    vstr.64  d1,  [r1, #5*8]
+    vstr.64  d13, [r1, #6*8]
+    vstr.64  d18, [r1, #7*8]
+    vstr.64  d23, [r1, #8*8]
+    vstr.64  d28, [r1, #9*8]
+
+    vstr.64  d2,  [r1, #10*8]
+    vstr.64  d14, [r1, #11*8]
+    vstr.64  d19, [r1, #12*8]
+    vstr.64  d24, [r1, #13*8]
+    vstr.64  d29, [r1, #14*8]
+
+    vstr.64  d3,  [r1, #15*8]
+    vstr.64  d15, [r1, #16*8]
+    vstr.64  d20, [r1, #17*8]
+    vstr.64  d25, [r1, #18*8]
+    vstr.64  d30, [r1, #19*8]
+
+    vstr.64  d4,  [r1, #20*8]
+    vstr.64  d16, [r1, #21*8]
+    vstr.64  d21, [r1, #22*8]
+    vstr.64  d26, [r1, #23*8]
+    vstr.64  d31, [r1, #24*8]
+
+    mov   r0, #112
+    vpop  {q4-q7}
+    pop   {ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon;
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4
+@					    int pos,    @r1
+@					    const byte *lanes,   @r2
+@					    unsigned int nlanes, @r3
+@					    int blocklanes) @ r5 callable from C
+.p2align 3
+.global   _gcry_keccak_absorb_lanes64_armv7_neon
+.type  _gcry_keccak_absorb_lanes64_armv7_neon,%function;
+_gcry_keccak_absorb_lanes64_armv7_neon:
+
+    cmp    r3, #0	@ nlanes == 0
+    itt eq
+    moveq  r0, #0
+    bxeq   lr
+
+    push   {r4-r5, ip, lr}
+    beq    .Lout
+    mov    r4, r0
+    ldr    r5, [sp, #(4*4)]
+    vpush  {q4-q7}
+
+    @ load state
+    vldr.64  d0,  [r4, #0*8]
+    vldr.64  d12, [r4, #1*8]
+    vldr.64  d17, [r4, #2*8]
+    vldr.64  d22, [r4, #3*8]
+    vldr.64  d27, [r4, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r4, #5*8]
+    vldr.64  d13, [r4, #6*8]
+    vldr.64  d18, [r4, #7*8]
+    vldr.64  d23, [r4, #8*8]
+    vldr.64  d28, [r4, #9*8]
+
+    vldr.64  d2,  [r4, #10*8]
+    vldr.64  d14, [r4, #11*8]
+    vldr.64  d19, [r4, #12*8]
+    vldr.64  d24, [r4, #13*8]
+    vldr.64  d29, [r4, #14*8]
+
+    vldr.64  d3,  [r4, #15*8]
+    vldr.64  d15, [r4, #16*8]
+    vldr.64  d20, [r4, #17*8]
+    vldr.64  d25, [r4, #18*8]
+    vldr.64  d30, [r4, #19*8]
+
+    vldr.64  d4,  [r4, #20*8]
+    vldr.64  d16, [r4, #21*8]
+    vldr.64  d21, [r4, #22*8]
+    vldr.64  d26, [r4, #23*8]
+    vldr.64  d31, [r4, #24*8]
+
+.Lmain_loop:
+
+    @ detect absorb mode (full blocks vs lanes)
+
+    cmp r1, #0		@ pos != 0
+    bne .Llanes_loop
+
+.Lmain_loop_pos0:
+
+    @ full blocks mode
+
+    @ switch (blocksize)
+    cmp r5, #21
+    beq .Lfull_block_21
+    cmp r5, #18
+    beq .Lfull_block_18
+    cmp r5, #17
+    beq .Lfull_block_17
+    cmp r5, #13
+    beq .Lfull_block_13
+    cmp r5, #9
+    beq .Lfull_block_9
+
+    @ unknown blocksize
+    b .Llanes_loop
+
+.Lfull_block_21:
+
+    @ SHAKE128
+
+    cmp r3, #21		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+    veor d25, d9
+    veor d30, d10
+
+    veor d4,  d11
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #21	@ nlanes -= 21
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_21
+
+.Lfull_block_18:
+
+    @ SHA3-224
+
+    cmp r3, #18		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #18	@ nlanes -= 18
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_18
+
+.Lfull_block_17:
+
+    @ SHA3-256 & SHAKE256
+
+    cmp r3, #17		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d7}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #17	@ nlanes -= 17
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_17
+
+.Lfull_block_13:
+
+    @ SHA3-384
+
+    cmp r3, #13		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d10}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    veor d14, d9
+    veor d19, d10
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #13	@ nlanes -= 13
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_13
+
+.Lfull_block_9:
+
+    @ SHA3-512
+
+    cmp r3, #9		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d6}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    veor d18, d5
+    veor d23, d6
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #9		@ nlanes -= 9
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_9
+
+.Llanes_loop:
+
+    @ per-lane mode
+
+    @ switch (pos)
+    ldrb r0, [pc, r1]
+    add pc, pc, r0, lsl #2
+.Lswitch_table:
+    .byte (.Llane0-.Lswitch_table-4)/4
+    .byte (.Llane1-.Lswitch_table-4)/4
+    .byte (.Llane2-.Lswitch_table-4)/4
+    .byte (.Llane3-.Lswitch_table-4)/4
+    .byte (.Llane4-.Lswitch_table-4)/4
+    .byte (.Llane5-.Lswitch_table-4)/4
+    .byte (.Llane6-.Lswitch_table-4)/4
+    .byte (.Llane7-.Lswitch_table-4)/4
+    .byte (.Llane8-.Lswitch_table-4)/4
+    .byte (.Llane9-.Lswitch_table-4)/4
+    .byte (.Llane10-.Lswitch_table-4)/4
+    .byte (.Llane11-.Lswitch_table-4)/4
+    .byte (.Llane12-.Lswitch_table-4)/4
+    .byte (.Llane13-.Lswitch_table-4)/4
+    .byte (.Llane14-.Lswitch_table-4)/4
+    .byte (.Llane15-.Lswitch_table-4)/4
+    .byte (.Llane16-.Lswitch_table-4)/4
+    .byte (.Llane17-.Lswitch_table-4)/4
+    .byte (.Llane18-.Lswitch_table-4)/4
+    .byte (.Llane19-.Lswitch_table-4)/4
+    .byte (.Llane20-.Lswitch_table-4)/4
+    .byte (.Llane21-.Lswitch_table-4)/4
+    .byte (.Llane22-.Lswitch_table-4)/4
+    .byte (.Llane23-.Lswitch_table-4)/4
+    .byte (.Llane24-.Lswitch_table-4)/4
+.p2align 2
+
+#define ABSORB_LANE(label, vreg) \
+    label: \
+      add     r1, #1; \
+      vld1.64 d5, [r2]!; \
+      cmp     r1, r5; /* pos == blocklanes */ \
+      veor    vreg, vreg, d5; \
+      beq     .Llanes_permute; \
+      subs    r3, #1; \
+      beq     .Ldone;
+
+    ABSORB_LANE(.Llane0, d0)
+    ABSORB_LANE(.Llane1, d12)
+    ABSORB_LANE(.Llane2, d17)
+    ABSORB_LANE(.Llane3, d22)
+    ABSORB_LANE(.Llane4, d27)
+
+    ABSORB_LANE(.Llane5, d1)
+    ABSORB_LANE(.Llane6, d13)
+    ABSORB_LANE(.Llane7, d18)
+    ABSORB_LANE(.Llane8, d23)
+    ABSORB_LANE(.Llane9, d28)
+
+    ABSORB_LANE(.Llane10, d2)
+    ABSORB_LANE(.Llane11, d14)
+    ABSORB_LANE(.Llane12, d19)
+    ABSORB_LANE(.Llane13, d24)
+    ABSORB_LANE(.Llane14, d29)
+
+    ABSORB_LANE(.Llane15, d3)
+    ABSORB_LANE(.Llane16, d15)
+    ABSORB_LANE(.Llane17, d20)
+    ABSORB_LANE(.Llane18, d25)
+    ABSORB_LANE(.Llane19, d30)
+
+    ABSORB_LANE(.Llane20, d4)
+    ABSORB_LANE(.Llane21, d16)
+    ABSORB_LANE(.Llane22, d21)
+    ABSORB_LANE(.Llane23, d26)
+    ABSORB_LANE(.Llane24, d31)
+
+    b .Llanes_loop
+
+.Llanes_permute:
+
+    sub    sp,sp, #5*8
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    mov  r1, #0   @ pos <= 0
+    subs r3, #1
+
+    vpop.64  { d0-d4 }
+
+    beq  .Ldone
+
+    b .Lmain_loop_pos0
+
+.Ldone:
+
+    @ save state
+    vstr.64  d0,  [r4, #0*8]
+    vstr.64  d12, [r4, #1*8]
+    vstr.64  d17, [r4, #2*8]
+    vstr.64  d22, [r4, #3*8]
+    vstr.64  d27, [r4, #4*8]
+
+    vstr.64  d1,  [r4, #5*8]
+    vstr.64  d13, [r4, #6*8]
+    vstr.64  d18, [r4, #7*8]
+    vstr.64  d23, [r4, #8*8]
+    vstr.64  d28, [r4, #9*8]
+
+    vstr.64  d2,  [r4, #10*8]
+    vstr.64  d14, [r4, #11*8]
+    vstr.64  d19, [r4, #12*8]
+    vstr.64  d24, [r4, #13*8]
+    vstr.64  d29, [r4, #14*8]
+
+    vstr.64  d3,  [r4, #15*8]
+    vstr.64  d15, [r4, #16*8]
+    vstr.64  d20, [r4, #17*8]
+    vstr.64  d25, [r4, #18*8]
+    vstr.64  d30, [r4, #19*8]
+
+    vstr.64  d4,  [r4, #20*8]
+    vstr.64  d16, [r4, #21*8]
+    vstr.64  d21, [r4, #22*8]
+    vstr.64  d26, [r4, #23*8]
+    vstr.64  d31, [r4, #24*8]
+
+    mov   r0, #120
+    vpop  {q4-q7}
+.Lout:
+    pop   {r4-r5, ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon;
+
+#endif
diff --git a/cipher/keccak.c b/cipher/keccak.c
index ce57860..0bb3155 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -59,7 +59,19 @@
 #endif
 
 
-#ifdef USE_64BIT
+/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
+ * code. */
+#undef USE_64BIT_ARM_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_64BIT_ARM_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
 # define NEED_COMMON64 1
 #endif
 
@@ -109,7 +121,7 @@ typedef struct KECCAK_CONTEXT_S
 
 #ifdef NEED_COMMON64
 
-static const u64 round_consts_64bit[24] =
+const u64 _gcry_keccak_round_consts_64bit[24 + 1] =
 {
   U64_C(0x0000000000000001), U64_C(0x0000000000008082),
   U64_C(0x800000000000808A), U64_C(0x8000000080008000),
@@ -122,7 +134,8 @@ static const u64 round_consts_64bit[24] =
   U64_C(0x8000000000008002), U64_C(0x8000000000000080),
   U64_C(0x000000000000800A), U64_C(0x800000008000000A),
   U64_C(0x8000000080008081), U64_C(0x8000000000008080),
-  U64_C(0x0000000080000001), U64_C(0x8000000080008008)
+  U64_C(0x0000000080000001), U64_C(0x8000000080008008),
+  U64_C(0xFFFFFFFFFFFFFFFF)
 };
 
 static unsigned int
@@ -400,6 +413,54 @@ static const keccak_ops_t keccak_bmi2_64_ops =
 #endif /* USE_64BIT_BMI2 */
 
 
+/* 64-bit ARMv7/NEON implementation. */
+#ifdef USE_64BIT_ARM_NEON
+
+unsigned int _gcry_keccak_permute_armv7_neon(u64 *state);
+unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos,
+						    const byte *lanes,
+						    unsigned int nlanes,
+						    int blocklanes);
+
+static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd)
+{
+  return _gcry_keccak_permute_armv7_neon(hd->u.state64);
+}
+
+static unsigned int
+keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes,
+				 unsigned int nlanes, int blocklanes)
+{
+  if (blocklanes < 0)
+    {
+      /* blocklanes == -1, permutationless absorb from keccak_final. */
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+	}
+
+      return 0;
+    }
+  else
+    {
+      return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes,
+						    nlanes, blocklanes);
+    }
+}
+
+static const keccak_ops_t keccak_armv7_neon_64_ops =
+{
+  .permute = keccak_permute64_armv7_neon,
+  .absorb = keccak_absorb_lanes64_armv7_neon,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_ARM_NEON */
+
+
 /* Construct generic 32-bit implementation. */
 #ifdef USE_32BIT
 
@@ -662,6 +723,10 @@ keccak_init (int algo, void *context, unsigned int flags)
 
   /* Select optimized implementation based in hw features. */
   if (0) {}
+#ifdef USE_64BIT_ARM_NEON
+  else if (features & HWF_ARM_NEON)
+    ctx->ops = &keccak_armv7_neon_64_ops;
+#endif
 #ifdef USE_64BIT_BMI2
   else if (features & HWF_INTEL_BMI2)
     ctx->ops = &keccak_bmi2_64_ops;
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 6f24217..1a80192 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -25,7 +25,7 @@
 static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
-  const u64 *round_consts = round_consts_64bit;
+  const u64 *round_consts = _gcry_keccak_round_consts_64bit;
   u64 Aba, Abe, Abi, Abo, Abu;
   u64 Aga, Age, Agi, Ago, Agu;
   u64 Aka, Ake, Aki, Ako, Aku;
diff --git a/configure.ac b/configure.ac
index 2acfa36..ed37ab5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2108,7 +2108,7 @@ if test "$found" = "1" ; then
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
-     :
+     GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 

From jussi.kivilinna at iki.fi  Sun Nov  1 20:06:14 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 01 Nov 2015 21:06:14 +0200
Subject: [PATCH 2/4] Optimize Keccak 64-bit absorb functions
In-Reply-To: <20151101190609.1607.2281.stgit@localhost6.localdomain6>
References: <20151101190609.1607.2281.stgit@localhost6.localdomain6>
Message-ID: <20151101190614.1607.39351.stgit@localhost6.localdomain6>

* cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
* cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
[USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT] (keccak_absorb_lanes64): Remove.
[USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
[USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
* cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
--

Optimize 64-bit absorb functions for small speed-up. After this
change, 64-bit BMI2 implementation matches speed of fastest results
from SUPERCOP for Intel Haswell CPUs (long messages).

Benchmark on Intel Haswell @ 3.2 Ghz:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHAKE128       |      2.32 ns/B     411.7 MiB/s      7.41 c/B
 SHAKE256       |      2.84 ns/B     336.2 MiB/s      9.08 c/B
 SHA3-224       |      2.69 ns/B     354.9 MiB/s      8.60 c/B
 SHA3-256       |      2.84 ns/B     336.0 MiB/s      9.08 c/B
 SHA3-384       |      3.69 ns/B     258.4 MiB/s     11.81 c/B
 SHA3-512       |      5.30 ns/B     179.9 MiB/s     16.97 c/B

After:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHAKE128       |      2.27 ns/B     420.6 MiB/s      7.26 c/B
 SHAKE256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
 SHA3-224       |      2.64 ns/B     361.7 MiB/s      8.44 c/B
 SHA3-256       |      2.79 ns/B     341.5 MiB/s      8.94 c/B
 SHA3-384       |      3.65 ns/B     261.4 MiB/s     11.68 c/B
 SHA3-512       |      5.27 ns/B     181.0 MiB/s     16.87 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/keccak.c            |  159 ++++++++++++++++++++++++++------------------
 cipher/keccak_permute_64.h |   99 +++++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 66 deletions(-)

diff --git a/cipher/keccak.c b/cipher/keccak.c
index f4f0ef3..ce57860 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -223,38 +223,105 @@ keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
 /* Construct generic 64-bit implementation. */
 #ifdef USE_64BIT
 
+#if __GNUC__ >= 4 && defined(__x86_64__)
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "movdqu 2*16(%[dst]), %%xmm2\n\t"
+       "movdqu 3*16(%[dst]), %%xmm3\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu 2*16(%[in]), %%xmm4\n\t"
+       "movdqu 3*16(%[in]), %%xmm5\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "pxor %%xmm4, %%xmm2\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       "pxor %%xmm5, %%xmm3\n\t"
+       "movdqu %%xmm2, 2*16(%[dst])\n\t"
+       "movdqu %%xmm3, 3*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm4", "memory");
+}
+
+#else /* __x86_64__ */
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+  dst[4] ^= buf_get_le64(in + 8 * 4);
+  dst[5] ^= buf_get_le64(in + 8 * 5);
+  dst[6] ^= buf_get_le64(in + 8 * 6);
+  dst[7] ^= buf_get_le64(in + 8 * 7);
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+}
+
+#endif /* !__x86_64__ */
+
+static inline void absorb_lanes64_1(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+}
+
+
 # define ANDN64(x, y) (~(x) & (y))
 # define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
 		      ((x) >> ((64 - (unsigned int)(n)) & 63)))
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes,
-		      unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_generic64_ops =
 {
@@ -279,33 +346,13 @@ static const keccak_ops_t keccak_generic64_ops =
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes,
-			   unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64_shld(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_shld_64_ops =
 {
@@ -335,33 +382,13 @@ static const keccak_ops_t keccak_shld_64_ops =
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
-			   unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64_bmi2(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_bmi2_64_ops =
 {
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1264f19..6f24217 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -288,3 +288,102 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 
   return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
 }
+
+static unsigned int
+KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
+			      unsigned int nlanes, int blocklanes)
+{
+  unsigned int burn = 0;
+
+  while (nlanes)
+    {
+      switch (blocklanes)
+	{
+	case 21:
+	  /* SHAKE128 */
+	  while (pos == 0 && nlanes >= 21)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
+	      absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
+	      lanes += 8 * 21;
+	      nlanes -= 21;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 18:
+	  /* SHA3-224 */
+	  while (pos == 0 && nlanes >= 18)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
+	      lanes += 8 * 18;
+	      nlanes -= 18;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 17:
+	  /* SHA3-256 & SHAKE256 */
+	  while (pos == 0 && nlanes >= 17)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
+	      lanes += 8 * 17;
+	      nlanes -= 17;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 13:
+	  /* SHA3-384 */
+	  while (pos == 0 && nlanes >= 13)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
+	      lanes += 8 * 13;
+	      nlanes -= 13;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 9:
+	  /* SHA3-512 */
+	  while (pos == 0 && nlanes >= 9)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
+	      lanes += 8 * 9;
+	      nlanes -= 9;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+	}
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+
+	  if (++pos == blocklanes)
+	    {
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	      pos = 0;
+	      break;
+	    }
+	}
+    }
+
+  return burn;
+}


From jussi.kivilinna at iki.fi  Sun Nov  1 20:06:25 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 01 Nov 2015 21:06:25 +0200
Subject: [PATCH 4/4] Improve performance of Tiger hash algorithms
In-Reply-To: <20151101190609.1607.2281.stgit@localhost6.localdomain6>
References: <20151101190609.1607.2281.stgit@localhost6.localdomain6>
Message-ID: <20151101190625.1607.56947.stgit@localhost6.localdomain6>

* cipher/tiger.c (tiger_round, pass, key_schedule): Convert functions
to macros.
(transform_blk): Pass variable names instead of pointers to 'pass'.
--

Benchmark results on Intel Haswell @ 3.2 Ghz:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 TIGER          |      3.25 ns/B     293.5 MiB/s     10.40 c/B

After (1.75x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 TIGER          |      1.85 ns/B     515.3 MiB/s      5.92 c/B

Benchmark results on Cortex-A8 @?1008 Mhz:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 TIGER          |     63.42 ns/B     15.04 MiB/s     63.93 c/B

After (1.26x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 TIGER          |     49.99 ns/B     19.08 MiB/s     50.39 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/tiger.c |  104 ++++++++++++++++++++++----------------------------------
 1 file changed, 40 insertions(+), 64 deletions(-)

diff --git a/cipher/tiger.c b/cipher/tiger.c
index 078133a..516bd44 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -633,68 +633,44 @@ tiger2_init (void *context, unsigned int flags)
   do_init (context, 2);
 }
 
-static void
-tiger_round( u64 *ra, u64 *rb, u64 *rc, u64 x, int mul )
-{
-  u64 a = *ra;
-  u64 b = *rb;
-  u64 c = *rc;
-
-  c ^= x;
-  a -= (  sbox1[  c        & 0xff ] ^ sbox2[ (c >> 16) & 0xff ]
-        ^ sbox3[ (c >> 32) & 0xff ] ^ sbox4[ (c >> 48) & 0xff ]);
-  b += (  sbox4[ (c >>  8) & 0xff ] ^ sbox3[ (c >> 24) & 0xff ]
-        ^ sbox2[ (c >> 40) & 0xff ] ^ sbox1[ (c >> 56) & 0xff ]);
-  b *= mul;
-
-  *ra = a;
-  *rb = b;
-  *rc = c;
-}
-
-
-static void
-pass( u64 *ra, u64 *rb, u64 *rc, u64 *x, int mul )
-{
-  u64 a = *ra;
-  u64 b = *rb;
-  u64 c = *rc;
-
-  tiger_round( &a, &b, &c, x[0], mul );
-  tiger_round( &b, &c, &a, x[1], mul );
-  tiger_round( &c, &a, &b, x[2], mul );
-  tiger_round( &a, &b, &c, x[3], mul );
-  tiger_round( &b, &c, &a, x[4], mul );
-  tiger_round( &c, &a, &b, x[5], mul );
-  tiger_round( &a, &b, &c, x[6], mul );
-  tiger_round( &b, &c, &a, x[7], mul );
-
-  *ra = a;
-  *rb = b;
-  *rc = c;
-}
-
 
-static void
-key_schedule( u64 *x )
-{
-  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL;
-  x[1] ^= x[0];
-  x[2] += x[1];
-  x[3] -= x[2] ^ ((~x[1]) << 19 );
-  x[4] ^= x[3];
-  x[5] += x[4];
-  x[6] -= x[5] ^ ((~x[4]) >> 23 );
-  x[7] ^= x[6];
-  x[0] += x[7];
-  x[1] -= x[0] ^ ((~x[7]) << 19 );
-  x[2] ^= x[1];
-  x[3] += x[2];
-  x[4] -= x[3] ^ ((~x[2]) >> 23 );
-  x[5] ^= x[4];
-  x[6] += x[5];
-  x[7] -= x[6] ^ 0x0123456789abcdefLL;
-}
+#define tiger_round(xa, xb, xc, xx, xmul) { \
+  xc ^= xx; \
+  xa -= (  sbox1[  (xc)        & 0xff ] ^ sbox2[ ((xc) >> 16) & 0xff ] \
+         ^ sbox3[ ((xc) >> 32) & 0xff ] ^ sbox4[ ((xc) >> 48) & 0xff ]); \
+  xb += (  sbox4[ ((xc) >>  8) & 0xff ] ^ sbox3[ ((xc) >> 24) & 0xff ] \
+         ^ sbox2[ ((xc) >> 40) & 0xff ] ^ sbox1[ ((xc) >> 56) & 0xff ]); \
+  xb *= xmul; }
+
+
+#define pass(ya, yb, yc, yx, ymul) { \
+  tiger_round( ya, yb, yc, yx[0], ymul ); \
+  tiger_round( yb, yc, ya, yx[1], ymul ); \
+  tiger_round( yc, ya, yb, yx[2], ymul ); \
+  tiger_round( ya, yb, yc, yx[3], ymul ); \
+  tiger_round( yb, yc, ya, yx[4], ymul ); \
+  tiger_round( yc, ya, yb, yx[5], ymul ); \
+  tiger_round( ya, yb, yc, yx[6], ymul ); \
+  tiger_round( yb, yc, ya, yx[7], ymul ); }
+
+
+#define key_schedule(x) { \
+  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL; \
+  x[1] ^= x[0]; \
+  x[2] += x[1]; \
+  x[3] -= x[2] ^ ((~x[1]) << 19 ); \
+  x[4] ^= x[3]; \
+  x[5] += x[4]; \
+  x[6] -= x[5] ^ ((~x[4]) >> 23 ); \
+  x[7] ^= x[6]; \
+  x[0] += x[7]; \
+  x[1] -= x[0] ^ ((~x[7]) << 19 ); \
+  x[2] ^= x[1]; \
+  x[3] += x[2]; \
+  x[4] -= x[3] ^ ((~x[2]) >> 23 ); \
+  x[5] ^= x[4]; \
+  x[6] += x[5]; \
+  x[7] -= x[6] ^ 0x0123456789abcdefLL; }
 
 
 /****************
@@ -716,11 +692,11 @@ transform_blk ( void *ctx, const unsigned char *data )
   b = bb = hd->b;
   c = cc = hd->c;
 
-  pass( &a, &b, &c, x, 5);
+  pass( a, b, c, x, 5);
   key_schedule( x );
-  pass( &c, &a, &b, x, 7);
+  pass( c, a, b, x, 7);
   key_schedule( x );
-  pass( &b, &c, &a, x, 9);
+  pass( b, c, a, x, 9);
 
   /* feedforward */
   a ^= aa;


From cvs at cvs.gnupg.org  Sun Nov  1 20:01:41 2015
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Sun, 01 Nov 2015 20:01:41 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-276-gc0b9eee
Message-ID: <E1Zsxjd-0005qx-BE@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  c0b9eee2d93a13930244f9ce0c14ed6b4aeb6c29 (commit)
       via  28de6f9e16e386018e81a9cdaee596be7616ccab (commit)
       via  92ad19873562cfce7bcc4a0b5aed8195d8284cfc (commit)
       via  577dc2b63ceca6a8a716256d034ea4e7414f65fa (commit)
       via  cee2e122ec6c1886957a8d47498eb63a6a921725 (commit)
      from  74184c28fbe7ff58cf57f0094ef957d94045da7d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit c0b9eee2d93a13930244f9ce0c14ed6b4aeb6c29
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Oct 25 20:34:50 2015 +0200

    Keccak: Add SHAKE Extendable-Output Functions
    
    * src/hash-common.c (_gcry_hash_selftest_check_one): Add handling for
    XOFs.
    * src/keccak.c (keccak_ops_t): Rename 'extract_inplace' to 'extract'
    and add 'pos' argument.
    (KECCAK_CONTEXT): Add 'suffix'.
    (keccak_extract_inplace64): Rename to...
    (keccak_extract64): ...this; Add handling for 'pos' argument.
    (keccak_extract_inplace32bi): Rename to...
    (keccak_extract32bi): ...this; Add handling for 'pos' argument.
    (keccak_extract_inplace64): Rename to...
    (keccak_extract64): ...this; Add handling for 'pos' argument.
    (keccak_extract_inplace32bi_bmi2): Rename to...
    (keccak_extract32bi_bmi2): ...this; Add handling for 'pos' argument.
    (keccak_init): Setup 'suffix'; add SHAKE128 & SHAKE256.
    (shake128_init, shake256_init): New.
    (keccak_final): Do not initial permute for SHAKE output; use correct
    suffix for SHAKE.
    (keccak_extract): New.
    (keccak_selftests_keccak): Add SHAKE128 & SHAKE256 test-vectors.
    (run_selftests): Add SHAKE128 & SHAKE256.
    (shake128_asn, oid_spec_shake128, shake256_asn, oid_spec_shake256)
    (_gcry_digest_spec_shake128, _gcry_digest_spec_shake256): New.
    * cipher/md.c (digest_list): Add SHAKE128 & SHAKE256.
    * doc/gcrypt.texi: Ditto.
    * src/cipher.h (_gcry_digest_spec_shake128)
    (_gcry_digest_spec_shake256): New.
    * src/gcrypt.h.in (GCRY_MD_SHAKE128, GCRY_MD_SHAKE256): New.
    * tests/basic.c (check_one_md): Add XOF check; Add 'elen' argument.
    (check_one_md_multi): Skip if algo is XOF.
    (check_digests): Add SHAKE128 & SHAKE256 test vectors.
    * tests/bench-slope.c (kdf_bench_one): Skip XOFs.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/hash-common.c b/cipher/hash-common.c
index 6743f09..a750d644 100644
--- a/cipher/hash-common.c
+++ b/cipher/hash-common.c
@@ -49,8 +49,12 @@ _gcry_hash_selftest_check_one (int algo,
   gcry_error_t err = 0;
   gcry_md_hd_t hd;
   unsigned char *digest;
+  char aaa[1000];
+  int xof = 0;
 
-  if (_gcry_md_get_algo_dlen (algo) != expectlen)
+  if (_gcry_md_get_algo_dlen (algo) == 0)
+    xof = 1;
+  else if (_gcry_md_get_algo_dlen (algo) != expectlen)
     return "digest size does not match expected size";
 
   err = _gcry_md_open (&hd, algo, 0);
@@ -65,7 +69,6 @@ _gcry_hash_selftest_check_one (int algo,
 
     case 1: /* Hash one million times an "a". */
       {
-        char aaa[1000];
         int i;
 
         /* Write in odd size chunks so that we test the buffering.  */
@@ -81,10 +84,23 @@ _gcry_hash_selftest_check_one (int algo,
 
   if (!result)
     {
-      digest = _gcry_md_read (hd, algo);
-
-      if ( memcmp (digest, expect, expectlen) )
-        result = "digest mismatch";
+      if (!xof)
+	{
+	  digest = _gcry_md_read (hd, algo);
+
+	  if ( memcmp (digest, expect, expectlen) )
+	    result = "digest mismatch";
+	}
+      else
+	{
+	  gcry_assert(expectlen <= sizeof(aaa));
+
+	  err = _gcry_md_extract (hd, algo, aaa, expectlen);
+	  if (err)
+	    result = "error extracting output from XOF";
+	  else if ( memcmp (aaa, expect, expectlen) )
+	    result = "digest mismatch";
+	}
     }
 
   _gcry_md_close (hd);
diff --git a/cipher/keccak.c b/cipher/keccak.c
index d46d9cb..f4f0ef3 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -90,7 +90,8 @@ typedef struct
   unsigned int (*permute)(KECCAK_STATE *hd);
   unsigned int (*absorb)(KECCAK_STATE *hd, int pos, const byte *lanes,
 			 unsigned int nlanes, int blocklanes);
-  unsigned int (*extract_inplace) (KECCAK_STATE *hd, unsigned int outlen);
+  unsigned int (*extract) (KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+			   unsigned int outlen);
 } keccak_ops_t;
 
 
@@ -100,6 +101,7 @@ typedef struct KECCAK_CONTEXT_S
   unsigned int outlen;
   unsigned int blocksize;
   unsigned int count;
+  unsigned int suffix;
   const keccak_ops_t *ops;
 } KECCAK_CONTEXT;
 
@@ -124,13 +126,18 @@ static const u64 round_consts_64bit[24] =
 };
 
 static unsigned int
-keccak_extract_inplace64(KECCAK_STATE *hd, unsigned int outlen)
+keccak_extract64(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+		 unsigned int outlen)
 {
   unsigned int i;
 
-  for (i = 0; i < outlen / 8 + !!(outlen % 8); i++)
+  /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
+
+  for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
     {
-      hd->u.state64[i] = le_bswap64(hd->u.state64[i]);
+      u64 tmp = hd->u.state64[i];
+      buf_put_le64(outbuf, tmp);
+      outbuf += 8;
     }
 
   return 0;
@@ -158,14 +165,17 @@ static const u32 round_consts_32bit[2 * 24] =
 };
 
 static unsigned int
-keccak_extract_inplace32bi(KECCAK_STATE *hd, unsigned int outlen)
+keccak_extract32bi(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+		   unsigned int outlen)
 {
   unsigned int i;
   u32 x0;
   u32 x1;
   u32 t;
 
-  for (i = 0; i < outlen / 8 + !!(outlen % 8); i++)
+  /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
+
+  for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
     {
       x0 = hd->u.state32bi[i * 2 + 0];
       x1 = hd->u.state32bi[i * 2 + 1];
@@ -182,8 +192,9 @@ keccak_extract_inplace32bi(KECCAK_STATE *hd, unsigned int outlen)
       t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL; x1 = x1 ^ t ^ (t << 2);
       t = (x1 ^ (x1 >> 1)) & 0x22222222UL; x1 = x1 ^ t ^ (t << 1);
 
-      hd->u.state32bi[i * 2 + 0] = le_bswap32(x0);
-      hd->u.state32bi[i * 2 + 1] = le_bswap32(x1);
+      buf_put_le32(&outbuf[0], x0);
+      buf_put_le32(&outbuf[4], x1);
+      outbuf += 8;
     }
 
   return 0;
@@ -249,7 +260,7 @@ static const keccak_ops_t keccak_generic64_ops =
 {
   .permute = keccak_f1600_state_permute64,
   .absorb = keccak_absorb_lanes64,
-  .extract_inplace = keccak_extract_inplace64,
+  .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT */
@@ -300,7 +311,7 @@ static const keccak_ops_t keccak_shld_64_ops =
 {
   .permute = keccak_f1600_state_permute64_shld,
   .absorb = keccak_absorb_lanes64_shld,
-  .extract_inplace = keccak_extract_inplace64,
+  .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT_SHLD */
@@ -356,7 +367,7 @@ static const keccak_ops_t keccak_bmi2_64_ops =
 {
   .permute = keccak_f1600_state_permute64_bmi2,
   .absorb = keccak_absorb_lanes64_bmi2,
-  .extract_inplace = keccak_extract_inplace64,
+  .extract = keccak_extract64,
 };
 
 #endif /* USE_64BIT_BMI2 */
@@ -404,7 +415,7 @@ static const keccak_ops_t keccak_generic32bi_ops =
 {
   .permute = keccak_f1600_state_permute32bi,
   .absorb = keccak_absorb_lanes32bi,
-  .extract_inplace = keccak_extract_inplace32bi,
+  .extract = keccak_extract32bi,
 };
 
 #endif /* USE_32BIT */
@@ -483,14 +494,17 @@ keccak_absorb_lanes32bi_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
 }
 
 static unsigned int
-keccak_extract_inplace32bi_bmi2(KECCAK_STATE *hd, unsigned int outlen)
+keccak_extract32bi_bmi2(KECCAK_STATE *hd, unsigned int pos, byte *outbuf,
+			unsigned int outlen)
 {
   unsigned int i;
   u32 x0;
   u32 x1;
   u32 t;
 
-  for (i = 0; i < outlen / 8 + !!(outlen % 8); i++)
+  /* NOTE: when pos == 0, hd and outbuf may point to same memory (SHA-3). */
+
+  for (i = pos; i < pos + outlen / 8 + !!(outlen % 8); i++)
     {
       x0 = hd->u.state32bi[i * 2 + 0];
       x1 = hd->u.state32bi[i * 2 + 1];
@@ -502,8 +516,9 @@ keccak_extract_inplace32bi_bmi2(KECCAK_STATE *hd, unsigned int outlen)
       x0 = pdep(pext(x0, 0xffff0001), 0xaaaaaaab) | pdep(x0 >> 1, 0x55555554);
       x1 = pdep(pext(x1, 0xffff0001), 0xaaaaaaab) | pdep(x1 >> 1, 0x55555554);
 
-      hd->u.state32bi[i * 2 + 0] = le_bswap32(x0);
-      hd->u.state32bi[i * 2 + 1] = le_bswap32(x1);
+      buf_put_le32(&outbuf[0], x0);
+      buf_put_le32(&outbuf[4], x1);
+      outbuf += 8;
     }
 
   return 0;
@@ -513,7 +528,7 @@ static const keccak_ops_t keccak_bmi2_32bi_ops =
 {
   .permute = keccak_f1600_state_permute32bi_bmi2,
   .absorb = keccak_absorb_lanes32bi_bmi2,
-  .extract_inplace = keccak_extract_inplace32bi_bmi2,
+  .extract = keccak_extract32bi_bmi2,
 };
 
 #endif /* USE_32BIT */
@@ -638,21 +653,35 @@ keccak_init (int algo, void *context, unsigned int flags)
   switch (algo)
     {
     case GCRY_MD_SHA3_224:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 1152 / 8;
       ctx->outlen = 224 / 8;
       break;
     case GCRY_MD_SHA3_256:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 1088 / 8;
       ctx->outlen = 256 / 8;
       break;
     case GCRY_MD_SHA3_384:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 832 / 8;
       ctx->outlen = 384 / 8;
       break;
     case GCRY_MD_SHA3_512:
+      ctx->suffix = SHA3_DELIMITED_SUFFIX;
       ctx->blocksize = 576 / 8;
       ctx->outlen = 512 / 8;
       break;
+    case GCRY_MD_SHAKE128:
+      ctx->suffix = SHAKE_DELIMITED_SUFFIX;
+      ctx->blocksize = 1344 / 8;
+      ctx->outlen = 0;
+      break;
+    case GCRY_MD_SHAKE256:
+      ctx->suffix = SHAKE_DELIMITED_SUFFIX;
+      ctx->blocksize = 1088 / 8;
+      ctx->outlen = 0;
+      break;
     default:
       BUG();
     }
@@ -682,6 +711,17 @@ sha3_512_init (void *context, unsigned int flags)
   keccak_init (GCRY_MD_SHA3_512, context, flags);
 }
 
+static void
+shake128_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHAKE128, context, flags);
+}
+
+static void
+shake256_init (void *context, unsigned int flags)
+{
+  keccak_init (GCRY_MD_SHAKE256, context, flags);
+}
 
 /* The routine final terminates the computation and
  * returns the digest.
@@ -696,7 +736,7 @@ keccak_final (void *context)
   KECCAK_CONTEXT *ctx = context;
   KECCAK_STATE *hd = &ctx->state;
   const size_t bsize = ctx->blocksize;
-  const byte suffix = SHA3_DELIMITED_SUFFIX;
+  const byte suffix = ctx->suffix;
   unsigned int nburn, burn = 0;
   unsigned int lastbytes;
   byte lane[8];
@@ -716,21 +756,21 @@ keccak_final (void *context)
   nburn = ctx->ops->absorb(&ctx->state, (bsize - 1) / 8, lane, 1, -1);
   burn = nburn > burn ? nburn : burn;
 
-  /* Switch to the squeezing phase. */
-  nburn = ctx->ops->permute(hd);
-  burn = nburn > burn ? nburn : burn;
-
-  /* Squeeze out all the output blocks */
-  if (ctx->outlen < bsize)
+  if (suffix == SHA3_DELIMITED_SUFFIX)
     {
-      /* Output SHA3 digest. */
-      nburn = ctx->ops->extract_inplace(hd, ctx->outlen);
+      /* Switch to the squeezing phase. */
+      nburn = ctx->ops->permute(hd);
+      burn = nburn > burn ? nburn : burn;
+
+      /* Squeeze out the SHA3 digest. */
+      nburn = ctx->ops->extract(hd, 0, (void *)hd, ctx->outlen);
       burn = nburn > burn ? nburn : burn;
     }
   else
     {
-      /* Output SHAKE digest. */
-      BUG();
+      /* Output for SHAKE can now be read with md_extract(). */
+
+      ctx->count = 0;
     }
 
   wipememory(lane, sizeof(lane));
@@ -748,6 +788,124 @@ keccak_read (void *context)
 }
 
 
+static void
+keccak_extract (void *context, void *out, size_t outlen)
+{
+  KECCAK_CONTEXT *ctx = context;
+  KECCAK_STATE *hd = &ctx->state;
+  const size_t bsize = ctx->blocksize;
+  unsigned int nburn, burn = 0;
+  byte *outbuf = out;
+  unsigned int nlanes;
+  unsigned int nleft;
+  unsigned int count;
+  unsigned int i;
+  byte lane[8];
+
+  count = ctx->count;
+
+  while (count && outlen && (outlen < 8 || count % 8))
+    {
+      /* Extract partial lane. */
+      nburn = ctx->ops->extract(hd, count / 8, lane, 8);
+      burn = nburn > burn ? nburn : burn;
+
+      for (i = count % 8; outlen && i < 8; i++)
+	{
+	  *outbuf++ = lane[i];
+	  outlen--;
+	  count++;
+	}
+
+      gcry_assert(count <= bsize);
+
+      if (count == bsize)
+	count = 0;
+    }
+
+  if (outlen >= 8 && count)
+    {
+      /* Extract tail of partial block. */
+      nlanes = outlen / 8;
+      nleft = (bsize - count) / 8;
+      nlanes = nlanes < nleft ? nlanes : nleft;
+
+      nburn = ctx->ops->extract(hd, count / 8, outbuf, nlanes * 8);
+      burn = nburn > burn ? nburn : burn;
+      outlen -= nlanes * 8;
+      outbuf += nlanes * 8;
+      count += nlanes * 8;
+
+      gcry_assert(count <= bsize);
+
+      if (count == bsize)
+	count = 0;
+    }
+
+  while (outlen >= bsize)
+    {
+      gcry_assert(count == 0);
+
+      /* Squeeze more. */
+      nburn = ctx->ops->permute(hd);
+      burn = nburn > burn ? nburn : burn;
+
+      /* Extract full block. */
+      nburn = ctx->ops->extract(hd, 0, outbuf, bsize);
+      burn = nburn > burn ? nburn : burn;
+
+      outlen -= bsize;
+      outbuf += bsize;
+    }
+
+  if (outlen)
+    {
+      gcry_assert(outlen < bsize);
+
+      if (count == 0)
+	{
+	  /* Squeeze more. */
+	  nburn = ctx->ops->permute(hd);
+	  burn = nburn > burn ? nburn : burn;
+	}
+
+      if (outlen >= 8)
+	{
+	  /* Extract head of partial block. */
+	  nlanes = outlen / 8;
+	  nburn = ctx->ops->extract(hd, count / 8, outbuf, nlanes * 8);
+	  burn = nburn > burn ? nburn : burn;
+	  outlen -= nlanes * 8;
+	  outbuf += nlanes * 8;
+	  count += nlanes * 8;
+
+	  gcry_assert(count < bsize);
+	}
+
+      if (outlen)
+	{
+	  /* Extract head of partial lane. */
+	  nburn = ctx->ops->extract(hd, count / 8, lane, 8);
+	  burn = nburn > burn ? nburn : burn;
+
+	  for (i = count % 8; outlen && i < 8; i++)
+	    {
+	      *outbuf++ = lane[i];
+	      outlen--;
+	      count++;
+	    }
+
+	  gcry_assert(count < bsize);
+	}
+    }
+
+  ctx->count = count;
+
+  if (burn)
+    _gcry_burn_stack (burn);
+}
+
+
 
 /*
      Self-test section.
@@ -829,6 +987,32 @@ selftests_keccak (int algo, int extended, selftest_report_func_t report)
 	"\xa8\xaa\x18\xac\xe8\x28\x2a\x0e\x0d\xb5\x96\xc9\x0b\x0a\x7b\x87";
       hash_len = 64;
       break;
+
+    case GCRY_MD_SHAKE128:
+      short_hash =
+	"\x58\x81\x09\x2d\xd8\x18\xbf\x5c\xf8\xa3\xdd\xb7\x93\xfb\xcb\xa7"
+	"\x40\x97\xd5\xc5\x26\xa6\xd3\x5f\x97\xb8\x33\x51\x94\x0f\x2c\xc8";
+      long_hash =
+	"\x7b\x6d\xf6\xff\x18\x11\x73\xb6\xd7\x89\x8d\x7f\xf6\x3f\xb0\x7b"
+	"\x7c\x23\x7d\xaf\x47\x1a\x5a\xe5\x60\x2a\xdb\xcc\xef\x9c\xcf\x4b";
+      one_million_a_hash =
+	"\x9d\x22\x2c\x79\xc4\xff\x9d\x09\x2c\xf6\xca\x86\x14\x3a\xa4\x11"
+	"\xe3\x69\x97\x38\x08\xef\x97\x09\x32\x55\x82\x6c\x55\x72\xef\x58";
+      hash_len = 32;
+      break;
+
+    case GCRY_MD_SHAKE256:
+      short_hash =
+	"\x48\x33\x66\x60\x13\x60\xa8\x77\x1c\x68\x63\x08\x0c\xc4\x11\x4d"
+	"\x8d\xb4\x45\x30\xf8\xf1\xe1\xee\x4f\x94\xea\x37\xe7\x8b\x57\x39";
+      long_hash =
+	"\x98\xbe\x04\x51\x6c\x04\xcc\x73\x59\x3f\xef\x3e\xd0\x35\x2e\xa9"
+	"\xf6\x44\x39\x42\xd6\x95\x0e\x29\xa3\x72\xa6\x81\xc3\xde\xaf\x45";
+      one_million_a_hash =
+	"\x35\x78\xa7\xa4\xca\x91\x37\x56\x9c\xdf\x76\xed\x61\x7d\x31\xbb"
+	"\x99\x4f\xca\x9c\x1b\xbf\x8b\x18\x40\x13\xde\x82\x34\xdf\xd1\x3a";
+      hash_len = 32;
+      break;
   }
 
   what = "short string";
@@ -876,6 +1060,8 @@ run_selftests (int algo, int extended, selftest_report_func_t report)
     case GCRY_MD_SHA3_256:
     case GCRY_MD_SHA3_384:
     case GCRY_MD_SHA3_512:
+    case GCRY_MD_SHAKE128:
+    case GCRY_MD_SHAKE256:
       ec = selftests_keccak (algo, extended, report);
       break;
     default:
@@ -921,7 +1107,22 @@ static gcry_md_oid_spec_t oid_spec_sha3_512[] =
     { "?" },
     { NULL }
   };
-
+static byte shake128_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_shake128[] =
+  {
+    { "2.16.840.1.101.3.4.2.11" },
+    /* PKCS#1 shake128WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
+static byte shake256_asn[] = { 0x30 };
+static gcry_md_oid_spec_t oid_spec_shake256[] =
+  {
+    { "2.16.840.1.101.3.4.2.12" },
+    /* PKCS#1 shake256WithRSAEncryption */
+    { "?" },
+    { NULL }
+  };
 
 gcry_md_spec_t _gcry_digest_spec_sha3_224 =
   {
@@ -955,3 +1156,19 @@ gcry_md_spec_t _gcry_digest_spec_sha3_512 =
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
+gcry_md_spec_t _gcry_digest_spec_shake128 =
+  {
+    GCRY_MD_SHAKE128, {0, 1},
+    "SHAKE128", shake128_asn, DIM (shake128_asn), oid_spec_shake128, 0,
+    shake128_init, keccak_write, keccak_final, NULL, keccak_extract,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
+gcry_md_spec_t _gcry_digest_spec_shake256 =
+  {
+    GCRY_MD_SHAKE256, {0, 1},
+    "SHAKE256", shake256_asn, DIM (shake256_asn), oid_spec_shake256, 0,
+    shake256_init, keccak_write, keccak_final, NULL, keccak_extract,
+    sizeof (KECCAK_CONTEXT),
+    run_selftests
+  };
diff --git a/cipher/md.c b/cipher/md.c
index 6ef8fee..15d944d 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -56,6 +56,8 @@ static gcry_md_spec_t *digest_list[] =
      &_gcry_digest_spec_sha3_256,
      &_gcry_digest_spec_sha3_384,
      &_gcry_digest_spec_sha3_512,
+     &_gcry_digest_spec_shake128,
+     &_gcry_digest_spec_shake256,
 #endif
 #ifdef USE_GOST_R_3411_94
      &_gcry_digest_spec_gost3411_94,
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index facdf65..cdb7644 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -3037,7 +3037,7 @@ are also supported.
 @c begin table of hash algorithms
 @cindex SHA-1
 @cindex SHA-224, SHA-256, SHA-384, SHA-512
- at cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512
+ at cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256
 @cindex RIPE-MD-160
 @cindex MD2, MD4, MD5
 @cindex TIGER, TIGER1, TIGER2
@@ -3126,6 +3126,16 @@ See FIPS 202 for the specification.
 This is the SHA3-384 algorithm which yields a message digest of 64 bytes.
 See FIPS 202 for the specification.
 
+ at item GCRY_MD_SHAKE128
+This is the SHAKE128 extendable-output function (XOF) algorithm with 128 bit
+security strength.
+See FIPS 202 for the specification.
+
+ at item GCRY_MD_SHAKE256
+This is the SHAKE256 extendable-output function (XOF) algorithm with 256 bit
+security strength.
+See FIPS 202 for the specification.
+
 @item GCRY_MD_CRC32
 This is the ISO 3309 and ITU-T V.42 cyclic redundancy check.  It yields
 an output of 4 bytes.  Note that this is not a hash algorithm in the
diff --git a/src/cipher.h b/src/cipher.h
index d96fdb9..c4b306a 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -295,6 +295,8 @@ extern gcry_md_spec_t _gcry_digest_spec_sha3_224;
 extern gcry_md_spec_t _gcry_digest_spec_sha3_256;
 extern gcry_md_spec_t _gcry_digest_spec_sha3_512;
 extern gcry_md_spec_t _gcry_digest_spec_sha3_384;
+extern gcry_md_spec_t _gcry_digest_spec_shake128;
+extern gcry_md_spec_t _gcry_digest_spec_shake256;
 extern gcry_md_spec_t _gcry_digest_spec_tiger;
 extern gcry_md_spec_t _gcry_digest_spec_tiger1;
 extern gcry_md_spec_t _gcry_digest_spec_tiger2;
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 39be37a..5ddeee3 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -1174,7 +1174,9 @@ enum gcry_md_algos
     GCRY_MD_SHA3_224      = 312,
     GCRY_MD_SHA3_256      = 313,
     GCRY_MD_SHA3_384      = 314,
-    GCRY_MD_SHA3_512      = 315
+    GCRY_MD_SHA3_512      = 315,
+    GCRY_MD_SHAKE128      = 316,
+    GCRY_MD_SHAKE256      = 317
   };
 
 /* Flags used with the open function.  */
diff --git a/tests/basic.c b/tests/basic.c
index 75ff349..0762a89 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -5265,13 +5265,15 @@ check_cipher_modes(void)
     fprintf (stderr, "Completed Cipher Mode checks.\n");
 }
 
+
 static void
-check_one_md (int algo, const char *data, int len, const char *expect)
+check_one_md (int algo, const char *data, int len, const char *expect, int elen)
 {
   gcry_md_hd_t hd, hd2;
   unsigned char *p;
   int mdlen;
   int i;
+  int xof = 0;
   gcry_error_t err = 0;
 
   err = gcry_md_open (&hd, algo, 0);
@@ -5284,8 +5286,15 @@ check_one_md (int algo, const char *data, int len, const char *expect)
   mdlen = gcry_md_get_algo_dlen (algo);
   if (mdlen < 1 || mdlen > 500)
     {
-      fail ("algo %d, gcry_md_get_algo_dlen failed: %d\n", algo, mdlen);
-      return;
+      if (mdlen == 0 && (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256))
+        {
+          xof = 1;
+        }
+      else
+        {
+          fail ("algo %d, gcry_md_get_algo_dlen failed: %d\n", algo, mdlen);
+          return;
+        }
     }
 
   if (*data == '!' && !data[1])
@@ -5326,19 +5335,168 @@ check_one_md (int algo, const char *data, int len, const char *expect)
 
   gcry_md_close (hd);
 
-  p = gcry_md_read (hd2, algo);
+  if (!xof)
+    {
+      p = gcry_md_read (hd2, algo);
 
-  if (memcmp (p, expect, mdlen))
+      if (memcmp (p, expect, mdlen))
+        {
+          printf ("computed: ");
+          for (i = 0; i < mdlen; i++)
+            printf ("%02x ", p[i] & 0xFF);
+          printf ("\nexpected: ");
+          for (i = 0; i < mdlen; i++)
+            printf ("%02x ", expect[i] & 0xFF);
+          printf ("\n");
+
+          fail ("algo %d, digest mismatch\n", algo);
+        }
+
+    }
+  else
     {
-      printf ("computed: ");
-      for (i = 0; i < mdlen; i++)
-	printf ("%02x ", p[i] & 0xFF);
-      printf ("\nexpected: ");
-      for (i = 0; i < mdlen; i++)
-	printf ("%02x ", expect[i] & 0xFF);
-      printf ("\n");
+      char buf[1000];
+      int outmax = sizeof(buf) > elen ? elen : sizeof(buf);
 
-      fail ("algo %d, digest mismatch\n", algo);
+      err = gcry_md_copy (&hd, hd2);
+      if (err)
+	{
+	  fail ("algo %d, gcry_md_copy failed: %s\n", algo, gpg_strerror (err));
+	}
+
+      err = gcry_md_extract(hd2, algo, buf, outmax);
+      if (err)
+	{
+	  fail ("algo %d, gcry_md_extract failed: %s\n", algo, gpg_strerror (err));
+	}
+
+      if (memcmp (buf, expect, outmax))
+	{
+	  printf ("computed: ");
+	  for (i = 0; i < outmax; i++)
+	    printf ("%02x ", buf[i] & 0xFF);
+	  printf ("\nexpected: ");
+	  for (i = 0; i < outmax; i++)
+	    printf ("%02x ", expect[i] & 0xFF);
+	  printf ("\n");
+
+	  fail ("algo %d, digest mismatch\n", algo);
+	}
+
+      memset(buf, 0, sizeof(buf));
+
+      /* Extract one byte at time. */
+      for (i = 0; i < outmax && !err; i++)
+	err = gcry_md_extract(hd, algo, &buf[i], 1);
+      if (err)
+	{
+	  fail ("algo %d, gcry_md_extract failed: %s\n", algo, gpg_strerror (err));
+	}
+
+      if (memcmp (buf, expect, outmax))
+	{
+	  printf ("computed: ");
+	  for (i = 0; i < outmax; i++)
+	    printf ("%02x ", buf[i] & 0xFF);
+	  printf ("\nexpected: ");
+	  for (i = 0; i < outmax; i++)
+	    printf ("%02x ", expect[i] & 0xFF);
+	  printf ("\n");
+
+	  fail ("algo %d, digest mismatch\n", algo);
+	}
+
+      if (*data == '!' && !data[1])
+	{
+	  int crcalgo = GCRY_MD_RMD160;
+	  gcry_md_hd_t crc1, crc2;
+	  size_t startlen;
+	  size_t piecelen;
+	  size_t left;
+	  const unsigned char *p1, *p2;
+	  int crclen;
+
+	  crclen = gcry_md_get_algo_dlen (crcalgo);
+
+	  err = gcry_md_open (&crc1, crcalgo, 0);
+	  if (err)
+	    {
+	      fail ("algo %d, crcalgo: %d, gcry_md_open failed: %s\n", algo,
+		    crcalgo, gpg_strerror (err));
+	      return;
+	    }
+
+	  err = gcry_md_open (&crc2, crcalgo, 0);
+	  if (err)
+	    {
+	      fail ("algo %d, crcalgo: %d, gcry_md_open failed: %s\n", algo,
+		    crcalgo, gpg_strerror (err));
+	      return;
+	    }
+
+	  /* Extract large chucks, total 1000000 additional bytes. */
+	  for (i = 0; i < 1000; i++)
+	    {
+	      err = gcry_md_extract(hd, algo, buf, 1000);
+	      if (!err)
+		gcry_md_write(crc1, buf, 1000);
+	    }
+	  if (err)
+	    {
+	      fail ("algo %d, gcry_md_extract failed: %s\n", algo,
+		    gpg_strerror (err));
+	    }
+
+	  /* Extract in odd size chunks, total 1000000 additional bytes.  */
+	  left = 1000 * 1000;
+	  startlen = 1;
+	  piecelen = startlen;
+
+	  while (!err && left > 0)
+	    {
+	      if (piecelen > sizeof(buf))
+		piecelen = sizeof(buf);
+	      if (piecelen > left)
+		piecelen = left;
+
+	      err = gcry_md_extract (hd2, algo, buf, piecelen);
+	      if (!err)
+		gcry_md_write(crc2, buf, piecelen);
+	      if (err)
+		{
+		  fail ("algo %d, gcry_md_extract failed: %s\n", algo,
+			gpg_strerror (err));
+		}
+
+	      left -= piecelen;
+
+	      if (piecelen == sizeof(buf))
+		piecelen = ++startlen;
+	      else
+		piecelen = piecelen * 2 - ((piecelen != startlen) ? startlen : 0);
+	    }
+
+	  p1 = gcry_md_read (crc1, crcalgo);
+	  p2 = gcry_md_read (crc2, crcalgo);
+
+	  if (memcmp (p1, p2, crclen))
+	    {
+	      printf ("computed: ");
+	      for (i = 0; i < crclen; i++)
+		printf ("%02x ", p2[i] & 0xFF);
+	      printf ("\nexpected: ");
+	      for (i = 0; i < crclen; i++)
+		printf ("%02x ", p1[i] & 0xFF);
+	      printf ("\n");
+
+	      fail ("algo %d, large xof output mismatch\n", algo);
+	    }
+
+	  gcry_md_close (crc1);
+	  gcry_md_close (crc2);
+	}
+
+      gcry_md_close (hd);
     }
 
   gcry_md_close (hd2);
@@ -5358,6 +5516,9 @@ check_one_md_multi (int algo, const char *data, int len, const char *expect)
   mdlen = gcry_md_get_algo_dlen (algo);
   if (mdlen < 1 || mdlen > 64)
     {
+      if (mdlen == 0 && (algo == GCRY_MD_SHAKE128 || algo == GCRY_MD_SHAKE256))
+        return;
+
       fail ("check_one_md_multi: algo %d, gcry_md_get_algo_dlen failed: %d\n",
             algo, mdlen);
       return;
@@ -5420,6 +5581,7 @@ check_digests (void)
     const char *data;
     const char *expect;
     int datalen;
+    int expectlen;
   } algos[] =
     {
       { GCRY_MD_MD2, "",
@@ -5917,7 +6079,238 @@ check_digests (void)
 #include "./sha3-256.h"
 #include "./sha3-384.h"
 #include "./sha3-512.h"
-      {	0 }
+      { GCRY_MD_SHAKE128,
+	"",
+	"\x7F\x9C\x2B\xA4\xE8\x8F\x82\x7D\x61\x60\x45\x50\x76\x05\x85\x3E"
+	"\xD7\x3B\x80\x93\xF6\xEF\xBC\x88\xEB\x1A\x6E\xAC\xFA\x66\xEF\x26"
+	"\x3C\xB1\xEE\xA9\x88\x00\x4B\x93\x10\x3C\xFB\x0A\xEE\xFD\x2A\x68"
+	"\x6E\x01\xFA\x4A\x58\xE8\xA3\x63\x9C\xA8\xA1\xE3\xF9\xAE\x57\xE2"
+	"\x35\xB8\xCC\x87\x3C\x23\xDC\x62\xB8\xD2\x60\x16\x9A\xFA\x2F\x75"
+	"\xAB\x91\x6A\x58\xD9\x74\x91\x88\x35\xD2\x5E\x6A\x43\x50\x85\xB2"
+	"\xBA\xDF\xD6\xDF\xAA\xC3\x59\xA5\xEF\xBB\x7B\xCC\x4B\x59\xD5\x38"
+	"\xDF\x9A\x04\x30\x2E\x10\xC8\xBC\x1C\xBF\x1A\x0B\x3A\x51\x20\xEA"
+	"\x17\xCD\xA7\xCF\xAD\x76\x5F\x56\x23\x47\x4D\x36\x8C\xCC\xA8\xAF"
+	"\x00\x07\xCD\x9F\x5E\x4C\x84\x9F\x16\x7A\x58\x0B\x14\xAA\xBD\xEF"
+	"\xAE\xE7\xEE\xF4\x7C\xB0\xFC\xA9\x76\x7B\xE1\xFD\xA6\x94\x19\xDF"
+	"\xB9\x27\xE9\xDF\x07\x34\x8B\x19\x66\x91\xAB\xAE\xB5\x80\xB3\x2D"
+	"\xEF\x58\x53\x8B\x8D\x23\xF8\x77\x32\xEA\x63\xB0\x2B\x4F\xA0\xF4"
+	"\x87\x33\x60\xE2\x84\x19\x28\xCD\x60\xDD\x4C\xEE\x8C\xC0\xD4\xC9"
+	"\x22\xA9\x61\x88\xD0\x32\x67\x5C\x8A\xC8\x50\x93\x3C\x7A\xFF\x15"
+	"\x33\xB9\x4C\x83\x4A\xDB\xB6\x9C\x61\x15\xBA\xD4\x69\x2D\x86\x19"
+	"\xF9\x0B\x0C\xDF\x8A\x7B\x9C\x26\x40\x29\xAC\x18\x5B\x70\xB8\x3F"
+	"\x28\x01\xF2\xF4\xB3\xF7\x0C\x59\x3E\xA3\xAE\xEB\x61\x3A\x7F\x1B"
+	"\x1D\xE3\x3F\xD7\x50\x81\xF5\x92\x30\x5F\x2E\x45\x26\xED\xC0\x96"
+	"\x31\xB1\x09\x58\xF4\x64\xD8\x89\xF3\x1B\xA0\x10\x25\x0F\xDA\x7F"
+	"\x13\x68\xEC\x29\x67\xFC\x84\xEF\x2A\xE9\xAF\xF2\x68\xE0\xB1\x70"
+	"\x0A\xFF\xC6\x82\x0B\x52\x3A\x3D\x91\x71\x35\xF2\xDF\xF2\xEE\x06"
+	"\xBF\xE7\x2B\x31\x24\x72\x1D\x4A\x26\xC0\x4E\x53\xA7\x5E\x30\xE7"
+	"\x3A\x7A\x9C\x4A\x95\xD9\x1C\x55\xD4\x95\xE9\xF5\x1D\xD0\xB5\xE9"
+	"\xD8\x3C\x6D\x5E\x8C\xE8\x03\xAA\x62\xB8\xD6\x54\xDB\x53\xD0\x9B"
+	"\x8D\xCF\xF2\x73\xCD\xFE\xB5\x73\xFA\xD8\xBC\xD4\x55\x78\xBE\xC2"
+	"\xE7\x70\xD0\x1E\xFD\xE8\x6E\x72\x1A\x3F\x7C\x6C\xCE\x27\x5D\xAB"
+	"\xE6\xE2\x14\x3F\x1A\xF1\x8D\xA7\xEF\xDD\xC4\xC7\xB7\x0B\x5E\x34"
+	"\x5D\xB9\x3C\xC9\x36\xBE\xA3\x23\x49\x1C\xCB\x38\xA3\x88\xF5\x46"
+	"\xA9\xFF\x00\xDD\x4E\x13\x00\xB9\xB2\x15\x3D\x20\x41\xD2\x05\xB4"
+	"\x43\xE4\x1B\x45\xA6\x53\xF2\xA5\xC4\x49\x2C\x1A\xDD\x54\x45\x12"
+	"\xDD\xA2\x52\x98\x33\x46\x2B\x71\xA4\x1A\x45\xBE\x97\x29\x0B\x6F",
+	0, 512, },
+      { GCRY_MD_SHAKE128,
+	"\x5A\xAB\x62\x75\x6D\x30\x7A\x66\x9D\x14\x6A\xBA\x98\x8D\x90\x74"
+	"\xC5\xA1\x59\xB3\xDE\x85\x15\x1A\x81\x9B\x11\x7C\xA1\xFF\x65\x97"
+	"\xF6\x15\x6E\x80\xFD\xD2\x8C\x9C\x31\x76\x83\x51\x64\xD3\x7D\xA7"
+	"\xDA\x11\xD9\x4E\x09\xAD\xD7\x70\xB6\x8A\x6E\x08\x1C\xD2\x2C\xA0"
+	"\xC0\x04\xBF\xE7\xCD\x28\x3B\xF4\x3A\x58\x8D\xA9\x1F\x50\x9B\x27"
+	"\xA6\x58\x4C\x47\x4A\x4A\x2F\x3E\xE0\xF1\xF5\x64\x47\x37\x92\x40"
+	"\xA5\xAB\x1F\xB7\x7F\xDC\xA4\x9B\x30\x5F\x07\xBA\x86\xB6\x27\x56"
+	"\xFB\x9E\xFB\x4F\xC2\x25\xC8\x68\x45\xF0\x26\xEA\x54\x20\x76\xB9"
+	"\x1A\x0B\xC2\xCD\xD1\x36\xE1\x22\xC6\x59\xBE\x25\x9D\x98\xE5\x84"
+	"\x1D\xF4\xC2\xF6\x03\x30\xD4\xD8\xCD\xEE\x7B\xF1\xA0\xA2\x44\x52"
+	"\x4E\xEC\xC6\x8F\xF2\xAE\xF5\xBF\x00\x69\xC9\xE8\x7A\x11\xC6\xE5"
+	"\x19\xDE\x1A\x40\x62\xA1\x0C\x83\x83\x73\x88\xF7\xEF\x58\x59\x8A"
+	"\x38\x46\xF4\x9D\x49\x96\x82\xB6\x83\xC4\xA0\x62\xB4\x21\x59\x4F"
+	"\xAF\xBC\x13\x83\xC9\x43\xBA\x83\xBD\xEF\x51\x5E\xFC\xF1\x0D",
+	"\xF0\x71\x5D\xE3\x56\x92\xFD\x70\x12\x3D\xC6\x83\x68\xD0\xFE\xEC"
+	"\x06\xA0\xC7\x4C\xF8\xAD\xB0\x5D\xDC\x25\x54\x87\xB1\xA8\xD4\xD1"
+	"\x21\x3E\x9E\xAB\xAF\x41\xF1\x16\x17\x19\xD0\x65\xD7\x94\xB7\x50"
+	"\xF8\x4B\xE3\x2A\x32\x34\xB4\xD5\x36\x46\x0D\x55\x20\x68\x8A\x5A"
+	"\x79\xA1\x7A\x4B\xA8\x98\x7F\xCB\x61\xBF\x7D\xAA\x8B\x54\x7B\xF5"
+	"\xC1\xCE\x36\xB5\x6A\x73\x25\x7D\xBB\xF1\xBA\xBB\x64\xF2\x49\xBD"
+	"\xCE\xB6\x7B\xA1\xC8\x88\x37\x0A\x96\x3D\xFD\x6B\x6A\x2A\xDE\x2C"
+	"\xEF\xD1\x4C\x32\x52\xCB\x37\x58\x52\x0F\x0C\x65\xF4\x52\x46\x82"
+	"\x77\x24\x99\x46\x3A\xE1\xA3\x41\x80\x01\x83\xAA\x60\xEF\xA0\x51"
+	"\x18\xA2\x82\x01\x74\x4F\x7B\xA0\xB0\xA3\x92\x8D\xD7\xC0\x26\x3F"
+	"\xD2\x64\xB7\xCD\x7B\x2E\x2E\x09\xB3\x22\xBF\xCE\xA8\xEE\xD0\x42"
+	"\x75\x79\x5B\xE7\xC0\xF0\x0E\x11\x38\x27\x37\x0D\x05\x1D\x50\x26"
+	"\x95\x80\x30\x00\x05\xAC\x12\x88\xFE\xA6\xCD\x9A\xE9\xF4\xF3\x7C"
+	"\xE0\xF8\xAC\xE8\xBF\x3E\xBE\x1D\x70\x56\x25\x59\x54\xC7\x61\x93"
+	"\x1D\x3C\x42\xED\x62\xF7\xF1\xCE\x1B\x94\x5C\xDE\xCC\x0A\x74\x32"
+	"\x2D\x7F\x64\xD6\x00\x4F\xF2\x16\x84\x14\x93\x07\x28\x8B\x44\x8E"
+	"\x45\x43\x34\x75\xB1\xEA\x13\x14\xB0\x0F\x1F\xC4\x50\x08\x9A\x9D"
+	"\x1F\x77\x10\xC6\xD7\x65\x2E\xCF\x65\x4F\x3B\x48\x7D\x02\x83\xD4"
+	"\xD8\xA2\x8E\xFB\x50\x66\xC4\x25\x0D\x5A\xD6\x98\xE1\x5D\xBA\x88"
+	"\xE9\x25\xE4\xDE\x99\xB6\x9B\xC3\x83\xAC\x80\x45\xB7\xF1\x02\x2A"
+	"\xDD\x39\xD4\x43\x54\x6A\xE0\x92\x4F\x13\xF4\x89\x60\x96\xDF\xDF"
+	"\x37\xCA\x72\x20\x79\x87\xC4\xA7\x70\x5A\x7A\xBE\x72\x4B\x7F\xA1"
+	"\x0C\x90\x9F\x39\x25\x44\x9F\x01\x0D\x61\xE2\x07\xAD\xD9\x52\x19"
+	"\x07\x1A\xCE\xED\xB9\xB9\xDC\xED\x32\xA9\xE1\x23\x56\x1D\x60\x82"
+	"\xD4\x6A\xEF\xAE\x07\xEE\x1B\xD1\x32\x76\x5E\x3E\x51\x3C\x66\x50"
+	"\x1B\x38\x7A\xB2\xEE\x09\xA0\x4A\xE6\x3E\x25\x80\x85\x17\xAF\xEA"
+	"\x3E\x05\x11\x69\xCF\xD2\xFF\xF8\xC5\x85\x8E\x2D\x96\x23\x89\x7C"
+	"\x9E\x85\x17\x5A\xC5\xA8\x63\x94\xCD\x0A\x32\xA0\xA6\x2A\x8F\x5D"
+	"\x6C\xCC\xBF\x49\x3D\xAA\x43\xF7\x83\x62\xBB\xCA\x40\xAD\xF7\x33"
+	"\xF8\x71\xE0\xC0\x09\x98\xD9\xBF\xD6\x88\x06\x56\x66\x6C\xD7\xBE"
+	"\x4F\xE9\x89\x2C\x61\xDC\xD5\xCD\x23\xA5\xE4\x27\x7E\xEE\x8B\x4A"
+	"\xFD\x29\xB6\x9B\xBA\x55\x66\x0A\x21\x71\x12\xFF\x6E\x34\x56\xB1",
+	223, 512, },
+      { GCRY_MD_SHAKE128,
+	"!",
+	"\x9d\x22\x2c\x79\xc4\xff\x9d\x09\x2c\xf6\xca\x86\x14\x3a\xa4\x11"
+	"\xe3\x69\x97\x38\x08\xef\x97\x09\x32\x55\x82\x6c\x55\x72\xef\x58"
+	"\x42\x4c\x4b\x5c\x28\x47\x5f\xfd\xcf\x98\x16\x63\x86\x7f\xec\x63"
+	"\x21\xc1\x26\x2e\x38\x7b\xcc\xf8\xca\x67\x68\x84\xc4\xa9\xd0\xc1"
+	"\x3b\xfa\x68\x69\x76\x3d\x5a\xe4\xbb\xc9\xb3\xcc\xd0\x9d\x1c\xa5"
+	"\xea\x74\x46\x53\x8d\x69\xb3\xfb\x98\xc7\x2b\x59\xa2\xb4\x81\x7d"
+	"\xb5\xea\xdd\x90\x11\xf9\x0f\xa7\x10\x91\x93\x1f\x81\x34\xf4\xf0"
+	"\x0b\x56\x2e\x2f\xe1\x05\x93\x72\x70\x36\x1c\x19\x09\x86\x2a\xd4"
+	"\x50\x46\xe3\x93\x2f\x5d\xd3\x11\xec\x72\xfe\xc5\xf8\xfb\x8f\x60"
+	"\xb4\x5a\x3b\xee\x3f\x85\xbb\xf7\xfc\xed\xc6\xa5\x55\x67\x76\x48"
+	"\xe0\x65\x4b\x38\x19\x41\xa8\x6b\xd3\xe5\x12\x65\x7b\x0d\x57\xa7"
+	"\x99\x1f\xc4\x54\x3f\x89\xd8\x29\x04\x92\x22\x2c\xe4\xa3\x3e\x17"
+	"\x60\x2b\x3b\x99\xc0\x09\xf7\x65\x5f\x87\x53\x5c\xda\xa3\x71\x6f"
+	"\x58\xc4\x7b\x8a\x15\x7a\xd1\x95\xf0\x28\x09\xf2\x75\x00\xb9\x25"
+	"\x49\x79\x31\x1c\x6b\xb4\x15\x96\x8c\xd1\x04\x31\x16\x9a\x27\xd5"
+	"\xa8\xd6\x1e\x13\xa6\xb8\xb7\x7a\xf1\xf8\xb6\xdd\x2e\xef\xde\xa0"
+	"\x40\x78\x96\x80\x49\x0b\x5e\xdc\xb1\xd3\xe5\x38\xa4\x66\xf7\x57"
+	"\xad\x71\x8f\xe1\xfd\x9f\xae\xef\xa4\x72\x46\xad\x5e\x36\x7f\x87"
+	"\xd3\xb4\x85\x0d\x44\x86\xeb\x21\x99\xe9\x4a\x79\x79\xe2\x09\x1a"
+	"\xbc\xdf\x3b\xc1\x33\x79\xc8\x96\xdc\xeb\x79\xa8\xfd\x08\xf1\x10"
+	"\x73\xf3\x3e\x3f\x99\x23\x22\xb3\x12\x02\xde\xe2\x34\x33\x0c\xf3"
+	"\x30\x4a\x58\x8f\x0d\x59\xda\xe4\xe6\x3b\xa2\xac\x3c\xe6\x82\xcc"
+	"\x19\xd4\xe3\x41\x67\x8c\xc3\xa6\x7a\x47\xc1\x13\xb4\xdb\x89\x0f"
+	"\x30\xa9\x2a\xa0\x8a\x1f\x6d\xc8\xfb\x64\x63\xf8\x03\x8c\x2b\x40"
+	"\xb2\x53\x00\x77\xb2\x36\xce\x88\xaf\xcc\xcd\xa0\x8a\xd6\xd7\x5e"
+	"\xee\x18\x99\xb1\x0c\xd8\x00\xc2\xce\x53\x72\xbf\xf2\x2e\xe3\xa3"
+	"\x39\xd4\xb9\xc1\xa2\xf5\xf4\xb8\x20\xf6\x87\xe5\x51\x9b\xd0\x5b"
+	"\x1f\xc5\xda\x0e\xb4\x53\x36\x81\x4f\x48\x13\x2c\x64\x0e\x66\xc3"
+	"\xa0\x2a\x22\xe6\x35\x98\xf9\x4f\x22\xf3\x51\x84\x11\x04\x46\xb6"
+	"\x48\xcf\x84\x74\xf3\x0c\x43\xea\xd5\x83\x09\xfb\x25\x90\x16\x09"
+	"\xe2\x41\x87\xe8\x01\xc8\x09\x56\x1a\x64\x80\x94\x50\xe6\x03\xc4"
+	"\xa8\x03\x95\x25\xc4\x76\xb5\x8e\x32\xce\x2c\x47\xb3\x7d\xa5\x91",
+	0, 512, },
+      { GCRY_MD_SHAKE256,
+	"",
+	"\x46\xB9\xDD\x2B\x0B\xA8\x8D\x13\x23\x3B\x3F\xEB\x74\x3E\xEB\x24"
+	"\x3F\xCD\x52\xEA\x62\xB8\x1B\x82\xB5\x0C\x27\x64\x6E\xD5\x76\x2F"
+	"\xD7\x5D\xC4\xDD\xD8\xC0\xF2\x00\xCB\x05\x01\x9D\x67\xB5\x92\xF6"
+	"\xFC\x82\x1C\x49\x47\x9A\xB4\x86\x40\x29\x2E\xAC\xB3\xB7\xC4\xBE"
+	"\x14\x1E\x96\x61\x6F\xB1\x39\x57\x69\x2C\xC7\xED\xD0\xB4\x5A\xE3"
+	"\xDC\x07\x22\x3C\x8E\x92\x93\x7B\xEF\x84\xBC\x0E\xAB\x86\x28\x53"
+	"\x34\x9E\xC7\x55\x46\xF5\x8F\xB7\xC2\x77\x5C\x38\x46\x2C\x50\x10"
+	"\xD8\x46\xC1\x85\xC1\x51\x11\xE5\x95\x52\x2A\x6B\xCD\x16\xCF\x86"
+	"\xF3\xD1\x22\x10\x9E\x3B\x1F\xDD\x94\x3B\x6A\xEC\x46\x8A\x2D\x62"
+	"\x1A\x7C\x06\xC6\xA9\x57\xC6\x2B\x54\xDA\xFC\x3B\xE8\x75\x67\xD6"
+	"\x77\x23\x13\x95\xF6\x14\x72\x93\xB6\x8C\xEA\xB7\xA9\xE0\xC5\x8D"
+	"\x86\x4E\x8E\xFD\xE4\xE1\xB9\xA4\x6C\xBE\x85\x47\x13\x67\x2F\x5C"
+	"\xAA\xAE\x31\x4E\xD9\x08\x3D\xAB\x4B\x09\x9F\x8E\x30\x0F\x01\xB8"
+	"\x65\x0F\x1F\x4B\x1D\x8F\xCF\x3F\x3C\xB5\x3F\xB8\xE9\xEB\x2E\xA2"
+	"\x03\xBD\xC9\x70\xF5\x0A\xE5\x54\x28\xA9\x1F\x7F\x53\xAC\x26\x6B"
+	"\x28\x41\x9C\x37\x78\xA1\x5F\xD2\x48\xD3\x39\xED\xE7\x85\xFB\x7F"
+	"\x5A\x1A\xAA\x96\xD3\x13\xEA\xCC\x89\x09\x36\xC1\x73\xCD\xCD\x0F"
+	"\xAB\x88\x2C\x45\x75\x5F\xEB\x3A\xED\x96\xD4\x77\xFF\x96\x39\x0B"
+	"\xF9\xA6\x6D\x13\x68\xB2\x08\xE2\x1F\x7C\x10\xD0\x4A\x3D\xBD\x4E"
+	"\x36\x06\x33\xE5\xDB\x4B\x60\x26\x01\xC1\x4C\xEA\x73\x7D\xB3\xDC"
+	"\xF7\x22\x63\x2C\xC7\x78\x51\xCB\xDD\xE2\xAA\xF0\xA3\x3A\x07\xB3"
+	"\x73\x44\x5D\xF4\x90\xCC\x8F\xC1\xE4\x16\x0F\xF1\x18\x37\x8F\x11"
+	"\xF0\x47\x7D\xE0\x55\xA8\x1A\x9E\xDA\x57\xA4\xA2\xCF\xB0\xC8\x39"
+	"\x29\xD3\x10\x91\x2F\x72\x9E\xC6\xCF\xA3\x6C\x6A\xC6\xA7\x58\x37"
+	"\x14\x30\x45\xD7\x91\xCC\x85\xEF\xF5\xB2\x19\x32\xF2\x38\x61\xBC"
+	"\xF2\x3A\x52\xB5\xDA\x67\xEA\xF7\xBA\xAE\x0F\x5F\xB1\x36\x9D\xB7"
+	"\x8F\x3A\xC4\x5F\x8C\x4A\xC5\x67\x1D\x85\x73\x5C\xDD\xDB\x09\xD2"
+	"\xB1\xE3\x4A\x1F\xC0\x66\xFF\x4A\x16\x2C\xB2\x63\xD6\x54\x12\x74"
+	"\xAE\x2F\xCC\x86\x5F\x61\x8A\xBE\x27\xC1\x24\xCD\x8B\x07\x4C\xCD"
+	"\x51\x63\x01\xB9\x18\x75\x82\x4D\x09\x95\x8F\x34\x1E\xF2\x74\xBD"
+	"\xAB\x0B\xAE\x31\x63\x39\x89\x43\x04\xE3\x58\x77\xB0\xC2\x8A\x9B"
+	"\x1F\xD1\x66\xC7\x96\xB9\xCC\x25\x8A\x06\x4A\x8F\x57\xE2\x7F\x2A",
+	0, 512, },
+      { GCRY_MD_SHAKE256,
+	"\xB3\x2D\x95\xB0\xB9\xAA\xD2\xA8\x81\x6D\xE6\xD0\x6D\x1F\x86\x00"
+	"\x85\x05\xBD\x8C\x14\x12\x4F\x6E\x9A\x16\x3B\x5A\x2A\xDE\x55\xF8"
+	"\x35\xD0\xEC\x38\x80\xEF\x50\x70\x0D\x3B\x25\xE4\x2C\xC0\xAF\x05"
+	"\x0C\xCD\x1B\xE5\xE5\x55\xB2\x30\x87\xE0\x4D\x7B\xF9\x81\x36\x22"
+	"\x78\x0C\x73\x13\xA1\x95\x4F\x87\x40\xB6\xEE\x2D\x3F\x71\xF7\x68"
+	"\xDD\x41\x7F\x52\x04\x82\xBD\x3A\x08\xD4\xF2\x22\xB4\xEE\x9D\xBD"
+	"\x01\x54\x47\xB3\x35\x07\xDD\x50\xF3\xAB\x42\x47\xC5\xDE\x9A\x8A"
+	"\xBD\x62\xA8\xDE\xCE\xA0\x1E\x3B\x87\xC8\xB9\x27\xF5\xB0\x8B\xEB"
+	"\x37\x67\x4C\x6F\x8E\x38\x0C\x04",
+	"\xCC\x2E\xAA\x04\xEE\xF8\x47\x9C\xDA\xE8\x56\x6E\xB8\xFF\xA1\x10"
+	"\x0A\x40\x79\x95\xBF\x99\x9A\xE9\x7E\xDE\x52\x66\x81\xDC\x34\x90"
+	"\x61\x6F\x28\x44\x2D\x20\xDA\x92\x12\x4C\xE0\x81\x58\x8B\x81\x49"
+	"\x1A\xED\xF6\x5C\xAA\xF0\xD2\x7E\x82\xA4\xB0\xE1\xD1\xCA\xB2\x38"
+	"\x33\x32\x8F\x1B\x8D\xA4\x30\xC8\xA0\x87\x66\xA8\x63\x70\xFA\x84"
+	"\x8A\x79\xB5\x99\x8D\xB3\xCF\xFD\x05\x7B\x96\xE1\xE2\xEE\x0E\xF2"
+	"\x29\xEC\xA1\x33\xC1\x55\x48\xF9\x83\x99\x02\x04\x37\x30\xE4\x4B"
+	"\xC5\x2C\x39\xFA\xDC\x1D\xDE\xEA\xD9\x5F\x99\x39\xF2\x20\xCA\x30"
+	"\x06\x61\x54\x0D\xF7\xED\xD9\xAF\x37\x8A\x5D\x4A\x19\xB2\xB9\x3E"
+	"\x6C\x78\xF4\x9C\x35\x33\x43\xA0\xB5\xF1\x19\x13\x2B\x53\x12\xD0"
+	"\x04\x83\x1D\x01\x76\x9A\x31\x6D\x2F\x51\xBF\x64\xCC\xB2\x0A\x21"
+	"\xC2\xCF\x7A\xC8\xFB\x6F\x6E\x90\x70\x61\x26\xBD\xAE\x06\x11\xDD"
+	"\x13\x96\x2E\x8B\x53\xD6\xEA\xE2\x6C\x7B\x0D\x25\x51\xDA\xF6\x24"
+	"\x8E\x9D\x65\x81\x73\x82\xB0\x4D\x23\x39\x2D\x10\x8E\x4D\x34\x43"
+	"\xDE\x5A\xDC\x72\x73\xC7\x21\xA8\xF8\x32\x0E\xCF\xE8\x17\x7A\xC0"
+	"\x67\xCA\x8A\x50\x16\x9A\x6E\x73\x00\x0E\xBC\xDC\x1E\x4E\xE6\x33"
+	"\x9F\xC8\x67\xC3\xD7\xAE\xAB\x84\x14\x63\x98\xD7\xBA\xDE\x12\x1D"
+	"\x19\x89\xFA\x45\x73\x35\x56\x4E\x97\x57\x70\xA3\xA0\x02\x59\xCA"
+	"\x08\x70\x61\x08\x26\x1A\xA2\xD3\x4D\xE0\x0F\x8C\xAC\x7D\x45\xD3"
+	"\x5E\x5A\xA6\x3E\xA6\x9E\x1D\x1A\x2F\x7D\xAB\x39\x00\xD5\x1E\x0B"
+	"\xC6\x53\x48\xA2\x55\x54\x00\x70\x39\xA5\x2C\x3C\x30\x99\x80\xD1"
+	"\x7C\xAD\x20\xF1\x15\x63\x10\xA3\x9C\xD3\x93\x76\x0C\xFE\x58\xF6"
+	"\xF8\xAD\xE4\x21\x31\x28\x82\x80\xA3\x5E\x1D\xB8\x70\x81\x83\xB9"
+	"\x1C\xFA\xF5\x82\x7E\x96\xB0\xF7\x74\xC4\x50\x93\xB4\x17\xAF\xF9"
+	"\xDD\x64\x17\xE5\x99\x64\xA0\x1B\xD2\xA6\x12\xFF\xCF\xBA\x18\xA0"
+	"\xF1\x93\xDB\x29\x7B\x9A\x6C\xC1\xD2\x70\xD9\x7A\xAE\x8F\x8A\x3A"
+	"\x6B\x26\x69\x5A\xB6\x64\x31\xC2\x02\xE1\x39\xD6\x3D\xD3\xA2\x47"
+	"\x78\x67\x6C\xEF\xE3\xE2\x1B\x02\xEC\x4E\x8F\x5C\xFD\x66\x58\x7A"
+	"\x12\xB4\x40\x78\xFC\xD3\x9E\xEE\x44\xBB\xEF\x4A\x94\x9A\x63\xC0"
+	"\xDF\xD5\x8C\xF2\xFB\x2C\xD5\xF0\x02\xE2\xB0\x21\x92\x66\xCF\xC0"
+	"\x31\x81\x74\x86\xDE\x70\xB4\x28\x5A\x8A\x70\xF3\xD3\x8A\x61\xD3"
+	"\x15\x5D\x99\xAA\xF4\xC2\x53\x90\xD7\x36\x45\xAB\x3E\x8D\x80\xF0",
+	136, 512, },
+      { GCRY_MD_SHAKE256,
+	"!",
+	"\x35\x78\xa7\xa4\xca\x91\x37\x56\x9c\xdf\x76\xed\x61\x7d\x31\xbb"
+	"\x99\x4f\xca\x9c\x1b\xbf\x8b\x18\x40\x13\xde\x82\x34\xdf\xd1\x3a"
+	"\x3f\xd1\x24\xd4\xdf\x76\xc0\xa5\x39\xee\x7d\xd2\xf6\xe1\xec\x34"
+	"\x61\x24\xc8\x15\xd9\x41\x0e\x14\x5e\xb5\x61\xbc\xd9\x7b\x18\xab"
+	"\x6c\xe8\xd5\x55\x3e\x0e\xab\x3d\x1f\x7d\xfb\x8f\x9d\xee\xfe\x16"
+	"\x84\x7e\x21\x92\xf6\xf6\x1f\xb8\x2f\xb9\x0d\xde\x60\xb1\x90\x63"
+	"\xc5\x6a\x4c\x55\xcd\xd7\xb6\x72\xb7\x5b\xf5\x15\xad\xbf\xe2\x04"
+	"\x90\x3c\x8c\x00\x36\xde\x54\xa2\x99\x9a\x92\x0d\xe9\x0f\x66\xd7"
+	"\xff\x6e\xc8\xe4\xc9\x3d\x24\xae\x34\x6f\xdc\xb3\xa5\xa5\xbd\x57"
+	"\x39\xec\x15\xa6\xed\xdb\x5c\xe5\xb0\x2d\xa5\x30\x39\xfa\xc6\x3e"
+	"\x19\x55\x5f\xaa\x2e\xdd\xc6\x93\xb1\xf0\xc2\xa6\xfc\xbe\x7c\x0a"
+	"\x0a\x09\x1d\x0e\xe7\x00\xd7\x32\x2e\x4b\x0f\xf0\x95\x90\xde\x16"
+	"\x64\x22\xf9\xea\xd5\xda\x4c\x99\x3d\x60\x5f\xe4\xd9\xc6\x34\x84"
+	"\x3a\xa1\x78\xb1\x76\x72\xc6\x56\x8c\x8a\x2e\x62\xab\xeb\xea\x2c"
+	"\x21\xc3\x02\xbd\x36\x6a\xd6\x98\x95\x9e\x1f\x6e\x43\x4a\xf1\x55"
+	"\x56\x8b\x27\x34\xd8\x37\x9f\xcd\x3f\xfe\x64\x89\xba\xff\xa6\xd7"
+	"\x11\x09\x44\x2e\x1b\x34\x4f\x13\x8a\x09\xca\xe3\xe2\xd3\x94\x2e"
+	"\xee\x82\x8f\xc4\x7e\x64\xde\xb5\xe0\x0a\x02\x4a\xe1\xf2\xc0\x77"
+	"\xe6\xb7\xb1\x33\xf6\xc1\xde\x91\x30\x92\xd4\xe8\x29\xec\xd2\xb2"
+	"\xef\x28\xca\x80\x20\x82\x1e\x2b\x8b\xe5\x17\xd9\x3e\xd0\x88\x36"
+	"\xf6\xf0\x66\xcc\x3d\x03\xb6\x25\xd8\x49\x7f\x29\xdb\xc1\xc3\x9e"
+	"\x6f\xe4\x63\x22\x6f\x85\xc1\x28\xa2\xc2\x98\x88\x11\x2e\x06\xa9"
+	"\x9c\x5d\x17\xb2\x5e\x90\x0d\x20\x4f\x39\x72\x31\xcd\xf7\x9c\x31"
+	"\x34\x46\x53\x2d\xad\x07\xf4\xc0\xbd\x9f\xba\x1d\xd4\x13\xd8\xa7"
+	"\xe6\xcb\xc0\xa0\x86\x2c\xc7\x69\x23\x9a\x89\xf9\xdb\x08\x5b\x78"
+	"\xa0\x54\x59\x6a\xd7\x08\x0d\xdf\x96\x01\x9b\x73\x99\xb5\x03\x48"
+	"\x0e\x5a\x65\xa2\x20\x8d\x74\x72\x4c\x98\x7d\x32\x5e\x9b\x0e\x82"
+	"\xfe\xcd\x4f\x27\xf3\x13\x5b\x1d\x9e\x27\xb4\x8e\x69\xdd\x6f\x59"
+	"\x62\xb8\xa6\x3b\x48\x92\x1e\xc8\xee\x53\x86\x9f\x1a\xc1\xc8\x18"
+	"\x23\x87\xee\x0d\x6c\xfe\xf6\x53\xff\x8b\xf6\x05\xf1\x47\x04\xb7"
+	"\x1b\xeb\x65\x53\xf2\x81\xfa\x75\x69\x48\xc4\x38\x49\x4b\x19\xb4"
+	"\xee\x69\xa5\x43\x6b\x22\x2b\xc9\x88\xed\xa4\xac\x60\x00\x24\xc9",
+	0, 512, },
+      { 0 }
     };
   gcry_error_t err;
   int i;
@@ -5950,7 +6343,7 @@ check_digests (void)
       check_one_md (algos[i].md, algos[i].data,
 		    algos[i].datalen > 0 ? algos[i].datalen
 					 : strlen (algos[i].data),
-		    algos[i].expect);
+		    algos[i].expect, algos[i].expectlen);
       check_one_md_multi (algos[i].md, algos[i].data,
 			  algos[i].datalen > 0 ? algos[i].datalen
 					       : strlen (algos[i].data),
diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index 2679556..3a2aa38 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -1651,6 +1651,12 @@ kdf_bench_one (int algo, int subalgo)
       return;
     }
 
+  if (gcry_md_get_algo_dlen (subalgo) == 0)
+    {
+      /* Skip XOFs */
+      return;
+    }
+
   *algo_name = 0;
 
   if (algo == GCRY_KDF_PBKDF2)

commit 28de6f9e16e386018e81a9cdaee596be7616ccab
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Oct 25 18:57:15 2015 +0200

    Few updates to documentation
    
    * doc/gcrypt.text: Add mention of new 'intel-fast-shld' hw feature
    flag; Add mention of x86 RDRAND support in rndhw.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 3450bb2..facdf65 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -556,6 +556,7 @@ are
 @item padlock-sha
 @item padlock-mmul
 @item intel-cpu
+ at item intel-fast-shld
 @item intel-bmi2
 @item intel-ssse3
 @item intel-pclmul
@@ -5610,9 +5611,9 @@ that system and is the only gathering module available for that OS.
 
 @item rndhw
 Extra module to collect additional entropy by utilizing a hardware
-random number generator.  As of now the only supported hardware RNG is
-the Padlock engine of VIA (Centaur) CPUs.  It is not available in FIPS
-mode.
+random number generator.  As of now the supported hardware RNG is
+the Padlock engine of VIA (Centaur) CPUs and x86 CPUs with the RDRAND
+instruction.  It is not available in FIPS mode.
 
 @end table
 

commit 92ad19873562cfce7bcc4a0b5aed8195d8284cfc
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Oct 25 17:59:33 2015 +0200

    Add HMAC-SHA3 test vectors
    
    * tests/basic.c (check_mac): Add HMAC_SHA3 test vectors.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/tests/basic.c b/tests/basic.c
index 4ea91a9..75ff349 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -6784,6 +6784,169 @@ check_mac (void)
         "\xde\xbd\x71\xf8\x86\x72\x89\x86\x5d\xf5\xa3\x2d\x20\xcd\xc9\x44"
         "\xb6\x02\x2c\xac\x3c\x49\x82\xb1\x0d\x5e\xeb\x55\xc3\xe4\xde\x15"
         "\x13\x46\x76\xfb\x6d\xe0\x44\x60\x65\xc9\x74\x40\xfa\x8c\x6a\x58" },
+      /* HMAC-SHA3 test vectors from
+       * http://wolfgang-ehrhardt.de/hmac-sha3-testvectors.html */
+      { GCRY_MAC_HMAC_SHA3_224,
+	"Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+	"\x0b\x0b\x0b",
+	"\x3b\x16\x54\x6b\xbc\x7b\xe2\x70\x6a\x03\x1d\xca\xfd\x56\x37\x3d"
+	"\x98\x84\x36\x76\x41\xd8\xc5\x9a\xf3\xc8\x60\xf7" },
+      { GCRY_MAC_HMAC_SHA3_256,
+	"Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+	"\x0b\x0b\x0b",
+	"\xba\x85\x19\x23\x10\xdf\xfa\x96\xe2\xa3\xa4\x0e\x69\x77\x43\x51"
+	"\x14\x0b\xb7\x18\x5e\x12\x02\xcd\xcc\x91\x75\x89\xf9\x5e\x16\xbb" },
+      { GCRY_MAC_HMAC_SHA3_512,
+	"Hi There",
+	"\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
+	"\x0b\x0b\x0b",
+	"\xeb\x3f\xbd\x4b\x2e\xaa\xb8\xf5\xc5\x04\xbd\x3a\x41\x46\x5a\xac"
+	"\xec\x15\x77\x0a\x7c\xab\xac\x53\x1e\x48\x2f\x86\x0b\x5e\xc7\xba"
+	"\x47\xcc\xb2\xc6\xf2\xaf\xce\x8f\x88\xd2\x2b\x6d\xc6\x13\x80\xf2"
+	"\x3a\x66\x8f\xd3\x88\x8b\xb8\x05\x37\xc0\xa0\xb8\x64\x07\x68\x9e" },
+      { GCRY_MAC_HMAC_SHA3_224, "what do ya want for nothing?", "Jefe",
+	"\x7f\xdb\x8d\xd8\x8b\xd2\xf6\x0d\x1b\x79\x86\x34\xad\x38\x68\x11"
+	"\xc2\xcf\xc8\x5b\xfa\xf5\xd5\x2b\xba\xce\x5e\x66" },
+      { GCRY_MAC_HMAC_SHA3_256, "what do ya want for nothing?", "Jefe",
+	"\xc7\xd4\x07\x2e\x78\x88\x77\xae\x35\x96\xbb\xb0\xda\x73\xb8\x87"
+	"\xc9\x17\x1f\x93\x09\x5b\x29\x4a\xe8\x57\xfb\xe2\x64\x5e\x1b\xa5" },
+      { GCRY_MAC_HMAC_SHA3_384, "what do ya want for nothing?", "Jefe",
+	"\xf1\x10\x1f\x8c\xbf\x97\x66\xfd\x67\x64\xd2\xed\x61\x90\x3f\x21"
+	"\xca\x9b\x18\xf5\x7c\xf3\xe1\xa2\x3c\xa1\x35\x08\xa9\x32\x43\xce"
+	"\x48\xc0\x45\xdc\x00\x7f\x26\xa2\x1b\x3f\x5e\x0e\x9d\xf4\xc2\x0a" },
+      { GCRY_MAC_HMAC_SHA3_512, "what do ya want for nothing?", "Jefe",
+	"\x5a\x4b\xfe\xab\x61\x66\x42\x7c\x7a\x36\x47\xb7\x47\x29\x2b\x83"
+	"\x84\x53\x7c\xdb\x89\xaf\xb3\xbf\x56\x65\xe4\xc5\xe7\x09\x35\x0b"
+	"\x28\x7b\xae\xc9\x21\xfd\x7c\xa0\xee\x7a\x0c\x31\xd0\x22\xa9\x5e"
+	"\x1f\xc9\x2b\xa9\xd7\x7d\xf8\x83\x96\x02\x75\xbe\xb4\xe6\x20\x24" },
+      { GCRY_MAC_HMAC_SHA3_224,
+	"Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xb9\x6d\x73\x0c\x14\x8c\x2d\xaa\xd8\x64\x9d\x83\xde\xfa\xa3\x71"
+	"\x97\x38\xd3\x47\x75\x39\x7b\x75\x71\xc3\x85\x15" },
+      { GCRY_MAC_HMAC_SHA3_256,
+	"Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xa6\x07\x2f\x86\xde\x52\xb3\x8b\xb3\x49\xfe\x84\xcd\x6d\x97\xfb"
+	"\x6a\x37\xc4\xc0\xf6\x2a\xae\x93\x98\x11\x93\xa7\x22\x9d\x34\x67" },
+      { GCRY_MAC_HMAC_SHA3_384,
+	"Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\x71\x3d\xff\x03\x02\xc8\x50\x86\xec\x5a\xd0\x76\x8d\xd6\x5a\x13"
+	"\xdd\xd7\x90\x68\xd8\xd4\xc6\x21\x2b\x71\x2e\x41\x64\x94\x49\x11"
+	"\x14\x80\x23\x00\x44\x18\x5a\x99\x10\x3e\xd8\x20\x04\xdd\xbf\xcc" },
+      { GCRY_MAC_HMAC_SHA3_512,
+	"Test Using Larger Than Block-Size Key - Hash Key First",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xb1\x48\x35\xc8\x19\xa2\x90\xef\xb0\x10\xac\xe6\xd8\x56\x8d\xc6"
+	"\xb8\x4d\xe6\x0b\xc4\x9b\x00\x4c\x3b\x13\xed\xa7\x63\x58\x94\x51"
+	"\xe5\xdd\x74\x29\x28\x84\xd1\xbd\xce\x64\xe6\xb9\x19\xdd\x61\xdc"
+	"\x9c\x56\xa2\x82\xa8\x1c\x0b\xd1\x4f\x1f\x36\x5b\x49\xb8\x3a\x5b" },
+      { GCRY_MAC_HMAC_SHA3_224,
+	"This is a test using a larger than block-size key and a larger "
+	"than block-size data. The key needs to be hashed before being "
+	"used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xc7\x9c\x9b\x09\x34\x24\xe5\x88\xa9\x87\x8b\xbc\xb0\x89\xe0\x18"
+	"\x27\x00\x96\xe9\xb4\xb1\xa9\xe8\x22\x0c\x86\x6a" },
+      { GCRY_MAC_HMAC_SHA3_256,
+	"This is a test using a larger than block-size key and a larger "
+	"than block-size data. The key needs to be hashed before being "
+	"used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xe6\xa3\x6d\x9b\x91\x5f\x86\xa0\x93\xca\xc7\xd1\x10\xe9\xe0\x4c"
+	"\xf1\xd6\x10\x0d\x30\x47\x55\x09\xc2\x47\x5f\x57\x1b\x75\x8b\x5a" },
+      { GCRY_MAC_HMAC_SHA3_384,
+	"This is a test using a larger than block-size key and a larger "
+	"than block-size data. The key needs to be hashed before being "
+	"used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xca\xd1\x8a\x8f\xf6\xc4\xcc\x3a\xd4\x87\xb9\x5f\x97\x69\xe9\xb6"
+	"\x1c\x06\x2a\xef\xd6\x95\x25\x69\xe6\xe6\x42\x18\x97\x05\x4c\xfc"
+	"\x70\xb5\xfd\xc6\x60\x5c\x18\x45\x71\x12\xfc\x6a\xaa\xd4\x55\x85" },
+      { GCRY_MAC_HMAC_SHA3_512,
+	"This is a test using a larger than block-size key and a larger "
+	"than block-size data. The key needs to be hashed before being "
+	"used by the HMAC algorithm.",
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+	"\xaa\xaa\xaa",
+	"\xdc\x03\x0e\xe7\x88\x70\x34\xf3\x2c\xf4\x02\xdf\x34\x62\x2f\x31"
+	"\x1f\x3e\x6c\xf0\x48\x60\xc6\xbb\xd7\xfa\x48\x86\x74\x78\x2b\x46"
+	"\x59\xfd\xbd\xf3\xfd\x87\x78\x52\x88\x5c\xfe\x6e\x22\x18\x5f\xe7"
+	"\xb2\xee\x95\x20\x43\x62\x9b\xc9\xd5\xf3\x29\x8a\x41\xd0\x2c\x66" },
       /* CMAC AES and DES test vectors from
          http://web.archive.org/web/20130930212819/http://csrc.nist.gov/publica\
          tions/nistpubs/800-38B/Updated_CMAC_Examples.pdf */

commit 577dc2b63ceca6a8a716256d034ea4e7414f65fa
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Oct 25 14:50:41 2015 +0200

    md: add variable length output interface
    
    * cipher/crc.c (_gcry_digest_spec_crc32)
    (_gcry_digest_spec_crc32_rfc1510, _gcry_digest_spec_crc24_rfc2440): Set
    'extract' NULL.
    * cipher/gostr3411-94.c (_gcry_digest_spec_gost3411_94)
    (_gcry_digest_spec_gost3411_cp): Ditto.
    * cipher/keccak.c (_gcry_digest_spec_sha3_224)
    (_gcry_digest_spec_sha3_256, _gcry_digest_spec_sha3_384)
    (_gcry_digest_spec_sha3_512): Ditto.
    * cipher/md2.c (_gcry_digest_spec_md2): Ditto.
    * cipher/md4.c (_gcry_digest_spec_md4): Ditto.
    * cipher/md5.c (_gcry_digest_spec_md5): Ditto.
    * cipher/rmd160.c (_gcry_digest_spec_rmd160): Ditto.
    * cipher/sha1.c (_gcry_digest_spec_sha1): Ditto.
    * cipher/sha256.c (_gcry_digest_spec_sha224)
    (_gcry_digest_spec_sha256): Ditto.
    * cipher/sha512.c (_gcry_digest_spec_sha384)
    (_gcry_digest_spec_sha512): Ditto.
    * cipher/stribog.c (_gcry_digest_spec_stribog_256)
    (_gcry_digest_spec_stribog_512): Ditto.
    * cipher/tiger.c (_gcry_digest_spec_tiger)
    (_gcry_digest_spec_tiger1, _gcry_digest_spec_tiger2): Ditto.
    * cipher/whirlpool.c (_gcry_digest_spec_whirlpool): Ditto.
    * cipher/md.c (md_enable): Do not allow combination of HMAC and
    'expandable-output function'.
    (md_final): Check if spec->read is NULL before calling.
    (md_read): Ditto.
    (md_extract, _gcry_md_extract): New.
    * doc/gcrypt.texi: Add SHA3 algorithms and gcry_md_extract.
    * src/cipher-proto.h (gcry_md_extract_t): New.
    (gcry_md_spec_t): Add 'extract'.
    * src/gcrypt-int.g (_gcry_md_extract): New.
    * src/gcrypt.h.in (gcry_md_extract): New.
    * src/libgcrypt.def: Add gcry_md_extract.
    * src/libgcrypt.vers: Add gcry_md_extract.
    * src/visibility.c (gcry_md_extract): New.
    * src/visibility.h (gcry_md_extract): New.
    --
    
    Patch adds new interface for reading output from 'expandable-output
    function' MD algorithms that can give variable length output (ie.
    SHAKE algorithms from FIPS-202). New function to read output is
    
     gpg_error_t gcry_md_extract(gcry_md_hd_t md, int algo,
    			     void *buffer, size_t length);
    
    Function implicitly finalizes algorithm so that no new input can
    be given. Subsequents calls of the function return more output
    bytes from the algorithm.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/crc.c b/cipher/crc.c
index 9105dfe..46a185a 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -785,7 +785,7 @@ gcry_md_spec_t _gcry_digest_spec_crc32 =
   {
     GCRY_MD_CRC32, {0, 1},
     "CRC32", NULL, 0, NULL, 4,
-    crc32_init, crc32_write, crc32_final, crc32_read,
+    crc32_init, crc32_write, crc32_final, crc32_read, NULL,
     sizeof (CRC_CONTEXT)
   };
 
@@ -793,8 +793,7 @@ gcry_md_spec_t _gcry_digest_spec_crc32_rfc1510 =
   {
     GCRY_MD_CRC32_RFC1510, {0, 1},
     "CRC32RFC1510", NULL, 0, NULL, 4,
-    crc32rfc1510_init, crc32_write,
-    crc32rfc1510_final, crc32_read,
+    crc32rfc1510_init, crc32_write, crc32rfc1510_final, crc32_read, NULL,
     sizeof (CRC_CONTEXT)
   };
 
@@ -802,7 +801,6 @@ gcry_md_spec_t _gcry_digest_spec_crc24_rfc2440 =
   {
     GCRY_MD_CRC24_RFC2440, {0, 1},
     "CRC24RFC2440", NULL, 0, NULL, 3,
-    crc24rfc2440_init, crc24rfc2440_write,
-    crc24rfc2440_final, crc32_read,
+    crc24rfc2440_init, crc24rfc2440_write, crc24rfc2440_final, crc32_read, NULL,
     sizeof (CRC_CONTEXT)
   };
diff --git a/cipher/gostr3411-94.c b/cipher/gostr3411-94.c
index 7b16e61..a782427 100644
--- a/cipher/gostr3411-94.c
+++ b/cipher/gostr3411-94.c
@@ -343,13 +343,13 @@ gcry_md_spec_t _gcry_digest_spec_gost3411_94 =
   {
     GCRY_MD_GOSTR3411_94, {0, 0},
     "GOSTR3411_94", NULL, 0, NULL, 32,
-    gost3411_init, _gcry_md_block_write, gost3411_final, gost3411_read,
+    gost3411_init, _gcry_md_block_write, gost3411_final, gost3411_read, NULL,
     sizeof (GOSTR3411_CONTEXT)
   };
 gcry_md_spec_t _gcry_digest_spec_gost3411_cp =
   {
     GCRY_MD_GOSTR3411_CP, {0, 0},
     "GOSTR3411_CP", asn, DIM (asn), oid_spec_gostr3411, 32,
-    gost3411_cp_init, _gcry_md_block_write, gost3411_final, gost3411_read,
+    gost3411_cp_init, _gcry_md_block_write, gost3411_final, gost3411_read, NULL,
     sizeof (GOSTR3411_CONTEXT)
   };
diff --git a/cipher/keccak.c b/cipher/keccak.c
index 3a72294..d46d9cb 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -927,7 +927,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_224 =
   {
     GCRY_MD_SHA3_224, {0, 1},
     "SHA3-224", sha3_224_asn, DIM (sha3_224_asn), oid_spec_sha3_224, 28,
-    sha3_224_init, keccak_write, keccak_final, keccak_read,
+    sha3_224_init, keccak_write, keccak_final, keccak_read, NULL,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
@@ -935,7 +935,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_256 =
   {
     GCRY_MD_SHA3_256, {0, 1},
     "SHA3-256", sha3_256_asn, DIM (sha3_256_asn), oid_spec_sha3_256, 32,
-    sha3_256_init, keccak_write, keccak_final, keccak_read,
+    sha3_256_init, keccak_write, keccak_final, keccak_read, NULL,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
@@ -943,7 +943,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_384 =
   {
     GCRY_MD_SHA3_384, {0, 1},
     "SHA3-384", sha3_384_asn, DIM (sha3_384_asn), oid_spec_sha3_384, 48,
-    sha3_384_init, keccak_write, keccak_final, keccak_read,
+    sha3_384_init, keccak_write, keccak_final, keccak_read, NULL,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
@@ -951,7 +951,7 @@ gcry_md_spec_t _gcry_digest_spec_sha3_512 =
   {
     GCRY_MD_SHA3_512, {0, 1},
     "SHA3-512", sha3_512_asn, DIM (sha3_512_asn), oid_spec_sha3_512, 64,
-    sha3_512_init, keccak_write, keccak_final, keccak_read,
+    sha3_512_init, keccak_write, keccak_final, keccak_read, NULL,
     sizeof (KECCAK_CONTEXT),
     run_selftests
   };
diff --git a/cipher/md.c b/cipher/md.c
index 948d269..6ef8fee 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -408,6 +408,12 @@ md_enable (gcry_md_hd_t hd, int algorithm)
         }
     }
 
+  if (!err && h->flags.hmac && spec->read == NULL)
+    {
+      /* Expandable output function cannot act as part of HMAC. */
+      err = GPG_ERR_DIGEST_ALGO;
+    }
+
   if (!err)
     {
       size_t size = (sizeof (*entry)
@@ -638,11 +644,16 @@ md_final (gcry_md_hd_t a)
 
   for (r = a->ctx->list; r; r = r->next)
     {
-      byte *p = r->spec->read (&r->context.c);
+      byte *p;
       size_t dlen = r->spec->mdlen;
       byte *hash;
       gcry_err_code_t err;
 
+      if (r->spec->read == NULL)
+        continue;
+
+      p = r->spec->read (&r->context.c);
+
       if (a->ctx->flags.secure)
         hash = xtrymalloc_secure (dlen);
       else
@@ -821,6 +832,8 @@ md_read( gcry_md_hd_t a, int algo )
         {
           if (r->next)
             log_debug ("more than one algorithm in md_read(0)\n");
+          if (r->spec->read == NULL)
+            return NULL;
           return r->spec->read (&r->context.c);
         }
     }
@@ -828,7 +841,11 @@ md_read( gcry_md_hd_t a, int algo )
     {
       for (r = a->ctx->list; r; r = r->next)
 	if (r->spec->algo == algo)
-	  return r->spec->read (&r->context.c);
+	  {
+	    if (r->spec->read == NULL)
+	      return NULL;
+	    return r->spec->read (&r->context.c);
+	  }
     }
   BUG();
   return NULL;
@@ -850,6 +867,52 @@ _gcry_md_read (gcry_md_hd_t hd, int algo)
 }
 
 
+/****************
+ * If ALGO is null get the digest for the used algo (which should be
+ * only one)
+ */
+static gcry_err_code_t
+md_extract(gcry_md_hd_t a, int algo, void *out, size_t outlen)
+{
+  GcryDigestEntry *r = a->ctx->list;
+
+  if (!algo)
+    {
+      /* Return the first algorithm */
+      if (r && r->spec->extract)
+	{
+	  if (r->next)
+	    log_debug ("more than one algorithm in md_extract(0)\n");
+	  r->spec->extract (&r->context.c, out, outlen);
+	  return 0;
+	}
+    }
+  else
+    {
+      for (r = a->ctx->list; r; r = r->next)
+	if (r->spec->algo == algo && r->spec->extract)
+	  {
+	    r->spec->extract (&r->context.c, out, outlen);
+	    return 0;
+	  }
+    }
+
+  return GPG_ERR_DIGEST_ALGO;
+}
+
+
+/*
+ * Expand the output from XOF class digest, this function implictly finalizes
+ * the hash.
+ */
+gcry_err_code_t
+_gcry_md_extract (gcry_md_hd_t hd, int algo, void *out, size_t outlen)
+{
+  _gcry_md_ctl (hd, GCRYCTL_FINALIZE, NULL, 0);
+  return md_extract (hd, algo, out, outlen);
+}
+
+
 /*
  * Read out an intermediate digest.  Not yet functional.
  */
diff --git a/cipher/md2.c b/cipher/md2.c
index 97682e5..e339b28 100644
--- a/cipher/md2.c
+++ b/cipher/md2.c
@@ -177,6 +177,6 @@ gcry_md_spec_t _gcry_digest_spec_md2 =
   {
     GCRY_MD_MD2, {0, 0},
     "MD2", asn, DIM (asn), oid_spec_md2, 16,
-    md2_init, _gcry_md_block_write, md2_final, md2_read,
+    md2_init, _gcry_md_block_write, md2_final, md2_read, NULL,
     sizeof (MD2_CONTEXT)
   };
diff --git a/cipher/md4.c b/cipher/md4.c
index c9b4154..afa6382 100644
--- a/cipher/md4.c
+++ b/cipher/md4.c
@@ -286,6 +286,6 @@ gcry_md_spec_t _gcry_digest_spec_md4 =
   {
     GCRY_MD_MD4, {0, 0},
     "MD4", asn, DIM (asn), oid_spec_md4,16,
-    md4_init, _gcry_md_block_write, md4_final, md4_read,
+    md4_init, _gcry_md_block_write, md4_final, md4_read, NULL,
     sizeof (MD4_CONTEXT)
   };
diff --git a/cipher/md5.c b/cipher/md5.c
index f17af7a..66cc5f6 100644
--- a/cipher/md5.c
+++ b/cipher/md5.c
@@ -312,6 +312,6 @@ gcry_md_spec_t _gcry_digest_spec_md5 =
   {
     GCRY_MD_MD5, {0, 1},
     "MD5", asn, DIM (asn), oid_spec_md5, 16,
-    md5_init, _gcry_md_block_write, md5_final, md5_read,
+    md5_init, _gcry_md_block_write, md5_final, md5_read, NULL,
     sizeof (MD5_CONTEXT)
   };
diff --git a/cipher/rmd160.c b/cipher/rmd160.c
index 2695db2..cf7531e 100644
--- a/cipher/rmd160.c
+++ b/cipher/rmd160.c
@@ -526,6 +526,6 @@ gcry_md_spec_t _gcry_digest_spec_rmd160 =
   {
     GCRY_MD_RMD160, {0, 0},
     "RIPEMD160", asn, DIM (asn), oid_spec_rmd160, 20,
-    rmd160_init, _gcry_md_block_write, rmd160_final, rmd160_read,
+    rmd160_init, _gcry_md_block_write, rmd160_final, rmd160_read, NULL,
     sizeof (RMD160_CONTEXT)
   };
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 554d55c..0de8412 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -573,7 +573,7 @@ gcry_md_spec_t _gcry_digest_spec_sha1 =
   {
     GCRY_MD_SHA1, {0, 1},
     "SHA1", asn, DIM (asn), oid_spec_sha1, 20,
-    sha1_init, _gcry_md_block_write, sha1_final, sha1_read,
+    sha1_init, _gcry_md_block_write, sha1_final, sha1_read, NULL,
     sizeof (SHA1_CONTEXT),
     run_selftests
   };
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 63869d5..bc326e0 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -633,7 +633,7 @@ gcry_md_spec_t _gcry_digest_spec_sha224 =
   {
     GCRY_MD_SHA224, {0, 1},
     "SHA224", asn224, DIM (asn224), oid_spec_sha224, 28,
-    sha224_init, _gcry_md_block_write, sha256_final, sha256_read,
+    sha224_init, _gcry_md_block_write, sha256_final, sha256_read, NULL,
     sizeof (SHA256_CONTEXT),
     run_selftests
   };
@@ -642,7 +642,7 @@ gcry_md_spec_t _gcry_digest_spec_sha256 =
   {
     GCRY_MD_SHA256, {0, 1},
     "SHA256", asn256, DIM (asn256), oid_spec_sha256, 32,
-    sha256_init, _gcry_md_block_write, sha256_final, sha256_read,
+    sha256_init, _gcry_md_block_write, sha256_final, sha256_read, NULL,
     sizeof (SHA256_CONTEXT),
     run_selftests
   };
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 4be1cab..1196db9 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -877,7 +877,7 @@ gcry_md_spec_t _gcry_digest_spec_sha512 =
   {
     GCRY_MD_SHA512, {0, 1},
     "SHA512", sha512_asn, DIM (sha512_asn), oid_spec_sha512, 64,
-    sha512_init, _gcry_md_block_write, sha512_final, sha512_read,
+    sha512_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
     sizeof (SHA512_CONTEXT),
     run_selftests
   };
@@ -903,7 +903,7 @@ gcry_md_spec_t _gcry_digest_spec_sha384 =
   {
     GCRY_MD_SHA384, {0, 1},
     "SHA384", sha384_asn, DIM (sha384_asn), oid_spec_sha384, 48,
-    sha384_init, _gcry_md_block_write, sha512_final, sha512_read,
+    sha384_init, _gcry_md_block_write, sha512_final, sha512_read, NULL,
     sizeof (SHA512_CONTEXT),
     run_selftests
   };
diff --git a/cipher/stribog.c b/cipher/stribog.c
index de167a7..7f38e6f 100644
--- a/cipher/stribog.c
+++ b/cipher/stribog.c
@@ -1326,6 +1326,7 @@ gcry_md_spec_t _gcry_digest_spec_stribog_256 =
     GCRY_MD_STRIBOG256, {0, 0},
     "STRIBOG256", NULL, 0, NULL, 32,
     stribog_init_256, _gcry_md_block_write, stribog_final, stribog_read_256,
+    NULL,
     sizeof (STRIBOG_CONTEXT)
   };
 
@@ -1334,5 +1335,6 @@ gcry_md_spec_t _gcry_digest_spec_stribog_512 =
     GCRY_MD_STRIBOG512, {0, 0},
     "STRIBOG512", NULL, 0, NULL, 64,
     stribog_init_512, _gcry_md_block_write, stribog_final, stribog_read_512,
+    NULL,
     sizeof (STRIBOG_CONTEXT)
   };
diff --git a/cipher/tiger.c b/cipher/tiger.c
index 8a08953..078133a 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -840,7 +840,7 @@ gcry_md_spec_t _gcry_digest_spec_tiger =
   {
     GCRY_MD_TIGER, {0, 0},
     "TIGER192", NULL, 0, NULL, 24,
-    tiger_init, _gcry_md_block_write, tiger_final, tiger_read,
+    tiger_init, _gcry_md_block_write, tiger_final, tiger_read, NULL,
     sizeof (TIGER_CONTEXT)
   };
 
@@ -863,7 +863,7 @@ gcry_md_spec_t _gcry_digest_spec_tiger1 =
   {
     GCRY_MD_TIGER1, {0, 0},
     "TIGER", asn1, DIM (asn1), oid_spec_tiger1, 24,
-    tiger1_init, _gcry_md_block_write, tiger_final, tiger_read,
+    tiger1_init, _gcry_md_block_write, tiger_final, tiger_read, NULL,
     sizeof (TIGER_CONTEXT)
   };
 
@@ -874,7 +874,7 @@ gcry_md_spec_t _gcry_digest_spec_tiger2 =
   {
     GCRY_MD_TIGER2, {0, 0},
     "TIGER2", NULL, 0, NULL, 24,
-    tiger2_init, _gcry_md_block_write, tiger_final, tiger_read,
+    tiger2_init, _gcry_md_block_write, tiger_final, tiger_read, NULL,
     sizeof (TIGER_CONTEXT)
   };
 
diff --git a/cipher/whirlpool.c b/cipher/whirlpool.c
index 5f224a1..8a06939 100644
--- a/cipher/whirlpool.c
+++ b/cipher/whirlpool.c
@@ -1525,6 +1525,6 @@ gcry_md_spec_t _gcry_digest_spec_whirlpool =
   {
     GCRY_MD_WHIRLPOOL, {0, 0},
     "WHIRLPOOL", NULL, 0, NULL, 64,
-    whirlpool_init, whirlpool_write, whirlpool_final, whirlpool_read,
+    whirlpool_init, whirlpool_write, whirlpool_final, whirlpool_read, NULL,
     sizeof (whirlpool_context_t)
   };
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index f13695a..3450bb2 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -3036,6 +3036,7 @@ are also supported.
 @c begin table of hash algorithms
 @cindex SHA-1
 @cindex SHA-224, SHA-256, SHA-384, SHA-512
+ at cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512
 @cindex RIPE-MD-160
 @cindex MD2, MD4, MD5
 @cindex TIGER, TIGER1, TIGER2
@@ -3108,6 +3109,22 @@ See FIPS 180-2 for the specification.
 This is the SHA-384 algorithm which yields a message digest of 64 bytes.
 See FIPS 180-2 for the specification.
 
+ at item GCRY_MD_SHA3_224
+This is the SHA3-224 algorithm which yields a message digest of 28 bytes.
+See FIPS 202 for the specification.
+
+ at item GCRY_MD_SHA3_256
+This is the SHA3-256 algorithm which yields a message digest of 32 bytes.
+See FIPS 202 for the specification.
+
+ at item GCRY_MD_SHA3_384
+This is the SHA3-384 algorithm which yields a message digest of 48 bytes.
+See FIPS 202 for the specification.
+
+ at item GCRY_MD_SHA3_512
+This is the SHA3-384 algorithm which yields a message digest of 64 bytes.
+See FIPS 202 for the specification.
+
 @item GCRY_MD_CRC32
 This is the ISO 3309 and ITU-T V.42 cyclic redundancy check.  It yields
 an output of 4 bytes.  Note that this is not a hash algorithm in the
@@ -3170,11 +3187,12 @@ this is the hashed data is highly confidential.
 @item GCRY_MD_FLAG_HMAC
 @cindex HMAC
 Turn the algorithm into a HMAC message authentication algorithm.  This
-only works if just one algorithm is enabled for the handle.  Note that
-the function @code{gcry_md_setkey} must be used to set the MAC key.
-The size of the MAC is equal to the message digest of the underlying
-hash algorithm.  If you want CBC message authentication codes based on
-a cipher, see @xref{Working with cipher handles}.
+only works if just one algorithm is enabled for the handle and that
+algorithm is not an extendable-output function.  Note that the function
+ at code{gcry_md_setkey} must be used to set the MAC key.  The size of the
+MAC is equal to the message digest of the underlying hash algorithm.
+If you want CBC message authentication codes based on a cipher,
+see @xref{Working with cipher handles}.
 
 @item GCRY_MD_FLAG_BUGEMU1
 @cindex bug emulation
@@ -3293,9 +3311,9 @@ message digest or some padding.
 @deftypefun void gcry_md_final (gcry_md_hd_t @var{h})
 
 Finalize the message digest calculation.  This is not really needed
-because @code{gcry_md_read} does this implicitly.  After this has been
-done no further updates (by means of @code{gcry_md_write} or
- at code{gcry_md_putc} should be done; However, to mitigate timing
+because @code{gcry_md_read} and @code{gcry_md_extract} do this implicitly.
+After this has been done no further updates (by means of @code{gcry_md_write}
+or @code{gcry_md_putc} should be done; However, to mitigate timing
 attacks it is sometimes useful to keep on updating the context after
 having stored away the actual digest.  Only the first call to this function
 has an effect. It is implemented as a macro.
@@ -3318,6 +3336,22 @@ The function does return @code{NULL} if the requested algorithm has not
 been enabled.
 @end deftypefun
 
+The way to read output of extendable-output function is by using the
+function:
+
+ at deftypefun gpg_err_code_t gcry_md_extract (gcry_md_hd_t @var{h}, @
+  int @var{algo}, void *@var{buffer}, size_t @var{length})
+
+ at code{gcry_mac_read} returns output from extendable-output function.
+This function may be used as often as required to generate more output
+byte stream from the algorithm.  Function extracts the new output bytes
+to @var{buffer} of the length @var{length}.  Buffer will be fully
+populated with new output.  @var{algo} may be given as 0 to return the only
+enabled message digest or it may specify one of the enabled algorithms.
+The function does return non-zero value if the requested algorithm has not
+been enabled.
+ at end deftypefun
+
 Because it is often necessary to get the message digest of blocks of
 memory, two fast convenience function are available for this task:
 
@@ -3493,6 +3527,7 @@ provided by Libgcrypt.
 @c begin table of MAC algorithms
 @cindex HMAC-SHA-1
 @cindex HMAC-SHA-224, HMAC-SHA-256, HMAC-SHA-384, HMAC-SHA-512
+ at cindex HMAC-SHA3-224, HMAC-SHA3-256, HMAC-SHA3-384, HMAC-SHA3-512
 @cindex HMAC-RIPE-MD-160
 @cindex HMAC-MD2, HMAC-MD4, HMAC-MD5
 @cindex HMAC-TIGER1
@@ -3520,6 +3555,22 @@ algorithm.
 This is HMAC message authentication algorithm based on the SHA-384 hash
 algorithm.
 
+ at item GCRY_MAC_HMAC_SHA3_256
+This is HMAC message authentication algorithm based on the SHA3-384 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_SHA3_224
+This is HMAC message authentication algorithm based on the SHA3-224 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_SHA3_512
+This is HMAC message authentication algorithm based on the SHA3-512 hash
+algorithm.
+
+ at item GCRY_MAC_HMAC_SHA3_384
+This is HMAC message authentication algorithm based on the SHA3-384 hash
+algorithm.
+
 @item GCRY_MAC_HMAC_SHA1
 This is HMAC message authentication algorithm based on the SHA-1 hash
 algorithm.
diff --git a/src/cipher-proto.h b/src/cipher-proto.h
index 8267791..3bca9c7 100644
--- a/src/cipher-proto.h
+++ b/src/cipher-proto.h
@@ -215,6 +215,9 @@ typedef void (*gcry_md_final_t) (void *c);
 /* Type for the md_read function.  */
 typedef unsigned char *(*gcry_md_read_t) (void *c);
 
+/* Type for the md_extract function.  */
+typedef void (*gcry_md_extract_t) (void *c, void *outbuf, size_t nbytes);
+
 typedef struct gcry_md_oid_spec
 {
   const char *oidstring;
@@ -237,6 +240,7 @@ typedef struct gcry_md_spec
   gcry_md_write_t write;
   gcry_md_final_t final;
   gcry_md_read_t read;
+  gcry_md_extract_t extract;
   size_t contextsize; /* allocate this amount of context */
   selftest_func_t selftest;
 } gcry_md_spec_t;
diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h
index 8014d61..d367307 100644
--- a/src/gcrypt-int.h
+++ b/src/gcrypt-int.h
@@ -129,6 +129,8 @@ gpg_err_code_t _gcry_md_ctl (gcry_md_hd_t hd, int cmd,
                           void *buffer, size_t buflen);
 void _gcry_md_write (gcry_md_hd_t hd, const void *buffer, size_t length);
 unsigned char *_gcry_md_read (gcry_md_hd_t hd, int algo);
+gpg_error_t _gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer,
+                              size_t length);
 void _gcry_md_hash_buffer (int algo, void *digest,
                            const void *buffer, size_t length);
 gpg_err_code_t _gcry_md_hash_buffers (int algo, unsigned int flags,
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 585da6a..39be37a 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -473,7 +473,7 @@ char *gcry_sexp_nth_string (gcry_sexp_t list, int number);
    value can't be converted to an MPI, `NULL' is returned.  */
 gcry_mpi_t gcry_sexp_nth_mpi (gcry_sexp_t list, int number, int mpifmt);
 
-/* Convenience fucntion to extract parameters from an S-expression
+/* Convenience function to extract parameters from an S-expression
  * using a list of single letter parameters.  */
 gpg_error_t gcry_sexp_extract_param (gcry_sexp_t sexp,
                                      const char *path,
@@ -1170,7 +1170,7 @@ enum gcry_md_algos
     GCRY_MD_GOSTR3411_94  = 308, /* GOST R 34.11-94.  */
     GCRY_MD_STRIBOG256    = 309, /* GOST R 34.11-2012, 256 bit.  */
     GCRY_MD_STRIBOG512    = 310, /* GOST R 34.11-2012, 512 bit.  */
-    GCRY_MD_GOSTR3411_CP  = 311,  /* GOST R 34.11-94 with CryptoPro-A S-Box.  */
+    GCRY_MD_GOSTR3411_CP  = 311, /* GOST R 34.11-94 with CryptoPro-A S-Box.  */
     GCRY_MD_SHA3_224      = 312,
     GCRY_MD_SHA3_256      = 313,
     GCRY_MD_SHA3_384      = 314,
@@ -1239,6 +1239,11 @@ void gcry_md_write (gcry_md_hd_t hd, const void *buffer, size_t length);
    algorithm ALGO. */
 unsigned char *gcry_md_read (gcry_md_hd_t hd, int algo);
 
+/* Read more output from algorithm ALGO to BUFFER of size LENGTH from
+ * digest object HD. Algorithm needs to be 'expendable-output function'. */
+gpg_error_t gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer,
+                             size_t length);
+
 /* Convenience function to calculate the hash from the data in BUFFER
    of size LENGTH using the algorithm ALGO avoiding the creating of a
    hash object.  The hash is returned in the caller provided buffer
diff --git a/src/libgcrypt.def b/src/libgcrypt.def
index 924f17f..f3e074b 100644
--- a/src/libgcrypt.def
+++ b/src/libgcrypt.def
@@ -278,5 +278,6 @@ EXPORTS
 
       gcry_mpi_ec_sub           @244
 
+      gcry_md_extract           @245
 
 ;; end of file with public symbols for Windows.
diff --git a/src/libgcrypt.vers b/src/libgcrypt.vers
index 7e8df3f..5b3d419 100644
--- a/src/libgcrypt.vers
+++ b/src/libgcrypt.vers
@@ -41,7 +41,7 @@ GCRYPT_1.6 {
     gcry_md_get_algo; gcry_md_get_algo_dlen; gcry_md_hash_buffer;
     gcry_md_hash_buffers;
     gcry_md_info; gcry_md_is_enabled; gcry_md_is_secure;
-    gcry_md_map_name; gcry_md_open; gcry_md_read;
+    gcry_md_map_name; gcry_md_open; gcry_md_read; gcry_md_extract;
     gcry_md_reset; gcry_md_setkey;
     gcry_md_write; gcry_md_debug;
 
diff --git a/src/visibility.c b/src/visibility.c
index cbf24e7..23a2705 100644
--- a/src/visibility.c
+++ b/src/visibility.c
@@ -1165,6 +1165,12 @@ gcry_md_read (gcry_md_hd_t hd, int algo)
   return _gcry_md_read (hd, algo);
 }
 
+gcry_err_code_t
+gcry_md_extract (gcry_md_hd_t hd, int algo, void *buffer, size_t length)
+{
+  return _gcry_md_extract(hd, algo, buffer, length);
+}
+
 void
 gcry_md_hash_buffer (int algo, void *digest,
                      const void *buffer, size_t length)
diff --git a/src/visibility.h b/src/visibility.h
index fa3c763..bb25de0 100644
--- a/src/visibility.h
+++ b/src/visibility.h
@@ -110,6 +110,7 @@ MARK_VISIBLEX (gcry_md_is_secure)
 MARK_VISIBLEX (gcry_md_map_name)
 MARK_VISIBLEX (gcry_md_open)
 MARK_VISIBLEX (gcry_md_read)
+MARK_VISIBLEX (gcry_md_extract)
 MARK_VISIBLEX (gcry_md_reset)
 MARK_VISIBLEX (gcry_md_setkey)
 MARK_VISIBLEX (gcry_md_write)
@@ -374,6 +375,7 @@ MARK_VISIBLEX (_gcry_mpi_get_const)
 #define gcry_md_map_name            _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_open                _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_read                _gcry_USE_THE_UNDERSCORED_FUNCTION
+#define gcry_md_extract             _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_reset               _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_setkey              _gcry_USE_THE_UNDERSCORED_FUNCTION
 #define gcry_md_write               _gcry_USE_THE_UNDERSCORED_FUNCTION

commit cee2e122ec6c1886957a8d47498eb63a6a921725
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Oct 25 15:11:14 2015 +0200

    md: check hmac flag in prepare_macpads
    
    * cipher/md.c (prepare_macpads): Check hmac flag.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/md.c b/cipher/md.c
index c6bf90d..948d269 100644
--- a/cipher/md.c
+++ b/cipher/md.c
@@ -671,6 +671,9 @@ prepare_macpads (gcry_md_hd_t a, const unsigned char *key, size_t keylen)
   if (!a->ctx->list)
     return GPG_ERR_DIGEST_ALGO; /* Might happen if no algo is enabled.  */
 
+  if (!a->ctx->flags.hmac)
+    return GPG_ERR_DIGEST_ALGO; /* Tried setkey for non-HMAC md. */
+
   for (r = a->ctx->list; r; r = r->next)
     {
       const unsigned char *k;

-----------------------------------------------------------------------

Summary of changes:
 cipher/crc.c          |   8 +-
 cipher/gostr3411-94.c |   4 +-
 cipher/hash-common.c  |  28 ++-
 cipher/keccak.c       | 283 +++++++++++++++++++++---
 cipher/md.c           |  72 ++++++-
 cipher/md2.c          |   2 +-
 cipher/md4.c          |   2 +-
 cipher/md5.c          |   2 +-
 cipher/rmd160.c       |   2 +-
 cipher/sha1.c         |   2 +-
 cipher/sha256.c       |   4 +-
 cipher/sha512.c       |   4 +-
 cipher/stribog.c      |   2 +
 cipher/tiger.c        |   6 +-
 cipher/whirlpool.c    |   2 +-
 doc/gcrypt.texi       |  84 +++++++-
 src/cipher-proto.h    |   4 +
 src/cipher.h          |   2 +
 src/gcrypt-int.h      |   2 +
 src/gcrypt.h.in       |  13 +-
 src/libgcrypt.def     |   1 +
 src/libgcrypt.vers    |   2 +-
 src/visibility.c      |   6 +
 src/visibility.h      |   2 +
 tests/basic.c         | 586 ++++++++++++++++++++++++++++++++++++++++++++++++--
 tests/bench-slope.c   |   6 +
 26 files changed, 1040 insertions(+), 91 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Mon Nov  2 09:31:07 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Mon, 02 Nov 2015 17:31:07 +0900
Subject: random: Use poll instead of select
Message-ID: <56371F4B.1080206@fsij.org>

Hello,

I'm trying to fix a bug in GnuPG 1.4 (caused by (a kind of bug in)
duplicity which uses many file descriptors):

    https://bugs.gnupg.org/gnupg/issue1818
    https://bugs.debian.org/771263

And find a patch for libgcrypt in Fedora:

    http://pkgs.fedoraproject.org/cgit/libgcrypt.git/tree/
    libgcrypt-1.6.1-use-poll.patch

I think that the patch by Fedora is almost good, but it's not that
accurate in the comment and the behavior of setting any_need_entropy to 1
when poll timeouts.

So, this is the version which keeps the original behavior; It simply
replaces select by poll.  Note: poll uses millisecond for its timeout.

I think that all Linux kernel nowadays support poll(2).


diff --git a/random/rndlinux.c b/random/rndlinux.c
index 9eeec57..267a07e 100644
--- a/random/rndlinux.c
+++ b/random/rndlinux.c
@@ -32,6 +32,7 @@
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <poll.h>
 #include "types.h"
 #include "g10lib.h"
 #include "rand-internal.h"
@@ -179,12 +180,14 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t,
                  return with something we will actually use 100ms. */
   while (length)
     {
-      fd_set rfds;
-      struct timeval tv;
       int rc;
+      struct pollfd pfd;
+
+      pfd.fd = fd;
+      pfd.events = POLLIN;

       /* If we collected some bytes update the progress indicator.  We
-         do this always and not just if the select timed out because
+         do this always and not just if the poll timed out because
          often just a few bytes are gathered within the timeout
          period.  */
       if (any_need_entropy || last_so_far != (want - length) )
@@ -195,33 +198,19 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t,
           any_need_entropy = 1;
         }

-      /* If the system has no limit on the number of file descriptors
-         and we encounter an fd which is larger than the fd_set size,
-         we don't use the select at all.  The select code is only used
-         to emit progress messages.  A better solution would be to
-         fall back to poll() if available.  */
-#ifdef FD_SETSIZE
-      if (fd < FD_SETSIZE)
-#endif
+      if ( !(rc = poll (&pfd, 1, delay)) )
         {
-          FD_ZERO(&rfds);
-          FD_SET(fd, &rfds);
-          tv.tv_sec = delay;
-          tv.tv_usec = delay? 0 : 100000;
-          if ( !(rc=select(fd+1, &rfds, NULL, NULL, &tv)) )
-            {
-              any_need_entropy = 1;
-              delay = 3; /* Use 3 seconds henceforth.  */
-              continue;
-            }
-          else if( rc == -1 )
-            {
-              log_error ("select() error: %s\n", strerror(errno));
-              if (!delay)
-                delay = 1; /* Use 1 second if we encounter an error before
-                              we have ever blocked.  */
-              continue;
-            }
+          any_need_entropy = 1;
+          delay = 3000; /* Use 3 seconds henceforth.  */
+          continue;
+        }
+      else if( rc == -1 )
+        {
+          log_error ("poll() error: %s\n", strerror (errno));
+          if (!delay)
+            delay = 1000; /* Use 1 second if we encounter an error before
+                             we have ever blocked.  */
+          continue;
         }

       do
--


From wk at gnupg.org  Mon Nov  2 17:22:59 2015
From: wk at gnupg.org (Werner Koch)
Date: Mon, 02 Nov 2015 17:22:59 +0100
Subject: random: Use poll instead of select
In-Reply-To: <56371F4B.1080206@fsij.org> (NIIBE Yutaka's message of "Mon, 02
 Nov 2015 17:31:07 +0900")
References: <56371F4B.1080206@fsij.org>
Message-ID: <87si4oxtek.fsf@vigenere.g10code.de>

On Mon,  2 Nov 2015 09:31, gniibe at fsij.org said:

> I'm trying to fix a bug in GnuPG 1.4 (caused by (a kind of bug in)
> duplicity which uses many file descriptors):

and doesn't close them before execing gpg?  That is a real bug in
duplicity and we should better not work around it but only detect it.

For Libgcrypt however, we can't limit the number of used file
descriptors.

> So, this is the version which keeps the original behavior; It simply
> replaces select by poll.  Note: poll uses millisecond for its timeout.
>
> I think that all Linux kernel nowadays support poll(2).

rndlinux is a misnomer these days because it is actually an access
module for /dev/random style devices as available on all kind of
platforms.  I am not sure whether whether poll is availabale on all
these platforms.  You should add a test for a working poll or use poll
only for linux (to simplify the configure test).


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From vcizek at suse.com  Thu Nov  5 14:14:24 2015
From: vcizek at suse.com (Vitezslav Cizek)
Date: Thu, 5 Nov 2015 14:14:24 +0100
Subject: DCO signature
Message-ID: <20151105131424.GA32700@kolac.suse.cz>

Libgcrypt Developer's Certificate of Origin.  Version 1.0
=========================================================

By making a contribution to the Libgcrypt project, I certify that:

(a) The contribution was created in whole or in part by me and I
    have the right to submit it under the free software license
    indicated in the file; or

(b) The contribution is based upon previous work that, to the
    best of my knowledge, is covered under an appropriate free
    software license and I have the right under that license to
    submit that work with modifications, whether created in whole
    or in part by me, under the same free software license
    (unless I am permitted to submit under a different license),
    as indicated in the file; or

(c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.

(d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including
    all personal information I submit with it, including my
    sign-off) is maintained indefinitely and may be redistributed
    consistent with this project or the free software license(s)
    involved.

Signed-off-by: Vitezslav Cizek <vcizek at suse.com>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 819 bytes
Desc: Digital signature
URL: </pipermail/attachments/20151105/d629fad9/attachment.sig>

From jussi.kivilinna at iki.fi  Thu Nov  5 18:38:12 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 05 Nov 2015 19:38:12 +0200
Subject: [PATCH] Update license information for CRC
Message-ID: <20151105173812.19735.9576.stgit@localhost6.localdomain6>

* LICENSES: Remove 'Simple permissive' and 'IETF permissive' licenses
for 'cipher/crc.c' as result of rewrite of CRC implementations.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 LICENSES |   50 --------------------------------------------------
 1 file changed, 50 deletions(-)

diff --git a/LICENSES b/LICENSES
index 6c09e1f..ff8b7fa 100644
--- a/LICENSES
+++ b/LICENSES
@@ -54,56 +54,6 @@ with any binary distributions derived from the GNU C Library.
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
-* Simple permissive licenses
-
-  For files:
-  - cipher/crc.c
-
-#+begin_quote
-  Copyright (c) 1996 L. Peter Deutsch
-
-  Permission is granted to copy and distribute this document for
-  any purpose and without charge, including translations into
-  other languages and incorporation into compilations, provided
-  that the copyright notice and this notice are preserved, and
-  that any substantive changes or deletions from the original are
-  clearly marked.
-#+end_quote
-
-* IETF permissive licenses
-
-  For files:
-  - cipher/crc.c
-
-#+begin_quote
-  Copyright (C) The Internet Society (1998).  All Rights Reserved.
-
-  This document and translations of it may be copied and furnished
-  to others, and derivative works that comment on or otherwise
-  explain it or assist in its implementation may be prepared,
-  copied, published and distributed, in whole or in part, without
-  restriction of any kind, provided that the above copyright notice
-  and this paragraph are included on all such copies and derivative
-  works.  However, this document itself may not be modified in any
-  way, such as by removing the copyright notice or references to
-  the Internet Society or other Internet organizations, except as
-  needed for the purpose of developing Internet standards in which
-  case the procedures for copyrights defined in the Internet
-  Standards process must be followed, or as required to translate
-  it into languages other than English.
-
-  The limited permissions granted above are perpetual and will not be
-  revoked by the Internet Society or its successors or assigns.
-
-  This document and the information contained herein is provided on
-  an "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET
-  ENGINEERING TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE
-  OF THE INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY
-  IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
-  PURPOSE.
-#+end_quote
-
 * X License
 
   For files:


From cvs at cvs.gnupg.org  Thu Nov  5 18:11:43 2015
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Thu, 05 Nov 2015 18:11:43 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-280-g89fa74d
Message-ID: <E1ZuNvI-0005TO-Uy@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0 (commit)
       via  a1cc7bb15473a2419b24ecac765ae0ce5989a13b (commit)
       via  2857cb89c6dc1c02266600bc1fd2967a3cd5cf88 (commit)
       via  07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2 (commit)
      from  c0b9eee2d93a13930244f9ce0c14ed6b4aeb6c29 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Nov 1 20:44:09 2015 +0200

    Improve performance of Tiger hash algorithms
    
    * cipher/tiger.c (tiger_round, pass, key_schedule): Convert functions
    to macros.
    (transform_blk): Pass variable names instead of pointers to 'pass'.
    --
    
    Benchmark results on Intel Haswell @ 3.2 Ghz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |      3.25 ns/B     293.5 MiB/s     10.40 c/B
    
    After (1.75x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |      1.85 ns/B     515.3 MiB/s      5.92 c/B
    
    Benchmark results on Cortex-A8 @?1008 Mhz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |     63.42 ns/B     15.04 MiB/s     63.93 c/B
    
    After (1.26x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     TIGER          |     49.99 ns/B     19.08 MiB/s     50.39 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/tiger.c b/cipher/tiger.c
index 078133a..516bd44 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -633,68 +633,44 @@ tiger2_init (void *context, unsigned int flags)
   do_init (context, 2);
 }
 
-static void
-tiger_round( u64 *ra, u64 *rb, u64 *rc, u64 x, int mul )
-{
-  u64 a = *ra;
-  u64 b = *rb;
-  u64 c = *rc;
-
-  c ^= x;
-  a -= (  sbox1[  c        & 0xff ] ^ sbox2[ (c >> 16) & 0xff ]
-        ^ sbox3[ (c >> 32) & 0xff ] ^ sbox4[ (c >> 48) & 0xff ]);
-  b += (  sbox4[ (c >>  8) & 0xff ] ^ sbox3[ (c >> 24) & 0xff ]
-        ^ sbox2[ (c >> 40) & 0xff ] ^ sbox1[ (c >> 56) & 0xff ]);
-  b *= mul;
-
-  *ra = a;
-  *rb = b;
-  *rc = c;
-}
-
-
-static void
-pass( u64 *ra, u64 *rb, u64 *rc, u64 *x, int mul )
-{
-  u64 a = *ra;
-  u64 b = *rb;
-  u64 c = *rc;
-
-  tiger_round( &a, &b, &c, x[0], mul );
-  tiger_round( &b, &c, &a, x[1], mul );
-  tiger_round( &c, &a, &b, x[2], mul );
-  tiger_round( &a, &b, &c, x[3], mul );
-  tiger_round( &b, &c, &a, x[4], mul );
-  tiger_round( &c, &a, &b, x[5], mul );
-  tiger_round( &a, &b, &c, x[6], mul );
-  tiger_round( &b, &c, &a, x[7], mul );
-
-  *ra = a;
-  *rb = b;
-  *rc = c;
-}
-
 
-static void
-key_schedule( u64 *x )
-{
-  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL;
-  x[1] ^= x[0];
-  x[2] += x[1];
-  x[3] -= x[2] ^ ((~x[1]) << 19 );
-  x[4] ^= x[3];
-  x[5] += x[4];
-  x[6] -= x[5] ^ ((~x[4]) >> 23 );
-  x[7] ^= x[6];
-  x[0] += x[7];
-  x[1] -= x[0] ^ ((~x[7]) << 19 );
-  x[2] ^= x[1];
-  x[3] += x[2];
-  x[4] -= x[3] ^ ((~x[2]) >> 23 );
-  x[5] ^= x[4];
-  x[6] += x[5];
-  x[7] -= x[6] ^ 0x0123456789abcdefLL;
-}
+#define tiger_round(xa, xb, xc, xx, xmul) { \
+  xc ^= xx; \
+  xa -= (  sbox1[  (xc)        & 0xff ] ^ sbox2[ ((xc) >> 16) & 0xff ] \
+         ^ sbox3[ ((xc) >> 32) & 0xff ] ^ sbox4[ ((xc) >> 48) & 0xff ]); \
+  xb += (  sbox4[ ((xc) >>  8) & 0xff ] ^ sbox3[ ((xc) >> 24) & 0xff ] \
+         ^ sbox2[ ((xc) >> 40) & 0xff ] ^ sbox1[ ((xc) >> 56) & 0xff ]); \
+  xb *= xmul; }
+
+
+#define pass(ya, yb, yc, yx, ymul) { \
+  tiger_round( ya, yb, yc, yx[0], ymul ); \
+  tiger_round( yb, yc, ya, yx[1], ymul ); \
+  tiger_round( yc, ya, yb, yx[2], ymul ); \
+  tiger_round( ya, yb, yc, yx[3], ymul ); \
+  tiger_round( yb, yc, ya, yx[4], ymul ); \
+  tiger_round( yc, ya, yb, yx[5], ymul ); \
+  tiger_round( ya, yb, yc, yx[6], ymul ); \
+  tiger_round( yb, yc, ya, yx[7], ymul ); }
+
+
+#define key_schedule(x) { \
+  x[0] -= x[7] ^ 0xa5a5a5a5a5a5a5a5LL; \
+  x[1] ^= x[0]; \
+  x[2] += x[1]; \
+  x[3] -= x[2] ^ ((~x[1]) << 19 ); \
+  x[4] ^= x[3]; \
+  x[5] += x[4]; \
+  x[6] -= x[5] ^ ((~x[4]) >> 23 ); \
+  x[7] ^= x[6]; \
+  x[0] += x[7]; \
+  x[1] -= x[0] ^ ((~x[7]) << 19 ); \
+  x[2] ^= x[1]; \
+  x[3] += x[2]; \
+  x[4] -= x[3] ^ ((~x[2]) >> 23 ); \
+  x[5] ^= x[4]; \
+  x[6] += x[5]; \
+  x[7] -= x[6] ^ 0x0123456789abcdefLL; }
 
 
 /****************
@@ -716,11 +692,11 @@ transform_blk ( void *ctx, const unsigned char *data )
   b = bb = hd->b;
   c = cc = hd->c;
 
-  pass( &a, &b, &c, x, 5);
+  pass( a, b, c, x, 5);
   key_schedule( x );
-  pass( &c, &a, &b, x, 7);
+  pass( c, a, b, x, 7);
   key_schedule( x );
-  pass( &b, &c, &a, x, 9);
+  pass( b, c, a, x, 9);
 
   /* feedforward */
   a ^= aa;

commit a1cc7bb15473a2419b24ecac765ae0ce5989a13b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Nov 1 16:06:26 2015 +0200

    Add ARMv7/NEON implementation of Keccak
    
    * cipher/Makefile.am: Add 'keccak-armv7-neon.S'.
    * cipher/keccak-armv7-neon.S: New.
    * cipher/keccak.c (USE_64BIT_ARM_NEON): New.
    (NEED_COMMON64): Select if USE_64BIT_ARM_NEON.
    [NEED_COMMON64] (round_consts_64bit): Rename to...
    [NEED_COMMON64] (_gcry_keccak_round_consts_64bit): ...this; Add
    terminator at end.
    [USE_64BIT_ARM_NEON] (_gcry_keccak_permute_armv7_neon)
    (_gcry_keccak_absorb_lanes64_armv7_neon, keccak_permute64_armv7_neon)
    (keccak_absorb_lanes64_armv7_neon, keccak_armv7_neon_64_ops): New.
    (keccak_init) [USE_64BIT_ARM_NEON]: Select ARM/NEON implementation
    if supported by HW.
    * cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Update
    to use new round constant table.
    * configure.ac: Add 'keccak-armv7-neon.lo'.
    --
    
    Patch adds ARMv7/NEON implementation of Keccak (SHAKE/SHA3). Patch
    is based on public-domain implementation by Ronny Van Keer from
    SUPERCOP package:
     https://github.com/floodyberry/supercop/blob/master/crypto_hash/\
    keccakc1024/inplace-armv7a-neon/keccak2.s
    
    Benchmark results on Cortex-A8 @ 1008 Mhz:
    
    Before (generic 32-bit bit-interleaved impl.):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |     83.00 ns/B     11.49 MiB/s     83.67 c/B
     SHAKE256       |     101.7 ns/B      9.38 MiB/s     102.5 c/B
     SHA3-224       |     96.13 ns/B      9.92 MiB/s     96.90 c/B
     SHA3-256       |     101.5 ns/B      9.40 MiB/s     102.3 c/B
     SHA3-384       |     131.4 ns/B      7.26 MiB/s     132.5 c/B
     SHA3-512       |     189.1 ns/B      5.04 MiB/s     190.6 c/B
    
    After (ARM/NEON, ~3.2x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |     25.09 ns/B     38.01 MiB/s     25.29 c/B
     SHAKE256       |     30.95 ns/B     30.82 MiB/s     31.19 c/B
     SHA3-224       |     29.24 ns/B     32.61 MiB/s     29.48 c/B
     SHA3-256       |     30.95 ns/B     30.82 MiB/s     31.19 c/B
     SHA3-384       |     40.42 ns/B     23.59 MiB/s     40.74 c/B
     SHA3-512       |     58.37 ns/B     16.34 MiB/s     58.84 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index be03d06..88c8fbf 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -90,7 +90,7 @@ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
 sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \
   sha512-armv7-neon.S \
-keccak.c keccak_permute_32.h keccak_permute_64.h \
+keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/keccak-armv7-neon.S b/cipher/keccak-armv7-neon.S
new file mode 100644
index 0000000..0bec8d5
--- /dev/null
+++ b/cipher/keccak-armv7-neon.S
@@ -0,0 +1,945 @@
+/* keccak-armv7-neon.S  -  ARMv7/NEON implementation of Keccak
+ *
+ * Copyright (C) 2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_NEON)
+
+/* Based on public-domain/CC0 implementation from SUPERCOP package
+ * (keccakc1024/inplace-armv7a-neon/keccak2.s)
+ *
+ * Original copyright header follows:
+ */
+
+@ The Keccak sponge function, designed by Guido Bertoni, Joan Daemen,
+@ Micha?l Peeters and Gilles Van Assche. For more information, feedback or
+@ questions, please refer to our website: http://keccak.noekeon.org/
+@
+@ Implementation by Ronny Van Keer, hereby denoted as "the implementer".
+@
+@ To the extent possible under law, the implementer has waived all copyright
+@ and related or neighboring rights to the source code in this file.
+@ http://creativecommons.org/publicdomain/zero/1.0/
+
+.text
+
+.syntax unified
+.fpu neon
+.arm
+
+
+.extern _gcry_keccak_round_consts_64bit;
+
+#ifdef __PIC__
+#  define GET_DATA_POINTER(reg, name, rtmp) \
+		ldr reg, 1f; \
+		ldr rtmp, 2f; \
+		b 3f; \
+	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
+	2:	.word name(GOT); \
+	3:	add reg, pc, reg; \
+		ldr reg, [reg, rtmp];
+#else
+#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
+#endif
+
+
+@//  --- offsets in state
+.equ Aba, 0*8
+.equ Aga, 1*8
+.equ Aka, 2*8
+.equ Ama, 3*8
+.equ Asa, 4*8
+
+@// --- macros
+
+.macro    KeccakThetaRhoPiChiIota argA1, argA2, argA3, argA4, argA5
+
+    @Prepare Theta
+    @Ca = Aba^Aga^Aka^Ama^Asa@
+    @Ce = Abe^Age^Ake^Ame^Ase@
+    @Ci = Abi^Agi^Aki^Ami^Asi@
+    @Co = Abo^Ago^Ako^Amo^Aso@
+    @Cu = Abu^Agu^Aku^Amu^Asu@
+    @De = Ca^ROL64(Ci, 1)@
+    @Di = Ce^ROL64(Co, 1)@
+    @Do = Ci^ROL64(Cu, 1)@
+    @Du = Co^ROL64(Ca, 1)@
+    @Da = Cu^ROL64(Ce, 1)@
+
+    veor.64 q4, q6, q7
+    veor.64 q5, q9, q10
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d1,  d8,   d16
+    veor.64 d2,  d10,   d17
+
+    veor.64 q4, q11, q12
+    veor.64 q5, q14, q15
+    veor.64 d8,  d8,   d9
+    veor.64 d10,  d10,   d11
+    veor.64 d3,  d8,   d26
+
+    vadd.u64 q4, q1, q1
+    veor.64 d4,  d10,   d27
+    vmov.64  d0, d5
+    vsri.64 q4, q1, #63
+
+    vadd.u64 q5, q2, q2
+    veor.64 q4, q4, q0
+    vsri.64 q5, q2, #63
+    vadd.u64 d7, d1, d1
+    veor.64 \argA2, \argA2, d8
+    veor.64 q5, q5, q1
+
+    vsri.64 d7, d1, #63
+    vshl.u64 d1, \argA2, #44
+    veor.64 \argA3, \argA3, d9
+    veor.64 d7, d7, d4
+
+    @Ba = argA1^Da@
+    @Be = ROL64((argA2^De), 44)@
+    @Bi = ROL64((argA3^Di), 43)@
+    @Bo = ROL64((argA4^Do), 21)@
+    @Bu = ROL64((argA5^Du), 14)@
+    @argA2 =   Be ^((~Bi)& Bo )@
+    @argA3 =   Bi ^((~Bo)& Bu )@
+    @argA4 =   Bo ^((~Bu)& Ba )@
+    @argA5 =   Bu ^((~Ba)& Be )@
+    @argA1 =   Ba ^((~Be)& Bi )@ argA1 ^= KeccakF1600RoundConstants[i+round]@
+    vsri.64 d1, \argA2, #64-44
+    vshl.u64 d2, \argA3, #43
+    vldr.64 d0, [sp, #\argA1]
+    veor.64 \argA4, \argA4, d10
+    vsri.64 d2, \argA3, #64-43
+    vshl.u64 d3, \argA4, #21
+    veor.64 \argA5, \argA5, d11
+    veor.64 d0, d0, d7
+    vsri.64 d3, \argA4, #64-21
+    vbic.64   d5, d2, d1
+    vshl.u64 d4, \argA5, #14
+    vbic.64   \argA2, d3, d2
+    vld1.64   d6, [ip]!
+    veor.64   d5, d0
+    vsri.64 d4, \argA5, #64-14
+    veor.64   d5, d6
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA2, d1
+    vstr.64   d5, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi1   argA1, argA2, argA3, argA4, argA5
+
+    @d2 = ROL64((argA1^Da), 3)@
+    @d3 = ROL64((argA2^De), 45)@
+    @d4 = ROL64((argA3^Di), 61)@
+    @d0 = ROL64((argA4^Do), 28)@
+    @d1 = ROL64((argA5^Du), 20)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d3, \argA2, #45
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d4, \argA3, #61
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d3, \argA2, #64-45
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d4, \argA3, #64-61
+    vshl.u64  d0, \argA4, #28
+    veor.64 d6, d6, d7
+    vshl.u64  d1, \argA5, #20
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA4, #64-28
+    vbic.64   \argA4, d0, d4
+    vshl.u64  d2, d6, #3
+    vsri.64  d1, \argA5, #64-20
+    veor.64   \argA4, d3
+    vsri.64  d2, d6, #64-3
+    vbic.64   \argA5, d1, d0
+    vbic.64   d6, d2, d1
+    vbic.64   \argA2, d3, d2
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64   \argA3, d2
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi2 argA1, argA2, argA3, argA4, argA5
+
+    @d4 = ROL64((argA1^Da), 18)@
+    @d0 = ROL64((argA2^De), 1)@
+    @d1 = ROL64((argA3^Di), 6)@
+    @d2 = ROL64((argA4^Do), 25)@
+    @d3 = ROL64((argA5^Du), 8)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA3, \argA3, d9
+    veor.64 \argA4, \argA4, d10
+    vshl.u64  d1, \argA3, #6
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d2, \argA4, #25
+    veor.64 \argA5, \argA5, d11
+    vsri.64  d1, \argA3, #64-6
+    veor.64 \argA2, \argA2, d8
+    vsri.64  d2, \argA4, #64-25
+    vext.8  d3, \argA5, \argA5, #7
+    veor.64 d6, d6, d7
+    vbic.64  \argA3, d2, d1
+    vadd.u64  d0, \argA2, \argA2
+    vbic.64   \argA4, d3, d2
+    vsri.64  d0, \argA2, #64-1
+    vshl.u64  d4, d6, #18
+    veor.64  \argA2, d1, \argA4
+    veor.64  \argA3, d0
+    vsri.64  d4, d6, #64-18
+    vstr.64   \argA3, [sp, #\argA1]
+    veor.64  d5, \argA3
+    vbic.64   \argA5, d1, d0
+    vbic.64   \argA3, d4, d3
+    vbic.64   \argA4, d0, d4
+    veor.64   \argA3, d2
+    veor.64   \argA4, d3
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi3 argA1, argA2, argA3, argA4, argA5
+
+    @d1 = ROL64((argA1^Da), 36)@
+    @d2 = ROL64((argA2^De), 10)@
+    @d3 = ROL64((argA3^Di), 15)@
+    @d4 = ROL64((argA4^Do), 56)@
+    @d0 = ROL64((argA5^Du), 27)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d2, \argA2, #10
+    vldr.64 d6, [sp, #\argA1]
+    vshl.u64  d3, \argA3, #15
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d2, \argA2, #64-10
+    vsri.64  d3, \argA3, #64-15
+    veor.64 \argA5, \argA5, d11
+    vext.8  d4, \argA4, \argA4, #1
+    vbic.64   \argA2, d3, d2
+    vshl.u64  d0, \argA5, #27
+    veor.64 d6, d6, d7
+    vbic.64   \argA3, d4, d3
+    vsri.64  d0, \argA5, #64-27
+    vshl.u64  d1, d6, #36
+    veor.64   \argA3, d2
+    vbic.64   \argA4, d0, d4
+    vsri.64  d1, d6, #64-36
+
+    veor.64   \argA4, d3
+    vbic.64   d6, d2, d1
+    vbic.64   \argA5, d1, d0
+    veor.64   d6, d0
+    veor.64   \argA2, d1
+    vstr.64   d6, [sp, #\argA1]
+    veor.64  d5, d6
+    veor.64   \argA5, d4
+
+    .endm
+
+.macro    KeccakThetaRhoPiChi4 argA1, argA2, argA3, argA4, argA5
+
+    @d3 = ROL64((argA1^Da), 41)@
+    @d4 = ROL64((argA2^De), 2)@
+    @d0 = ROL64((argA3^Di), 62)@
+    @d1 = ROL64((argA4^Do), 55)@
+    @d2 = ROL64((argA5^Du), 39)@
+    @argA1 =   Ba ^((~Be)&  Bi )@ Ca ^= argA1@
+    @argA2 =   Be ^((~Bi)&  Bo )@
+    @argA3 =   Bi ^((~Bo)&  Bu )@
+    @argA4 =   Bo ^((~Bu)&  Ba )@
+    @argA5 =   Bu ^((~Ba)&  Be )@
+
+    veor.64 \argA2, \argA2, d8
+    veor.64 \argA3, \argA3, d9
+    vshl.u64  d4, \argA2, #2
+    veor.64 \argA5, \argA5, d11
+    vshl.u64  d0, \argA3, #62
+    vldr.64 d6, [sp, #\argA1]
+    vsri.64  d4, \argA2, #64-2
+    veor.64 \argA4, \argA4, d10
+    vsri.64  d0, \argA3, #64-62
+
+    vshl.u64  d1, \argA4, #55
+    veor.64 d6, d6, d7
+    vshl.u64  d2, \argA5, #39
+    vsri.64  d1, \argA4, #64-55
+    vbic.64  \argA4, d0, d4
+    vsri.64  d2, \argA5, #64-39
+    vbic.64  \argA2, d1, d0
+    vshl.u64  d3, d6, #41
+    veor.64  \argA5, d4, \argA2
+    vbic.64  \argA2, d2, d1
+    vsri.64  d3, d6, #64-41
+    veor.64  d6, d0, \argA2
+
+    vbic.64 \argA2, d3, d2
+    vbic.64 \argA3, d4, d3
+    veor.64 \argA2, d1
+    vstr.64 d6, [sp, #\argA1]
+    veor.64 d5, d6
+    veor.64 \argA3, d2
+    veor.64 \argA4, d3
+
+    .endm
+
+
+@// --- code
+
+ at not callable from C!
+.p2align 3
+.type  KeccakF_armv7a_neon_asm,%function;
+KeccakF_armv7a_neon_asm:  @
+
+.LroundLoop:
+
+    KeccakThetaRhoPiChiIota  Aba, d13, d19, d25, d31
+    KeccakThetaRhoPiChi1    Aka, d15, d21, d22, d28
+    KeccakThetaRhoPiChi2    Asa, d12, d18, d24, d30
+    KeccakThetaRhoPiChi3    Aga, d14, d20, d26, d27
+    KeccakThetaRhoPiChi4    Ama, d16, d17, d23, d29
+
+    KeccakThetaRhoPiChiIota  Aba, d15, d18, d26, d29
+    KeccakThetaRhoPiChi1    Asa, d14, d17, d25, d28
+    KeccakThetaRhoPiChi2    Ama, d13, d21, d24, d27
+    KeccakThetaRhoPiChi3    Aka, d12, d20, d23, d31
+    KeccakThetaRhoPiChi4    Aga, d16, d19, d22, d30
+
+    KeccakThetaRhoPiChiIota Aba, d14, d21, d23, d30
+    KeccakThetaRhoPiChi1    Ama, d12, d19, d26, d28
+    KeccakThetaRhoPiChi2    Aga, d15, d17, d24, d31
+    KeccakThetaRhoPiChi3    Asa, d13, d20, d22, d29
+    KeccakThetaRhoPiChi4    Aka, d16, d18, d25, d27
+
+    KeccakThetaRhoPiChiIota Aba, d12, d17, d22, d27
+    KeccakThetaRhoPiChi1    Aga, d13, d18, d23, d28
+    KeccakThetaRhoPiChi2    Aka, d14, d19, d24, d29
+    ldr    r0, [ip]
+    KeccakThetaRhoPiChi3    Ama, d15, d20, d25, d30
+    cmp    r0, #0xFFFFFFFF
+    KeccakThetaRhoPiChi4    Asa, d16, d21, d26, d31
+
+    bne    .LroundLoop
+    sub    ip, #(8*24)
+    bx    lr
+.p2align 2
+.ltorg
+.size KeccakF_armv7a_neon_asm,.-KeccakF_armv7a_neon_asm;
+
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state)  callable from C
+.p2align 3
+.global   _gcry_keccak_permute_armv7_neon
+.type  _gcry_keccak_permute_armv7_neon,%function;
+_gcry_keccak_permute_armv7_neon:
+
+    push   {ip, lr}
+    vpush  {q4-q7}
+    sub    sp,sp, #5*8
+
+    vldr.64  d0,  [r0, #0*8]
+    vldr.64  d12, [r0, #1*8]
+    vldr.64  d17, [r0, #2*8]
+    vldr.64  d22, [r0, #3*8]
+    vldr.64  d27, [r0, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r0, #5*8]
+    vldr.64  d13, [r0, #6*8]
+    vldr.64  d18, [r0, #7*8]
+    vldr.64  d23, [r0, #8*8]
+    vldr.64  d28, [r0, #9*8]
+
+    vldr.64  d2,  [r0, #10*8]
+    vldr.64  d14, [r0, #11*8]
+    vldr.64  d19, [r0, #12*8]
+    vldr.64  d24, [r0, #13*8]
+    vldr.64  d29, [r0, #14*8]
+
+    vldr.64  d3,  [r0, #15*8]
+    vldr.64  d15, [r0, #16*8]
+    vldr.64  d20, [r0, #17*8]
+    vldr.64  d25, [r0, #18*8]
+    vldr.64  d30, [r0, #19*8]
+
+    vldr.64  d4,  [r0, #20*8]
+    vldr.64  d16, [r0, #21*8]
+    vldr.64  d21, [r0, #22*8]
+    vldr.64  d26, [r0, #23*8]
+    vldr.64  d31, [r0, #24*8]
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    mov      r1, r0
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    vpop.64  { d0- d4 }
+
+    vstr.64  d0,  [r1, #0*8]
+    vstr.64  d12, [r1, #1*8]
+    vstr.64  d17, [r1, #2*8]
+    vstr.64  d22, [r1, #3*8]
+    vstr.64  d27, [r1, #4*8]
+
+    vstr.64  d1,  [r1, #5*8]
+    vstr.64  d13, [r1, #6*8]
+    vstr.64  d18, [r1, #7*8]
+    vstr.64  d23, [r1, #8*8]
+    vstr.64  d28, [r1, #9*8]
+
+    vstr.64  d2,  [r1, #10*8]
+    vstr.64  d14, [r1, #11*8]
+    vstr.64  d19, [r1, #12*8]
+    vstr.64  d24, [r1, #13*8]
+    vstr.64  d29, [r1, #14*8]
+
+    vstr.64  d3,  [r1, #15*8]
+    vstr.64  d15, [r1, #16*8]
+    vstr.64  d20, [r1, #17*8]
+    vstr.64  d25, [r1, #18*8]
+    vstr.64  d30, [r1, #19*8]
+
+    vstr.64  d4,  [r1, #20*8]
+    vstr.64  d16, [r1, #21*8]
+    vstr.64  d21, [r1, #22*8]
+    vstr.64  d26, [r1, #23*8]
+    vstr.64  d31, [r1, #24*8]
+
+    mov   r0, #112
+    vpop  {q4-q7}
+    pop   {ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_permute_armv7_neon,.-_gcry_keccak_permute_armv7_neon;
+
+@//unsigned _gcry_keccak_permute_armv7_neon(u64 *state, @r4
+@					    int pos,    @r1
+@					    const byte *lanes,   @r2
+@					    unsigned int nlanes, @r3
+@					    int blocklanes) @ r5 callable from C
+.p2align 3
+.global   _gcry_keccak_absorb_lanes64_armv7_neon
+.type  _gcry_keccak_absorb_lanes64_armv7_neon,%function;
+_gcry_keccak_absorb_lanes64_armv7_neon:
+
+    cmp    r3, #0	@ nlanes == 0
+    itt eq
+    moveq  r0, #0
+    bxeq   lr
+
+    push   {r4-r5, ip, lr}
+    beq    .Lout
+    mov    r4, r0
+    ldr    r5, [sp, #(4*4)]
+    vpush  {q4-q7}
+
+    @ load state
+    vldr.64  d0,  [r4, #0*8]
+    vldr.64  d12, [r4, #1*8]
+    vldr.64  d17, [r4, #2*8]
+    vldr.64  d22, [r4, #3*8]
+    vldr.64  d27, [r4, #4*8]
+
+    GET_DATA_POINTER(ip, _gcry_keccak_round_consts_64bit, lr);
+
+    vldr.64  d1,  [r4, #5*8]
+    vldr.64  d13, [r4, #6*8]
+    vldr.64  d18, [r4, #7*8]
+    vldr.64  d23, [r4, #8*8]
+    vldr.64  d28, [r4, #9*8]
+
+    vldr.64  d2,  [r4, #10*8]
+    vldr.64  d14, [r4, #11*8]
+    vldr.64  d19, [r4, #12*8]
+    vldr.64  d24, [r4, #13*8]
+    vldr.64  d29, [r4, #14*8]
+
+    vldr.64  d3,  [r4, #15*8]
+    vldr.64  d15, [r4, #16*8]
+    vldr.64  d20, [r4, #17*8]
+    vldr.64  d25, [r4, #18*8]
+    vldr.64  d30, [r4, #19*8]
+
+    vldr.64  d4,  [r4, #20*8]
+    vldr.64  d16, [r4, #21*8]
+    vldr.64  d21, [r4, #22*8]
+    vldr.64  d26, [r4, #23*8]
+    vldr.64  d31, [r4, #24*8]
+
+.Lmain_loop:
+
+    @ detect absorb mode (full blocks vs lanes)
+
+    cmp r1, #0		@ pos != 0
+    bne .Llanes_loop
+
+.Lmain_loop_pos0:
+
+    @ full blocks mode
+
+    @ switch (blocksize)
+    cmp r5, #21
+    beq .Lfull_block_21
+    cmp r5, #18
+    beq .Lfull_block_18
+    cmp r5, #17
+    beq .Lfull_block_17
+    cmp r5, #13
+    beq .Lfull_block_13
+    cmp r5, #9
+    beq .Lfull_block_9
+
+    @ unknown blocksize
+    b .Llanes_loop
+
+.Lfull_block_21:
+
+    @ SHAKE128
+
+    cmp r3, #21		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+    veor d25, d9
+    veor d30, d10
+
+    veor d4,  d11
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #21	@ nlanes -= 21
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_21
+
+.Lfull_block_18:
+
+    @ SHA3-224
+
+    cmp r3, #18		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+    veor d20, d8
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #18	@ nlanes -= 18
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_18
+
+.Lfull_block_17:
+
+    @ SHA3-256 & SHAKE256
+
+    cmp r3, #17		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d11}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    vld1.64 {d5-d7}, [r2]!
+    veor d14, d9
+    veor d19, d10
+    veor d24, d11
+    veor d29, d5
+
+    veor d3,  d6
+    veor d15, d7
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #17	@ nlanes -= 17
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_17
+
+.Lfull_block_13:
+
+    @ SHA3-384
+
+    cmp r3, #13		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d8}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    vld1.64 {d9-d10}, [r2]!
+    veor d18, d5
+    veor d23, d6
+    veor d28, d7
+
+    veor d2,  d8
+    veor d14, d9
+    veor d19, d10
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #13	@ nlanes -= 13
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_13
+
+.Lfull_block_9:
+
+    @ SHA3-512
+
+    cmp r3, #9		@ nlanes < blocklanes
+    blo .Llanes_loop
+
+    sub    sp,sp, #5*8
+
+    vld1.64 {d5-d8}, [r2]!
+    veor d0,  d5
+    vld1.64 {d9-d11}, [r2]!
+    veor d12, d6
+    veor d17, d7
+    veor d22, d8
+    vld1.64 {d5-d6}, [r2]!
+    veor d27, d9
+
+    veor d1,  d10
+    veor d13, d11
+    veor d18, d5
+    veor d23, d6
+
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    subs r3, #9		@ nlanes -= 9
+    vpop.64  { d0-d4 }
+
+    beq .Ldone
+
+    b .Lfull_block_9
+
+.Llanes_loop:
+
+    @ per-lane mode
+
+    @ switch (pos)
+    ldrb r0, [pc, r1]
+    add pc, pc, r0, lsl #2
+.Lswitch_table:
+    .byte (.Llane0-.Lswitch_table-4)/4
+    .byte (.Llane1-.Lswitch_table-4)/4
+    .byte (.Llane2-.Lswitch_table-4)/4
+    .byte (.Llane3-.Lswitch_table-4)/4
+    .byte (.Llane4-.Lswitch_table-4)/4
+    .byte (.Llane5-.Lswitch_table-4)/4
+    .byte (.Llane6-.Lswitch_table-4)/4
+    .byte (.Llane7-.Lswitch_table-4)/4
+    .byte (.Llane8-.Lswitch_table-4)/4
+    .byte (.Llane9-.Lswitch_table-4)/4
+    .byte (.Llane10-.Lswitch_table-4)/4
+    .byte (.Llane11-.Lswitch_table-4)/4
+    .byte (.Llane12-.Lswitch_table-4)/4
+    .byte (.Llane13-.Lswitch_table-4)/4
+    .byte (.Llane14-.Lswitch_table-4)/4
+    .byte (.Llane15-.Lswitch_table-4)/4
+    .byte (.Llane16-.Lswitch_table-4)/4
+    .byte (.Llane17-.Lswitch_table-4)/4
+    .byte (.Llane18-.Lswitch_table-4)/4
+    .byte (.Llane19-.Lswitch_table-4)/4
+    .byte (.Llane20-.Lswitch_table-4)/4
+    .byte (.Llane21-.Lswitch_table-4)/4
+    .byte (.Llane22-.Lswitch_table-4)/4
+    .byte (.Llane23-.Lswitch_table-4)/4
+    .byte (.Llane24-.Lswitch_table-4)/4
+.p2align 2
+
+#define ABSORB_LANE(label, vreg) \
+    label: \
+      add     r1, #1; \
+      vld1.64 d5, [r2]!; \
+      cmp     r1, r5; /* pos == blocklanes */ \
+      veor    vreg, vreg, d5; \
+      beq     .Llanes_permute; \
+      subs    r3, #1; \
+      beq     .Ldone;
+
+    ABSORB_LANE(.Llane0, d0)
+    ABSORB_LANE(.Llane1, d12)
+    ABSORB_LANE(.Llane2, d17)
+    ABSORB_LANE(.Llane3, d22)
+    ABSORB_LANE(.Llane4, d27)
+
+    ABSORB_LANE(.Llane5, d1)
+    ABSORB_LANE(.Llane6, d13)
+    ABSORB_LANE(.Llane7, d18)
+    ABSORB_LANE(.Llane8, d23)
+    ABSORB_LANE(.Llane9, d28)
+
+    ABSORB_LANE(.Llane10, d2)
+    ABSORB_LANE(.Llane11, d14)
+    ABSORB_LANE(.Llane12, d19)
+    ABSORB_LANE(.Llane13, d24)
+    ABSORB_LANE(.Llane14, d29)
+
+    ABSORB_LANE(.Llane15, d3)
+    ABSORB_LANE(.Llane16, d15)
+    ABSORB_LANE(.Llane17, d20)
+    ABSORB_LANE(.Llane18, d25)
+    ABSORB_LANE(.Llane19, d30)
+
+    ABSORB_LANE(.Llane20, d4)
+    ABSORB_LANE(.Llane21, d16)
+    ABSORB_LANE(.Llane22, d21)
+    ABSORB_LANE(.Llane23, d26)
+    ABSORB_LANE(.Llane24, d31)
+
+    b .Llanes_loop
+
+.Llanes_permute:
+
+    sub    sp,sp, #5*8
+    vstr.64  d0, [sp, #Aba]
+    vstr.64  d1, [sp, #Aga]
+    veor.64 q0, q0, q1
+    vstr.64  d2, [sp, #Aka]
+    veor.64 d5, d0,  d1
+    vstr.64  d3, [sp, #Ama]
+    vstr.64  d4, [sp, #Asa]
+    veor.64 d5, d5,  d4
+
+    bl KeccakF_armv7a_neon_asm
+
+    mov  r1, #0   @ pos <= 0
+    subs r3, #1
+
+    vpop.64  { d0-d4 }
+
+    beq  .Ldone
+
+    b .Lmain_loop_pos0
+
+.Ldone:
+
+    @ save state
+    vstr.64  d0,  [r4, #0*8]
+    vstr.64  d12, [r4, #1*8]
+    vstr.64  d17, [r4, #2*8]
+    vstr.64  d22, [r4, #3*8]
+    vstr.64  d27, [r4, #4*8]
+
+    vstr.64  d1,  [r4, #5*8]
+    vstr.64  d13, [r4, #6*8]
+    vstr.64  d18, [r4, #7*8]
+    vstr.64  d23, [r4, #8*8]
+    vstr.64  d28, [r4, #9*8]
+
+    vstr.64  d2,  [r4, #10*8]
+    vstr.64  d14, [r4, #11*8]
+    vstr.64  d19, [r4, #12*8]
+    vstr.64  d24, [r4, #13*8]
+    vstr.64  d29, [r4, #14*8]
+
+    vstr.64  d3,  [r4, #15*8]
+    vstr.64  d15, [r4, #16*8]
+    vstr.64  d20, [r4, #17*8]
+    vstr.64  d25, [r4, #18*8]
+    vstr.64  d30, [r4, #19*8]
+
+    vstr.64  d4,  [r4, #20*8]
+    vstr.64  d16, [r4, #21*8]
+    vstr.64  d21, [r4, #22*8]
+    vstr.64  d26, [r4, #23*8]
+    vstr.64  d31, [r4, #24*8]
+
+    mov   r0, #120
+    vpop  {q4-q7}
+.Lout:
+    pop   {r4-r5, ip, pc}
+.p2align 2
+.ltorg
+.size _gcry_keccak_absorb_lanes64_armv7_neon,.-_gcry_keccak_absorb_lanes64_armv7_neon;
+
+#endif
diff --git a/cipher/keccak.c b/cipher/keccak.c
index ce57860..0bb3155 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -59,7 +59,19 @@
 #endif
 
 
-#ifdef USE_64BIT
+/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
+ * code. */
+#undef USE_64BIT_ARM_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+     && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+     && defined(HAVE_GCC_INLINE_ASM_NEON)
+#  define USE_64BIT_ARM_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
 # define NEED_COMMON64 1
 #endif
 
@@ -109,7 +121,7 @@ typedef struct KECCAK_CONTEXT_S
 
 #ifdef NEED_COMMON64
 
-static const u64 round_consts_64bit[24] =
+const u64 _gcry_keccak_round_consts_64bit[24 + 1] =
 {
   U64_C(0x0000000000000001), U64_C(0x0000000000008082),
   U64_C(0x800000000000808A), U64_C(0x8000000080008000),
@@ -122,7 +134,8 @@ static const u64 round_consts_64bit[24] =
   U64_C(0x8000000000008002), U64_C(0x8000000000000080),
   U64_C(0x000000000000800A), U64_C(0x800000008000000A),
   U64_C(0x8000000080008081), U64_C(0x8000000000008080),
-  U64_C(0x0000000080000001), U64_C(0x8000000080008008)
+  U64_C(0x0000000080000001), U64_C(0x8000000080008008),
+  U64_C(0xFFFFFFFFFFFFFFFF)
 };
 
 static unsigned int
@@ -400,6 +413,54 @@ static const keccak_ops_t keccak_bmi2_64_ops =
 #endif /* USE_64BIT_BMI2 */
 
 
+/* 64-bit ARMv7/NEON implementation. */
+#ifdef USE_64BIT_ARM_NEON
+
+unsigned int _gcry_keccak_permute_armv7_neon(u64 *state);
+unsigned int _gcry_keccak_absorb_lanes64_armv7_neon(u64 *state, int pos,
+						    const byte *lanes,
+						    unsigned int nlanes,
+						    int blocklanes);
+
+static unsigned int keccak_permute64_armv7_neon(KECCAK_STATE *hd)
+{
+  return _gcry_keccak_permute_armv7_neon(hd->u.state64);
+}
+
+static unsigned int
+keccak_absorb_lanes64_armv7_neon(KECCAK_STATE *hd, int pos, const byte *lanes,
+				 unsigned int nlanes, int blocklanes)
+{
+  if (blocklanes < 0)
+    {
+      /* blocklanes == -1, permutationless absorb from keccak_final. */
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+	}
+
+      return 0;
+    }
+  else
+    {
+      return _gcry_keccak_absorb_lanes64_armv7_neon(hd->u.state64, pos, lanes,
+						    nlanes, blocklanes);
+    }
+}
+
+static const keccak_ops_t keccak_armv7_neon_64_ops =
+{
+  .permute = keccak_permute64_armv7_neon,
+  .absorb = keccak_absorb_lanes64_armv7_neon,
+  .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_ARM_NEON */
+
+
 /* Construct generic 32-bit implementation. */
 #ifdef USE_32BIT
 
@@ -662,6 +723,10 @@ keccak_init (int algo, void *context, unsigned int flags)
 
   /* Select optimized implementation based in hw features. */
   if (0) {}
+#ifdef USE_64BIT_ARM_NEON
+  else if (features & HWF_ARM_NEON)
+    ctx->ops = &keccak_armv7_neon_64_ops;
+#endif
 #ifdef USE_64BIT_BMI2
   else if (features & HWF_INTEL_BMI2)
     ctx->ops = &keccak_bmi2_64_ops;
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 6f24217..1a80192 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -25,7 +25,7 @@
 static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
-  const u64 *round_consts = round_consts_64bit;
+  const u64 *round_consts = _gcry_keccak_round_consts_64bit;
   u64 Aba, Abe, Abi, Abo, Abu;
   u64 Aga, Age, Agi, Ago, Agu;
   u64 Aka, Ake, Aki, Ako, Aku;
diff --git a/configure.ac b/configure.ac
index 2acfa36..ed37ab5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2108,7 +2108,7 @@ if test "$found" = "1" ; then
 
    if test x"$neonsupport" = xyes ; then
      # Build with the NEON implementation
-     :
+     GCRYPT_DIGESTS="$GCRYPT_DIGESTS keccak-armv7-neon.lo"
    fi
 fi
 

commit 2857cb89c6dc1c02266600bc1fd2967a3cd5cf88
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Oct 31 21:29:56 2015 +0200

    Optimize Keccak 64-bit absorb functions
    
    * cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
    (absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
    * cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
    (absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
    [USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    [USE_64BIT] (keccak_absorb_lanes64): Remove.
    [USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    [USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
    [USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    [USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
    * cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
    --
    
    Optimize 64-bit absorb functions for small speed-up. After this
    change, 64-bit BMI2 implementation matches speed of fastest results
    from SUPERCOP for Intel Haswell CPUs (long messages).
    
    Benchmark on Intel Haswell @ 3.2 Ghz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |      2.32 ns/B     411.7 MiB/s      7.41 c/B
     SHAKE256       |      2.84 ns/B     336.2 MiB/s      9.08 c/B
     SHA3-224       |      2.69 ns/B     354.9 MiB/s      8.60 c/B
     SHA3-256       |      2.84 ns/B     336.0 MiB/s      9.08 c/B
     SHA3-384       |      3.69 ns/B     258.4 MiB/s     11.81 c/B
     SHA3-512       |      5.30 ns/B     179.9 MiB/s     16.97 c/B
    
    After:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |      2.27 ns/B     420.6 MiB/s      7.26 c/B
     SHAKE256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
     SHA3-224       |      2.64 ns/B     361.7 MiB/s      8.44 c/B
     SHA3-256       |      2.79 ns/B     341.5 MiB/s      8.94 c/B
     SHA3-384       |      3.65 ns/B     261.4 MiB/s     11.68 c/B
     SHA3-512       |      5.27 ns/B     181.0 MiB/s     16.87 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/keccak.c b/cipher/keccak.c
index f4f0ef3..ce57860 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -223,38 +223,105 @@ keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
 /* Construct generic 64-bit implementation. */
 #ifdef USE_64BIT
 
+#if __GNUC__ >= 4 && defined(__x86_64__)
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "movdqu 2*16(%[dst]), %%xmm2\n\t"
+       "movdqu 3*16(%[dst]), %%xmm3\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu 2*16(%[in]), %%xmm4\n\t"
+       "movdqu 3*16(%[in]), %%xmm5\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "pxor %%xmm4, %%xmm2\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       "pxor %%xmm5, %%xmm3\n\t"
+       "movdqu %%xmm2, 2*16(%[dst])\n\t"
+       "movdqu %%xmm3, 3*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "movdqu 1*16(%[dst]), %%xmm1\n\t"
+       "movdqu 1*16(%[in]), %%xmm5\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "pxor %%xmm5, %%xmm1\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       "movdqu %%xmm1, 1*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+       "movdqu 0*16(%[in]), %%xmm4\n\t"
+       "pxor %%xmm4, %%xmm0\n\t"
+       "movdqu %%xmm0, 0*16(%[dst])\n\t"
+       :
+       : [dst] "r" (dst), [in] "r" (in)
+       : "xmm0", "xmm4", "memory");
+}
+
+#else /* __x86_64__ */
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+  dst[4] ^= buf_get_le64(in + 8 * 4);
+  dst[5] ^= buf_get_le64(in + 8 * 5);
+  dst[6] ^= buf_get_le64(in + 8 * 6);
+  dst[7] ^= buf_get_le64(in + 8 * 7);
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+  dst[2] ^= buf_get_le64(in + 8 * 2);
+  dst[3] ^= buf_get_le64(in + 8 * 3);
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+  dst[1] ^= buf_get_le64(in + 8 * 1);
+}
+
+#endif /* !__x86_64__ */
+
+static inline void absorb_lanes64_1(u64 *dst, const byte *in)
+{
+  dst[0] ^= buf_get_le64(in + 8 * 0);
+}
+
+
 # define ANDN64(x, y) (~(x) & (y))
 # define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
 		      ((x) >> ((64 - (unsigned int)(n)) & 63)))
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes,
-		      unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_generic64_ops =
 {
@@ -279,33 +346,13 @@ static const keccak_ops_t keccak_generic64_ops =
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes,
-			   unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64_shld(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_shld_64_ops =
 {
@@ -335,33 +382,13 @@ static const keccak_ops_t keccak_shld_64_ops =
 			tmp; })
 
 # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
 # include "keccak_permute_64.h"
 
 # undef ANDN64
 # undef ROL64
 # undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
-			   unsigned int nlanes, int blocklanes)
-{
-  unsigned int burn = 0;
-
-  while (nlanes)
-    {
-      hd->u.state64[pos] ^= buf_get_le64(lanes);
-      lanes += 8;
-      nlanes--;
-
-      if (++pos == blocklanes)
-	{
-	  burn = keccak_f1600_state_permute64_bmi2(hd);
-	  pos = 0;
-	}
-    }
-
-  return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
 
 static const keccak_ops_t keccak_bmi2_64_ops =
 {
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1264f19..6f24217 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -288,3 +288,102 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 
   return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
 }
+
+static unsigned int
+KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
+			      unsigned int nlanes, int blocklanes)
+{
+  unsigned int burn = 0;
+
+  while (nlanes)
+    {
+      switch (blocklanes)
+	{
+	case 21:
+	  /* SHAKE128 */
+	  while (pos == 0 && nlanes >= 21)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
+	      absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
+	      lanes += 8 * 21;
+	      nlanes -= 21;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 18:
+	  /* SHA3-224 */
+	  while (pos == 0 && nlanes >= 18)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
+	      lanes += 8 * 18;
+	      nlanes -= 18;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 17:
+	  /* SHA3-256 & SHAKE256 */
+	  while (pos == 0 && nlanes >= 17)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
+	      lanes += 8 * 17;
+	      nlanes -= 17;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 13:
+	  /* SHA3-384 */
+	  while (pos == 0 && nlanes >= 13)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+	      absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
+	      lanes += 8 * 13;
+	      nlanes -= 13;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+
+	case 9:
+	  /* SHA3-512 */
+	  while (pos == 0 && nlanes >= 9)
+	    {
+	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+	      absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
+	      lanes += 8 * 9;
+	      nlanes -= 9;
+
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	    }
+	  break;
+	}
+
+      while (nlanes)
+	{
+	  hd->u.state64[pos] ^= buf_get_le64(lanes);
+	  lanes += 8;
+	  nlanes--;
+
+	  if (++pos == blocklanes)
+	    {
+	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+	      pos = 0;
+	      break;
+	    }
+	}
+    }
+
+  return burn;
+}

commit 07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sat Oct 31 20:19:59 2015 +0200

    Enable CRC test vectors with zero bytes
    
    * tests/basic.c (check_digests): Enable CRC test-vectors with zero
    bytes.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/tests/basic.c b/tests/basic.c
index 0762a89..7d5de00 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -5851,16 +5851,12 @@ check_digests (void)
       {	GCRY_MD_CRC32_RFC1510, "test0123456789", "\xb8\x3e\x88\xd6" },
       {	GCRY_MD_CRC32_RFC1510, "MASSACHVSETTS INSTITVTE OF TECHNOLOGY",
 	"\xe3\x41\x80\xf7" },
-#if 0
-      {	GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20" },
-#endif
+      {	GCRY_MD_CRC32_RFC1510, "\x80\x00", "\x3b\x83\x98\x4b", 2 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x08", "\x0e\xdb\x88\x32", 2 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x80", "\xed\xb8\x83\x20", 2 },
       {	GCRY_MD_CRC32_RFC1510, "\x80", "\xed\xb8\x83\x20" },
-#if 0
-      {	GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b" },
-      {	GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96" },
-#endif
+      {	GCRY_MD_CRC32_RFC1510, "\x80\x00\x00\x00", "\xed\x59\xb6\x3b", 4 },
+      {	GCRY_MD_CRC32_RFC1510, "\x00\x00\x00\x01", "\x77\x07\x30\x96", 4 },
       { GCRY_MD_CRC32_RFC1510, "123456789", "\x2d\xfd\x2d\x88" },
       {	GCRY_MD_CRC24_RFC2440, "", "\xb7\x04\xce" },
       {	GCRY_MD_CRC24_RFC2440, "foo", "\x4f\xc2\x55" },

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am         |   2 +-
 cipher/keccak-armv7-neon.S | 945 +++++++++++++++++++++++++++++++++++++++++++++
 cipher/keccak.c            | 220 ++++++++---
 cipher/keccak_permute_64.h | 101 ++++-
 cipher/tiger.c             | 104 ++---
 configure.ac               |   2 +-
 tests/basic.c              |  14 +-
 7 files changed, 1248 insertions(+), 140 deletions(-)
 create mode 100644 cipher/keccak-armv7-neon.S


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits

From jussi.kivilinna at iki.fi  Sat Nov  7 16:20:29 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sat, 07 Nov 2015 17:20:29 +0200
Subject: [PATCH] Tweak Keccak for small speed-up
Message-ID: <20151107152029.18057.23575.stgit@localhost6.localdomain6>

* cipher/keccak_permute_32.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Track
rounds with round constant pointer instead of separate round counter.
* cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Ditto.
(KECCAK_F1600_ABSORB_FUNC_NAME): Tweak lanes pointer increment for bulk
absorb loops.
--

Patch makes small tweaks to improve performance.

Benchmark on Intel Haswell @ 3.2 Ghz:

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHAKE128       |      2.27 ns/B     420.5 MiB/s      7.26 c/B
 SHAKE256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
 SHA3-224       |      2.64 ns/B     361.7 MiB/s      8.44 c/B
 SHA3-256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
 SHA3-384       |      3.65 ns/B     261.3 MiB/s     11.68 c/B
 SHA3-512       |      5.27 ns/B     181.0 MiB/s     16.86 c/B

After:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 SHAKE128       |      2.25 ns/B     423.5 MiB/s      7.21 c/B
 SHAKE256       |      2.77 ns/B     343.9 MiB/s      8.88 c/B
 SHA3-224       |      2.62 ns/B     364.1 MiB/s      8.38 c/B
 SHA3-256       |      2.77 ns/B     343.8 MiB/s      8.88 c/B
 SHA3-384       |      3.63 ns/B     262.6 MiB/s     11.63 c/B
 SHA3-512       |      5.23 ns/B     182.3 MiB/s     16.75 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/keccak_permute_32.h |   13 +++++++------
 cipher/keccak_permute_64.h |   44 ++++++++++++++++++++------------------------
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/cipher/keccak_permute_32.h b/cipher/keccak_permute_32.h
index fed9383..1ce42a4 100644
--- a/cipher/keccak_permute_32.h
+++ b/cipher/keccak_permute_32.h
@@ -27,6 +27,7 @@ static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
   const u32 *round_consts = round_consts_32bit;
+  const u32 *round_consts_end = round_consts_32bit + 2 * 24;
   u32 Aba0, Abe0, Abi0, Abo0, Abu0;
   u32 Aba1, Abe1, Abi1, Abo1, Abu1;
   u32 Aga0, Age0, Agi0, Ago0, Agu0;
@@ -52,7 +53,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   u32 Esa0, Ese0, Esi0, Eso0, Esu0;
   u32 Esa1, Ese1, Esi1, Eso1, Esu1;
   u32 *state = hd->u.state32bi;
-  unsigned int round;
 
   Aba0 = state[0];
   Aba1 = state[1];
@@ -105,7 +105,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   Asu0 = state[48];
   Asu1 = state[49];
 
-  for (round = 0; round < 24; round += 2)
+  do
     {
       /* prepareTheta */
       BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0;
@@ -142,7 +142,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Asu0 ^= Du0;
       BCu0 = ROL32(Asu0, 7);
       Eba0 = BCa0 ^ ANDN32(BCe0, BCi0);
-      Eba0 ^= round_consts[round * 2 + 0];
+      Eba0 ^= *(round_consts++);
       Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0);
       Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0);
       Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0);
@@ -159,7 +159,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Asu1 ^= Du1;
       BCu1 = ROL32(Asu1, 7);
       Eba1 = BCa1 ^ ANDN32(BCe1, BCi1);
-      Eba1 ^= round_consts[round * 2 + 1];
+      Eba1 ^= *(round_consts++);
       Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1);
       Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1);
       Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1);
@@ -328,7 +328,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Esu0 ^= Du0;
       BCu0 = ROL32(Esu0, 7);
       Aba0 = BCa0 ^ ANDN32(BCe0, BCi0);
-      Aba0 ^= round_consts[round * 2 + 2];
+      Aba0 ^= *(round_consts++);
       Abe0 = BCe0 ^ ANDN32(BCi0, BCo0);
       Abi0 = BCi0 ^ ANDN32(BCo0, BCu0);
       Abo0 = BCo0 ^ ANDN32(BCu0, BCa0);
@@ -345,7 +345,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Esu1 ^= Du1;
       BCu1 = ROL32(Esu1, 7);
       Aba1 = BCa1 ^ ANDN32(BCe1, BCi1);
-      Aba1 ^= round_consts[round * 2 + 3];
+      Aba1 ^= *(round_consts++);
       Abe1 = BCe1 ^ ANDN32(BCi1, BCo1);
       Abi1 = BCi1 ^ ANDN32(BCo1, BCu1);
       Abo1 = BCo1 ^ ANDN32(BCu1, BCa1);
@@ -479,6 +479,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Aso1 = BCo1 ^ ANDN32(BCu1, BCa1);
       Asu1 = BCu1 ^ ANDN32(BCa1, BCe1);
     }
+  while (round_consts < round_consts_end);
 
   state[0] = Aba0;
   state[1] = Aba1;
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1a80192..b28c871 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -26,6 +26,7 @@ static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
   const u64 *round_consts = _gcry_keccak_round_consts_64bit;
+  const u64 *round_consts_end = _gcry_keccak_round_consts_64bit + 24;
   u64 Aba, Abe, Abi, Abo, Abu;
   u64 Aga, Age, Agi, Ago, Agu;
   u64 Aka, Ake, Aki, Ako, Aku;
@@ -39,7 +40,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   u64 Ema, Eme, Emi, Emo, Emu;
   u64 Esa, Ese, Esi, Eso, Esu;
   u64 *state = hd->u.state64;
-  unsigned int round;
 
   Aba = state[0];
   Abe = state[1];
@@ -67,7 +67,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   Aso = state[23];
   Asu = state[24];
 
-  for (round = 0; round < 24; round += 2)
+  do
     {
       /* prepareTheta */
       BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
@@ -94,7 +94,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Asu ^= Du;
       BCu = ROL64(Asu, 14);
       Eba = BCa ^ ANDN64(BCe, BCi);
-      Eba ^= (u64)round_consts[round];
+      Eba ^= *(round_consts++);
       Ebe = BCe ^ ANDN64(BCi, BCo);
       Ebi = BCi ^ ANDN64(BCo, BCu);
       Ebo = BCo ^ ANDN64(BCu, BCa);
@@ -189,7 +189,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Esu ^= Du;
       BCu = ROL64(Esu, 14);
       Aba = BCa ^ ANDN64(BCe, BCi);
-      Aba ^= (u64)round_consts[round + 1];
+      Aba ^= *(round_consts++);
       Abe = BCe ^ ANDN64(BCi, BCo);
       Abi = BCi ^ ANDN64(BCo, BCu);
       Abo = BCo ^ ANDN64(BCu, BCa);
@@ -259,6 +259,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Aso = BCo ^ ANDN64(BCu, BCa);
       Asu = BCu ^ ANDN64(BCa, BCe);
     }
+  while (round_consts < round_consts_end);
 
   state[0] = Aba;
   state[1] = Abe;
@@ -303,12 +304,11 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHAKE128 */
 	  while (pos == 0 && nlanes >= 21)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
-	      absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
-	      lanes += 8 * 21;
 	      nlanes -= 21;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_4(&hd->u.state64[16], lanes); lanes += 8 * 4;
+	      absorb_lanes64_1(&hd->u.state64[20], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -318,11 +318,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-224 */
 	  while (pos == 0 && nlanes >= 18)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
-	      lanes += 8 * 18;
 	      nlanes -= 18;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_2(&hd->u.state64[16], lanes); lanes += 8 * 2;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -332,11 +331,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-256 & SHAKE256 */
 	  while (pos == 0 && nlanes >= 17)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
-	      lanes += 8 * 17;
 	      nlanes -= 17;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_1(&hd->u.state64[16], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -346,11 +344,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-384 */
 	  while (pos == 0 && nlanes >= 13)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
-	      lanes += 8 * 13;
 	      nlanes -= 13;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_4(&hd->u.state64[8], lanes); lanes += 8 * 4;
+	      absorb_lanes64_1(&hd->u.state64[12], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -360,10 +357,9 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-512 */
 	  while (pos == 0 && nlanes >= 9)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
-	      lanes += 8 * 9;
 	      nlanes -= 9;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_1(&hd->u.state64[8], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }


From campbell+gcrypt at mumble.net  Sat Nov  7 23:08:27 2015
From: campbell+gcrypt at mumble.net (Taylor R Campbell)
Date: Sat, 7 Nov 2015 22:08:27 +0000
Subject: multiple timing side channels
Message-ID: <20151107220814.9C7BB60322@jupiter.mumble.net>

This morning I had occasion to glance at the libgcrypt source code,
and to my unpleasant surprise I found a collection of independent,
obvious timing side channels in the code:

- In twisted Edwards scalar multiplication, _gcry_mpi_ec_mul_point
branches depending on whether a bit in a secret scalar is set:

(mpi/ec.c, _gcry_mpi_ec_mul_point)
  1232                _gcry_mpi_ec_add_points (&tmppnt, result, point, ctx);
  1233                if (mpi_test_bit (scalar, j))
  1234                  point_set (result, &tmppnt);

Presumably this is intended to run in constant time, because the
scalar is a secret -- there's even a comment above saying `we use
constant time operation' in that case.  But secret-dependent branches
are not that.  (Easy fix: add point_swap_cond like point_set, using
mpi_cond_swap, and use it here.)

- In conditional mpi swapping, _gcry_mpi_cond_swap branches depending
on the swap condition:

(mpi/mpiutil.c, _gcry_mpi_cond_swap)
   576    mpi_limb_t mask = ((mpi_limb_t)0) - !!swap;

Some compilers may not turn this into a branch -- but some do.
Presumably this is intended to run in constant time because it is used
in code that operates on secrets, so I suggest it be documented as
such.  (Easy fix: saturate instead of !!, e.g. iterate swap |= (swap
<< (1<<i)) | (swap >> (1<<i)) for i in 0..6.)

- In mpi bit testing, _gcry_mpi_test_bit branches depending on the
value of the limb:

(mpi/mpi-bit.c, _gcry_mpi_test_bit)
   109      return (limb & (A_LIMB_1 << bitno))? 1: 0;

Again, some compilers may use a constant-time conditional move here --
but some will use a branch.  (Easy fix: return 1 & (limb >> bitno).)

- ~All general-purpose modular reduction involves numerator- and
denominator-dependent branches, e.g.:

(mpi/mpih-div.c, _gcry_mpih_divrem)
   319              if( n0 >= dX ) {

Here n0 and dX are limbs of the numerator and the denominator.  This
includes modular reduction for elliptic curve arithmetic, in which the
numerator is secret; and modular reduction for RSA, in which the
numerator (plaintext message) or denominator (p, q) can be secret.

- Many arithmetic routines normalize their inputs and outputs, so that
secrets flow into nlimbs and thence into loop counts and memory
reference patterns.


I don't have exploits for these -- but I hope the world is at a stage
in crypto engineering where it is not necessary to demonstrate remote
exploitability of every obvious secret-dependent branch and memory
reference.  These are all I found in half an hour of code inspection,
when until I found the first two I hadn't even thought to look for
timing side channels.

I started writing a patch, but the code is so riddled with
secret-dependent branches and memory references that it's not an easy
effort.  In order to avoid a new paper year after year demonstrating a
new timing side channel exploit on GnuPG, I suggest:

1. Avoid general-purpose division.  For divisors fixed by an
algorithm, divisor-specific reduction is usually better, especially
for divisors chosen for it such as 2^255 - 19 or 2^448 - 2^224 - 1.
For a priori unknown divisors, Montgomery reduction is likely much
faster anyway -- and together with Barrett reduction, it is never
necessary to combine attacker-controlled data with secrets in a
general-purpose division.  (For RSA, you need to divide a constant,
4^k, by p and by q, every time you load a key, but that's all.)

2. Eliminate mpi normalization.  Most k-bit multiprecision integers
that libgcrypt handles are uniformly distributed in [0, 2^k), or at
least in [0, p) where 2^(k - 1) <= p <= 2^k.  It's hard to imagine
that there's much value in saving a handful of integer operations on
the last limb once in every ~2^32 cases for an mpi operation -- but
this frequency is high enough that it's not hard to imagine devising a
timing attack where you learn something after a billion messages.

3. Eliminate mpi altogether for arithmetic in fixed finite fields,
such as GF(2^255 - 19) as used in Curve25519.  There's plenty of
easy-to-use, high-quality, high-performance, constant-time code to
compute it -- faster and more safely than the generic mpi code.

4. Eliminate the generic elliptic-curve abstraction, especially for
new curves.  For modern curve design, it offers no benefits over
curve-specific code, and makes variable-time code much more tempting.
Applications don't care that there are elliptic curves or points on
them involved -- applications deal in opaque octet strings.

5. Eliminate mpi_is_secure.  This interleaves code paths that may
operate on secret or public data.  Better to statically distinguish
the code paths that operate on secrets.  While it is OK to use code
paths designed for secret data on public data, making the code paths
conditional makes it harder to audit.  Auditability is critical for
code that millions of people rely on for crypto.

6. Aggressively reject all new code, and prune old code, that uses
secret-dependent branches and memory references.  Kocher's paper was
published in 1996; it shouldn't take twenty years for the world to
learn its lesson.  I understand why the RSA code written long ago
might be vulnerable -- but twisted Edwards arithmetic was designed
from the beginning to make constant-time evaluation easy.


From gniibe at fsij.org  Tue Nov 10 09:00:37 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Tue, 10 Nov 2015 17:00:37 +0900
Subject: multiple timing side channels
In-Reply-To: <20151107220814.9C7BB60322@jupiter.mumble.net>
References: <20151107220814.9C7BB60322@jupiter.mumble.net>
Message-ID: <5641A425.20103@fsij.org>

On 11/08/2015 07:08 AM, Taylor R Campbell wrote:
> This morning I had occasion to glance at the libgcrypt source code,
> and to my unpleasant surprise I found a collection of independent,
> obvious timing side channels in the code:

Thank you for your review and suggestions.  While I share your view,
it's not that easy to fix all at once.  I'd like to fix one by one,
with my capability.  I understand your suggestions, but it is not
clear for me how to achieve that.  When I implement Curve25519, I did
my best on top of existing code.

Let me fix where I can.


> - In twisted Edwards scalar multiplication, _gcry_mpi_ec_mul_point
> branches depending on whether a bit in a secret scalar is set:
> 
> (mpi/ec.c, _gcry_mpi_ec_mul_point)
>   1232                _gcry_mpi_ec_add_points (&tmppnt, result, point, ctx);
>   1233                if (mpi_test_bit (scalar, j))
>   1234                  point_set (result, &tmppnt);
> 
> Presumably this is intended to run in constant time, because the
> scalar is a secret -- there's even a comment above saying `we use
> constant time operation' in that case.  But secret-dependent branches
> are not that.  (Easy fix: add point_swap_cond like point_set, using
> mpi_cond_swap, and use it here.)

Yes.  I'll fix that.

> - In conditional mpi swapping, _gcry_mpi_cond_swap branches depending
> on the swap condition:
> 
> (mpi/mpiutil.c, _gcry_mpi_cond_swap)
>    576    mpi_limb_t mask = ((mpi_limb_t)0) - !!swap;
> 
> Some compilers may not turn this into a branch -- but some do.
> Presumably this is intended to run in constant time because it is used
> in code that operates on secrets, so I suggest it be documented as
> such.  (Easy fix: saturate instead of !!, e.g. iterate swap |= (swap
> << (1<<i)) | (swap >> (1<<i)) for i in 0..6.)

I understand your point.  It is possible for compilers to turn this
into a branch, that's true, but I haven't had any experience for
existing compilers for libgcrypt (for supported architectures), so
far.  Let me consider.  It would be also good considering its API
itself.

> - In mpi bit testing, _gcry_mpi_test_bit branches depending on the
> value of the limb:
> 
> (mpi/mpi-bit.c, _gcry_mpi_test_bit)
>    109      return (limb & (A_LIMB_1 << bitno))? 1: 0;
> 
> Again, some compilers may use a constant-time conditional move here --
> but some will use a branch.  (Easy fix: return 1 & (limb >> bitno).)

I understand your point.  While good compilers turns the expression to
the one you suggest, I'm afraid it could not be constant time on some
architecture which doesn't have barrel shifter in lower level.  Let me
consider.

> - ~All general-purpose modular reduction involves numerator- and
> denominator-dependent branches, e.g.:
> 
> (mpi/mpih-div.c, _gcry_mpih_divrem)
>    319              if( n0 >= dX ) {
> 
> Here n0 and dX are limbs of the numerator and the denominator.  This
> includes modular reduction for elliptic curve arithmetic, in which the
> numerator is secret; and modular reduction for RSA, in which the
> numerator (plaintext message) or denominator (p, q) can be secret.
> 
> - Many arithmetic routines normalize their inputs and outputs, so that
> secrets flow into nlimbs and thence into loop counts and memory
> reference patterns.

Yes.  I understand.


For now, here is a patch to address the first issue.  Built and
tested.

diff --git a/mpi/ec.c b/mpi/ec.c
index 7266f2a..671cf78 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -138,6 +138,22 @@ point_set (mpi_point_t d, mpi_point_t s)
   mpi_set (d->z, s->z);
 }

+static void
+point_resize (mpi_point_t p, size_t nlimbs)
+{
+  mpi_resize (p->x, nlimbs);
+  mpi_resize (p->y, nlimbs);
+  mpi_resize (p->z, nlimbs);
+}
+
+static void
+point_swap_cond (mpi_point_t d, mpi_point_t s, unsigned long swap)
+{
+  mpi_swap_cond (d->x, s->x, swap);
+  mpi_swap_cond (d->y, s->y, swap);
+  mpi_swap_cond (d->z, s->z, swap);
+}
+

 /* Set the projective coordinates from POINT into X, Y, and Z.  If a
    coordinate is not required, X, Y, or Z may be passed as NULL.  */
@@ -1224,14 +1240,16 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
           /* If SCALAR is in secure memory we assume that it is the
              secret key we use constant time operation.  */
           mpi_point_struct tmppnt;
+          size_t nlimbs = 2*(nbits+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB+1;

           point_init (&tmppnt);
+          point_resize (result, nlimbs);
+          point_resize (&tmppnt, nlimbs);
           for (j=nbits-1; j >= 0; j--)
             {
               _gcry_mpi_ec_dup_point (result, result, ctx);
               _gcry_mpi_ec_add_points (&tmppnt, result, point, ctx);
-              if (mpi_test_bit (scalar, j))
-                point_set (result, &tmppnt);
+              point_swap_cond (result, &tmppnt, mpi_test_bit (scalar, j));
             }
           point_free (&tmppnt);
         }
-- 


From campbell+gcrypt at mumble.net  Tue Nov 10 18:49:12 2015
From: campbell+gcrypt at mumble.net (Taylor R Campbell)
Date: Tue, 10 Nov 2015 17:49:12 +0000
Subject: multiple timing side channels
In-Reply-To: <5641A425.20103@fsij.org> (gniibe@fsij.org)
Message-ID: <20151110174857.33A75604DD@jupiter.mumble.net>

   Date: Tue, 10 Nov 2015 17:00:37 +0900
   From: NIIBE Yutaka <gniibe at fsij.org>

   On 11/08/2015 07:08 AM, Taylor R Campbell wrote:
   > - In conditional mpi swapping, _gcry_mpi_cond_swap branches depending
   > on the swap condition:
   > 
   > (mpi/mpiutil.c, _gcry_mpi_cond_swap)
   >    576    mpi_limb_t mask = ((mpi_limb_t)0) - !!swap;
   > 
   > Some compilers may not turn this into a branch -- but some do.
   > Presumably this is intended to run in constant time because it is used
   > in code that operates on secrets, so I suggest it be documented as
   > such.  (Easy fix: saturate instead of !!, e.g. iterate swap |= (swap
   > << (1<<i)) | (swap >> (1<<i)) for i in 0..6.)

   I understand your point.  It is possible for compilers to turn this
   into a branch, that's true, but I haven't had any experience for
   existing compilers for libgcrypt (for supported architectures), so
   far.  Let me consider.  It would be also good considering its API
   itself.

It's certainly worth naming, with cheaper CPU-specific versions.  The
last time I ran into this, I wrote down that `it's not hard to find
CPU/compiler combinations with branches for ``!res'' ' -- but I
foolishly neglected to write down which combinations.

Another possibly cheaper generic option, with fewer shifts, is:

for (i = 1; i < CHAR_BIT*sizeof(swap); i <<= 1)
        swap |= swap >> i;
swap = ~((swap & 1) - 1);

(It is, of course, theoretically possible for a compiler would
translate even this into a conditional branch -- but I've never heard
of a compiler doing that, and that would be rather surprising to many
people.)

   > - In mpi bit testing, _gcry_mpi_test_bit branches depending on the
   > value of the limb:
   > 
   > (mpi/mpi-bit.c, _gcry_mpi_test_bit)
   >    109      return (limb & (A_LIMB_1 << bitno))? 1: 0;
   > 
   > Again, some compilers may use a constant-time conditional move here --
   > but some will use a branch.  (Easy fix: return 1 & (limb >> bitno).)

   I understand your point.  While good compilers turns the expression to
   the one you suggest, I'm afraid it could not be constant time on some
   architecture which doesn't have barrel shifter in lower level.  Let me
   consider.

In this case, bitno is not (or should not be) secret, so it's OK for
the time of the shift to vary depending on bitno.  What's not OK is
when the time of an operation varies depending on the value of limb --
that's secret.  The problem here is not the shift, but using the
secret (limb) in the condition part of a ?: expression.

   For now, here is a patch to address the first issue.  Built and
   tested.

Great, thanks!  That looks better.

Some other parts of _gcry_mpi_ec_mul_point look likely to be
problematic:

- `if (p1.z->nlimbs == 0) ...'  I don't see how this could be true --
  but if it can be, that's probably secret-dependent and thus leads to
  a timing leak.
- `if ( mpi_has_sign (k) )'  Even allowing for negative scalars seems
  to me likely to be a mistake.
- The rest of the routine, after branches for the twisted Edwards and
  Montgomery cases, I assume handles Weierstrass coordinates, for
  which timing side channels are not surprising -- but I started to
  list them before I realized it was for Weierstrass coordinates,
  since nothing in the routine says so.


From jussi.kivilinna at iki.fi  Tue Nov 10 21:09:20 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 10 Nov 2015 22:09:20 +0200
Subject: multiple timing side channels
In-Reply-To: <20151110174857.33A75604DD@jupiter.mumble.net>
References: <20151110174857.33A75604DD@jupiter.mumble.net>
Message-ID: <56424EF0.2090707@iki.fi>

On 10.11.2015 19:49, Taylor R Campbell wrote:
>    Date: Tue, 10 Nov 2015 17:00:37 +0900
>    From: NIIBE Yutaka <gniibe at fsij.org>
> 
>    On 11/08/2015 07:08 AM, Taylor R Campbell wrote:
>    > - In conditional mpi swapping, _gcry_mpi_cond_swap branches depending
>    > on the swap condition:
>    > 
>    > (mpi/mpiutil.c, _gcry_mpi_cond_swap)
>    >    576    mpi_limb_t mask = ((mpi_limb_t)0) - !!swap;
>    > 
>    > Some compilers may not turn this into a branch -- but some do.
>    > Presumably this is intended to run in constant time because it is used
>    > in code that operates on secrets, so I suggest it be documented as
>    > such.  (Easy fix: saturate instead of !!, e.g. iterate swap |= (swap
>    > << (1<<i)) | (swap >> (1<<i)) for i in 0..6.)
> 
>    I understand your point.  It is possible for compilers to turn this
>    into a branch, that's true, but I haven't had any experience for
>    existing compilers for libgcrypt (for supported architectures), so
>    far.  Let me consider.  It would be also good considering its API
>    itself.
> 
> It's certainly worth naming, with cheaper CPU-specific versions.  The
> last time I ran into this, I wrote down that `it's not hard to find
> CPU/compiler combinations with branches for ``!res'' ' -- but I
> foolishly neglected to write down which combinations.
> 
> Another possibly cheaper generic option, with fewer shifts, is:
> 
> for (i = 1; i < CHAR_BIT*sizeof(swap); i <<= 1)
>         swap |= swap >> i;
> swap = ~((swap & 1) - 1);
> 
> (It is, of course, theoretically possible for a compiler would
> translate even this into a conditional branch -- but I've never heard
> of a compiler doing that, and that would be rather surprising to many
> people.)
> 

Another is to do '!!' by bit-wise ORing number and its negative and
extracting sign-bit, which will be set only if number was non-zero:

 /* Convert non-zero values to '1' and zero to '0'. */

 static inline int is_not_zero(unsigned long val)
 {
   val |= -val; /* sign-bit will be set if 'val != 0' */
   return (val >> (CHAR_BIT * sizeof(val) - 1)) & 1;
 }

 ...

 mpi_limb_t mask = ((mpi_limb_t)0) - is_not_zero(swap);

With above GCC/x86-64 generates four instructions (in: swap = rdx,
out: mask = rdx):

 mov    %rdx,%rax
 neg    %rax
 or     %rax,%rdx
 sar    $0x3f,%rdx

Which is same amount as with original '!!swap' (in: swap = rdx,
out: mask = r10):

 xor    %r10d,%r10d
 test   %rdx,%rdx
 setne  %r10b
 neg    %r10

-Jussi


From yunlian at google.com  Fri Nov 13 18:43:29 2015
From: yunlian at google.com (Yunlian Jiang)
Date: Fri, 13 Nov 2015 09:43:29 -0800
Subject: [PATCH] Enable --noexecstack with -g for clang
Message-ID: <CAMsPy2vfoRECQ8_Hdrb4DtLWq9V-m6EJDwESPVxbYh7JYsX_iw@mail.gmail.com>

modify the configuration a little bit.

When I try to build libgcrypt with clang, the --noexecstack is not
used with CFLAGS="-g". This is a similar bug to
 https://sourceware.org/bugzilla/show_bug.cgi?id=6428

diff --git a/m4/noexecstack.m4 b/m4/noexecstack.m4
index 4aab484..ef1bafd 100644
--- a/m4/noexecstack.m4
+++ b/m4/noexecstack.m4
@@ -38,7 +38,7 @@ EOF
   if AC_TRY_COMMAND([${CC} $CFLAGS $CPPFLAGS
                      -S -o conftest.s conftest.c >/dev/null]) \
      && grep .note.GNU-stack conftest.s >/dev/null \
-     && AC_TRY_COMMAND([${CCAS} $CCASFLAGS $CPPFLAGS -Wa,--noexecstack
+     && AC_TRY_COMMAND([${CCAS} $ASFLAGS -Wa,--noexecstack
                        -c -o conftest.o conftest.s >/dev/null])
   then
     cl_cv_as_noexecstack=yes


From gniibe at fsij.org  Mon Nov 16 04:20:56 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Mon, 16 Nov 2015 12:20:56 +0900
Subject: ecc: Montgomery curve always uses the prefix 0x40
Message-ID: <56494B98.4020103@fsij.org>

Hello,

I'm checking GnuPG's Curve25519 implementation.

Currently, in the GnuPG implementation, ECDH shared point and
ephemeral public key (with x-coordinate only) is represented in native
little endian format with no prefix.

I think that it should be prefixed by 0x40, so that it can be also
accessed as a fixed size MPI.

Here is the change for libgcrypt, so that libgcrypt always uses the
prefix 0x40 for the representation of key for Montgomery curve
(regerdless of PUBKEY_FLAG_COMP flag).

(The change for GnuPG will be posted soon, after this patch will be
committed.)

OK to commit?

diff --git a/cipher/ecc-misc.c b/cipher/ecc-misc.c
index 2f2e593..79708f2 100644
--- a/cipher/ecc-misc.c
+++ b/cipher/ecc-misc.c
@@ -292,6 +292,7 @@ _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec,
 gpg_err_code_t
 _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
 {
+  unsigned char *a;
   unsigned char *rawmpi;
   unsigned int rawmpilen;

@@ -311,8 +312,8 @@ _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
           buf++;
         }

-      rawmpi = xtrymalloc (rawmpilen? rawmpilen:1);
-      if (!rawmpi)
+      a = rawmpi = xtrymalloc (rawmpilen? rawmpilen:1);
+      if (!a)
         return gpg_err_code_from_syserror ();

       p = rawmpi + rawmpilen;
@@ -321,16 +322,19 @@ _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
     }
   else
     {
-      /* Note: Without using an opaque MPI it is not reliable possible
-         to find out whether the public key has been given in
-         uncompressed format.  Thus we expect native EdDSA format.  */
-      rawmpi = _gcry_mpi_get_buffer (pk, ctx->nbits/8, &rawmpilen, NULL);
-      if (!rawmpi)
+      a = rawmpi = _gcry_mpi_get_buffer (pk, ctx->nbits/8, &rawmpilen, NULL);
+      if (!a)
         return gpg_err_code_from_syserror ();
+      /*
+       * It is not reliable to assume that 0x40 means the prefix.
+       * Now, we always put 0x40 for X-only coordinate, it is reliable.
+       */
+      if (a[0] == 0x40 && (rawmpilen%2))
+	rawmpi++;
     }

   _gcry_mpi_set_buffer (result->x, rawmpi, rawmpilen, 0);
-  xfree (rawmpi);
+  xfree (a);
   mpi_set_ui (result->z, 1);

   return 0;
diff --git a/cipher/ecc.c b/cipher/ecc.c
index 4958fbb..6932489 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -606,17 +606,14 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
                                           &encpk, &encpklen);
       else
         {
-          int off = !!(flags & PUBKEY_FLAG_COMP);
-
-          encpk = _gcry_mpi_get_buffer_extra (Qx, ctx->nbits/8, off?-1:0,
+          encpk = _gcry_mpi_get_buffer_extra (Qx, ctx->nbits/8, -1,
                                               &encpklen, NULL);
           if (encpk == NULL)
             rc = gpg_err_code_from_syserror ();
           else
             {
-              if (off)
-                encpk[0] = 0x40;
-              encpklen += off;
+              encpk[0] = 0x40;
+              encpklen++;
             }
         }
       if (rc)
@@ -1374,11 +1371,13 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       mpi_s = _gcry_ecc_ec2os (x, y, pk.E.p);
     else
       {
-        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        rawmpi = _gcry_mpi_get_buffer_extra (x, ec->nbits/8, -1,
+                                             &rawmpilen, NULL);
         if (!rawmpi)
           rc = gpg_err_code_from_syserror ();
         else
           {
+            rawmpi[0] = 0x40;
             mpi_s = mpi_new (0);
             mpi_set_opaque (mpi_s, rawmpi, rawmpilen*8);
           }
@@ -1393,11 +1392,13 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       mpi_e = _gcry_ecc_ec2os (x, y, pk.E.p);
     else
       {
-        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        rawmpi = _gcry_mpi_get_buffer_extra (x, ec->nbits/8, -1,
+                                             &rawmpilen, NULL);
         if (!rawmpi)
           rc = gpg_err_code_from_syserror ();
         else
           {
+            rawmpi[0] = 0x40;
             mpi_e = mpi_new (0);
             mpi_set_opaque (mpi_e, rawmpi, rawmpilen*8);
           }
@@ -1587,11 +1588,13 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         unsigned char *rawmpi;
         unsigned int rawmpilen;

-        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        rawmpi = _gcry_mpi_get_buffer_extra (x, ec->nbits/8, -1,
+                                             &rawmpilen, NULL);
         if (!rawmpi)
           rc = gpg_err_code_from_syserror ();
         else
           {
+            rawmpi[0] = 0x40;
             r = mpi_new (0);
             mpi_set_opaque (r, rawmpi, rawmpilen*8);
           }
--


From cvs at cvs.gnupg.org  Tue Nov 17 11:20:22 2015
From: cvs at cvs.gnupg.org (by Justus Winter)
Date: Tue, 17 Nov 2015 11:20:22 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-281-g0e39594
Message-ID: <E1ZydDV-0008Vq-MV@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  0e395944b70c7a92a6437f6bcc14f287c19ce9de (commit)
      from  89fa74d6b3e58cd4fcd6e0939a35e46cbaca2ea0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 0e395944b70c7a92a6437f6bcc14f287c19ce9de
Author: Justus Winter <justus at g10code.com>
Date:   Mon Nov 16 12:18:47 2015 +0100

    Fix typos found using codespell
    
    * cipher/cipher-ocb.c: Fix typos.
    * cipher/des.c: Likewise.
    * cipher/dsa-common.c: Likewise.
    * cipher/ecc.c: Likewise.
    * cipher/pubkey.c: Likewise.
    * cipher/rsa-common.c: Likewise.
    * cipher/scrypt.c: Likewise.
    * random/random-csprng.c: Likewise.
    * random/random-fips.c: Likewise.
    * random/rndw32.c: Likewise.
    * src/cipher-proto.h: Likewise.
    * src/context.c: Likewise.
    * src/fips.c: Likewise.
    * src/gcrypt.h.in: Likewise.
    * src/global.c: Likewise.
    * src/sexp.c: Likewise.
    * tests/mpitests.c: Likewise.
    * tests/t-lock.c: Likewise.
    
    Signed-off-by: Justus Winter <justus at g10code.com>

diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index a3a2c9b..6db1db3 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -307,7 +307,7 @@ _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
       c->spec->encrypt (&c->context.c, l_tmp, l_tmp);
       buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, OCB_BLOCK_LEN);
 
-      /* Mark AAD as finalized to avoid accidently calling this
+      /* Mark AAD as finalized to avoid accidentally calling this
          function again after a non-full block has been processed.  */
       c->u_mode.ocb.aad_finalized = 1;
     }
diff --git a/cipher/des.c b/cipher/des.c
index be62763..5c99f50 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -49,7 +49,7 @@
  * encrypt or decrypt data in 64bit blocks in Electronic Codebook Mode.
  *
  * (In the examples below the slashes at the beginning and ending of comments
- * are omited.)
+ * are omitted.)
  *
  * DES Example
  * -----------
@@ -68,7 +68,7 @@
  *     * Encrypt the plaintext *
  *     des_ecb_encrypt(context, plaintext, ciphertext);
  *
- *     * To recover the orginal plaintext from ciphertext use: *
+ *     * To recover the original plaintext from ciphertext use: *
  *     des_ecb_decrypt(context, ciphertext, recoverd);
  *
  *
diff --git a/cipher/dsa-common.c b/cipher/dsa-common.c
index a5e42a2..6f2c2f9 100644
--- a/cipher/dsa-common.c
+++ b/cipher/dsa-common.c
@@ -319,7 +319,7 @@ _gcry_dsa_gen_rfc6979_k (gcry_mpi_t *r_k,
 
   /* The caller may have requested that we introduce some extra loops.
      This is for example useful if the caller wants another value for
-     K because the last returned one yielded an R of 0.  Becuase this
+     K because the last returned one yielded an R of 0.  Because this
      is very unlikely we implement it in a straightforward way.  */
   if (extraloops)
     {
diff --git a/cipher/ecc.c b/cipher/ecc.c
index 4958fbb..bd3e754 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -30,7 +30,7 @@
       Ramiro Moreno Chiral
       Mikael Mylnikov (mmr)
   For use in Libgcrypt the code has been heavily modified and cleaned
-  up. In fact there is not much left of the orginally code except for
+  up. In fact there is not much left of the originally code except for
   some variable names and the text book implementaion of the sign and
   verification algorithms.  The arithmetic functions have entirely
   been rewritten and moved to mpi/ec.c.
diff --git a/cipher/pubkey.c b/cipher/pubkey.c
index e3842c0..b321a89 100644
--- a/cipher/pubkey.c
+++ b/cipher/pubkey.c
@@ -114,7 +114,7 @@ spec_from_name (const char *name)
  * set the function will only succeed if a private key has been given.
  * On success the spec is stored at R_SPEC.  On error NULL is stored
  * at R_SPEC and an error code returned.  If R_PARMS is not NULL and
- * the fucntion returns success, the parameter list below
+ * the function returns success, the parameter list below
  * "private-key" or "public-key" is stored there and the caller must
  * call gcry_sexp_release on it.
  */
@@ -945,7 +945,7 @@ _gcry_pk_selftest (int algo, int extended, selftest_report_func_t report)
   else
     {
       ec = GPG_ERR_PUBKEY_ALGO;
-      /* Fixme: We need to change the report fucntion to allow passing
+      /* Fixme: We need to change the report function to allow passing
          of an encryption mode (e.g. pkcs1, ecdsa, or ecdh).  */
       if (report)
         report ("pubkey", algo, "module",
diff --git a/cipher/rsa-common.c b/cipher/rsa-common.c
index b260142..7b56237 100644
--- a/cipher/rsa-common.c
+++ b/cipher/rsa-common.c
@@ -46,7 +46,7 @@ octet_string_from_mpi (unsigned char **r_frame, void *space,
 
 
 /* Encode {VALUE,VALUELEN} for an NBITS keys using the pkcs#1 block
-   type 2 padding.  On sucess the result is stored as a new MPI at
+   type 2 padding.  On success the result is stored as a new MPI at
    R_RESULT.  On error the value at R_RESULT is undefined.
 
    If {RANDOM_OVERRIDE, RANDOM_OVERRIDE_LEN} is given it is used as
@@ -675,7 +675,7 @@ _gcry_rsa_oaep_decode (unsigned char **r_result, size_t *r_resultlen,
     }
   db = seed + hlen;
 
-  /* To avoid choosen ciphertext attacks from now on we make sure to
+  /* To avoid chosen ciphertext attacks from now on we make sure to
      run all code even in the error case; this avoids possible timing
      attacks as described by Manger.  */
 
diff --git a/cipher/scrypt.c b/cipher/scrypt.c
index 3c21c2a..a05b5bf 100644
--- a/cipher/scrypt.c
+++ b/cipher/scrypt.c
@@ -246,7 +246,7 @@ _gcry_kdf_scrypt (const unsigned char *passwd, size_t passwdlen,
                   unsigned long iterations,
                   size_t dkLen, unsigned char *DK)
 {
-  u64 N = subalgo;    /* CPU/memory cost paramter.  */
+  u64 N = subalgo;    /* CPU/memory cost parameter.  */
   u32 r;              /* Block size.  */
   u32 p = iterations; /* Parallelization parameter.  */
 
diff --git a/random/random-csprng.c b/random/random-csprng.c
index da50fda..dbebe98 100644
--- a/random/random-csprng.c
+++ b/random/random-csprng.c
@@ -1234,7 +1234,7 @@ do_fast_random_poll (void)
 # endif /*!RUSAGE_SELF*/
 #endif /*HAVE_GETRUSAGE*/
 
-  /* Time and clock are availabe on all systems - so we better do it
+  /* Time and clock are available on all systems - so we better do it
      just in case one of the above functions didn't work.  */
   {
     time_t x = time(NULL);
@@ -1275,12 +1275,12 @@ _gcry_rngcsprng_fast_poll (void)
 
 
 static void
-read_random_source (enum random_origins orgin, size_t length, int level )
+read_random_source (enum random_origins origin, size_t length, int level)
 {
   if ( !slow_gather_fnc )
     log_fatal ("Slow entropy gathering module not yet initialized\n");
 
-  if ( slow_gather_fnc (add_randomness, orgin, length, level) < 0)
+  if (slow_gather_fnc (add_randomness, origin, length, level) < 0)
     log_fatal ("No way to gather entropy for the RNG\n");
 }
 
diff --git a/random/random-fips.c b/random/random-fips.c
index 0a76362..3a641b2 100644
--- a/random/random-fips.c
+++ b/random/random-fips.c
@@ -104,7 +104,7 @@ static size_t entropy_collect_buffer_size;     /* Allocated length.  */
 /* This random context type is used to track properties of one random
    generator. Thee context are usually allocated in secure memory so
    that the seed value is well protected.  There are a couble of guard
-   fields to help detecting applications accidently overwriting parts
+   fields to help detecting applications accidentally overwriting parts
    of the memory. */
 struct rng_context
 {
@@ -315,7 +315,7 @@ x931_get_dt (unsigned char *buffer, size_t length, rng_context_t rng_ctx)
     if (gettimeofday (&tv, NULL))
       log_fatal ("gettimeofday() failed: %s\n", strerror (errno));
 
-    /* The microseconds part is always less than 1 millon (0x0f4240).
+    /* The microseconds part is always less than 1 million (0x0f4240).
        Thus we don't care about the MSB and in addition shift it to
        the left by 4 bits.  */
     usec = tv.tv_usec;
diff --git a/random/rndw32.c b/random/rndw32.c
index 1325b18..1c0fc3d 100644
--- a/random/rndw32.c
+++ b/random/rndw32.c
@@ -955,7 +955,7 @@ _gcry_rndw32_gather_random_fast (void (*add)(const void*, size_t,
      However, the kernel appears to synchronise the TSCs across CPUs at
      boot time (it resets the TSC as part of its system init), so this
      shouldn't really be a problem.  Under WinCE it's completely platform-
-     dependant, if there's no hardware performance counter available, it
+     dependent, if there's no hardware performance counter available, it
      uses the 1ms system timer.
 
      Another feature of the TSC (although it doesn't really affect us here)
diff --git a/src/cipher-proto.h b/src/cipher-proto.h
index 3bca9c7..d1ddc5d 100644
--- a/src/cipher-proto.h
+++ b/src/cipher-proto.h
@@ -92,7 +92,7 @@ typedef const char *(*pk_get_curve_t)(gcry_sexp_t keyparms, int iterator,
 typedef gcry_sexp_t (*pk_get_curve_param_t)(const char *name);
 
 
-/* Module specification structure for public key algoritms.  */
+/* Module specification structure for public key algorithms.  */
 typedef struct gcry_pk_spec
 {
   int algo;
diff --git a/src/context.c b/src/context.c
index 94e5be9..f77878b 100644
--- a/src/context.c
+++ b/src/context.c
@@ -47,7 +47,7 @@ struct gcry_context
 
 /* Allocate a fresh generic context of contect TYPE and allocate
    LENGTH extra bytes for private use of the type handler. DEINIT is a
-   fucntion used called to deinitialize the private part; it may be
+   function used called to deinitialize the private part; it may be
    NULL if de-initialization is not required.  Returns NULL and sets
    ERRNO if memory allocation failed.  */
 gcry_ctx_t
diff --git a/src/fips.c b/src/fips.c
index 7939abd..edcbeac 100644
--- a/src/fips.c
+++ b/src/fips.c
@@ -102,7 +102,7 @@ _gcry_initialize_fips_mode (int force)
   static int done;
   gpg_error_t err;
 
-  /* Make sure we are not accidently called twice.  */
+  /* Make sure we are not accidentally called twice.  */
   if (done)
     {
       if ( fips_mode () )
@@ -377,7 +377,7 @@ _gcry_fips_is_operational (void)
              (GCRYCTL_INITIALIZATION_FINISHED) where the latter will
              run the selftests.  The drawback of these on-demand
              self-tests are a small chance that self-tests are
-             performed by severeal threads; that is no problem because
+             performed by several threads; that is no problem because
              our FSM make sure that we won't oversee any error. */
           unlock_fsm ();
           _gcry_fips_run_selftests (0);
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 5ddeee3..93b1f43 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -583,7 +583,7 @@ gcry_error_t gcry_mpi_print (enum gcry_mpi_format format,
                              size_t *nwritten,
                              const gcry_mpi_t a);
 
-/* Convert the big integer A int the external representation described
+/* Convert the big integer A into the external representation described
    by FORMAT and store it in a newly allocated buffer which address
    will be put into BUFFER.  NWRITTEN receives the actual lengths of the
    external representation. */
@@ -1598,7 +1598,7 @@ gcry_error_t gcry_prime_generate (gcry_mpi_t *prime,
 /* Find a generator for PRIME where the factorization of (prime-1) is
    in the NULL terminated array FACTORS. Return the generator as a
    newly allocated MPI in R_G.  If START_G is not NULL, use this as
-   teh start for the search. */
+   the start for the search. */
 gcry_error_t gcry_prime_group_generator (gcry_mpi_t *r_g,
                                          gcry_mpi_t prime,
                                          gcry_mpi_t *factors,
diff --git a/src/global.c b/src/global.c
index 2290393..889de4c 100644
--- a/src/global.c
+++ b/src/global.c
@@ -305,7 +305,7 @@ print_config ( int (*fnc)(FILE *fp, const char *format, ...), FILE *fp)
       fnc (fp, "%s:", s);
   fnc (fp, "\n");
   /* We use y/n instead of 1/0 for the simple reason that Emacsen's
-     compile error parser would accidently flag that line when printed
+     compile error parser would accidentally flag that line when printed
      during "make check" as an error.  */
   fnc (fp, "fips-mode:%c:%c:\n",
        fips_mode ()? 'y':'n',
@@ -867,7 +867,7 @@ _gcry_free (void *p)
     return;
 
   /* In case ERRNO is set we better save it so that the free machinery
-     may not accidently change ERRNO.  We restore it only if it was
+     may not accidentally change ERRNO.  We restore it only if it was
      already set to comply with the usual C semantic for ERRNO.  */
   save_errno = errno;
   if (free_func)
diff --git a/src/sexp.c b/src/sexp.c
index f1bbffa..d063962 100644
--- a/src/sexp.c
+++ b/src/sexp.c
@@ -53,7 +53,7 @@
    data) is required as well.  The close_tag finishes the list and
    would actually be sufficient.  For fail-safe reasons a final stop
    tag is always the last byte in a buffer; it has a value of 0 so
-   that string function accidently applied to an S-expression will
+   that string function accidentally applied to an S-expression will
    never access unallocated data.  We do not support display hints and
    thus don't need to represent them.  A list may have more an
    arbitrary number of data elements but at least one is required.
diff --git a/tests/mpitests.c b/tests/mpitests.c
index d75aca9..e6f8525 100644
--- a/tests/mpitests.c
+++ b/tests/mpitests.c
@@ -426,7 +426,7 @@ test_mul (void)
 
 
 /* What we test here is that we don't overwrite our args and that
-   using thne same mpi for several args works.  */
+   using the same mpi for several args works.  */
 static int
 test_powm (void)
 {
diff --git a/tests/t-lock.c b/tests/t-lock.c
index 22b67ef..815f63b 100644
--- a/tests/t-lock.c
+++ b/tests/t-lock.c
@@ -229,7 +229,7 @@ init_accounts (void)
 }
 
 
-/* Check that the sum of all accounts matches the intial sum.  */
+/* Check that the sum of all accounts matches the initial sum.  */
 static void
 check_accounts (void)
 {
@@ -261,7 +261,7 @@ get_rand (int high)
 }
 
 
-/* Pick a random account.  Note that this fucntion is not
+/* Pick a random account.  Note that this function is not
    thread-safe. */
 static int
 pick_account (void)

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-ocb.c    | 2 +-
 cipher/des.c           | 4 ++--
 cipher/dsa-common.c    | 2 +-
 cipher/ecc.c           | 2 +-
 cipher/pubkey.c        | 4 ++--
 cipher/rsa-common.c    | 4 ++--
 cipher/scrypt.c        | 2 +-
 random/random-csprng.c | 6 +++---
 random/random-fips.c   | 4 ++--
 random/rndw32.c        | 2 +-
 src/cipher-proto.h     | 2 +-
 src/context.c          | 2 +-
 src/fips.c             | 4 ++--
 src/gcrypt.h.in        | 4 ++--
 src/global.c           | 4 ++--
 src/sexp.c             | 2 +-
 tests/mpitests.c       | 2 +-
 tests/t-lock.c         | 4 ++--
 18 files changed, 28 insertions(+), 28 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From cvs at cvs.gnupg.org  Wed Nov 18 08:52:43 2015
From: cvs at cvs.gnupg.org (by Jussi Kivilinna)
Date: Wed, 18 Nov 2015 08:52:43 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-283-g6571a64
Message-ID: <E1ZyxO9-00069u-2o@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  6571a64331839d7d952292163afbf34c8bef62e0 (commit)
       via  15ea0acf8bb0aa307eccc23024a0bd7878fb8080 (commit)
      from  0e395944b70c7a92a6437f6bcc14f287c19ce9de (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 6571a64331839d7d952292163afbf34c8bef62e0
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Nov 18 09:44:18 2015 +0200

    Tweak Keccak for small speed-up
    
    * cipher/keccak_permute_32.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Track
    rounds with round constant pointer instead of separate round counter.
    * cipher/keccak_permute_64.h (KECCAK_F1600_PERMUTE_FUNC_NAME): Ditto.
    (KECCAK_F1600_ABSORB_FUNC_NAME): Tweak lanes pointer increment for bulk
    absorb loops.
    --
    
    Patch makes small tweaks to improve performance.
    
    Benchmark on Intel Haswell @ 3.2 Ghz:
    
    Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |      2.27 ns/B     420.5 MiB/s      7.26 c/B
     SHAKE256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
     SHA3-224       |      2.64 ns/B     361.7 MiB/s      8.44 c/B
     SHA3-256       |      2.79 ns/B     341.4 MiB/s      8.94 c/B
     SHA3-384       |      3.65 ns/B     261.3 MiB/s     11.68 c/B
     SHA3-512       |      5.27 ns/B     181.0 MiB/s     16.86 c/B
    
    After:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
     SHAKE128       |      2.25 ns/B     423.5 MiB/s      7.21 c/B
     SHAKE256       |      2.77 ns/B     343.9 MiB/s      8.88 c/B
     SHA3-224       |      2.62 ns/B     364.1 MiB/s      8.38 c/B
     SHA3-256       |      2.77 ns/B     343.8 MiB/s      8.88 c/B
     SHA3-384       |      3.63 ns/B     262.6 MiB/s     11.63 c/B
     SHA3-512       |      5.23 ns/B     182.3 MiB/s     16.75 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/keccak_permute_32.h b/cipher/keccak_permute_32.h
index fed9383..1ce42a4 100644
--- a/cipher/keccak_permute_32.h
+++ b/cipher/keccak_permute_32.h
@@ -27,6 +27,7 @@ static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
   const u32 *round_consts = round_consts_32bit;
+  const u32 *round_consts_end = round_consts_32bit + 2 * 24;
   u32 Aba0, Abe0, Abi0, Abo0, Abu0;
   u32 Aba1, Abe1, Abi1, Abo1, Abu1;
   u32 Aga0, Age0, Agi0, Ago0, Agu0;
@@ -52,7 +53,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   u32 Esa0, Ese0, Esi0, Eso0, Esu0;
   u32 Esa1, Ese1, Esi1, Eso1, Esu1;
   u32 *state = hd->u.state32bi;
-  unsigned int round;
 
   Aba0 = state[0];
   Aba1 = state[1];
@@ -105,7 +105,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   Asu0 = state[48];
   Asu1 = state[49];
 
-  for (round = 0; round < 24; round += 2)
+  do
     {
       /* prepareTheta */
       BCa0 = Aba0 ^ Aga0 ^ Aka0 ^ Ama0 ^ Asa0;
@@ -142,7 +142,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Asu0 ^= Du0;
       BCu0 = ROL32(Asu0, 7);
       Eba0 = BCa0 ^ ANDN32(BCe0, BCi0);
-      Eba0 ^= round_consts[round * 2 + 0];
+      Eba0 ^= *(round_consts++);
       Ebe0 = BCe0 ^ ANDN32(BCi0, BCo0);
       Ebi0 = BCi0 ^ ANDN32(BCo0, BCu0);
       Ebo0 = BCo0 ^ ANDN32(BCu0, BCa0);
@@ -159,7 +159,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Asu1 ^= Du1;
       BCu1 = ROL32(Asu1, 7);
       Eba1 = BCa1 ^ ANDN32(BCe1, BCi1);
-      Eba1 ^= round_consts[round * 2 + 1];
+      Eba1 ^= *(round_consts++);
       Ebe1 = BCe1 ^ ANDN32(BCi1, BCo1);
       Ebi1 = BCi1 ^ ANDN32(BCo1, BCu1);
       Ebo1 = BCo1 ^ ANDN32(BCu1, BCa1);
@@ -328,7 +328,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Esu0 ^= Du0;
       BCu0 = ROL32(Esu0, 7);
       Aba0 = BCa0 ^ ANDN32(BCe0, BCi0);
-      Aba0 ^= round_consts[round * 2 + 2];
+      Aba0 ^= *(round_consts++);
       Abe0 = BCe0 ^ ANDN32(BCi0, BCo0);
       Abi0 = BCi0 ^ ANDN32(BCo0, BCu0);
       Abo0 = BCo0 ^ ANDN32(BCu0, BCa0);
@@ -345,7 +345,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Esu1 ^= Du1;
       BCu1 = ROL32(Esu1, 7);
       Aba1 = BCa1 ^ ANDN32(BCe1, BCi1);
-      Aba1 ^= round_consts[round * 2 + 3];
+      Aba1 ^= *(round_consts++);
       Abe1 = BCe1 ^ ANDN32(BCi1, BCo1);
       Abi1 = BCi1 ^ ANDN32(BCo1, BCu1);
       Abo1 = BCo1 ^ ANDN32(BCu1, BCa1);
@@ -479,6 +479,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Aso1 = BCo1 ^ ANDN32(BCu1, BCa1);
       Asu1 = BCu1 ^ ANDN32(BCa1, BCe1);
     }
+  while (round_consts < round_consts_end);
 
   state[0] = Aba0;
   state[1] = Aba1;
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1a80192..b28c871 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -26,6 +26,7 @@ static unsigned int
 KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
 {
   const u64 *round_consts = _gcry_keccak_round_consts_64bit;
+  const u64 *round_consts_end = _gcry_keccak_round_consts_64bit + 24;
   u64 Aba, Abe, Abi, Abo, Abu;
   u64 Aga, Age, Agi, Ago, Agu;
   u64 Aka, Ake, Aki, Ako, Aku;
@@ -39,7 +40,6 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   u64 Ema, Eme, Emi, Emo, Emu;
   u64 Esa, Ese, Esi, Eso, Esu;
   u64 *state = hd->u.state64;
-  unsigned int round;
 
   Aba = state[0];
   Abe = state[1];
@@ -67,7 +67,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
   Aso = state[23];
   Asu = state[24];
 
-  for (round = 0; round < 24; round += 2)
+  do
     {
       /* prepareTheta */
       BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
@@ -94,7 +94,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Asu ^= Du;
       BCu = ROL64(Asu, 14);
       Eba = BCa ^ ANDN64(BCe, BCi);
-      Eba ^= (u64)round_consts[round];
+      Eba ^= *(round_consts++);
       Ebe = BCe ^ ANDN64(BCi, BCo);
       Ebi = BCi ^ ANDN64(BCo, BCu);
       Ebo = BCo ^ ANDN64(BCu, BCa);
@@ -189,7 +189,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Esu ^= Du;
       BCu = ROL64(Esu, 14);
       Aba = BCa ^ ANDN64(BCe, BCi);
-      Aba ^= (u64)round_consts[round + 1];
+      Aba ^= *(round_consts++);
       Abe = BCe ^ ANDN64(BCi, BCo);
       Abi = BCi ^ ANDN64(BCo, BCu);
       Abo = BCo ^ ANDN64(BCu, BCa);
@@ -259,6 +259,7 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
       Aso = BCo ^ ANDN64(BCu, BCa);
       Asu = BCu ^ ANDN64(BCa, BCe);
     }
+  while (round_consts < round_consts_end);
 
   state[0] = Aba;
   state[1] = Abe;
@@ -303,12 +304,11 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHAKE128 */
 	  while (pos == 0 && nlanes >= 21)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
-	      absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
-	      lanes += 8 * 21;
 	      nlanes -= 21;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_4(&hd->u.state64[16], lanes); lanes += 8 * 4;
+	      absorb_lanes64_1(&hd->u.state64[20], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -318,11 +318,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-224 */
 	  while (pos == 0 && nlanes >= 18)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
-	      lanes += 8 * 18;
 	      nlanes -= 18;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_2(&hd->u.state64[16], lanes); lanes += 8 * 2;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -332,11 +331,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-256 & SHAKE256 */
 	  while (pos == 0 && nlanes >= 17)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
-	      lanes += 8 * 17;
 	      nlanes -= 17;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_8(&hd->u.state64[8], lanes); lanes += 8 * 8;
+	      absorb_lanes64_1(&hd->u.state64[16], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -346,11 +344,10 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-384 */
 	  while (pos == 0 && nlanes >= 13)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
-	      absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
-	      lanes += 8 * 13;
 	      nlanes -= 13;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_4(&hd->u.state64[8], lanes); lanes += 8 * 4;
+	      absorb_lanes64_1(&hd->u.state64[12], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }
@@ -360,10 +357,9 @@ KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
 	  /* SHA3-512 */
 	  while (pos == 0 && nlanes >= 9)
 	    {
-	      absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
-	      absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
-	      lanes += 8 * 9;
 	      nlanes -= 9;
+	      absorb_lanes64_8(&hd->u.state64[0], lanes); lanes += 8 * 8;
+	      absorb_lanes64_1(&hd->u.state64[8], lanes); lanes += 8 * 1;
 
 	      burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
 	    }

commit 15ea0acf8bb0aa307eccc23024a0bd7878fb8080
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Nov 18 09:44:18 2015 +0200

    Update license information for CRC
    
    * LICENSES: Remove 'Simple permissive' and 'IETF permissive' licenses
    for 'cipher/crc.c' as result of rewrite of CRC implementations.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/LICENSES b/LICENSES
index 6c09e1f..ff8b7fa 100644
--- a/LICENSES
+++ b/LICENSES
@@ -54,56 +54,6 @@ with any binary distributions derived from the GNU C Library.
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
-* Simple permissive licenses
-
-  For files:
-  - cipher/crc.c
-
-#+begin_quote
-  Copyright (c) 1996 L. Peter Deutsch
-
-  Permission is granted to copy and distribute this document for
-  any purpose and without charge, including translations into
-  other languages and incorporation into compilations, provided
-  that the copyright notice and this notice are preserved, and
-  that any substantive changes or deletions from the original are
-  clearly marked.
-#+end_quote
-
-* IETF permissive licenses
-
-  For files:
-  - cipher/crc.c
-
-#+begin_quote
-  Copyright (C) The Internet Society (1998).  All Rights Reserved.
-
-  This document and translations of it may be copied and furnished
-  to others, and derivative works that comment on or otherwise
-  explain it or assist in its implementation may be prepared,
-  copied, published and distributed, in whole or in part, without
-  restriction of any kind, provided that the above copyright notice
-  and this paragraph are included on all such copies and derivative
-  works.  However, this document itself may not be modified in any
-  way, such as by removing the copyright notice or references to
-  the Internet Society or other Internet organizations, except as
-  needed for the purpose of developing Internet standards in which
-  case the procedures for copyrights defined in the Internet
-  Standards process must be followed, or as required to translate
-  it into languages other than English.
-
-  The limited permissions granted above are perpetual and will not be
-  revoked by the Internet Society or its successors or assigns.
-
-  This document and the information contained herein is provided on
-  an "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET
-  ENGINEERING TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR
-  IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE
-  OF THE INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY
-  IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
-  PURPOSE.
-#+end_quote
-
 * X License
 
   For files:

-----------------------------------------------------------------------

Summary of changes:
 LICENSES                   | 50 ----------------------------------------------
 cipher/keccak_permute_32.h | 13 ++++++------
 cipher/keccak_permute_64.h | 44 +++++++++++++++++++---------------------
 3 files changed, 27 insertions(+), 80 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From cvs at cvs.gnupg.org  Wed Nov 18 17:35:43 2015
From: cvs at cvs.gnupg.org (by Justus Winter)
Date: Wed, 18 Nov 2015 17:35:43 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-284-g940dc8a
Message-ID: <E1Zz5YG-0008OE-VO@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  940dc8adc034a6c6c38742f6bfd7d837a532d537 (commit)
      from  6571a64331839d7d952292163afbf34c8bef62e0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 940dc8adc034a6c6c38742f6bfd7d837a532d537
Author: Justus Winter <justus at g10code.com>
Date:   Tue Nov 17 16:00:16 2015 +0100

    cipher: Fix error handling.
    
    * cipher/cipher.c (_gcry_cipher_ctl): Fix error handling.
    --
    Found using the Clang Static Analyzer.
    
    Signed-off-by: Justus Winter <justus at g10code.com>

diff --git a/cipher/cipher.c b/cipher/cipher.c
index ab9f0dc..f163bde 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -1359,6 +1359,7 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen)
           (&h->context.c, GCRYCTL_SET_SBOX, buffer, buflen);
       else
         rc = GPG_ERR_NOT_SUPPORTED;
+      break;
 
     default:
       rc = GPG_ERR_INV_OP;

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher.c | 1 +
 1 file changed, 1 insertion(+)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Mon Nov 23 04:11:31 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Mon, 23 Nov 2015 12:11:31 +0900
Subject: multiple timing side channels
In-Reply-To: <56424EF0.2090707@iki.fi>
References: <20151110174857.33A75604DD@jupiter.mumble.net>
 <56424EF0.2090707@iki.fi>
Message-ID: <565283E3.5020008@fsij.org>

On 11/11/2015 05:09 AM, Jussi Kivilinna wrote:
> Another is to do '!!' by bit-wise ORing number and its negative and
> extracting sign-bit, which will be set only if number was non-zero:
> 
>  /* Convert non-zero values to '1' and zero to '0'. */
> 
>  static inline int is_not_zero(unsigned long val)
>  {
>    val |= -val; /* sign-bit will be set if 'val != 0' */
>    return (val >> (CHAR_BIT * sizeof(val) - 1)) & 1;
>  }
> 
>  ...
> 
>  mpi_limb_t mask = ((mpi_limb_t)0) - is_not_zero(swap);
> 
> With above GCC/x86-64 generates four instructions (in: swap = rdx,
> out: mask = rdx):
> 
>  mov    %rdx,%rax
>  neg    %rax
>  or     %rax,%rdx
>  sar    $0x3f,%rdx
> 
> Which is same amount as with original '!!swap' (in: swap = rdx,
> out: mask = r10):

Thank you for discussions.  I'll be back to this issue.

Before this fine-grained timing issue, I have to handle following, as
we have attacks now (as you had already imagined).  Please don't get
me wrong when you see another fixes before this paticular fix.  I
never ignore your point.

Well, I am considering how to fix libgcrypt ECC to be constant-time.

> 2. Eliminate mpi normalization.  Most k-bit multiprecision integers
> that libgcrypt handles are uniformly distributed in [0, 2^k), or at
> least in [0, p) where 2^(k - 1) <= p <= 2^k.  It's hard to imagine
> that there's much value in saving a handful of integer operations on
> the last limb once in every ~2^32 cases for an mpi operation -- but
> this frequency is high enough that it's not hard to imagine devising a
> timing attack where you learn something after a billion messages.

Yes.  When I did the implementation for Montgomery curve computation,
I also think that MPI normalization should be done at highest layer
only, and we should avoid the normalization in the middle of
computation.  Chosen cipher text attack would be surly possible.

I'm going to fix functions in mpi/ec.c (ec_mod, ec_addm, ec_subm,
ec_mulm, ec_mul2, and possibly ec_powm), so that those will use fixed
number of limbs.  Perhaps, we need another implementation of mpi_invm
which is constant-time.

> 3. Eliminate mpi altogether for arithmetic in fixed finite fields,
> such as GF(2^255 - 19) as used in Curve25519.  There's plenty of
> easy-to-use, high-quality, high-performance, constant-time code to
> compute it -- faster and more safely than the generic mpi code.

I basically agree this view, and I do something like this for my own
project (Gnuk).

For the maintenance and development of libgcrypt itself, it's not that
easy though, since we need to maintain API for existing applications.

> 4. Eliminate the generic elliptic-curve abstraction, especially for
> new curves.  For modern curve design, it offers no benefits over
> curve-specific code, and makes variable-time code much more tempting.
> Applications don't care that there are elliptic curves or points on
> them involved -- applications deal in opaque octet strings.

While I agree, this would require major API changes, I'm afraid of.

I'm going to fix major timing difference of ec_* functions, small
timing difference issues like point_set_cond, and micro timing
difference issues like use of !!.
-- 


From cvs at cvs.gnupg.org  Wed Nov 25 04:48:27 2015
From: cvs at cvs.gnupg.org (by NIIBE Yutaka)
Date: Wed, 25 Nov 2015 04:48:27 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-289-g88e1358
Message-ID: <E1a1QuT-0001qI-0j@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  88e1358962e902ff1cbec8d53ba3eee46407851a (commit)
       via  f88adee3e1f3e2de7d63f92f90bfb3078afd3b4f (commit)
       via  8ad682c412047d3b9196950709dbd7bd14ac8732 (commit)
       via  295b1c3540752af4fc5e6f41480e6db215222fba (commit)
       via  b6015176df6bfae107ac82f9baa29ef2c175c9f9 (commit)
      from  940dc8adc034a6c6c38742f6bfd7d837a532d537 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 88e1358962e902ff1cbec8d53ba3eee46407851a
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 12:46:19 2015 +0900

    ecc: Constant-time multiplication for Weierstrass curve.
    
    * mpi/ec.c (_gcry_mpi_ec_mul_point): Use simple left-to-right binary
    method for Weierstrass curve when SCALAR is secure.

diff --git a/mpi/ec.c b/mpi/ec.c
index 9394d89..4d59a7e 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -1236,16 +1236,27 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
   unsigned int i, loops;
   mpi_point_struct p1, p2, p1inv;
 
-  if (ctx->model == MPI_EC_EDWARDS)
+  if (ctx->model == MPI_EC_EDWARDS
+      || (ctx->model == MPI_EC_WEIERSTRASS
+          && mpi_is_secure (scalar)))
     {
       /* Simple left to right binary method.  GECC Algorithm 3.27 */
       unsigned int nbits;
       int j;
 
       nbits = mpi_get_nbits (scalar);
-      mpi_set_ui (result->x, 0);
-      mpi_set_ui (result->y, 1);
-      mpi_set_ui (result->z, 1);
+      if (ctx->model == MPI_EC_WEIERSTRASS)
+        {
+          mpi_set_ui (result->x, 1);
+          mpi_set_ui (result->y, 1);
+          mpi_set_ui (result->z, 0);
+        }
+      else
+        {
+          mpi_set_ui (result->x, 0);
+          mpi_set_ui (result->y, 1);
+          mpi_set_ui (result->z, 1);
+        }
 
       if (mpi_is_secure (scalar))
         {

commit f88adee3e1f3e2de7d63f92f90bfb3078afd3b4f
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 12:13:04 2015 +0900

    mpi: fix gcry_mpi_swap_cond.
    
    * mpi/mpiutil.c (_gcry_mpi_swap_cond): Relax the condition.

diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c
index d3264c7..99402b8 100644
--- a/mpi/mpiutil.c
+++ b/mpi/mpiutil.c
@@ -582,11 +582,15 @@ void
 _gcry_mpi_swap_cond (gcry_mpi_t a, gcry_mpi_t b, unsigned long swap)
 {
   mpi_size_t i;
-  mpi_size_t nlimbs = a->alloced;
+  mpi_size_t nlimbs;
   mpi_limb_t mask = ((mpi_limb_t)0) - swap;
   mpi_limb_t x;
 
-  if (a->alloced != b->alloced)
+  if (a->alloced > b->alloced)
+    nlimbs = b->alloced;
+  else
+    nlimbs = a->alloced;
+  if (a->nlimbs > nlimbs || b->nlimbs > nlimbs)
     log_bug ("mpi_swap_cond: different sizes\n");
 
   for (i = 0; i < nlimbs; i++)

commit 8ad682c412047d3b9196950709dbd7bd14ac8732
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 10:52:57 2015 +0900

    mpi: Fix mpi_set_cond and mpi_swap_cond .
    
    * mpi/mpiutil.c (_gcry_mpi_set_cond, _gcry_mpi_swap_cond): Don't use
    the operator of !!, but assume SET/SWAP is 0 or 1.
    
    --
    
    If the code for !! would include a branch, it spoils the purpose of
    mpi_set_cond/mpi_swap_cond at all.  It's better to make sure the use
    of this function to be called with 0 or 1 for SET/SWAP.  Note that it
    conforms when SET/SWAP is the result of conditional expression of
    mpi_test_bit.
    
    Reported-by: Taylor R Campbell.

diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c
index 71b3f1c..d3264c7 100644
--- a/mpi/mpiutil.c
+++ b/mpi/mpiutil.c
@@ -483,12 +483,17 @@ _gcry_mpi_set (gcry_mpi_t w, gcry_mpi_t u)
   return w;
 }
 
+/****************
+ * Set the value of W by the one of U, when SET is 1.
+ * Leave the value when SET is 0.
+ * This implementation should be constant-time regardless of SET.
+ */
 gcry_mpi_t
 _gcry_mpi_set_cond (gcry_mpi_t w, const gcry_mpi_t u, unsigned long set)
 {
   mpi_size_t i;
   mpi_size_t nlimbs = u->alloced;
-  mpi_limb_t mask = ((mpi_limb_t)0) - !!set;
+  mpi_limb_t mask = ((mpi_limb_t)0) - set;
   mpi_limb_t x;
 
   if (w->alloced != u->alloced)
@@ -568,12 +573,17 @@ _gcry_mpi_swap (gcry_mpi_t a, gcry_mpi_t b)
 }
 
 
+/****************
+ * Swap the value of A and B, when SWAP is 1.
+ * Leave the value when SWAP is 0.
+ * This implementation should be constant-time regardless of SWAP.
+ */
 void
 _gcry_mpi_swap_cond (gcry_mpi_t a, gcry_mpi_t b, unsigned long swap)
 {
   mpi_size_t i;
   mpi_size_t nlimbs = a->alloced;
-  mpi_limb_t mask = ((mpi_limb_t)0) - !!swap;
+  mpi_limb_t mask = ((mpi_limb_t)0) - swap;
   mpi_limb_t x;
 
   if (a->alloced != b->alloced)

commit 295b1c3540752af4fc5e6f41480e6db215222fba
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 10:42:47 2015 +0900

    ecc: multiplication of Edwards curve to be constant-time.
    
    * mpi/ec.c (_gcry_mpi_ec_mul_point): Use point_swap_cond.
    
    --
    
    Reported-by: Taylor R Campbell.

diff --git a/mpi/ec.c b/mpi/ec.c
index 1644942..9394d89 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -1254,12 +1254,13 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
           mpi_point_struct tmppnt;
 
           point_init (&tmppnt);
+          point_resize (result, ctx);
+          point_resize (&tmppnt, ctx);
           for (j=nbits-1; j >= 0; j--)
             {
               _gcry_mpi_ec_dup_point (result, result, ctx);
               _gcry_mpi_ec_add_points (&tmppnt, result, point, ctx);
-              if (mpi_test_bit (scalar, j))
-                point_set (result, &tmppnt);
+              point_swap_cond (result, &tmppnt, mpi_test_bit (scalar, j), ctx);
             }
           point_free (&tmppnt);
         }

commit b6015176df6bfae107ac82f9baa29ef2c175c9f9
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 10:19:39 2015 +0900

    ecc: Add point_resize and point_swap_cond.
    
    * mpi/ec.c (point_resize, point_swap_cond): New.
    (_gcry_mpi_ec_mul_point): Use point_resize and point_swap_cond.
    
    --
    
    Thanks to Taylor R Campbell who suggests.

diff --git a/mpi/ec.c b/mpi/ec.c
index 7266f2a..1644942 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -139,6 +139,34 @@ point_set (mpi_point_t d, mpi_point_t s)
 }
 
 
+static void
+point_resize (mpi_point_t p, mpi_ec_t ctx)
+{
+  /*
+   * For now, we allocate enough limbs for our EC computation of ec_*.
+   * Once we will improve ec_* to be constant size (and constant
+   * time), NLIMBS can be ctx->p->nlimbs.
+   */
+  size_t nlimbs = 2*ctx->p->nlimbs+1;
+
+  mpi_resize (p->x, nlimbs);
+  if (ctx->model != MPI_EC_MONTGOMERY)
+    mpi_resize (p->y, nlimbs);
+  mpi_resize (p->z, nlimbs);
+}
+
+
+static void
+point_swap_cond (mpi_point_t d, mpi_point_t s, unsigned long swap,
+                 mpi_ec_t ctx)
+{
+  mpi_swap_cond (d->x, s->x, swap);
+  if (ctx->model != MPI_EC_MONTGOMERY)
+    mpi_swap_cond (d->y, s->y, swap);
+  mpi_swap_cond (d->z, s->z, swap);
+}
+
+
 /* Set the projective coordinates from POINT into X, Y, and Z.  If a
    coordinate is not required, X, Y, or Z may be passed as NULL.  */
 void
@@ -1253,7 +1281,6 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
       mpi_point_struct p1_, p2_;
       mpi_point_t q1, q2, prd, sum;
       unsigned long sw;
-      size_t nlimbs;
 
       /* Compute scalar point multiplication with Montgomery Ladder.
          Note that we don't use Y-coordinate in the points at all.
@@ -1269,15 +1296,10 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
       p2.x  = mpi_copy (point->x);
       mpi_set_ui (p2.z, 1);
 
-      nlimbs = 2*(nbits+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB+1;
-      mpi_resize (p1.x, nlimbs);
-      mpi_resize (p1.z, nlimbs);
-      mpi_resize (p2.x, nlimbs);
-      mpi_resize (p2.z, nlimbs);
-      mpi_resize (p1_.x, nlimbs);
-      mpi_resize (p1_.z, nlimbs);
-      mpi_resize (p2_.x, nlimbs);
-      mpi_resize (p2_.z, nlimbs);
+      point_resize (&p1, ctx);
+      point_resize (&p2, ctx);
+      point_resize (&p1_, ctx);
+      point_resize (&p2_, ctx);
 
       q1 = &p1;
       q2 = &p2;
@@ -1289,19 +1311,16 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
           mpi_point_t t;
 
           sw = mpi_test_bit (scalar, j);
-          mpi_swap_cond (q1->x, q2->x, sw);
-          mpi_swap_cond (q1->z, q2->z, sw);
+          point_swap_cond (q1, q2, sw, ctx);
           montgomery_ladder (prd, sum, q1, q2, point->x, ctx);
-          mpi_swap_cond (prd->x, sum->x, sw);
-          mpi_swap_cond (prd->z, sum->z, sw);
+          point_swap_cond (prd, sum, sw, ctx);
           t = q1;  q1 = prd;  prd = t;
           t = q2;  q2 = sum;  sum = t;
         }
 
       mpi_clear (result->y);
       sw = (nbits & 1);
-      mpi_swap_cond (p1.x, p1_.x, sw);
-      mpi_swap_cond (p1.z, p1_.z, sw);
+      point_swap_cond (&p1, &p1_, sw, ctx);
 
       if (p1.z->nlimbs == 0)
         {

-----------------------------------------------------------------------

Summary of changes:
 mpi/ec.c      | 75 +++++++++++++++++++++++++++++++++++++++++------------------
 mpi/mpiutil.c | 22 ++++++++++++++----
 2 files changed, 71 insertions(+), 26 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Thu Nov 26 02:12:05 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 26 Nov 2015 10:12:05 +0900
Subject: multiple timing side channels
In-Reply-To: <565283E3.5020008@fsij.org>
References: <20151110174857.33A75604DD@jupiter.mumble.net>
 <56424EF0.2090707@iki.fi> <565283E3.5020008@fsij.org>
Message-ID: <56565C65.2020007@fsij.org>

Hello,

Please have a look at the development version.

    http://git.gnupg.org/cgi-bin/gitweb.cgi?p=libgcrypt.git

I committed five changes.  I'll keep considering changes for ec_*
implementation.


commit 88e1358962e902ff1cbec8d53ba3eee46407851a
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 12:46:19 2015 +0900

    ecc: Constant-time multiplication for Weierstrass curve.

    * mpi/ec.c (_gcry_mpi_ec_mul_point): Use simple left-to-right binary
    method for Weierstrass curve when SCALAR is secure.

commit f88adee3e1f3e2de7d63f92f90bfb3078afd3b4f
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 12:13:04 2015 +0900

    mpi: fix gcry_mpi_swap_cond.

    * mpi/mpiutil.c (_gcry_mpi_swap_cond): Relax the condition.

commit 8ad682c412047d3b9196950709dbd7bd14ac8732
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 10:52:57 2015 +0900

    mpi: Fix mpi_set_cond and mpi_swap_cond .

    * mpi/mpiutil.c (_gcry_mpi_set_cond, _gcry_mpi_swap_cond): Don't use
    the operator of !!, but assume SET/SWAP is 0 or 1.

    --

    If the code for !! would include a branch, it spoils the purpose of
    mpi_set_cond/mpi_swap_cond at all.  It's better to make sure the use
    of this function to be called with 0 or 1 for SET/SWAP.  Note that it
    conforms when SET/SWAP is the result of conditional expression of
    mpi_test_bit.

    Reported-by: Taylor R Campbell.

commit 295b1c3540752af4fc5e6f41480e6db215222fba
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 10:42:47 2015 +0900

    ecc: multiplication of Edwards curve to be constant-time.

    * mpi/ec.c (_gcry_mpi_ec_mul_point): Use point_swap_cond.

    --

    Reported-by: Taylor R Campbell.

commit b6015176df6bfae107ac82f9baa29ef2c175c9f9
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Wed Nov 25 10:19:39 2015 +0900

    ecc: Add point_resize and point_swap_cond.

    * mpi/ec.c (point_resize, point_swap_cond): New.
    (_gcry_mpi_ec_mul_point): Use point_resize and point_swap_cond.

    --

    Thanks to Taylor R Campbell who suggests.
-- 


From cvs at cvs.gnupg.org  Thu Nov 26 03:41:00 2015
From: cvs at cvs.gnupg.org (by NIIBE Yutaka)
Date: Thu, 26 Nov 2015 03:41:00 +0100
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-290-g3658afd
Message-ID: <E1a1mKg-0002p4-6A@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  3658afd09c3b03b4398aaa5748387220c93b1a94 (commit)
      from  88e1358962e902ff1cbec8d53ba3eee46407851a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 3658afd09c3b03b4398aaa5748387220c93b1a94
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Thu Nov 26 11:37:47 2015 +0900

    ecc: minor improvement of point multiplication.
    
    * mpi/ec.c (_gcry_mpi_ec_mul_point): Move ec_subm out of the loop.

diff --git a/mpi/ec.c b/mpi/ec.c
index 4d59a7e..40e09be 100644
--- a/mpi/ec.c
+++ b/mpi/ec.c
@@ -1415,6 +1415,10 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
   point_init (&p2);
   point_init (&p1inv);
 
+  /* Invert point: y = p - y mod p  */
+  point_set (&p1inv, &p1);
+  ec_subm (p1inv.y, ctx->p, p1inv.y, ctx);
+
   for (i=loops-2; i > 0; i--)
     {
       _gcry_mpi_ec_dup_point (result, result, ctx);
@@ -1426,9 +1430,6 @@ _gcry_mpi_ec_mul_point (mpi_point_t result,
       if (mpi_test_bit (h, i) == 0 && mpi_test_bit (k, i) == 1)
         {
           point_set (&p2, result);
-          /* Invert point: y = p - y mod p  */
-          point_set (&p1inv, &p1);
-          ec_subm (p1inv.y, ctx->p, p1inv.y, ctx);
           _gcry_mpi_ec_add_points (result, &p2, &p1inv, ctx);
         }
     }

-----------------------------------------------------------------------

Summary of changes:
 mpi/ec.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Thu Nov 26 03:48:15 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 26 Nov 2015 11:48:15 +0900
Subject: ecc: Montgomery curve always uses the prefix 0x40
In-Reply-To: <56494B98.4020103@fsij.org>
References: <56494B98.4020103@fsij.org>
Message-ID: <565672EF.10400@fsij.org>

On 11/16/2015 12:20 PM, NIIBE Yutaka wrote:
> I'm checking GnuPG's Curve25519 implementation.
[..]
> I think that it should be prefixed by 0x40, so that it can be also
> accessed as a fixed size MPI.

Here is update.  This can handle old data with no prefix, too.


diff --git a/cipher/ecc-misc.c b/cipher/ecc-misc.c
index 2f2e593..b9ad060 100644
--- a/cipher/ecc-misc.c
+++ b/cipher/ecc-misc.c
@@ -292,6 +292,7 @@ _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec,
 gpg_err_code_t
 _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
 {
+  unsigned char *a;
   unsigned char *rawmpi;
   unsigned int rawmpilen;

@@ -311,8 +312,8 @@ _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
           buf++;
         }

-      rawmpi = xtrymalloc (rawmpilen? rawmpilen:1);
-      if (!rawmpi)
+      a = rawmpi = xtrymalloc (rawmpilen? rawmpilen:1);
+      if (!a)
         return gpg_err_code_from_syserror ();

       p = rawmpi + rawmpilen;
@@ -321,16 +322,27 @@ _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
     }
   else
     {
-      /* Note: Without using an opaque MPI it is not reliable possible
-         to find out whether the public key has been given in
-         uncompressed format.  Thus we expect native EdDSA format.  */
-      rawmpi = _gcry_mpi_get_buffer (pk, ctx->nbits/8, &rawmpilen, NULL);
-      if (!rawmpi)
+      a = rawmpi = _gcry_mpi_get_buffer (pk, ctx->nbits/8, &rawmpilen, NULL);
+      if (!a)
         return gpg_err_code_from_syserror ();
+      /*
+       * It is not reliable to assume that 0x40 means the prefix.
+       *
+       * For newer implementation, it is reliable since we always put
+       * 0x40 for x-only coordinate.
+       *
+       * For older implementation (non-released development version),
+       * it is possibe to have the 0x40 as a part of data.
+       *
+       * So, we need to check if it's really the prefix or not.
+       * Only when it's the prefix, we remove it.
+       */
+      if (a[0] == 0x40 && ctx->nbits/8 == rawmpilen - 1)
+        rawmpi++;
     }

   _gcry_mpi_set_buffer (result->x, rawmpi, rawmpilen, 0);
-  xfree (rawmpi);
+  xfree (a);
   mpi_set_ui (result->z, 1);

   return 0;
diff --git a/cipher/ecc.c b/cipher/ecc.c
index bd3e754..51621f8 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -606,17 +606,14 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
                                           &encpk, &encpklen);
       else
         {
-          int off = !!(flags & PUBKEY_FLAG_COMP);
-
-          encpk = _gcry_mpi_get_buffer_extra (Qx, ctx->nbits/8, off?-1:0,
+          encpk = _gcry_mpi_get_buffer_extra (Qx, ctx->nbits/8, -1,
                                               &encpklen, NULL);
           if (encpk == NULL)
             rc = gpg_err_code_from_syserror ();
           else
             {
-              if (off)
-                encpk[0] = 0x40;
-              encpklen += off;
+              encpk[0] = 0x40;
+              encpklen++;
             }
         }
       if (rc)
@@ -1374,11 +1371,13 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       mpi_s = _gcry_ecc_ec2os (x, y, pk.E.p);
     else
       {
-        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        rawmpi = _gcry_mpi_get_buffer_extra (x, ec->nbits/8, -1,
+                                             &rawmpilen, NULL);
         if (!rawmpi)
           rc = gpg_err_code_from_syserror ();
         else
           {
+            rawmpi[0] = 0x40;
             mpi_s = mpi_new (0);
             mpi_set_opaque (mpi_s, rawmpi, rawmpilen*8);
           }
@@ -1393,11 +1392,13 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       mpi_e = _gcry_ecc_ec2os (x, y, pk.E.p);
     else
       {
-        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        rawmpi = _gcry_mpi_get_buffer_extra (x, ec->nbits/8, -1,
+                                             &rawmpilen, NULL);
         if (!rawmpi)
           rc = gpg_err_code_from_syserror ();
         else
           {
+            rawmpi[0] = 0x40;
             mpi_e = mpi_new (0);
             mpi_set_opaque (mpi_e, rawmpi, rawmpilen*8);
           }
@@ -1587,11 +1588,13 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         unsigned char *rawmpi;
         unsigned int rawmpilen;

-        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        rawmpi = _gcry_mpi_get_buffer_extra (x, ec->nbits/8, -1,
+                                             &rawmpilen, NULL);
         if (!rawmpi)
           rc = gpg_err_code_from_syserror ();
         else
           {
+            rawmpi[0] = 0x40;
             r = mpi_new (0);
             mpi_set_opaque (r, rawmpi, rawmpilen*8);
           }
-- 


From cpm at fbsd.es  Thu Nov 26 10:37:29 2015
From: cpm at fbsd.es (Carlos J Puga Medina)
Date: Thu, 26 Nov 2015 10:37:29 +0100
Subject: Patch to fix libgcrypt and KeepassX issue on FreeBSD
Message-ID: <1448530649.1549.14.camel@fbsd.es>

Hi people,

The following patch fixes a current problem on FreeBSD. Please, can
someone commit it for the next libgcrypt release?

--- cipher/Makefile.in.orig	2015-09-08 06:32:11 UTC
+++ cipher/Makefile.in
@@ -818,13 +818,19 @@ uninstall-am:
 	tags tags-am uninstall uninstall-am
 
 
-# We need to lower the optimization for this module.
+# We need to lower the optimization for these modules.
 tiger.o: $(srcdir)/tiger.c
 	`echo $(COMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
 
 tiger.lo: $(srcdir)/tiger.c
 	`echo $(LTCOMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
 
+salsa20.o: $(srcdir)/salsa20.c
+	`echo $(COMPILE) -c $(srcdir)/salsa20.c | $(o_flag_munging) `
+
+salsa20.lo: $(srcdir)/salsa20.c
+	`echo $(LTCOMPILE) -c $(srcdir)/salsa20.c | $(o_flag_munging)
`
+
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:

See bug 204323 for further details:

https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204323

Kind regards,
-- 
Carlos Jacobo Puga Medina <cpm at fbsd.es>
PGP fingerprint = C60E 9497 5302 793B CC2D  BB89 A1F3 5D66 E6D0 5453
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 473 bytes
Desc: This is a digitally signed message part
URL: </pipermail/attachments/20151126/6a997bca/attachment.sig>

From jussi.kivilinna at iki.fi  Thu Nov 26 19:25:28 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 26 Nov 2015 20:25:28 +0200
Subject: Patch to fix libgcrypt and KeepassX issue on FreeBSD
In-Reply-To: <1448530649.1549.14.camel@fbsd.es>
References: <1448530649.1549.14.camel@fbsd.es>
Message-ID: <56574E98.50707@iki.fi>

Hello,

On 26.11.2015 11:37, Carlos J Puga Medina wrote:
> Hi people,
> 
> The following patch fixes a current problem on FreeBSD. Please, can
> someone commit it for the next libgcrypt release?
> 

I managed to reproduce this on Ubuntu/clang and found bug in salsa20
selftest code. Does attached patch fix the issue for you?

-Jussi

> --- cipher/Makefile.in.orig	2015-09-08 06:32:11 UTC
> +++ cipher/Makefile.in
> @@ -818,13 +818,19 @@ uninstall-am:
>  	tags tags-am uninstall uninstall-am
>  
>  
> -# We need to lower the optimization for this module.
> +# We need to lower the optimization for these modules.
>  tiger.o: $(srcdir)/tiger.c
>  	`echo $(COMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
>  
>  tiger.lo: $(srcdir)/tiger.c
>  	`echo $(LTCOMPILE) -c $(srcdir)/tiger.c | $(o_flag_munging) `
>  
> +salsa20.o: $(srcdir)/salsa20.c
> +	`echo $(COMPILE) -c $(srcdir)/salsa20.c | $(o_flag_munging) `
> +
> +salsa20.lo: $(srcdir)/salsa20.c
> +	`echo $(LTCOMPILE) -c $(srcdir)/salsa20.c | $(o_flag_munging)
> `
> +
>  # Tell versions [3.59,3.63) of GNU make to not export all variables.
>  # Otherwise a system limit (for SysV at least) may be exceeded.
>  .NOEXPORT:
> 
> See bug 204323 for further details:
> 
> https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204323
> 
> Kind regards,
> 
> 
> 
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel at gnupg.org
> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 01-salsa20-fix-alignment-of-self.patch
Type: text/x-patch
Size: 3109 bytes
Desc: not available
URL: </pipermail/attachments/20151126/90b4e3ef/attachment.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 648 bytes
Desc: OpenPGP digital signature
URL: </pipermail/attachments/20151126/90b4e3ef/attachment.sig>

From jussi.kivilinna at iki.fi  Sun Nov 29 12:07:37 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 29 Nov 2015 13:07:37 +0200
Subject: [PATCH 1/2] salsa20: fix alignment of self-test context
Message-ID: <20151129110737.20931.4361.stgit@localhost6.localdomain6>

* cipher/salsa20.c (selftest): Ensure 16-byte alignment for salsa20
context structure.
--

Reported-by: Carlos J Puga Medina <cpm at fbsd.es>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/salsa20.c |   34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index fa3d23b..9768198 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -501,7 +501,8 @@ salsa20r12_encrypt_stream (void *context,
 static const char*
 selftest (void)
 {
-  SALSA20_context_t ctx;
+  byte ctxbuf[sizeof(SALSA20_context_t) + 15];
+  SALSA20_context_t *ctx;
   byte scratch[8+1];
   byte buf[256+64+4];
   int i;
@@ -518,32 +519,35 @@ selftest (void)
   static const byte ciphertext_1[] =
     { 0xE3, 0xBE, 0x8F, 0xDD, 0x8B, 0xEC, 0xA2, 0xE3};
 
-  salsa20_setkey (&ctx, key_1, sizeof key_1);
-  salsa20_setiv  (&ctx, nonce_1, sizeof nonce_1);
+  /* 16-byte alignment required for amd64 implementation. */
+  ctx = (SALSA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
+
+  salsa20_setkey (ctx, key_1, sizeof key_1);
+  salsa20_setiv  (ctx, nonce_1, sizeof nonce_1);
   scratch[8] = 0;
-  salsa20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1);
+  salsa20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "Salsa20 encryption test 1 failed.";
   if (scratch[8])
     return "Salsa20 wrote too much.";
-  salsa20_setkey( &ctx, key_1, sizeof(key_1));
-  salsa20_setiv  (&ctx, nonce_1, sizeof nonce_1);
-  salsa20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1);
+  salsa20_setkey( ctx, key_1, sizeof(key_1));
+  salsa20_setiv  (ctx, nonce_1, sizeof nonce_1);
+  salsa20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "Salsa20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
-  salsa20_setkey (&ctx, key_1, sizeof key_1);
-  salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  salsa20_setkey (ctx, key_1, sizeof key_1);
+  salsa20_setiv (ctx, nonce_1, sizeof nonce_1);
   /*encrypt*/
-  salsa20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+  salsa20_encrypt_stream (ctx, buf, buf, sizeof buf);
   /*decrypt*/
-  salsa20_setkey (&ctx, key_1, sizeof key_1);
-  salsa20_setiv (&ctx, nonce_1, sizeof nonce_1);
-  salsa20_encrypt_stream (&ctx, buf, buf, 1);
-  salsa20_encrypt_stream (&ctx, buf+1, buf+1, (sizeof buf)-1-1);
-  salsa20_encrypt_stream (&ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1);
+  salsa20_setkey (ctx, key_1, sizeof key_1);
+  salsa20_setiv (ctx, nonce_1, sizeof nonce_1);
+  salsa20_encrypt_stream (ctx, buf, buf, 1);
+  salsa20_encrypt_stream (ctx, buf+1, buf+1, (sizeof buf)-1-1);
+  salsa20_encrypt_stream (ctx, buf+(sizeof buf)-1, buf+(sizeof buf)-1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte)i)
       return "Salsa20 encryption test 2 failed.";


From jussi.kivilinna at iki.fi  Sun Nov 29 12:07:42 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 29 Nov 2015 13:07:42 +0200
Subject: [PATCH 2/2] chacha20: fix alignment of self-test context
In-Reply-To: <20151129110737.20931.4361.stgit@localhost6.localdomain6>
References: <20151129110737.20931.4361.stgit@localhost6.localdomain6>
Message-ID: <20151129110742.20931.53162.stgit@localhost6.localdomain6>

* cipher/chacha20.c (selftest): Ensure 16-byte alignment for chacha20
context structure.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20.c |   46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index e25e239..613fa82 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -514,7 +514,8 @@ chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf,
 static const char *
 selftest (void)
 {
-  CHACHA20_context_t ctx;
+  byte ctxbuf[sizeof(CHACHA20_context_t) + 15];
+  CHACHA20_context_t *ctx;
   byte scratch[127 + 1];
   byte buf[512 + 64 + 4];
   int i;
@@ -565,46 +566,49 @@ selftest (void)
     0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
   };
 
-  chacha20_setkey (&ctx, key_1, sizeof key_1);
-  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  /* 16-byte alignment required for amd64 implementation. */
+  ctx = (CHACHA20_context_t *)((uintptr_t)(ctxbuf + 15) & ~(uintptr_t)15);
+
+  chacha20_setkey (ctx, key_1, sizeof key_1);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   scratch[sizeof (scratch) - 1] = 0;
-  chacha20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1);
+  chacha20_encrypt_stream (ctx, scratch, plaintext_1, sizeof plaintext_1);
   if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
     return "ChaCha20 encryption test 1 failed.";
   if (scratch[sizeof (scratch) - 1])
     return "ChaCha20 wrote too much.";
-  chacha20_setkey (&ctx, key_1, sizeof (key_1));
-  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
-  chacha20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1);
+  chacha20_setkey (ctx, key_1, sizeof (key_1));
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (ctx, scratch, scratch, sizeof plaintext_1);
   if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
     return "ChaCha20 decryption test 1 failed.";
 
   for (i = 0; i < sizeof buf; i++)
     buf[i] = i;
-  chacha20_setkey (&ctx, key_1, sizeof key_1);
-  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  chacha20_setkey (ctx, key_1, sizeof key_1);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /*encrypt */
-  chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   /*decrypt */
-  chacha20_setkey (&ctx, key_1, sizeof key_1);
-  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
-  chacha20_encrypt_stream (&ctx, buf, buf, 1);
-  chacha20_encrypt_stream (&ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
-  chacha20_encrypt_stream (&ctx, buf + (sizeof buf) - 1,
+  chacha20_setkey (ctx, key_1, sizeof key_1);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (ctx, buf, buf, 1);
+  chacha20_encrypt_stream (ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
+  chacha20_encrypt_stream (ctx, buf + (sizeof buf) - 1,
                            buf + (sizeof buf) - 1, 1);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 2 failed.";
 
-  chacha20_setkey (&ctx, key_1, sizeof key_1);
-  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  chacha20_setkey (ctx, key_1, sizeof key_1);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
   /* encrypt */
   for (i = 0; i < sizeof buf; i++)
-    chacha20_encrypt_stream (&ctx, &buf[i], &buf[i], 1);
+    chacha20_encrypt_stream (ctx, &buf[i], &buf[i], 1);
   /* decrypt */
-  chacha20_setkey (&ctx, key_1, sizeof key_1);
-  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
-  chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+  chacha20_setkey (ctx, key_1, sizeof key_1);
+  chacha20_setiv (ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (ctx, buf, buf, sizeof buf);
   for (i = 0; i < sizeof buf; i++)
     if (buf[i] != (byte) i)
       return "ChaCha20 encryption test 3 failed.";