[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-70-ga39ee75

by Jussi Kivilinna cvs at cvs.gnupg.org
Sun May 11 11:06:09 CEST 2014


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  a39ee7555691d18cae97560f130aaf952bfbd278 (commit)
       via  def7d4cad386271c6d4e2f10aabe0cb4abd871e4 (commit)
       via  23f33d57c9b6f2295a8ddfc9a8eee5a2c30cf406 (commit)
      from  246b7aaae1ee459f440260bbc4ec2c01c5dc3362 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit a39ee7555691d18cae97560f130aaf952bfbd278
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun May 11 12:00:19 2014 +0300

    chacha20: add AVX2/AMD64 assembly implementation
    
    * cipher/Makefile.am: Add 'chacha20-avx2-amd64.S'.
    * cipher/chacha20-avx2-amd64.S: New.
    * cipher/chacha20.c (USE_AVX2): New macro.
    [USE_AVX2] (_gcry_chacha20_amd64_avx2_blocks): New.
    (chacha20_do_setkey): Select AVX2 implementation if there is HW
    support.
    (selftest): Increase size of buf by 256.
    * configure.ac [host=x86-64]: Add 'chacha20-avx2-amd64.lo'.
    --
    
    Add AVX2 optimized implementation for ChaCha20. Based on implementation by
    Andrew Moon.
    
    SSSE3 (Intel Haswell):
    
     CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
         STREAM enc |     0.742 ns/B    1284.8 MiB/s      2.38 c/B
         STREAM dec |     0.741 ns/B    1286.5 MiB/s      2.37 c/B
    
    AVX2:
    
     CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
         STREAM enc |     0.393 ns/B    2428.0 MiB/s      1.26 c/B
         STREAM dec |     0.392 ns/B    2433.6 MiB/s      1.25 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 27ca7ac..26d13d2 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -59,7 +59,7 @@ EXTRA_libcipher_la_SOURCES = \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
-chacha20.c chacha20-ssse3-amd64.S \
+chacha20.c chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
 crc.c \
 des.c des-amd64.S \
 dsa.c \
diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S
new file mode 100644
index 0000000..c50a0c0
--- /dev/null
+++ b/cipher/chacha20-avx2-amd64.S
@@ -0,0 +1,949 @@
+/* chacha20-avx2-amd64.S  -  AMD64/AVX2 implementation of ChaCha20
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by Andrew Moon at
+ *  https://github.com/floodyberry/chacha-opt
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX2) && USE_CHACHA20
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+.text
+
+.align 8
+.globl _gcry_chacha20_amd64_avx2_blocks
+.type  _gcry_chacha20_amd64_avx2_blocks, at function;
+_gcry_chacha20_amd64_avx2_blocks:
+.Lchacha_blocks_avx2_local:
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	movq %rsp, %rbp
+	andq $~63, %rsp
+	subq $512, %rsp
+	leaq .LC RIP, %rax
+	vmovdqu 0(%rax), %xmm6
+	vmovdqu 16(%rax), %xmm7
+	vmovdqu 0(%rdi), %xmm8
+	vmovdqu 16(%rdi), %xmm9
+	vmovdqu 32(%rdi), %xmm10
+	vmovdqu 48(%rdi), %xmm11
+	movl $20, %eax
+	movq $1, %r9
+	vmovdqa %xmm8, 0(%rsp)
+	vmovdqa %xmm9, 16(%rsp)
+	vmovdqa %xmm10, 32(%rsp)
+	vmovdqa %xmm11, 48(%rsp)
+	movq %rax, 64(%rsp)
+	vmovdqa %xmm6, 448(%rsp)
+	vmovdqa %xmm6, 464(%rsp)
+	vmovdqa %xmm7, 480(%rsp)
+	vmovdqa %xmm7, 496(%rsp)
+	cmpq $512, %rcx
+	jae .Lchacha_blocks_avx2_atleast512
+	cmp $256, %rcx
+	jae .Lchacha_blocks_avx2_atleast256
+	jmp .Lchacha_blocks_avx2_below256
+	.p2align 6,,63
+.Lchacha_blocks_avx2_atleast512:
+	movq 48(%rsp), %rax
+	leaq 1(%rax), %r8
+	leaq 2(%rax), %r9
+	leaq 3(%rax), %r10
+	leaq 4(%rax), %rbx
+	leaq 5(%rax), %r11
+	leaq 6(%rax), %r12
+	leaq 7(%rax), %r13
+	leaq 8(%rax), %r14
+	movl %eax, 128(%rsp)
+	movl %r8d, 4+128(%rsp)
+	movl %r9d, 8+128(%rsp)
+	movl %r10d, 12+128(%rsp)
+	movl %ebx, 16+128(%rsp)
+	movl %r11d, 20+128(%rsp)
+	movl %r12d, 24+128(%rsp)
+	movl %r13d, 28+128(%rsp)
+	shrq $32, %rax
+	shrq $32, %r8
+	shrq $32, %r9
+	shrq $32, %r10
+	shrq $32, %rbx
+	shrq $32, %r11
+	shrq $32, %r12
+	shrq $32, %r13
+	movl %eax, 160(%rsp)
+	movl %r8d, 4+160(%rsp)
+	movl %r9d, 8+160(%rsp)
+	movl %r10d, 12+160(%rsp)
+	movl %ebx, 16+160(%rsp)
+	movl %r11d, 20+160(%rsp)
+	movl %r12d, 24+160(%rsp)
+	movl %r13d, 28+160(%rsp)
+	movq %r14, 48(%rsp)
+	movq 64(%rsp), %rax
+	vpbroadcastd 0(%rsp), %ymm0
+	vpbroadcastd 4+0(%rsp), %ymm1
+	vpbroadcastd 8+0(%rsp), %ymm2
+	vpbroadcastd 12+0(%rsp), %ymm3
+	vpbroadcastd 16(%rsp), %ymm4
+	vpbroadcastd 4+16(%rsp), %ymm5
+	vpbroadcastd 8+16(%rsp), %ymm6
+	vpbroadcastd 12+16(%rsp), %ymm7
+	vpbroadcastd 32(%rsp), %ymm8
+	vpbroadcastd 4+32(%rsp), %ymm9
+	vpbroadcastd 8+32(%rsp), %ymm10
+	vpbroadcastd 12+32(%rsp), %ymm11
+	vpbroadcastd 8+48(%rsp), %ymm14
+	vpbroadcastd 12+48(%rsp), %ymm15
+	vmovdqa 128(%rsp), %ymm12
+	vmovdqa 160(%rsp), %ymm13
+.Lchacha_blocks_avx2_mainloop1:
+	vpaddd %ymm0, %ymm4, %ymm0
+	vpaddd %ymm1, %ymm5, %ymm1
+	vpxor %ymm12, %ymm0, %ymm12
+	vpxor %ymm13, %ymm1, %ymm13
+	vpaddd %ymm2, %ymm6, %ymm2
+	vpaddd %ymm3, %ymm7, %ymm3
+	vpxor %ymm14, %ymm2, %ymm14
+	vpxor %ymm15, %ymm3, %ymm15
+	vpshufb 448(%rsp), %ymm12, %ymm12
+	vpshufb 448(%rsp), %ymm13, %ymm13
+	vpaddd %ymm8, %ymm12, %ymm8
+	vpaddd %ymm9, %ymm13, %ymm9
+	vpshufb 448(%rsp), %ymm14, %ymm14
+	vpshufb 448(%rsp), %ymm15, %ymm15
+	vpaddd %ymm10, %ymm14, %ymm10
+	vpaddd %ymm11, %ymm15, %ymm11
+	vmovdqa %ymm12, 96(%rsp)
+	vpxor %ymm4, %ymm8, %ymm4
+	vpxor %ymm5, %ymm9, %ymm5
+	vpslld $ 12, %ymm4, %ymm12
+	vpsrld $20, %ymm4, %ymm4
+	vpxor %ymm4, %ymm12, %ymm4
+	vpslld $ 12, %ymm5, %ymm12
+	vpsrld $20, %ymm5, %ymm5
+	vpxor %ymm5, %ymm12, %ymm5
+	vpxor %ymm6, %ymm10, %ymm6
+	vpxor %ymm7, %ymm11, %ymm7
+	vpslld $ 12, %ymm6, %ymm12
+	vpsrld $20, %ymm6, %ymm6
+	vpxor %ymm6, %ymm12, %ymm6
+	vpslld $ 12, %ymm7, %ymm12
+	vpsrld $20, %ymm7, %ymm7
+	vpxor %ymm7, %ymm12, %ymm7
+	vpaddd %ymm0, %ymm4, %ymm0
+	vpaddd %ymm1, %ymm5, %ymm1
+	vpxor 96(%rsp), %ymm0, %ymm12
+	vpxor %ymm13, %ymm1, %ymm13
+	vpaddd %ymm2, %ymm6, %ymm2
+	vpaddd %ymm3, %ymm7, %ymm3
+	vpxor %ymm14, %ymm2, %ymm14
+	vpxor %ymm15, %ymm3, %ymm15
+	vpshufb 480(%rsp), %ymm12, %ymm12
+	vpshufb 480(%rsp), %ymm13, %ymm13
+	vpaddd %ymm8, %ymm12, %ymm8
+	vpaddd %ymm9, %ymm13, %ymm9
+	vpshufb 480(%rsp), %ymm14, %ymm14
+	vpshufb 480(%rsp), %ymm15, %ymm15
+	vpaddd %ymm10, %ymm14, %ymm10
+	vpaddd %ymm11, %ymm15, %ymm11
+	vmovdqa %ymm12, 96(%rsp)
+	vpxor %ymm4, %ymm8, %ymm4
+	vpxor %ymm5, %ymm9, %ymm5
+	vpslld $ 7, %ymm4, %ymm12
+	vpsrld $25, %ymm4, %ymm4
+	vpxor %ymm4, %ymm12, %ymm4
+	vpslld $ 7, %ymm5, %ymm12
+	vpsrld $25, %ymm5, %ymm5
+	vpxor %ymm5, %ymm12, %ymm5
+	vpxor %ymm6, %ymm10, %ymm6
+	vpxor %ymm7, %ymm11, %ymm7
+	vpslld $ 7, %ymm6, %ymm12
+	vpsrld $25, %ymm6, %ymm6
+	vpxor %ymm6, %ymm12, %ymm6
+	vpslld $ 7, %ymm7, %ymm12
+	vpsrld $25, %ymm7, %ymm7
+	vpxor %ymm7, %ymm12, %ymm7
+	vpaddd %ymm0, %ymm5, %ymm0
+	vpaddd %ymm1, %ymm6, %ymm1
+	vpxor %ymm15, %ymm0, %ymm15
+	vpxor 96(%rsp), %ymm1, %ymm12
+	vpaddd %ymm2, %ymm7, %ymm2
+	vpaddd %ymm3, %ymm4, %ymm3
+	vpxor %ymm13, %ymm2, %ymm13
+	vpxor %ymm14, %ymm3, %ymm14
+	vpshufb 448(%rsp), %ymm15, %ymm15
+	vpshufb 448(%rsp), %ymm12, %ymm12
+	vpaddd %ymm10, %ymm15, %ymm10
+	vpaddd %ymm11, %ymm12, %ymm11
+	vpshufb 448(%rsp), %ymm13, %ymm13
+	vpshufb 448(%rsp), %ymm14, %ymm14
+	vpaddd %ymm8, %ymm13, %ymm8
+	vpaddd %ymm9, %ymm14, %ymm9
+	vmovdqa %ymm15, 96(%rsp)
+	vpxor %ymm5, %ymm10, %ymm5
+	vpxor %ymm6, %ymm11, %ymm6
+	vpslld $ 12, %ymm5, %ymm15
+	vpsrld $20, %ymm5, %ymm5
+	vpxor %ymm5, %ymm15, %ymm5
+	vpslld $ 12, %ymm6, %ymm15
+	vpsrld $20, %ymm6, %ymm6
+	vpxor %ymm6, %ymm15, %ymm6
+	vpxor %ymm7, %ymm8, %ymm7
+	vpxor %ymm4, %ymm9, %ymm4
+	vpslld $ 12, %ymm7, %ymm15
+	vpsrld $20, %ymm7, %ymm7
+	vpxor %ymm7, %ymm15, %ymm7
+	vpslld $ 12, %ymm4, %ymm15
+	vpsrld $20, %ymm4, %ymm4
+	vpxor %ymm4, %ymm15, %ymm4
+	vpaddd %ymm0, %ymm5, %ymm0
+	vpaddd %ymm1, %ymm6, %ymm1
+	vpxor 96(%rsp), %ymm0, %ymm15
+	vpxor %ymm12, %ymm1, %ymm12
+	vpaddd %ymm2, %ymm7, %ymm2
+	vpaddd %ymm3, %ymm4, %ymm3
+	vpxor %ymm13, %ymm2, %ymm13
+	vpxor %ymm14, %ymm3, %ymm14
+	vpshufb 480(%rsp), %ymm15, %ymm15
+	vpshufb 480(%rsp), %ymm12, %ymm12
+	vpaddd %ymm10, %ymm15, %ymm10
+	vpaddd %ymm11, %ymm12, %ymm11
+	vpshufb 480(%rsp), %ymm13, %ymm13
+	vpshufb 480(%rsp), %ymm14, %ymm14
+	vpaddd %ymm8, %ymm13, %ymm8
+	vpaddd %ymm9, %ymm14, %ymm9
+	vmovdqa %ymm15, 96(%rsp)
+	vpxor %ymm5, %ymm10, %ymm5
+	vpxor %ymm6, %ymm11, %ymm6
+	vpslld $ 7, %ymm5, %ymm15
+	vpsrld $25, %ymm5, %ymm5
+	vpxor %ymm5, %ymm15, %ymm5
+	vpslld $ 7, %ymm6, %ymm15
+	vpsrld $25, %ymm6, %ymm6
+	vpxor %ymm6, %ymm15, %ymm6
+	vpxor %ymm7, %ymm8, %ymm7
+	vpxor %ymm4, %ymm9, %ymm4
+	vpslld $ 7, %ymm7, %ymm15
+	vpsrld $25, %ymm7, %ymm7
+	vpxor %ymm7, %ymm15, %ymm7
+	vpslld $ 7, %ymm4, %ymm15
+	vpsrld $25, %ymm4, %ymm4
+	vpxor %ymm4, %ymm15, %ymm4
+	vmovdqa 96(%rsp), %ymm15
+	subq $2, %rax
+	jnz .Lchacha_blocks_avx2_mainloop1
+	vmovdqa %ymm8, 192(%rsp)
+	vmovdqa %ymm9, 224(%rsp)
+	vmovdqa %ymm10, 256(%rsp)
+	vmovdqa %ymm11, 288(%rsp)
+	vmovdqa %ymm12, 320(%rsp)
+	vmovdqa %ymm13, 352(%rsp)
+	vmovdqa %ymm14, 384(%rsp)
+	vmovdqa %ymm15, 416(%rsp)
+	vpbroadcastd 0(%rsp), %ymm8
+	vpbroadcastd 4+0(%rsp), %ymm9
+	vpbroadcastd 8+0(%rsp), %ymm10
+	vpbroadcastd 12+0(%rsp), %ymm11
+	vpbroadcastd 16(%rsp), %ymm12
+	vpbroadcastd 4+16(%rsp), %ymm13
+	vpbroadcastd 8+16(%rsp), %ymm14
+	vpbroadcastd 12+16(%rsp), %ymm15
+	vpaddd %ymm8, %ymm0, %ymm0
+	vpaddd %ymm9, %ymm1, %ymm1
+	vpaddd %ymm10, %ymm2, %ymm2
+	vpaddd %ymm11, %ymm3, %ymm3
+	vpaddd %ymm12, %ymm4, %ymm4
+	vpaddd %ymm13, %ymm5, %ymm5
+	vpaddd %ymm14, %ymm6, %ymm6
+	vpaddd %ymm15, %ymm7, %ymm7
+	vpunpckldq %ymm1, %ymm0, %ymm8
+	vpunpckldq %ymm3, %ymm2, %ymm9
+	vpunpckhdq %ymm1, %ymm0, %ymm12
+	vpunpckhdq %ymm3, %ymm2, %ymm13
+	vpunpckldq %ymm5, %ymm4, %ymm10
+	vpunpckldq %ymm7, %ymm6, %ymm11
+	vpunpckhdq %ymm5, %ymm4, %ymm14
+	vpunpckhdq %ymm7, %ymm6, %ymm15
+	vpunpcklqdq %ymm9, %ymm8, %ymm0
+	vpunpcklqdq %ymm11, %ymm10, %ymm1
+	vpunpckhqdq %ymm9, %ymm8, %ymm2
+	vpunpckhqdq %ymm11, %ymm10, %ymm3
+	vpunpcklqdq %ymm13, %ymm12, %ymm4
+	vpunpcklqdq %ymm15, %ymm14, %ymm5
+	vpunpckhqdq %ymm13, %ymm12, %ymm6
+	vpunpckhqdq %ymm15, %ymm14, %ymm7
+	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
+	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
+	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
+	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
+	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
+	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
+	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
+	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_avx2_noinput1
+	vpxor 0(%rsi), %ymm8, %ymm8
+	vpxor 64(%rsi), %ymm9, %ymm9
+	vpxor 128(%rsi), %ymm10, %ymm10
+	vpxor 192(%rsi), %ymm11, %ymm11
+	vpxor 256(%rsi), %ymm12, %ymm12
+	vpxor 320(%rsi), %ymm13, %ymm13
+	vpxor 384(%rsi), %ymm14, %ymm14
+	vpxor 448(%rsi), %ymm15, %ymm15
+	vmovdqu %ymm8, 0(%rdx)
+	vmovdqu %ymm9, 64(%rdx)
+	vmovdqu %ymm10, 128(%rdx)
+	vmovdqu %ymm11, 192(%rdx)
+	vmovdqu %ymm12, 256(%rdx)
+	vmovdqu %ymm13, 320(%rdx)
+	vmovdqu %ymm14, 384(%rdx)
+	vmovdqu %ymm15, 448(%rdx)
+	vmovdqa 192(%rsp), %ymm0
+	vmovdqa 224(%rsp), %ymm1
+	vmovdqa 256(%rsp), %ymm2
+	vmovdqa 288(%rsp), %ymm3
+	vmovdqa 320(%rsp), %ymm4
+	vmovdqa 352(%rsp), %ymm5
+	vmovdqa 384(%rsp), %ymm6
+	vmovdqa 416(%rsp), %ymm7
+	vpbroadcastd 32(%rsp), %ymm8
+	vpbroadcastd 4+32(%rsp), %ymm9
+	vpbroadcastd 8+32(%rsp), %ymm10
+	vpbroadcastd 12+32(%rsp), %ymm11
+	vmovdqa 128(%rsp), %ymm12
+	vmovdqa 160(%rsp), %ymm13
+	vpbroadcastd 8+48(%rsp), %ymm14
+	vpbroadcastd 12+48(%rsp), %ymm15
+	vpaddd %ymm8, %ymm0, %ymm0
+	vpaddd %ymm9, %ymm1, %ymm1
+	vpaddd %ymm10, %ymm2, %ymm2
+	vpaddd %ymm11, %ymm3, %ymm3
+	vpaddd %ymm12, %ymm4, %ymm4
+	vpaddd %ymm13, %ymm5, %ymm5
+	vpaddd %ymm14, %ymm6, %ymm6
+	vpaddd %ymm15, %ymm7, %ymm7
+	vpunpckldq %ymm1, %ymm0, %ymm8
+	vpunpckldq %ymm3, %ymm2, %ymm9
+	vpunpckhdq %ymm1, %ymm0, %ymm12
+	vpunpckhdq %ymm3, %ymm2, %ymm13
+	vpunpckldq %ymm5, %ymm4, %ymm10
+	vpunpckldq %ymm7, %ymm6, %ymm11
+	vpunpckhdq %ymm5, %ymm4, %ymm14
+	vpunpckhdq %ymm7, %ymm6, %ymm15
+	vpunpcklqdq %ymm9, %ymm8, %ymm0
+	vpunpcklqdq %ymm11, %ymm10, %ymm1
+	vpunpckhqdq %ymm9, %ymm8, %ymm2
+	vpunpckhqdq %ymm11, %ymm10, %ymm3
+	vpunpcklqdq %ymm13, %ymm12, %ymm4
+	vpunpcklqdq %ymm15, %ymm14, %ymm5
+	vpunpckhqdq %ymm13, %ymm12, %ymm6
+	vpunpckhqdq %ymm15, %ymm14, %ymm7
+	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
+	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
+	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
+	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
+	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
+	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
+	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
+	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
+	vpxor 32(%rsi), %ymm8, %ymm8
+	vpxor 96(%rsi), %ymm9, %ymm9
+	vpxor 160(%rsi), %ymm10, %ymm10
+	vpxor 224(%rsi), %ymm11, %ymm11
+	vpxor 288(%rsi), %ymm12, %ymm12
+	vpxor 352(%rsi), %ymm13, %ymm13
+	vpxor 416(%rsi), %ymm14, %ymm14
+	vpxor 480(%rsi), %ymm15, %ymm15
+	vmovdqu %ymm8, 32(%rdx)
+	vmovdqu %ymm9, 96(%rdx)
+	vmovdqu %ymm10, 160(%rdx)
+	vmovdqu %ymm11, 224(%rdx)
+	vmovdqu %ymm12, 288(%rdx)
+	vmovdqu %ymm13, 352(%rdx)
+	vmovdqu %ymm14, 416(%rdx)
+	vmovdqu %ymm15, 480(%rdx)
+	addq $512, %rsi
+	jmp .Lchacha_blocks_avx2_mainloop1_cont
+.Lchacha_blocks_avx2_noinput1:
+	vmovdqu %ymm8, 0(%rdx)
+	vmovdqu %ymm9, 64(%rdx)
+	vmovdqu %ymm10, 128(%rdx)
+	vmovdqu %ymm11, 192(%rdx)
+	vmovdqu %ymm12, 256(%rdx)
+	vmovdqu %ymm13, 320(%rdx)
+	vmovdqu %ymm14, 384(%rdx)
+	vmovdqu %ymm15, 448(%rdx)
+	vmovdqa 192(%rsp), %ymm0
+	vmovdqa 224(%rsp), %ymm1
+	vmovdqa 256(%rsp), %ymm2
+	vmovdqa 288(%rsp), %ymm3
+	vmovdqa 320(%rsp), %ymm4
+	vmovdqa 352(%rsp), %ymm5
+	vmovdqa 384(%rsp), %ymm6
+	vmovdqa 416(%rsp), %ymm7
+	vpbroadcastd 32(%rsp), %ymm8
+	vpbroadcastd 4+32(%rsp), %ymm9
+	vpbroadcastd 8+32(%rsp), %ymm10
+	vpbroadcastd 12+32(%rsp), %ymm11
+	vmovdqa 128(%rsp), %ymm12
+	vmovdqa 160(%rsp), %ymm13
+	vpbroadcastd 8+48(%rsp), %ymm14
+	vpbroadcastd 12+48(%rsp), %ymm15
+	vpaddd %ymm8, %ymm0, %ymm0
+	vpaddd %ymm9, %ymm1, %ymm1
+	vpaddd %ymm10, %ymm2, %ymm2
+	vpaddd %ymm11, %ymm3, %ymm3
+	vpaddd %ymm12, %ymm4, %ymm4
+	vpaddd %ymm13, %ymm5, %ymm5
+	vpaddd %ymm14, %ymm6, %ymm6
+	vpaddd %ymm15, %ymm7, %ymm7
+	vpunpckldq %ymm1, %ymm0, %ymm8
+	vpunpckldq %ymm3, %ymm2, %ymm9
+	vpunpckhdq %ymm1, %ymm0, %ymm12
+	vpunpckhdq %ymm3, %ymm2, %ymm13
+	vpunpckldq %ymm5, %ymm4, %ymm10
+	vpunpckldq %ymm7, %ymm6, %ymm11
+	vpunpckhdq %ymm5, %ymm4, %ymm14
+	vpunpckhdq %ymm7, %ymm6, %ymm15
+	vpunpcklqdq %ymm9, %ymm8, %ymm0
+	vpunpcklqdq %ymm11, %ymm10, %ymm1
+	vpunpckhqdq %ymm9, %ymm8, %ymm2
+	vpunpckhqdq %ymm11, %ymm10, %ymm3
+	vpunpcklqdq %ymm13, %ymm12, %ymm4
+	vpunpcklqdq %ymm15, %ymm14, %ymm5
+	vpunpckhqdq %ymm13, %ymm12, %ymm6
+	vpunpckhqdq %ymm15, %ymm14, %ymm7
+	vperm2i128 $0x20, %ymm1, %ymm0, %ymm8
+	vperm2i128 $0x20, %ymm3, %ymm2, %ymm9
+	vperm2i128 $0x31, %ymm1, %ymm0, %ymm12
+	vperm2i128 $0x31, %ymm3, %ymm2, %ymm13
+	vperm2i128 $0x20, %ymm5, %ymm4, %ymm10
+	vperm2i128 $0x20, %ymm7, %ymm6, %ymm11
+	vperm2i128 $0x31, %ymm5, %ymm4, %ymm14
+	vperm2i128 $0x31, %ymm7, %ymm6, %ymm15
+	vmovdqu %ymm8, 32(%rdx)
+	vmovdqu %ymm9, 96(%rdx)
+	vmovdqu %ymm10, 160(%rdx)
+	vmovdqu %ymm11, 224(%rdx)
+	vmovdqu %ymm12, 288(%rdx)
+	vmovdqu %ymm13, 352(%rdx)
+	vmovdqu %ymm14, 416(%rdx)
+	vmovdqu %ymm15, 480(%rdx)
+.Lchacha_blocks_avx2_mainloop1_cont:
+	addq $512, %rdx
+	subq $512, %rcx
+	cmp $512, %rcx
+	jae .Lchacha_blocks_avx2_atleast512
+	cmp $256, %rcx
+	jb .Lchacha_blocks_avx2_below256_fixup
+.Lchacha_blocks_avx2_atleast256:
+	movq 48(%rsp), %rax
+	leaq 1(%rax), %r8
+	leaq 2(%rax), %r9
+	leaq 3(%rax), %r10
+	leaq 4(%rax), %rbx
+	movl %eax, 128(%rsp)
+	movl %r8d, 4+128(%rsp)
+	movl %r9d, 8+128(%rsp)
+	movl %r10d, 12+128(%rsp)
+	shrq $32, %rax
+	shrq $32, %r8
+	shrq $32, %r9
+	shrq $32, %r10
+	movl %eax, 160(%rsp)
+	movl %r8d, 4+160(%rsp)
+	movl %r9d, 8+160(%rsp)
+	movl %r10d, 12+160(%rsp)
+	movq %rbx, 48(%rsp)
+	movq 64(%rsp), %rax
+	vpbroadcastd 0(%rsp), %xmm0
+	vpbroadcastd 4+0(%rsp), %xmm1
+	vpbroadcastd 8+0(%rsp), %xmm2
+	vpbroadcastd 12+0(%rsp), %xmm3
+	vpbroadcastd 16(%rsp), %xmm4
+	vpbroadcastd 4+16(%rsp), %xmm5
+	vpbroadcastd 8+16(%rsp), %xmm6
+	vpbroadcastd 12+16(%rsp), %xmm7
+	vpbroadcastd 32(%rsp), %xmm8
+	vpbroadcastd 4+32(%rsp), %xmm9
+	vpbroadcastd 8+32(%rsp), %xmm10
+	vpbroadcastd 12+32(%rsp), %xmm11
+	vmovdqa 128(%rsp), %xmm12
+	vmovdqa 160(%rsp), %xmm13
+	vpbroadcastd 8+48(%rsp), %xmm14
+	vpbroadcastd 12+48(%rsp), %xmm15
+.Lchacha_blocks_avx2_mainloop2:
+	vpaddd %xmm0, %xmm4, %xmm0
+	vpaddd %xmm1, %xmm5, %xmm1
+	vpxor %xmm12, %xmm0, %xmm12
+	vpxor %xmm13, %xmm1, %xmm13
+	vpaddd %xmm2, %xmm6, %xmm2
+	vpaddd %xmm3, %xmm7, %xmm3
+	vpxor %xmm14, %xmm2, %xmm14
+	vpxor %xmm15, %xmm3, %xmm15
+	vpshufb 448(%rsp), %xmm12, %xmm12
+	vpshufb 448(%rsp), %xmm13, %xmm13
+	vpaddd %xmm8, %xmm12, %xmm8
+	vpaddd %xmm9, %xmm13, %xmm9
+	vpshufb 448(%rsp), %xmm14, %xmm14
+	vpshufb 448(%rsp), %xmm15, %xmm15
+	vpaddd %xmm10, %xmm14, %xmm10
+	vpaddd %xmm11, %xmm15, %xmm11
+	vmovdqa %xmm12, 96(%rsp)
+	vpxor %xmm4, %xmm8, %xmm4
+	vpxor %xmm5, %xmm9, %xmm5
+	vpslld $ 12, %xmm4, %xmm12
+	vpsrld $20, %xmm4, %xmm4
+	vpxor %xmm4, %xmm12, %xmm4
+	vpslld $ 12, %xmm5, %xmm12
+	vpsrld $20, %xmm5, %xmm5
+	vpxor %xmm5, %xmm12, %xmm5
+	vpxor %xmm6, %xmm10, %xmm6
+	vpxor %xmm7, %xmm11, %xmm7
+	vpslld $ 12, %xmm6, %xmm12
+	vpsrld $20, %xmm6, %xmm6
+	vpxor %xmm6, %xmm12, %xmm6
+	vpslld $ 12, %xmm7, %xmm12
+	vpsrld $20, %xmm7, %xmm7
+	vpxor %xmm7, %xmm12, %xmm7
+	vpaddd %xmm0, %xmm4, %xmm0
+	vpaddd %xmm1, %xmm5, %xmm1
+	vpxor 96(%rsp), %xmm0, %xmm12
+	vpxor %xmm13, %xmm1, %xmm13
+	vpaddd %xmm2, %xmm6, %xmm2
+	vpaddd %xmm3, %xmm7, %xmm3
+	vpxor %xmm14, %xmm2, %xmm14
+	vpxor %xmm15, %xmm3, %xmm15
+	vpshufb 480(%rsp), %xmm12, %xmm12
+	vpshufb 480(%rsp), %xmm13, %xmm13
+	vpaddd %xmm8, %xmm12, %xmm8
+	vpaddd %xmm9, %xmm13, %xmm9
+	vpshufb 480(%rsp), %xmm14, %xmm14
+	vpshufb 480(%rsp), %xmm15, %xmm15
+	vpaddd %xmm10, %xmm14, %xmm10
+	vpaddd %xmm11, %xmm15, %xmm11
+	vmovdqa %xmm12, 96(%rsp)
+	vpxor %xmm4, %xmm8, %xmm4
+	vpxor %xmm5, %xmm9, %xmm5
+	vpslld $ 7, %xmm4, %xmm12
+	vpsrld $25, %xmm4, %xmm4
+	vpxor %xmm4, %xmm12, %xmm4
+	vpslld $ 7, %xmm5, %xmm12
+	vpsrld $25, %xmm5, %xmm5
+	vpxor %xmm5, %xmm12, %xmm5
+	vpxor %xmm6, %xmm10, %xmm6
+	vpxor %xmm7, %xmm11, %xmm7
+	vpslld $ 7, %xmm6, %xmm12
+	vpsrld $25, %xmm6, %xmm6
+	vpxor %xmm6, %xmm12, %xmm6
+	vpslld $ 7, %xmm7, %xmm12
+	vpsrld $25, %xmm7, %xmm7
+	vpxor %xmm7, %xmm12, %xmm7
+	vpaddd %xmm0, %xmm5, %xmm0
+	vpaddd %xmm1, %xmm6, %xmm1
+	vpxor %xmm15, %xmm0, %xmm15
+	vpxor 96(%rsp), %xmm1, %xmm12
+	vpaddd %xmm2, %xmm7, %xmm2
+	vpaddd %xmm3, %xmm4, %xmm3
+	vpxor %xmm13, %xmm2, %xmm13
+	vpxor %xmm14, %xmm3, %xmm14
+	vpshufb 448(%rsp), %xmm15, %xmm15
+	vpshufb 448(%rsp), %xmm12, %xmm12
+	vpaddd %xmm10, %xmm15, %xmm10
+	vpaddd %xmm11, %xmm12, %xmm11
+	vpshufb 448(%rsp), %xmm13, %xmm13
+	vpshufb 448(%rsp), %xmm14, %xmm14
+	vpaddd %xmm8, %xmm13, %xmm8
+	vpaddd %xmm9, %xmm14, %xmm9
+	vmovdqa %xmm15, 96(%rsp)
+	vpxor %xmm5, %xmm10, %xmm5
+	vpxor %xmm6, %xmm11, %xmm6
+	vpslld $ 12, %xmm5, %xmm15
+	vpsrld $20, %xmm5, %xmm5
+	vpxor %xmm5, %xmm15, %xmm5
+	vpslld $ 12, %xmm6, %xmm15
+	vpsrld $20, %xmm6, %xmm6
+	vpxor %xmm6, %xmm15, %xmm6
+	vpxor %xmm7, %xmm8, %xmm7
+	vpxor %xmm4, %xmm9, %xmm4
+	vpslld $ 12, %xmm7, %xmm15
+	vpsrld $20, %xmm7, %xmm7
+	vpxor %xmm7, %xmm15, %xmm7
+	vpslld $ 12, %xmm4, %xmm15
+	vpsrld $20, %xmm4, %xmm4
+	vpxor %xmm4, %xmm15, %xmm4
+	vpaddd %xmm0, %xmm5, %xmm0
+	vpaddd %xmm1, %xmm6, %xmm1
+	vpxor 96(%rsp), %xmm0, %xmm15
+	vpxor %xmm12, %xmm1, %xmm12
+	vpaddd %xmm2, %xmm7, %xmm2
+	vpaddd %xmm3, %xmm4, %xmm3
+	vpxor %xmm13, %xmm2, %xmm13
+	vpxor %xmm14, %xmm3, %xmm14
+	vpshufb 480(%rsp), %xmm15, %xmm15
+	vpshufb 480(%rsp), %xmm12, %xmm12
+	vpaddd %xmm10, %xmm15, %xmm10
+	vpaddd %xmm11, %xmm12, %xmm11
+	vpshufb 480(%rsp), %xmm13, %xmm13
+	vpshufb 480(%rsp), %xmm14, %xmm14
+	vpaddd %xmm8, %xmm13, %xmm8
+	vpaddd %xmm9, %xmm14, %xmm9
+	vmovdqa %xmm15, 96(%rsp)
+	vpxor %xmm5, %xmm10, %xmm5
+	vpxor %xmm6, %xmm11, %xmm6
+	vpslld $ 7, %xmm5, %xmm15
+	vpsrld $25, %xmm5, %xmm5
+	vpxor %xmm5, %xmm15, %xmm5
+	vpslld $ 7, %xmm6, %xmm15
+	vpsrld $25, %xmm6, %xmm6
+	vpxor %xmm6, %xmm15, %xmm6
+	vpxor %xmm7, %xmm8, %xmm7
+	vpxor %xmm4, %xmm9, %xmm4
+	vpslld $ 7, %xmm7, %xmm15
+	vpsrld $25, %xmm7, %xmm7
+	vpxor %xmm7, %xmm15, %xmm7
+	vpslld $ 7, %xmm4, %xmm15
+	vpsrld $25, %xmm4, %xmm4
+	vpxor %xmm4, %xmm15, %xmm4
+	vmovdqa 96(%rsp), %xmm15
+	subq $2, %rax
+	jnz .Lchacha_blocks_avx2_mainloop2
+	vmovdqa %xmm8, 192(%rsp)
+	vmovdqa %xmm9, 208(%rsp)
+	vmovdqa %xmm10, 224(%rsp)
+	vmovdqa %xmm11, 240(%rsp)
+	vmovdqa %xmm12, 256(%rsp)
+	vmovdqa %xmm13, 272(%rsp)
+	vmovdqa %xmm14, 288(%rsp)
+	vmovdqa %xmm15, 304(%rsp)
+	vpbroadcastd 0(%rsp), %xmm8
+	vpbroadcastd 4+0(%rsp), %xmm9
+	vpbroadcastd 8+0(%rsp), %xmm10
+	vpbroadcastd 12+0(%rsp), %xmm11
+	vpbroadcastd 16(%rsp), %xmm12
+	vpbroadcastd 4+16(%rsp), %xmm13
+	vpbroadcastd 8+16(%rsp), %xmm14
+	vpbroadcastd 12+16(%rsp), %xmm15
+	vpaddd %xmm8, %xmm0, %xmm0
+	vpaddd %xmm9, %xmm1, %xmm1
+	vpaddd %xmm10, %xmm2, %xmm2
+	vpaddd %xmm11, %xmm3, %xmm3
+	vpaddd %xmm12, %xmm4, %xmm4
+	vpaddd %xmm13, %xmm5, %xmm5
+	vpaddd %xmm14, %xmm6, %xmm6
+	vpaddd %xmm15, %xmm7, %xmm7
+	vpunpckldq %xmm1, %xmm0, %xmm8
+	vpunpckldq %xmm3, %xmm2, %xmm9
+	vpunpckhdq %xmm1, %xmm0, %xmm12
+	vpunpckhdq %xmm3, %xmm2, %xmm13
+	vpunpckldq %xmm5, %xmm4, %xmm10
+	vpunpckldq %xmm7, %xmm6, %xmm11
+	vpunpckhdq %xmm5, %xmm4, %xmm14
+	vpunpckhdq %xmm7, %xmm6, %xmm15
+	vpunpcklqdq %xmm9, %xmm8, %xmm0
+	vpunpcklqdq %xmm11, %xmm10, %xmm1
+	vpunpckhqdq %xmm9, %xmm8, %xmm2
+	vpunpckhqdq %xmm11, %xmm10, %xmm3
+	vpunpcklqdq %xmm13, %xmm12, %xmm4
+	vpunpcklqdq %xmm15, %xmm14, %xmm5
+	vpunpckhqdq %xmm13, %xmm12, %xmm6
+	vpunpckhqdq %xmm15, %xmm14, %xmm7
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_avx2_noinput2
+	vpxor 0(%rsi), %xmm0, %xmm0
+	vpxor 16(%rsi), %xmm1, %xmm1
+	vpxor 64(%rsi), %xmm2, %xmm2
+	vpxor 80(%rsi), %xmm3, %xmm3
+	vpxor 128(%rsi), %xmm4, %xmm4
+	vpxor 144(%rsi), %xmm5, %xmm5
+	vpxor 192(%rsi), %xmm6, %xmm6
+	vpxor 208(%rsi), %xmm7, %xmm7
+	vmovdqu %xmm0, 0(%rdx)
+	vmovdqu %xmm1, 16(%rdx)
+	vmovdqu %xmm2, 64(%rdx)
+	vmovdqu %xmm3, 80(%rdx)
+	vmovdqu %xmm4, 128(%rdx)
+	vmovdqu %xmm5, 144(%rdx)
+	vmovdqu %xmm6, 192(%rdx)
+	vmovdqu %xmm7, 208(%rdx)
+	vmovdqa 192(%rsp), %xmm0
+	vmovdqa 208(%rsp), %xmm1
+	vmovdqa 224(%rsp), %xmm2
+	vmovdqa 240(%rsp), %xmm3
+	vmovdqa 256(%rsp), %xmm4
+	vmovdqa 272(%rsp), %xmm5
+	vmovdqa 288(%rsp), %xmm6
+	vmovdqa 304(%rsp), %xmm7
+	vpbroadcastd 32(%rsp), %xmm8
+	vpbroadcastd 4+32(%rsp), %xmm9
+	vpbroadcastd 8+32(%rsp), %xmm10
+	vpbroadcastd 12+32(%rsp), %xmm11
+	vmovdqa 128(%rsp), %xmm12
+	vmovdqa 160(%rsp), %xmm13
+	vpbroadcastd 8+48(%rsp), %xmm14
+	vpbroadcastd 12+48(%rsp), %xmm15
+	vpaddd %xmm8, %xmm0, %xmm0
+	vpaddd %xmm9, %xmm1, %xmm1
+	vpaddd %xmm10, %xmm2, %xmm2
+	vpaddd %xmm11, %xmm3, %xmm3
+	vpaddd %xmm12, %xmm4, %xmm4
+	vpaddd %xmm13, %xmm5, %xmm5
+	vpaddd %xmm14, %xmm6, %xmm6
+	vpaddd %xmm15, %xmm7, %xmm7
+	vpunpckldq %xmm1, %xmm0, %xmm8
+	vpunpckldq %xmm3, %xmm2, %xmm9
+	vpunpckhdq %xmm1, %xmm0, %xmm12
+	vpunpckhdq %xmm3, %xmm2, %xmm13
+	vpunpckldq %xmm5, %xmm4, %xmm10
+	vpunpckldq %xmm7, %xmm6, %xmm11
+	vpunpckhdq %xmm5, %xmm4, %xmm14
+	vpunpckhdq %xmm7, %xmm6, %xmm15
+	vpunpcklqdq %xmm9, %xmm8, %xmm0
+	vpunpcklqdq %xmm11, %xmm10, %xmm1
+	vpunpckhqdq %xmm9, %xmm8, %xmm2
+	vpunpckhqdq %xmm11, %xmm10, %xmm3
+	vpunpcklqdq %xmm13, %xmm12, %xmm4
+	vpunpcklqdq %xmm15, %xmm14, %xmm5
+	vpunpckhqdq %xmm13, %xmm12, %xmm6
+	vpunpckhqdq %xmm15, %xmm14, %xmm7
+	vpxor 32(%rsi), %xmm0, %xmm0
+	vpxor 48(%rsi), %xmm1, %xmm1
+	vpxor 96(%rsi), %xmm2, %xmm2
+	vpxor 112(%rsi), %xmm3, %xmm3
+	vpxor 160(%rsi), %xmm4, %xmm4
+	vpxor 176(%rsi), %xmm5, %xmm5
+	vpxor 224(%rsi), %xmm6, %xmm6
+	vpxor 240(%rsi), %xmm7, %xmm7
+	vmovdqu %xmm0, 32(%rdx)
+	vmovdqu %xmm1, 48(%rdx)
+	vmovdqu %xmm2, 96(%rdx)
+	vmovdqu %xmm3, 112(%rdx)
+	vmovdqu %xmm4, 160(%rdx)
+	vmovdqu %xmm5, 176(%rdx)
+	vmovdqu %xmm6, 224(%rdx)
+	vmovdqu %xmm7, 240(%rdx)
+	addq $256, %rsi
+	jmp .Lchacha_blocks_avx2_mainloop2_cont
+.Lchacha_blocks_avx2_noinput2:
+	vmovdqu %xmm0, 0(%rdx)
+	vmovdqu %xmm1, 16(%rdx)
+	vmovdqu %xmm2, 64(%rdx)
+	vmovdqu %xmm3, 80(%rdx)
+	vmovdqu %xmm4, 128(%rdx)
+	vmovdqu %xmm5, 144(%rdx)
+	vmovdqu %xmm6, 192(%rdx)
+	vmovdqu %xmm7, 208(%rdx)
+	vmovdqa 192(%rsp), %xmm0
+	vmovdqa 208(%rsp), %xmm1
+	vmovdqa 224(%rsp), %xmm2
+	vmovdqa 240(%rsp), %xmm3
+	vmovdqa 256(%rsp), %xmm4
+	vmovdqa 272(%rsp), %xmm5
+	vmovdqa 288(%rsp), %xmm6
+	vmovdqa 304(%rsp), %xmm7
+	vpbroadcastd 32(%rsp), %xmm8
+	vpbroadcastd 4+32(%rsp), %xmm9
+	vpbroadcastd 8+32(%rsp), %xmm10
+	vpbroadcastd 12+32(%rsp), %xmm11
+	vmovdqa 128(%rsp), %xmm12
+	vmovdqa 160(%rsp), %xmm13
+	vpbroadcastd 8+48(%rsp), %xmm14
+	vpbroadcastd 12+48(%rsp), %xmm15
+	vpaddd %xmm8, %xmm0, %xmm0
+	vpaddd %xmm9, %xmm1, %xmm1
+	vpaddd %xmm10, %xmm2, %xmm2
+	vpaddd %xmm11, %xmm3, %xmm3
+	vpaddd %xmm12, %xmm4, %xmm4
+	vpaddd %xmm13, %xmm5, %xmm5
+	vpaddd %xmm14, %xmm6, %xmm6
+	vpaddd %xmm15, %xmm7, %xmm7
+	vpunpckldq %xmm1, %xmm0, %xmm8
+	vpunpckldq %xmm3, %xmm2, %xmm9
+	vpunpckhdq %xmm1, %xmm0, %xmm12
+	vpunpckhdq %xmm3, %xmm2, %xmm13
+	vpunpckldq %xmm5, %xmm4, %xmm10
+	vpunpckldq %xmm7, %xmm6, %xmm11
+	vpunpckhdq %xmm5, %xmm4, %xmm14
+	vpunpckhdq %xmm7, %xmm6, %xmm15
+	vpunpcklqdq %xmm9, %xmm8, %xmm0
+	vpunpcklqdq %xmm11, %xmm10, %xmm1
+	vpunpckhqdq %xmm9, %xmm8, %xmm2
+	vpunpckhqdq %xmm11, %xmm10, %xmm3
+	vpunpcklqdq %xmm13, %xmm12, %xmm4
+	vpunpcklqdq %xmm15, %xmm14, %xmm5
+	vpunpckhqdq %xmm13, %xmm12, %xmm6
+	vpunpckhqdq %xmm15, %xmm14, %xmm7
+	vmovdqu %xmm0, 32(%rdx)
+	vmovdqu %xmm1, 48(%rdx)
+	vmovdqu %xmm2, 96(%rdx)
+	vmovdqu %xmm3, 112(%rdx)
+	vmovdqu %xmm4, 160(%rdx)
+	vmovdqu %xmm5, 176(%rdx)
+	vmovdqu %xmm6, 224(%rdx)
+	vmovdqu %xmm7, 240(%rdx)
+.Lchacha_blocks_avx2_mainloop2_cont:
+	addq $256, %rdx
+	subq $256, %rcx
+	cmp $256, %rcx
+	jae .Lchacha_blocks_avx2_atleast256
+.Lchacha_blocks_avx2_below256_fixup:
+	vmovdqa 448(%rsp), %xmm6
+	vmovdqa 480(%rsp), %xmm7
+	vmovdqa 0(%rsp), %xmm8
+	vmovdqa 16(%rsp), %xmm9
+	vmovdqa 32(%rsp), %xmm10
+	vmovdqa 48(%rsp), %xmm11
+	movq $1, %r9
+.Lchacha_blocks_avx2_below256:
+	vmovq %r9, %xmm5
+	andq %rcx, %rcx
+	jz .Lchacha_blocks_avx2_done
+	cmpq $64, %rcx
+	jae .Lchacha_blocks_avx2_above63
+	movq %rdx, %r9
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_avx2_noinput3
+	movq %rcx, %r10
+	movq %rsp, %rdx
+	addq %r10, %rsi
+	addq %r10, %rdx
+	negq %r10
+.Lchacha_blocks_avx2_copyinput:
+	movb (%rsi, %r10), %al
+	movb %al, (%rdx, %r10)
+	incq %r10
+	jnz .Lchacha_blocks_avx2_copyinput
+	movq %rsp, %rsi
+.Lchacha_blocks_avx2_noinput3:
+	movq %rsp, %rdx
+.Lchacha_blocks_avx2_above63:
+	vmovdqa %xmm8, %xmm0
+	vmovdqa %xmm9, %xmm1
+	vmovdqa %xmm10, %xmm2
+	vmovdqa %xmm11, %xmm3
+	movq 64(%rsp), %rax
+.Lchacha_blocks_avx2_mainloop3:
+	vpaddd %xmm0, %xmm1, %xmm0
+	vpxor %xmm3, %xmm0, %xmm3
+	vpshufb %xmm6, %xmm3, %xmm3
+	vpaddd %xmm2, %xmm3, %xmm2
+	vpxor %xmm1, %xmm2, %xmm1
+	vpslld $12, %xmm1, %xmm4
+	vpsrld $20, %xmm1, %xmm1
+	vpxor %xmm1, %xmm4, %xmm1
+	vpaddd %xmm0, %xmm1, %xmm0
+	vpxor %xmm3, %xmm0, %xmm3
+	vpshufb %xmm7, %xmm3, %xmm3
+	vpshufd $0x93, %xmm0, %xmm0
+	vpaddd %xmm2, %xmm3, %xmm2
+	vpshufd $0x4e, %xmm3, %xmm3
+	vpxor %xmm1, %xmm2, %xmm1
+	vpshufd $0x39, %xmm2, %xmm2
+	vpslld $7, %xmm1, %xmm4
+	vpsrld $25, %xmm1, %xmm1
+	vpxor %xmm1, %xmm4, %xmm1
+	vpaddd %xmm0, %xmm1, %xmm0
+	vpxor %xmm3, %xmm0, %xmm3
+	vpshufb %xmm6, %xmm3, %xmm3
+	vpaddd %xmm2, %xmm3, %xmm2
+	vpxor %xmm1, %xmm2, %xmm1
+	vpslld $12, %xmm1, %xmm4
+	vpsrld $20, %xmm1, %xmm1
+	vpxor %xmm1, %xmm4, %xmm1
+	vpaddd %xmm0, %xmm1, %xmm0
+	vpxor %xmm3, %xmm0, %xmm3
+	vpshufb %xmm7, %xmm3, %xmm3
+	vpshufd $0x39, %xmm0, %xmm0
+	vpaddd %xmm2, %xmm3, %xmm2
+	vpshufd $0x4e, %xmm3, %xmm3
+	vpxor %xmm1, %xmm2, %xmm1
+	vpshufd $0x93, %xmm2, %xmm2
+	vpslld $7, %xmm1, %xmm4
+	vpsrld $25, %xmm1, %xmm1
+	vpxor %xmm1, %xmm4, %xmm1
+	subq $2, %rax
+	jnz .Lchacha_blocks_avx2_mainloop3
+	vpaddd %xmm0, %xmm8, %xmm0
+	vpaddd %xmm1, %xmm9, %xmm1
+	vpaddd %xmm2, %xmm10, %xmm2
+	vpaddd %xmm3, %xmm11, %xmm3
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_avx2_noinput4
+	vpxor 0(%rsi), %xmm0, %xmm0
+	vpxor 16(%rsi), %xmm1, %xmm1
+	vpxor 32(%rsi), %xmm2, %xmm2
+	vpxor 48(%rsi), %xmm3, %xmm3
+	addq $64, %rsi
+.Lchacha_blocks_avx2_noinput4:
+	vmovdqu %xmm0, 0(%rdx)
+	vmovdqu %xmm1, 16(%rdx)
+	vmovdqu %xmm2, 32(%rdx)
+	vmovdqu %xmm3, 48(%rdx)
+	vpaddq %xmm11, %xmm5, %xmm11
+	cmpq $64, %rcx
+	jbe .Lchacha_blocks_avx2_mainloop3_finishup
+	addq $64, %rdx
+	subq $64, %rcx
+	jmp .Lchacha_blocks_avx2_below256
+.Lchacha_blocks_avx2_mainloop3_finishup:
+	cmpq $64, %rcx
+	je .Lchacha_blocks_avx2_done
+	addq %rcx, %r9
+	addq %rcx, %rdx
+	negq %rcx
+.Lchacha_blocks_avx2_copyoutput:
+	movb (%rdx, %rcx), %al
+	movb %al, (%r9, %rcx)
+	incq %rcx
+	jnz .Lchacha_blocks_avx2_copyoutput
+.Lchacha_blocks_avx2_done:
+	vmovdqu %xmm11, 48(%rdi)
+	movq %rbp, %rsp
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	vzeroall
+	movl $(63 + 512), %eax
+	ret
+.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;
+
+.data
+.align 16
+.LC:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13       /* pshufb rotate by 16 */
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14       /* pshufb rotate by 8 */
+
+#endif /*defined(USE_CHACHA20)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index de8982b..2ac5a32 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -54,6 +54,13 @@
 # define USE_SSSE3 1
 #endif
 
+/* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX2)
+# define USE_AVX2 1
+#endif
+
 
 struct CHACHA20_context_s;
 
@@ -77,6 +84,13 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
 
 #endif /* USE_SSSE3 */
 
+#ifdef USE_AVX2
+
+unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
+                                              byte *out, size_t bytes);
+
+#endif /* USE_AVX2 */
+
 
 static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
 static const char *selftest (void);
@@ -314,6 +328,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
   if (features & HWF_INTEL_SSSE3)
     ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks;
 #endif
+#ifdef USE_AVX2
+  if (features & HWF_INTEL_AVX2)
+    ctx->blocks = _gcry_chacha20_amd64_avx2_blocks;
+#endif
 
   (void)features;
 
@@ -422,7 +440,7 @@ selftest (void)
 {
   CHACHA20_context_t ctx;
   byte scratch[127 + 1];
-  byte buf[256 + 64 + 4];
+  byte buf[512 + 64 + 4];
   int i;
 
   /* From draft-strombergson-chacha-test-vectors */
diff --git a/configure.ac b/configure.ac
index 0342067..3a0fd52 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1816,6 +1816,7 @@ if test "$found" = "1" ; then
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
       ;;
    esac
 fi

commit def7d4cad386271c6d4e2f10aabe0cb4abd871e4
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun May 11 12:00:19 2014 +0300

    chacha20: add SSSE3 assembly implementation
    
    * cipher/Makefile.am: Add 'chacha20-ssse3-amd64.S'.
    * cipher/chacha20-ssse3-amd64.S: New.
    * cipher/chacha20.c (USE_SSSE3): New macro.
    [USE_SSSE3] (_gcry_chacha20_amd64_ssse3_blocks): New.
    (chacha20_do_setkey): Select SSSE3 implementation if there is HW
    support.
    * configure.ac [host=x86-64]: Add 'chacha20-ssse3-amd64.lo'.
    --
    
    Add SSSE3 optimized implementation for ChaCha20. Based on implementation
    by Andrew Moon.
    
    Before (Intel Haswell):
    
     CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
         STREAM enc |      1.97 ns/B     483.6 MiB/s      6.31 c/B
         STREAM dec |      1.97 ns/B     484.0 MiB/s      6.31 c/B
    
    After:
    
     CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
         STREAM enc |     0.742 ns/B    1284.8 MiB/s      2.38 c/B
         STREAM dec |     0.741 ns/B    1286.5 MiB/s      2.37 c/B
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index bc7959a..27ca7ac 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -59,7 +59,7 @@ EXTRA_libcipher_la_SOURCES = \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
-chacha20.c \
+chacha20.c chacha20-ssse3-amd64.S \
 crc.c \
 des.c des-amd64.S \
 dsa.c \
diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S
new file mode 100644
index 0000000..aaa7e5b
--- /dev/null
+++ b/cipher/chacha20-ssse3-amd64.S
@@ -0,0 +1,610 @@
+/* chacha20-ssse3-amd64.S  -  AMD64/SSSE3 implementation of ChaCha20
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by Andrew Moon at
+ *  https://github.com/floodyberry/chacha-opt
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+.text
+
+.align 8
+.globl _gcry_chacha20_amd64_ssse3_blocks
+.type  _gcry_chacha20_amd64_ssse3_blocks, at function;
+_gcry_chacha20_amd64_ssse3_blocks:
+.Lchacha_blocks_ssse3_local:
+	pushq %rbx
+	pushq %rbp
+	movq %rsp, %rbp
+	andq $~63, %rsp
+	subq $512, %rsp
+	leaq .LC RIP, %rax
+	movdqa 0(%rax), %xmm6
+	movdqa 16(%rax), %xmm7
+	movdqu 0(%rdi), %xmm8
+	movdqu 16(%rdi), %xmm9
+	movdqu 32(%rdi), %xmm10
+	movdqu 48(%rdi), %xmm11
+	movl $20, %eax
+	movq $1, %r9
+	movdqa %xmm8, 0(%rsp)
+	movdqa %xmm9, 16(%rsp)
+	movdqa %xmm10, 32(%rsp)
+	movdqa %xmm11, 48(%rsp)
+	movdqa %xmm6, 80(%rsp)
+	movdqa %xmm7, 96(%rsp)
+	movq %rax, 64(%rsp)
+	cmpq $256, %rcx
+	jb .Lchacha_blocks_ssse3_below256
+	pshufd $0x00, %xmm8, %xmm0
+	pshufd $0x55, %xmm8, %xmm1
+	pshufd $0xaa, %xmm8, %xmm2
+	pshufd $0xff, %xmm8, %xmm3
+	movdqa %xmm0, 128(%rsp)
+	movdqa %xmm1, 144(%rsp)
+	movdqa %xmm2, 160(%rsp)
+	movdqa %xmm3, 176(%rsp)
+	pshufd $0x00, %xmm9, %xmm0
+	pshufd $0x55, %xmm9, %xmm1
+	pshufd $0xaa, %xmm9, %xmm2
+	pshufd $0xff, %xmm9, %xmm3
+	movdqa %xmm0, 192(%rsp)
+	movdqa %xmm1, 208(%rsp)
+	movdqa %xmm2, 224(%rsp)
+	movdqa %xmm3, 240(%rsp)
+	pshufd $0x00, %xmm10, %xmm0
+	pshufd $0x55, %xmm10, %xmm1
+	pshufd $0xaa, %xmm10, %xmm2
+	pshufd $0xff, %xmm10, %xmm3
+	movdqa %xmm0, 256(%rsp)
+	movdqa %xmm1, 272(%rsp)
+	movdqa %xmm2, 288(%rsp)
+	movdqa %xmm3, 304(%rsp)
+	pshufd $0xaa, %xmm11, %xmm0
+	pshufd $0xff, %xmm11, %xmm1
+	movdqa %xmm0, 352(%rsp)
+	movdqa %xmm1, 368(%rsp)
+	jmp .Lchacha_blocks_ssse3_atleast256
+.p2align 6,,63
+	# align to 4 mod 64
+	nop;nop;nop;nop;
+.Lchacha_blocks_ssse3_atleast256:
+	movq 48(%rsp), %rax
+	leaq 1(%rax), %r8
+	leaq 2(%rax), %r9
+	leaq 3(%rax), %r10
+	leaq 4(%rax), %rbx
+	movl %eax, 320(%rsp)
+	movl %r8d, 4+320(%rsp)
+	movl %r9d, 8+320(%rsp)
+	movl %r10d, 12+320(%rsp)
+	shrq $32, %rax
+	shrq $32, %r8
+	shrq $32, %r9
+	shrq $32, %r10
+	movl %eax, 336(%rsp)
+	movl %r8d, 4+336(%rsp)
+	movl %r9d, 8+336(%rsp)
+	movl %r10d, 12+336(%rsp)
+	movq %rbx, 48(%rsp)
+	movq 64(%rsp), %rax
+	movdqa 128(%rsp), %xmm0
+	movdqa 144(%rsp), %xmm1
+	movdqa 160(%rsp), %xmm2
+	movdqa 176(%rsp), %xmm3
+	movdqa 192(%rsp), %xmm4
+	movdqa 208(%rsp), %xmm5
+	movdqa 224(%rsp), %xmm6
+	movdqa 240(%rsp), %xmm7
+	movdqa 256(%rsp), %xmm8
+	movdqa 272(%rsp), %xmm9
+	movdqa 288(%rsp), %xmm10
+	movdqa 304(%rsp), %xmm11
+	movdqa 320(%rsp), %xmm12
+	movdqa 336(%rsp), %xmm13
+	movdqa 352(%rsp), %xmm14
+	movdqa 368(%rsp), %xmm15
+.Lchacha_blocks_ssse3_mainloop1:
+	paddd %xmm4, %xmm0
+	paddd %xmm5, %xmm1
+	pxor %xmm0, %xmm12
+	pxor %xmm1, %xmm13
+	paddd %xmm6, %xmm2
+	paddd %xmm7, %xmm3
+	pxor %xmm2, %xmm14
+	pxor %xmm3, %xmm15
+	pshufb 80(%rsp), %xmm12
+	pshufb 80(%rsp), %xmm13
+	paddd %xmm12, %xmm8
+	paddd %xmm13, %xmm9
+	pshufb 80(%rsp), %xmm14
+	pshufb 80(%rsp), %xmm15
+	paddd %xmm14, %xmm10
+	paddd %xmm15, %xmm11
+	movdqa %xmm12, 112(%rsp)
+	pxor %xmm8, %xmm4
+	pxor %xmm9, %xmm5
+	movdqa %xmm4, %xmm12
+	pslld $ 12, %xmm4
+	psrld $20, %xmm12
+	pxor %xmm12, %xmm4
+	movdqa %xmm5, %xmm12
+	pslld $ 12, %xmm5
+	psrld $20, %xmm12
+	pxor %xmm12, %xmm5
+	pxor %xmm10, %xmm6
+	pxor %xmm11, %xmm7
+	movdqa %xmm6, %xmm12
+	pslld $ 12, %xmm6
+	psrld $20, %xmm12
+	pxor %xmm12, %xmm6
+	movdqa %xmm7, %xmm12
+	pslld $ 12, %xmm7
+	psrld $20, %xmm12
+	pxor %xmm12, %xmm7
+	movdqa 112(%rsp), %xmm12
+	paddd %xmm4, %xmm0
+	paddd %xmm5, %xmm1
+	pxor %xmm0, %xmm12
+	pxor %xmm1, %xmm13
+	paddd %xmm6, %xmm2
+	paddd %xmm7, %xmm3
+	pxor %xmm2, %xmm14
+	pxor %xmm3, %xmm15
+	pshufb 96(%rsp), %xmm12
+	pshufb 96(%rsp), %xmm13
+	paddd %xmm12, %xmm8
+	paddd %xmm13, %xmm9
+	pshufb 96(%rsp), %xmm14
+	pshufb 96(%rsp), %xmm15
+	paddd %xmm14, %xmm10
+	paddd %xmm15, %xmm11
+	movdqa %xmm12, 112(%rsp)
+	pxor %xmm8, %xmm4
+	pxor %xmm9, %xmm5
+	movdqa %xmm4, %xmm12
+	pslld $ 7, %xmm4
+	psrld $25, %xmm12
+	pxor %xmm12, %xmm4
+	movdqa %xmm5, %xmm12
+	pslld $ 7, %xmm5
+	psrld $25, %xmm12
+	pxor %xmm12, %xmm5
+	pxor %xmm10, %xmm6
+	pxor %xmm11, %xmm7
+	movdqa %xmm6, %xmm12
+	pslld $ 7, %xmm6
+	psrld $25, %xmm12
+	pxor %xmm12, %xmm6
+	movdqa %xmm7, %xmm12
+	pslld $ 7, %xmm7
+	psrld $25, %xmm12
+	pxor %xmm12, %xmm7
+	movdqa 112(%rsp), %xmm12
+	paddd %xmm5, %xmm0
+	paddd %xmm6, %xmm1
+	pxor %xmm0, %xmm15
+	pxor %xmm1, %xmm12
+	paddd %xmm7, %xmm2
+	paddd %xmm4, %xmm3
+	pxor %xmm2, %xmm13
+	pxor %xmm3, %xmm14
+	pshufb 80(%rsp), %xmm15
+	pshufb 80(%rsp), %xmm12
+	paddd %xmm15, %xmm10
+	paddd %xmm12, %xmm11
+	pshufb 80(%rsp), %xmm13
+	pshufb 80(%rsp), %xmm14
+	paddd %xmm13, %xmm8
+	paddd %xmm14, %xmm9
+	movdqa %xmm15, 112(%rsp)
+	pxor %xmm10, %xmm5
+	pxor %xmm11, %xmm6
+	movdqa %xmm5, %xmm15
+	pslld $ 12, %xmm5
+	psrld $20, %xmm15
+	pxor %xmm15, %xmm5
+	movdqa %xmm6, %xmm15
+	pslld $ 12, %xmm6
+	psrld $20, %xmm15
+	pxor %xmm15, %xmm6
+	pxor %xmm8, %xmm7
+	pxor %xmm9, %xmm4
+	movdqa %xmm7, %xmm15
+	pslld $ 12, %xmm7
+	psrld $20, %xmm15
+	pxor %xmm15, %xmm7
+	movdqa %xmm4, %xmm15
+	pslld $ 12, %xmm4
+	psrld $20, %xmm15
+	pxor %xmm15, %xmm4
+	movdqa 112(%rsp), %xmm15
+	paddd %xmm5, %xmm0
+	paddd %xmm6, %xmm1
+	pxor %xmm0, %xmm15
+	pxor %xmm1, %xmm12
+	paddd %xmm7, %xmm2
+	paddd %xmm4, %xmm3
+	pxor %xmm2, %xmm13
+	pxor %xmm3, %xmm14
+	pshufb 96(%rsp), %xmm15
+	pshufb 96(%rsp), %xmm12
+	paddd %xmm15, %xmm10
+	paddd %xmm12, %xmm11
+	pshufb 96(%rsp), %xmm13
+	pshufb 96(%rsp), %xmm14
+	paddd %xmm13, %xmm8
+	paddd %xmm14, %xmm9
+	movdqa %xmm15, 112(%rsp)
+	pxor %xmm10, %xmm5
+	pxor %xmm11, %xmm6
+	movdqa %xmm5, %xmm15
+	pslld $ 7, %xmm5
+	psrld $25, %xmm15
+	pxor %xmm15, %xmm5
+	movdqa %xmm6, %xmm15
+	pslld $ 7, %xmm6
+	psrld $25, %xmm15
+	pxor %xmm15, %xmm6
+	pxor %xmm8, %xmm7
+	pxor %xmm9, %xmm4
+	movdqa %xmm7, %xmm15
+	pslld $ 7, %xmm7
+	psrld $25, %xmm15
+	pxor %xmm15, %xmm7
+	movdqa %xmm4, %xmm15
+	pslld $ 7, %xmm4
+	psrld $25, %xmm15
+	pxor %xmm15, %xmm4
+	subq $2, %rax
+	movdqa 112(%rsp), %xmm15
+	jnz .Lchacha_blocks_ssse3_mainloop1
+	paddd 128(%rsp), %xmm0
+	paddd 144(%rsp), %xmm1
+	paddd 160(%rsp), %xmm2
+	paddd 176(%rsp), %xmm3
+	paddd 192(%rsp), %xmm4
+	paddd 208(%rsp), %xmm5
+	paddd 224(%rsp), %xmm6
+	paddd 240(%rsp), %xmm7
+	paddd 256(%rsp), %xmm8
+	paddd 272(%rsp), %xmm9
+	paddd 288(%rsp), %xmm10
+	paddd 304(%rsp), %xmm11
+	paddd 320(%rsp), %xmm12
+	paddd 336(%rsp), %xmm13
+	paddd 352(%rsp), %xmm14
+	paddd 368(%rsp), %xmm15
+	movdqa %xmm8, 384(%rsp)
+	movdqa %xmm9, 400(%rsp)
+	movdqa %xmm10, 416(%rsp)
+	movdqa %xmm11, 432(%rsp)
+	movdqa %xmm12, 448(%rsp)
+	movdqa %xmm13, 464(%rsp)
+	movdqa %xmm14, 480(%rsp)
+	movdqa %xmm15, 496(%rsp)
+	movdqa %xmm0, %xmm8
+	movdqa %xmm2, %xmm9
+	movdqa %xmm4, %xmm10
+	movdqa %xmm6, %xmm11
+	punpckhdq %xmm1, %xmm0
+	punpckhdq %xmm3, %xmm2
+	punpckhdq %xmm5, %xmm4
+	punpckhdq %xmm7, %xmm6
+	punpckldq %xmm1, %xmm8
+	punpckldq %xmm3, %xmm9
+	punpckldq %xmm5, %xmm10
+	punpckldq %xmm7, %xmm11
+	movdqa %xmm0, %xmm1
+	movdqa %xmm4, %xmm3
+	movdqa %xmm8, %xmm5
+	movdqa %xmm10, %xmm7
+	punpckhqdq %xmm2, %xmm0
+	punpckhqdq %xmm6, %xmm4
+	punpckhqdq %xmm9, %xmm8
+	punpckhqdq %xmm11, %xmm10
+	punpcklqdq %xmm2, %xmm1
+	punpcklqdq %xmm6, %xmm3
+	punpcklqdq %xmm9, %xmm5
+	punpcklqdq %xmm11, %xmm7
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_ssse3_noinput1
+	movdqu 0(%rsi), %xmm2
+	movdqu 16(%rsi), %xmm6
+	movdqu 64(%rsi), %xmm9
+	movdqu 80(%rsi), %xmm11
+	movdqu 128(%rsi), %xmm12
+	movdqu 144(%rsi), %xmm13
+	movdqu 192(%rsi), %xmm14
+	movdqu 208(%rsi), %xmm15
+	pxor %xmm2, %xmm5
+	pxor %xmm6, %xmm7
+	pxor %xmm9, %xmm8
+	pxor %xmm11, %xmm10
+	pxor %xmm12, %xmm1
+	pxor %xmm13, %xmm3
+	pxor %xmm14, %xmm0
+	pxor %xmm15, %xmm4
+	movdqu %xmm5, 0(%rdx)
+	movdqu %xmm7, 16(%rdx)
+	movdqu %xmm8, 64(%rdx)
+	movdqu %xmm10, 80(%rdx)
+	movdqu %xmm1, 128(%rdx)
+	movdqu %xmm3, 144(%rdx)
+	movdqu %xmm0, 192(%rdx)
+	movdqu %xmm4, 208(%rdx)
+	movdqa 384(%rsp), %xmm0
+	movdqa 400(%rsp), %xmm1
+	movdqa 416(%rsp), %xmm2
+	movdqa 432(%rsp), %xmm3
+	movdqa 448(%rsp), %xmm4
+	movdqa 464(%rsp), %xmm5
+	movdqa 480(%rsp), %xmm6
+	movdqa 496(%rsp), %xmm7
+	movdqa %xmm0, %xmm8
+	movdqa %xmm2, %xmm9
+	movdqa %xmm4, %xmm10
+	movdqa %xmm6, %xmm11
+	punpckldq %xmm1, %xmm8
+	punpckldq %xmm3, %xmm9
+	punpckhdq %xmm1, %xmm0
+	punpckhdq %xmm3, %xmm2
+	punpckldq %xmm5, %xmm10
+	punpckldq %xmm7, %xmm11
+	punpckhdq %xmm5, %xmm4
+	punpckhdq %xmm7, %xmm6
+	movdqa %xmm8, %xmm1
+	movdqa %xmm0, %xmm3
+	movdqa %xmm10, %xmm5
+	movdqa %xmm4, %xmm7
+	punpcklqdq %xmm9, %xmm1
+	punpcklqdq %xmm11, %xmm5
+	punpckhqdq %xmm9, %xmm8
+	punpckhqdq %xmm11, %xmm10
+	punpcklqdq %xmm2, %xmm3
+	punpcklqdq %xmm6, %xmm7
+	punpckhqdq %xmm2, %xmm0
+	punpckhqdq %xmm6, %xmm4
+	movdqu 32(%rsi), %xmm2
+	movdqu 48(%rsi), %xmm6
+	movdqu 96(%rsi), %xmm9
+	movdqu 112(%rsi), %xmm11
+	movdqu 160(%rsi), %xmm12
+	movdqu 176(%rsi), %xmm13
+	movdqu 224(%rsi), %xmm14
+	movdqu 240(%rsi), %xmm15
+	pxor %xmm2, %xmm1
+	pxor %xmm6, %xmm5
+	pxor %xmm9, %xmm8
+	pxor %xmm11, %xmm10
+	pxor %xmm12, %xmm3
+	pxor %xmm13, %xmm7
+	pxor %xmm14, %xmm0
+	pxor %xmm15, %xmm4
+	movdqu %xmm1, 32(%rdx)
+	movdqu %xmm5, 48(%rdx)
+	movdqu %xmm8, 96(%rdx)
+	movdqu %xmm10, 112(%rdx)
+	movdqu %xmm3, 160(%rdx)
+	movdqu %xmm7, 176(%rdx)
+	movdqu %xmm0, 224(%rdx)
+	movdqu %xmm4, 240(%rdx)
+	addq $256, %rsi
+	jmp .Lchacha_blocks_ssse3_mainloop_cont
+.Lchacha_blocks_ssse3_noinput1:
+	movdqu %xmm5, 0(%rdx)
+	movdqu %xmm7, 16(%rdx)
+	movdqu %xmm8, 64(%rdx)
+	movdqu %xmm10, 80(%rdx)
+	movdqu %xmm1, 128(%rdx)
+	movdqu %xmm3, 144(%rdx)
+	movdqu %xmm0, 192(%rdx)
+	movdqu %xmm4, 208(%rdx)
+	movdqa 384(%rsp), %xmm0
+	movdqa 400(%rsp), %xmm1
+	movdqa 416(%rsp), %xmm2
+	movdqa 432(%rsp), %xmm3
+	movdqa 448(%rsp), %xmm4
+	movdqa 464(%rsp), %xmm5
+	movdqa 480(%rsp), %xmm6
+	movdqa 496(%rsp), %xmm7
+	movdqa %xmm0, %xmm8
+	movdqa %xmm2, %xmm9
+	movdqa %xmm4, %xmm10
+	movdqa %xmm6, %xmm11
+	punpckldq %xmm1, %xmm8
+	punpckldq %xmm3, %xmm9
+	punpckhdq %xmm1, %xmm0
+	punpckhdq %xmm3, %xmm2
+	punpckldq %xmm5, %xmm10
+	punpckldq %xmm7, %xmm11
+	punpckhdq %xmm5, %xmm4
+	punpckhdq %xmm7, %xmm6
+	movdqa %xmm8, %xmm1
+	movdqa %xmm0, %xmm3
+	movdqa %xmm10, %xmm5
+	movdqa %xmm4, %xmm7
+	punpcklqdq %xmm9, %xmm1
+	punpcklqdq %xmm11, %xmm5
+	punpckhqdq %xmm9, %xmm8
+	punpckhqdq %xmm11, %xmm10
+	punpcklqdq %xmm2, %xmm3
+	punpcklqdq %xmm6, %xmm7
+	punpckhqdq %xmm2, %xmm0
+	punpckhqdq %xmm6, %xmm4
+	movdqu %xmm1, 32(%rdx)
+	movdqu %xmm5, 48(%rdx)
+	movdqu %xmm8, 96(%rdx)
+	movdqu %xmm10, 112(%rdx)
+	movdqu %xmm3, 160(%rdx)
+	movdqu %xmm7, 176(%rdx)
+	movdqu %xmm0, 224(%rdx)
+	movdqu %xmm4, 240(%rdx)
+.Lchacha_blocks_ssse3_mainloop_cont:
+	addq $256, %rdx
+	subq $256, %rcx
+	cmp $256, %rcx
+	jae .Lchacha_blocks_ssse3_atleast256
+	movdqa 80(%rsp), %xmm6
+	movdqa 96(%rsp), %xmm7
+	movdqa 0(%rsp), %xmm8
+	movdqa 16(%rsp), %xmm9
+	movdqa 32(%rsp), %xmm10
+	movdqa 48(%rsp), %xmm11
+	movq $1, %r9
+.Lchacha_blocks_ssse3_below256:
+	movq %r9, %xmm5
+	andq %rcx, %rcx
+	jz .Lchacha_blocks_ssse3_done
+	cmpq $64, %rcx
+	jae .Lchacha_blocks_ssse3_above63
+	movq %rdx, %r9
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_ssse3_noinput2
+	movq %rcx, %r10
+	movq %rsp, %rdx
+	addq %r10, %rsi
+	addq %r10, %rdx
+	negq %r10
+.Lchacha_blocks_ssse3_copyinput:
+	movb (%rsi, %r10), %al
+	movb %al, (%rdx, %r10)
+	incq %r10
+	jnz .Lchacha_blocks_ssse3_copyinput
+	movq %rsp, %rsi
+.Lchacha_blocks_ssse3_noinput2:
+	movq %rsp, %rdx
+.Lchacha_blocks_ssse3_above63:
+	movdqa %xmm8, %xmm0
+	movdqa %xmm9, %xmm1
+	movdqa %xmm10, %xmm2
+	movdqa %xmm11, %xmm3
+	movq 64(%rsp), %rax
+.Lchacha_blocks_ssse3_mainloop2:
+	paddd %xmm1, %xmm0
+	pxor %xmm0, %xmm3
+	pshufb %xmm6, %xmm3
+	paddd %xmm3, %xmm2
+	pxor %xmm2, %xmm1
+	movdqa %xmm1, %xmm4
+	pslld $12, %xmm4
+	psrld $20, %xmm1
+	pxor %xmm4, %xmm1
+	paddd %xmm1, %xmm0
+	pxor %xmm0, %xmm3
+	pshufb %xmm7, %xmm3
+	pshufd $0x93, %xmm0, %xmm0
+	paddd %xmm3, %xmm2
+	pshufd $0x4e, %xmm3, %xmm3
+	pxor %xmm2, %xmm1
+	pshufd $0x39, %xmm2, %xmm2
+	movdqa %xmm1, %xmm4
+	pslld $7, %xmm4
+	psrld $25, %xmm1
+	pxor %xmm4, %xmm1
+	paddd %xmm1, %xmm0
+	pxor %xmm0, %xmm3
+	pshufb %xmm6, %xmm3
+	paddd %xmm3, %xmm2
+	pxor %xmm2, %xmm1
+	movdqa %xmm1, %xmm4
+	pslld $12, %xmm4
+	psrld $20, %xmm1
+	pxor %xmm4, %xmm1
+	paddd %xmm1, %xmm0
+	pxor %xmm0, %xmm3
+	pshufb %xmm7, %xmm3
+	pshufd $0x39, %xmm0, %xmm0
+	paddd %xmm3, %xmm2
+	pshufd $0x4e, %xmm3, %xmm3
+	pxor %xmm2, %xmm1
+	pshufd $0x93, %xmm2, %xmm2
+	movdqa %xmm1, %xmm4
+	pslld $7, %xmm4
+	psrld $25, %xmm1
+	pxor %xmm4, %xmm1
+	subq $2, %rax
+	jnz .Lchacha_blocks_ssse3_mainloop2
+	paddd %xmm8, %xmm0
+	paddd %xmm9, %xmm1
+	paddd %xmm10, %xmm2
+	paddd %xmm11, %xmm3
+	andq %rsi, %rsi
+	jz .Lchacha_blocks_ssse3_noinput3
+	movdqu 0(%rsi), %xmm12
+	movdqu 16(%rsi), %xmm13
+	movdqu 32(%rsi), %xmm14
+	movdqu 48(%rsi), %xmm15
+	pxor %xmm12, %xmm0
+	pxor %xmm13, %xmm1
+	pxor %xmm14, %xmm2
+	pxor %xmm15, %xmm3
+	addq $64, %rsi
+.Lchacha_blocks_ssse3_noinput3:
+	movdqu %xmm0, 0(%rdx)
+	movdqu %xmm1, 16(%rdx)
+	movdqu %xmm2, 32(%rdx)
+	movdqu %xmm3, 48(%rdx)
+	paddq %xmm5, %xmm11
+	cmpq $64, %rcx
+	jbe .Lchacha_blocks_ssse3_mainloop2_finishup
+	addq $64, %rdx
+	subq $64, %rcx
+	jmp .Lchacha_blocks_ssse3_below256
+.Lchacha_blocks_ssse3_mainloop2_finishup:
+	cmpq $64, %rcx
+	je .Lchacha_blocks_ssse3_done
+	addq %rcx, %r9
+	addq %rcx, %rdx
+	negq %rcx
+.Lchacha_blocks_ssse3_copyoutput:
+	movb (%rdx, %rcx), %al
+	movb %al, (%r9, %rcx)
+	incq %rcx
+	jnz .Lchacha_blocks_ssse3_copyoutput
+.Lchacha_blocks_ssse3_done:
+	movdqu %xmm11, 48(%rdi)
+	movq %rbp, %rsp
+	popq %rbp
+	popq %rbx
+	movl $(63 + 512 + 16), %eax
+	ret
+.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;
+
+.data
+.align 16;
+.LC:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13       /* pshufb rotate by 16 */
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14       /* pshufb rotate by 8 */
+
+#endif /*defined(USE_CHACHA20)*/
+#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ff0366d..de8982b 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -47,6 +47,13 @@
 #define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
 #define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4)
 
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3)
+# define USE_SSSE3 1
+#endif
+
 
 struct CHACHA20_context_s;
 
@@ -63,6 +70,14 @@ typedef struct CHACHA20_context_s
 } CHACHA20_context_t;
 
 
+#ifdef USE_SSSE3
+
+unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
+                                               byte *out, size_t bytes);
+
+#endif /* USE_SSSE3 */
+
+
 static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
 static const char *selftest (void);
 

@@ -279,6 +294,7 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
 {
   static int initialized;
   static const char *selftest_failed;
+  unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
@@ -294,6 +310,12 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
     return GPG_ERR_INV_KEYLEN;
 
   ctx->blocks = chacha20_blocks;
+#ifdef USE_SSSE3
+  if (features & HWF_INTEL_SSSE3)
+    ctx->blocks = _gcry_chacha20_amd64_ssse3_blocks;
+#endif
+
+  (void)features;
 
   chacha20_keysetup (ctx, key, keylen);
 
diff --git a/configure.ac b/configure.ac
index 7573952..0342067 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1811,6 +1811,13 @@ LIST_MEMBER(chacha20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
    AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)

commit 23f33d57c9b6f2295a8ddfc9a8eee5a2c30cf406
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun May 11 12:00:19 2014 +0300

    Add ChaCha20 stream cipher
    
    * cipher/Makefile.am: Add 'chacha20.c'.
    * cipher/chacha20.c: New.
    * cipher/cipher.c (cipher_list): Add ChaCha20.
    * configure.ac: Add ChaCha20.
    * doc/gcrypt.texi: Add ChaCha20.
    * src/cipher.h (_gcry_cipher_spec_chacha20): New.
    * src/gcrypt.h.in (GCRY_CIPHER_CHACHA20): Add new algo.
    * tests/basic.c (MAX_DATA_LEN): Increase to 128 from 100.
    (check_stream_cipher): Add ChaCha20 test-vectors.
    (check_ciphers): Add ChaCha20.
    --
    
    Patch adds Bernstein's ChaCha20 cipher to libgcrypt. Implementation is based
    on public domain implementations.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3c20d3c..bc7959a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -59,6 +59,7 @@ EXTRA_libcipher_la_SOURCES = \
 arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
+chacha20.c \
 crc.c \
 des.c des-amd64.S \
 dsa.c \
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
new file mode 100644
index 0000000..ff0366d
--- /dev/null
+++ b/cipher/chacha20.c
@@ -0,0 +1,504 @@
+/* chacha20.c  -  Bernstein's ChaCha20 cipher
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * For a description of the algorithm, see:
+ *   http://cr.yp.to/chacha.html
+ */
+
+/* The code is based on salsa20.c and public-domain ChaCha implementations:
+ *  chacha-ref.c version 20080118
+ *  D. J. Bernstein
+ *  Public domain.
+ * and
+ *  Andrew Moon
+ *  https://github.com/floodyberry/chacha-opt
+ */
+
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+
+
+#define CHACHA20_MIN_KEY_SIZE 16        /* Bytes.  */
+#define CHACHA20_MAX_KEY_SIZE 32        /* Bytes.  */
+#define CHACHA20_BLOCK_SIZE   64        /* Bytes.  */
+#define CHACHA20_MIN_IV_SIZE   8        /* Bytes.  */
+#define CHACHA20_MAX_IV_SIZE  12        /* Bytes.  */
+#define CHACHA20_INPUT_LENGTH (CHACHA20_BLOCK_SIZE / 4)
+
+
+struct CHACHA20_context_s;
+
+
+typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src,
+                                           byte *dst, size_t bytes);
+
+typedef struct CHACHA20_context_s
+{
+  u32 input[CHACHA20_INPUT_LENGTH];
+  u32 pad[CHACHA20_INPUT_LENGTH];
+  chacha20_blocks_t blocks;
+  unsigned int unused; /* bytes in the pad.  */
+} CHACHA20_context_t;
+
+
+static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
+static const char *selftest (void);
+

+
+
+#define QROUND(a,b,c,d)         \
+  do {                          \
+    a += b; d = rol(d ^ a, 16); \
+    c += d; b = rol(b ^ c, 12); \
+    a += b; d = rol(d ^ a, 8);  \
+    c += d; b = rol(b ^ c, 7);  \
+  } while (0)
+
+#define QOUT(ai, bi, ci, di) \
+  DO_OUT(ai); DO_OUT(bi); DO_OUT(ci); DO_OUT(di)
+
+static unsigned int
+chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
+{
+  u32 pad[CHACHA20_INPUT_LENGTH];
+  u32 inp[CHACHA20_INPUT_LENGTH];
+  unsigned int i;
+
+  /* Note: 'bytes' must be multiple of 64 and not zero. */
+
+  inp[0] = state[0];
+  inp[1] = state[1];
+  inp[2] = state[2];
+  inp[3] = state[3];
+  inp[4] = state[4];
+  inp[5] = state[5];
+  inp[6] = state[6];
+  inp[7] = state[7];
+  inp[8] = state[8];
+  inp[9] = state[9];
+  inp[10] = state[10];
+  inp[11] = state[11];
+  inp[12] = state[12];
+  inp[13] = state[13];
+  inp[14] = state[14];
+  inp[15] = state[15];
+
+  do
+    {
+      /* First round. */
+      pad[0] = inp[0];
+      pad[4] = inp[4];
+      pad[8] = inp[8];
+      pad[12] = inp[12];
+      QROUND (pad[0], pad[4], pad[8], pad[12]);
+      pad[1] = inp[1];
+      pad[5] = inp[5];
+      pad[9] = inp[9];
+      pad[13] = inp[13];
+      QROUND (pad[1], pad[5], pad[9], pad[13]);
+      pad[2] = inp[2];
+      pad[6] = inp[6];
+      pad[10] = inp[10];
+      pad[14] = inp[14];
+      QROUND (pad[2], pad[6], pad[10], pad[14]);
+      pad[3] = inp[3];
+      pad[7] = inp[7];
+      pad[11] = inp[11];
+      pad[15] = inp[15];
+      QROUND (pad[3], pad[7], pad[11], pad[15]);
+
+      QROUND (pad[0], pad[5], pad[10], pad[15]);
+      QROUND (pad[1], pad[6], pad[11], pad[12]);
+      QROUND (pad[2], pad[7], pad[8], pad[13]);
+      QROUND (pad[3], pad[4], pad[9], pad[14]);
+
+      for (i = 2; i < 20 - 2; i += 2)
+      {
+        QROUND (pad[0], pad[4], pad[8], pad[12]);
+        QROUND (pad[1], pad[5], pad[9], pad[13]);
+        QROUND (pad[2], pad[6], pad[10], pad[14]);
+        QROUND (pad[3], pad[7], pad[11], pad[15]);
+
+        QROUND (pad[0], pad[5], pad[10], pad[15]);
+        QROUND (pad[1], pad[6], pad[11], pad[12]);
+        QROUND (pad[2], pad[7], pad[8], pad[13]);
+        QROUND (pad[3], pad[4], pad[9], pad[14]);
+      }
+
+      QROUND (pad[0], pad[4], pad[8], pad[12]);
+      QROUND (pad[1], pad[5], pad[9], pad[13]);
+      QROUND (pad[2], pad[6], pad[10], pad[14]);
+      QROUND (pad[3], pad[7], pad[11], pad[15]);
+
+      if (src)
+        {
+#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, \
+                                 (pad[idx] + inp[idx]) ^ \
+                                  buf_get_le32(src + (idx) * 4))
+          /* Last round. */
+          QROUND (pad[0], pad[5], pad[10], pad[15]);
+          QOUT(0, 5, 10, 15);
+          QROUND (pad[1], pad[6], pad[11], pad[12]);
+          QOUT(1, 6, 11, 12);
+          QROUND (pad[2], pad[7], pad[8], pad[13]);
+          QOUT(2, 7, 8, 13);
+          QROUND (pad[3], pad[4], pad[9], pad[14]);
+          QOUT(3, 4, 9, 14);
+#undef DO_OUT
+        }
+      else
+        {
+#define DO_OUT(idx) buf_put_le32(dst + (idx) * 4, pad[idx] + inp[idx])
+          /* Last round. */
+          QROUND (pad[0], pad[5], pad[10], pad[15]);
+          QOUT(0, 5, 10, 15);
+          QROUND (pad[1], pad[6], pad[11], pad[12]);
+          QOUT(1, 6, 11, 12);
+          QROUND (pad[2], pad[7], pad[8], pad[13]);
+          QOUT(2, 7, 8, 13);
+          QROUND (pad[3], pad[4], pad[9], pad[14]);
+          QOUT(3, 4, 9, 14);
+#undef DO_OUT
+        }
+
+      /* Update counter. */
+      inp[13] += (!++inp[12]);
+
+      bytes -= CHACHA20_BLOCK_SIZE;
+      dst += CHACHA20_BLOCK_SIZE;
+      src += (src) ? CHACHA20_BLOCK_SIZE : 0;
+    }
+  while (bytes >= CHACHA20_BLOCK_SIZE);
+
+  state[12] = inp[12];
+  state[13] = inp[13];
+
+  /* burn_stack */
+  return (2 * CHACHA20_INPUT_LENGTH * sizeof(u32) + 6 * sizeof(void *));
+}
+
+#undef QROUND
+#undef QOUT
+
+
+static unsigned int
+chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx)
+{
+  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE);
+}
+
+
+static void
+chacha20_keysetup (CHACHA20_context_t * ctx, const byte * key,
+                   unsigned int keylen)
+{
+  /* These constants are the little endian encoding of the string
+     "expand 32-byte k".  For the 128 bit variant, the "32" in that
+     string will be fixed up to "16".  */
+  ctx->input[0] = 0x61707865;        /* "apxe"  */
+  ctx->input[1] = 0x3320646e;        /* "3 dn"  */
+  ctx->input[2] = 0x79622d32;        /* "yb-2"  */
+  ctx->input[3] = 0x6b206574;        /* "k et"  */
+
+  ctx->input[4] = buf_get_le32 (key + 0);
+  ctx->input[5] = buf_get_le32 (key + 4);
+  ctx->input[6] = buf_get_le32 (key + 8);
+  ctx->input[7] = buf_get_le32 (key + 12);
+
+  if (keylen == CHACHA20_MAX_KEY_SIZE) /* 256 bits */
+    {
+      ctx->input[8] = buf_get_le32 (key + 16);
+      ctx->input[9] = buf_get_le32 (key + 20);
+      ctx->input[10] = buf_get_le32 (key + 24);
+      ctx->input[11] = buf_get_le32 (key + 28);
+    }
+  else /* 128 bits */
+    {
+      ctx->input[8] = ctx->input[4];
+      ctx->input[9] = ctx->input[5];
+      ctx->input[10] = ctx->input[6];
+      ctx->input[11] = ctx->input[7];
+
+      ctx->input[1] -= 0x02000000;        /* Change to "1 dn".  */
+      ctx->input[2] += 0x00000004;        /* Change to "yb-6".  */
+    }
+}
+
+
+static void
+chacha20_ivsetup (CHACHA20_context_t * ctx, const byte * iv, size_t ivlen)
+{
+  ctx->input[12] = 0;
+
+  if (ivlen == CHACHA20_MAX_IV_SIZE)
+    {
+      ctx->input[13] = buf_get_le32 (iv + 0);
+      ctx->input[14] = buf_get_le32 (iv + 4);
+      ctx->input[15] = buf_get_le32 (iv + 8);
+    }
+  else if (ivlen == CHACHA20_MIN_IV_SIZE)
+    {
+      ctx->input[13] = 0;
+      ctx->input[14] = buf_get_le32 (iv + 0);
+      ctx->input[15] = buf_get_le32 (iv + 4);
+    }
+  else
+    {
+      ctx->input[13] = 0;
+      ctx->input[14] = 0;
+      ctx->input[15] = 0;
+    }
+}
+
+
+static gcry_err_code_t
+chacha20_do_setkey (CHACHA20_context_t * ctx,
+                    const byte * key, unsigned int keylen)
+{
+  static int initialized;
+  static const char *selftest_failed;
+
+  if (!initialized)
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+        log_error ("CHACHA20 selftest failed (%s)\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != CHACHA20_MAX_KEY_SIZE && keylen != CHACHA20_MIN_KEY_SIZE)
+    return GPG_ERR_INV_KEYLEN;
+
+  ctx->blocks = chacha20_blocks;
+
+  chacha20_keysetup (ctx, key, keylen);
+
+  /* We default to a zero nonce.  */
+  chacha20_setiv (ctx, NULL, 0);
+
+  return 0;
+}
+
+
+static gcry_err_code_t
+chacha20_setkey (void *context, const byte * key, unsigned int keylen)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+  gcry_err_code_t rc = chacha20_do_setkey (ctx, key, keylen);
+  _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
+  return rc;
+}
+
+
+static void
+chacha20_setiv (void *context, const byte * iv, size_t ivlen)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+
+  /* draft-nir-cfrg-chacha20-poly1305-02 defines 96-bit and 64-bit nonce. */
+  if (iv && ivlen != CHACHA20_MAX_IV_SIZE && ivlen != CHACHA20_MIN_IV_SIZE)
+    log_info ("WARNING: chacha20_setiv: bad ivlen=%u\n", (u32) ivlen);
+
+  if (iv && (ivlen == CHACHA20_MAX_IV_SIZE || ivlen == CHACHA20_MIN_IV_SIZE))
+    chacha20_ivsetup (ctx, iv, ivlen);
+  else
+    chacha20_ivsetup (ctx, NULL, 0);
+
+  /* Reset the unused pad bytes counter.  */
+  ctx->unused = 0;
+}
+

+
+
+/* Note: This function requires LENGTH > 0.  */
+static void
+chacha20_do_encrypt_stream (CHACHA20_context_t * ctx,
+                            byte * outbuf, const byte * inbuf, size_t length)
+{
+  unsigned int nburn, burn = 0;
+
+  if (ctx->unused)
+    {
+      unsigned char *p = (void *) ctx->pad;
+      size_t n;
+
+      gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+      n = ctx->unused;
+      if (n > length)
+        n = length;
+      buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+      length -= n;
+      outbuf += n;
+      inbuf += n;
+      ctx->unused -= n;
+      if (!length)
+        return;
+      gcry_assert (!ctx->unused);
+    }
+
+  if (length >= CHACHA20_BLOCK_SIZE)
+    {
+      size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+      size_t bytes = nblocks * CHACHA20_BLOCK_SIZE;
+      burn = ctx->blocks(ctx->input, inbuf, outbuf, bytes);
+      length -= bytes;
+      outbuf += bytes;
+      inbuf  += bytes;
+    }
+
+  if (length > 0)
+    {
+      nburn = chacha20_core (ctx->pad, ctx);
+      burn = nburn > burn ? nburn : burn;
+
+      buf_xor (outbuf, inbuf, ctx->pad, length);
+      ctx->unused = CHACHA20_BLOCK_SIZE - length;
+    }
+
+  _gcry_burn_stack (burn);
+}
+
+
+static void
+chacha20_encrypt_stream (void *context, byte * outbuf, const byte * inbuf,
+                         size_t length)
+{
+  CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+
+  if (length)
+    chacha20_do_encrypt_stream (ctx, outbuf, inbuf, length);
+}
+
+
+static const char *
+selftest (void)
+{
+  CHACHA20_context_t ctx;
+  byte scratch[127 + 1];
+  byte buf[256 + 64 + 4];
+  int i;
+
+  /* From draft-strombergson-chacha-test-vectors */
+  static byte key_1[] = {
+    0xc4, 0x6e, 0xc1, 0xb1, 0x8c, 0xe8, 0xa8, 0x78,
+    0x72, 0x5a, 0x37, 0xe7, 0x80, 0xdf, 0xb7, 0x35,
+    0x1f, 0x68, 0xed, 0x2e, 0x19, 0x4c, 0x79, 0xfb,
+    0xc6, 0xae, 0xbe, 0xe1, 0xa6, 0x67, 0x97, 0x5d
+  };
+  static const byte nonce_1[] =
+    { 0x1a, 0xda, 0x31, 0xd5, 0xcf, 0x68, 0x82, 0x21 };
+  static const byte plaintext_1[127] = {
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  };
+  static const byte ciphertext_1[127] = {
+    0xf6, 0x3a, 0x89, 0xb7, 0x5c, 0x22, 0x71, 0xf9,
+    0x36, 0x88, 0x16, 0x54, 0x2b, 0xa5, 0x2f, 0x06,
+    0xed, 0x49, 0x24, 0x17, 0x92, 0x30, 0x2b, 0x00,
+    0xb5, 0xe8, 0xf8, 0x0a, 0xe9, 0xa4, 0x73, 0xaf,
+    0xc2, 0x5b, 0x21, 0x8f, 0x51, 0x9a, 0xf0, 0xfd,
+    0xd4, 0x06, 0x36, 0x2e, 0x8d, 0x69, 0xde, 0x7f,
+    0x54, 0xc6, 0x04, 0xa6, 0xe0, 0x0f, 0x35, 0x3f,
+    0x11, 0x0f, 0x77, 0x1b, 0xdc, 0xa8, 0xab, 0x92,
+    0xe5, 0xfb, 0xc3, 0x4e, 0x60, 0xa1, 0xd9, 0xa9,
+    0xdb, 0x17, 0x34, 0x5b, 0x0a, 0x40, 0x27, 0x36,
+    0x85, 0x3b, 0xf9, 0x10, 0xb0, 0x60, 0xbd, 0xf1,
+    0xf8, 0x97, 0xb6, 0x29, 0x0f, 0x01, 0xd1, 0x38,
+    0xae, 0x2c, 0x4c, 0x90, 0x22, 0x5b, 0xa9, 0xea,
+    0x14, 0xd5, 0x18, 0xf5, 0x59, 0x29, 0xde, 0xa0,
+    0x98, 0xca, 0x7a, 0x6c, 0xcf, 0xe6, 0x12, 0x27,
+    0x05, 0x3c, 0x84, 0xe4, 0x9a, 0x4a, 0x33
+  };
+
+  chacha20_setkey (&ctx, key_1, sizeof key_1);
+  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  scratch[sizeof (scratch) - 1] = 0;
+  chacha20_encrypt_stream (&ctx, scratch, plaintext_1, sizeof plaintext_1);
+  if (memcmp (scratch, ciphertext_1, sizeof ciphertext_1))
+    return "ChaCha20 encryption test 1 failed.";
+  if (scratch[sizeof (scratch) - 1])
+    return "ChaCha20 wrote too much.";
+  chacha20_setkey (&ctx, key_1, sizeof (key_1));
+  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (&ctx, scratch, scratch, sizeof plaintext_1);
+  if (memcmp (scratch, plaintext_1, sizeof plaintext_1))
+    return "ChaCha20 decryption test 1 failed.";
+
+  for (i = 0; i < sizeof buf; i++)
+    buf[i] = i;
+  chacha20_setkey (&ctx, key_1, sizeof key_1);
+  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  /*encrypt */
+  chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+  /*decrypt */
+  chacha20_setkey (&ctx, key_1, sizeof key_1);
+  chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+  chacha20_encrypt_stream (&ctx, buf, buf, 1);
+  chacha20_encrypt_stream (&ctx, buf + 1, buf + 1, (sizeof buf) - 1 - 1);
+  chacha20_encrypt_stream (&ctx, buf + (sizeof buf) - 1,
+                           buf + (sizeof buf) - 1, 1);
+  for (i = 0; i < sizeof buf; i++)
+    if (buf[i] != (byte) i)
+      return "ChaCha20 encryption test 2 failed.";
+
+  return NULL;
+}
+
+
+gcry_cipher_spec_t _gcry_cipher_spec_chacha20 = {
+  GCRY_CIPHER_CHACHA20,
+  {0, 0},                       /* flags */
+  "CHACHA20",                   /* name */
+  NULL,                         /* aliases */
+  NULL,                         /* oids */
+  1,                            /* blocksize in bytes. */
+  CHACHA20_MAX_KEY_SIZE * 8,    /* standard key length in bits. */
+  sizeof (CHACHA20_context_t),
+  chacha20_setkey,
+  NULL,
+  NULL,
+  chacha20_encrypt_stream,
+  chacha20_encrypt_stream,
+  NULL,
+  NULL,
+  chacha20_setiv
+};
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 6552ed3..4751302 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -83,6 +83,9 @@ static gcry_cipher_spec_t *cipher_list[] =
 #if USE_GOST28147
      &_gcry_cipher_spec_gost28147,
 #endif
+#if USE_CHACHA20
+     &_gcry_cipher_spec_chacha20,
+#endif
     NULL
   };
 
diff --git a/configure.ac b/configure.ac
index 6539a96..7573952 100644
--- a/configure.ac
+++ b/configure.ac
@@ -187,7 +187,7 @@ LIBGCRYPT_CONFIG_HOST="$host"
 
 # Definitions for symmetric ciphers.
 available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed"
-available_ciphers="$available_ciphers camellia idea salsa20 gost28147"
+available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20"
 enabled_ciphers=""
 
 # Definitions for public-key ciphers.
@@ -1807,6 +1807,12 @@ if test "$found" = "1" ; then
    AC_DEFINE(USE_GOST28147, 1, [Defined if this module should be included])
 fi
 
+LIST_MEMBER(chacha20, $enabled_ciphers)
+if test "$found" = "1" ; then
+   GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20.lo"
+   AC_DEFINE(USE_CHACHA20, 1, [Defined if this module should be included])
+fi
+
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index c5c3b45..d202b8b 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1564,6 +1564,10 @@ This is the Salsa20/12 - reduced round version of Salsa20 stream cipher.
 The GOST 28147-89 cipher, defined in the respective GOST standard.
 Translation of this GOST into English is provided in the RFC-5830.
 
+ at item GCRY_CIPHER_CHACHA20
+ at cindex ChaCha20
+This is the ChaCha20 stream cipher.
+
 @end table
 
 @node Available cipher modes
@@ -1720,9 +1724,9 @@ vector is passed as the buffer @var{K} of length @var{l} bytes and
 copied to internal data structures.  The function checks that the IV
 matches the requirement of the selected algorithm and mode.
 
-This function is also used with the Salsa20 stream cipher to set or
-update the required nonce.  In this case it needs to be called after
-setting the key.
+This function is also used with Salsa20 and ChaCha20 stream ciphers
+to set or update the required nonce.  In this case it needs to be
+called after setting the key.
 
 This function is also used with the AEAD cipher modes to set or
 update the required nonce.
diff --git a/src/cipher.h b/src/cipher.h
index 5d1b5f6..ed57d3c 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -251,6 +251,7 @@ extern gcry_cipher_spec_t _gcry_cipher_spec_idea;
 extern gcry_cipher_spec_t _gcry_cipher_spec_salsa20;
 extern gcry_cipher_spec_t _gcry_cipher_spec_salsa20r12;
 extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147;
+extern gcry_cipher_spec_t _gcry_cipher_spec_chacha20;
 
 /* Declarations for the digest specifications.  */
 extern gcry_md_spec_t _gcry_digest_spec_crc32;
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index c84a3f7..d4e9bb2 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -880,7 +880,8 @@ enum gcry_cipher_algos
     GCRY_CIPHER_CAMELLIA256 = 312,
     GCRY_CIPHER_SALSA20     = 313,
     GCRY_CIPHER_SALSA20R12  = 314,
-    GCRY_CIPHER_GOST28147   = 315
+    GCRY_CIPHER_GOST28147   = 315,
+    GCRY_CIPHER_CHACHA20    = 316
   };
 
 /* The Rijndael algorithm is basically AES, so provide some macros. */
diff --git a/tests/basic.c b/tests/basic.c
index 5c6c51c..406d82d 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -195,7 +195,7 @@ show_mac_not_available (int algo)
 
 
 
-#define MAX_DATA_LEN 100
+#define MAX_DATA_LEN 128
 
 void
 progress_handler (void *cb_data, const char *what, int printchar,
@@ -2583,8 +2583,331 @@ check_stream_cipher (void)
           "\x44\xC9\x70\x0A\x0F\x21\x38\xE8\xC1\xA2\x86\xFB\x8C\x1F\xBF\xA0"
         }
       }
-    }
+    },
 #endif /*USE_SALSA20*/
+#ifdef USE_CHACHA20
+    /* From draft-strombergson-chacha-test-vectors-01 */
+    {
+      "ChaCha20 128 bit, TC1",
+      GCRY_CIPHER_CHACHA20, 16, 8,
+      "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+      "\x00\x00\x00\x00\x00\x00\x00\x00",
+      {
+        { 8,
+          "\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x89\x67\x09\x52\x60\x83\x64\xfd"
+        },
+        { 112,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x89\x67\x09\x52\x60\x83\x64\xfd\x00\xb2\xf9\x09\x36\xf0\x31\xc8"
+          "\xe7\x56\xe1\x5d\xba\x04\xb8\x49\x3d\x00\x42\x92\x59\xb2\x0f\x46"
+          "\xcc\x04\xf1\x11\x24\x6b\x6c\x2c\xe0\x66\xbe\x3b\xfb\x32\xd9\xaa"
+          "\x0f\xdd\xfb\xc1\x21\x23\xd4\xb9\xe4\x4f\x34\xdc\xa0\x5a\x10\x3f"
+          "\x6c\xd1\x35\xc2\x87\x8c\x83\x2b\x58\x96\xb1\x34\xf6\x14\x2a\x9d"
+          "\x4d\x8d\x0d\x8f\x10\x26\xd2\x0a\x0a\x81\x51\x2c\xbc\xe6\xe9\x75"
+          "\x8a\x71\x43\xd0\x21\x97\x80\x22\xa3\x84\x14\x1a\x80\xce\xa3\x06"
+        },
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x89\x67\x09\x52\x60\x83\x64\xfd\x00\xb2\xf9\x09\x36\xf0\x31\xc8"
+          "\xe7\x56\xe1\x5d\xba\x04\xb8\x49\x3d\x00\x42\x92\x59\xb2\x0f\x46"
+          "\xcc\x04\xf1\x11\x24\x6b\x6c\x2c\xe0\x66\xbe\x3b\xfb\x32\xd9\xaa"
+          "\x0f\xdd\xfb\xc1\x21\x23\xd4\xb9\xe4\x4f\x34\xdc\xa0\x5a\x10\x3f"
+          "\x6c\xd1\x35\xc2\x87\x8c\x83\x2b\x58\x96\xb1\x34\xf6\x14\x2a\x9d"
+          "\x4d\x8d\x0d\x8f\x10\x26\xd2\x0a\x0a\x81\x51\x2c\xbc\xe6\xe9\x75"
+          "\x8a\x71\x43\xd0\x21\x97\x80\x22\xa3\x84\x14\x1a\x80\xce\xa3\x06"
+          "\x2f\x41\xf6\x7a\x75\x2e\x66\xad\x34\x11\x98\x4c\x78\x7e\x30\xad"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC1",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+      "\x00\x00\x00\x00\x00\x00\x00\x00",
+      {
+        { 8,
+          "\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90"
+        },
+        { 112,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90\x40\x5d\x6a\xe5\x53\x86\xbd\x28"
+          "\xbd\xd2\x19\xb8\xa0\x8d\xed\x1a\xa8\x36\xef\xcc\x8b\x77\x0d\xc7"
+          "\xda\x41\x59\x7c\x51\x57\x48\x8d\x77\x24\xe0\x3f\xb8\xd8\x4a\x37"
+          "\x6a\x43\xb8\xf4\x15\x18\xa1\x1c\xc3\x87\xb6\x69\xb2\xee\x65\x86"
+          "\x9f\x07\xe7\xbe\x55\x51\x38\x7a\x98\xba\x97\x7c\x73\x2d\x08\x0d"
+          "\xcb\x0f\x29\xa0\x48\xe3\x65\x69\x12\xc6\x53\x3e\x32\xee\x7a\xed"
+          "\x29\xb7\x21\x76\x9c\xe6\x4e\x43\xd5\x71\x33\xb0\x74\xd8\x39\xd5"
+        },
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x76\xb8\xe0\xad\xa0\xf1\x3d\x90\x40\x5d\x6a\xe5\x53\x86\xbd\x28"
+          "\xbd\xd2\x19\xb8\xa0\x8d\xed\x1a\xa8\x36\xef\xcc\x8b\x77\x0d\xc7"
+          "\xda\x41\x59\x7c\x51\x57\x48\x8d\x77\x24\xe0\x3f\xb8\xd8\x4a\x37"
+          "\x6a\x43\xb8\xf4\x15\x18\xa1\x1c\xc3\x87\xb6\x69\xb2\xee\x65\x86"
+          "\x9f\x07\xe7\xbe\x55\x51\x38\x7a\x98\xba\x97\x7c\x73\x2d\x08\x0d"
+          "\xcb\x0f\x29\xa0\x48\xe3\x65\x69\x12\xc6\x53\x3e\x32\xee\x7a\xed"
+          "\x29\xb7\x21\x76\x9c\xe6\x4e\x43\xd5\x71\x33\xb0\x74\xd8\x39\xd5"
+          "\x31\xed\x1f\x28\x51\x0a\xfb\x45\xac\xe1\x0a\x1f\x4b\x79\x4d\x6f"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC2",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+      "\x00\x00\x00\x00\x00\x00\x00\x00",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\xc5\xd3\x0a\x7c\xe1\xec\x11\x93\x78\xc8\x4f\x48\x7d\x77\x5a\x85"
+          "\x42\xf1\x3e\xce\x23\x8a\x94\x55\xe8\x22\x9e\x88\x8d\xe8\x5b\xbd"
+          "\x29\xeb\x63\xd0\xa1\x7a\x5b\x99\x9b\x52\xda\x22\xbe\x40\x23\xeb"
+          "\x07\x62\x0a\x54\xf6\xfa\x6a\xd8\x73\x7b\x71\xeb\x04\x64\xda\xc0"
+          "\x10\xf6\x56\xe6\xd1\xfd\x55\x05\x3e\x50\xc4\x87\x5c\x99\x30\xa3"
+          "\x3f\x6d\x02\x63\xbd\x14\xdf\xd6\xab\x8c\x70\x52\x1c\x19\x33\x8b"
+          "\x23\x08\xb9\x5c\xf8\xd0\xbb\x7d\x20\x2d\x21\x02\x78\x0e\xa3\x52"
+          "\x8f\x1c\xb4\x85\x60\xf7\x6b\x20\xf3\x82\xb9\x42\x50\x0f\xce\xac"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC3",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+      "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+      "\x01\x00\x00\x00\x00\x00\x00\x00",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\xef\x3f\xdf\xd6\xc6\x15\x78\xfb\xf5\xcf\x35\xbd\x3d\xd3\x3b\x80"
+          "\x09\x63\x16\x34\xd2\x1e\x42\xac\x33\x96\x0b\xd1\x38\xe5\x0d\x32"
+          "\x11\x1e\x4c\xaf\x23\x7e\xe5\x3c\xa8\xad\x64\x26\x19\x4a\x88\x54"
+          "\x5d\xdc\x49\x7a\x0b\x46\x6e\x7d\x6b\xbd\xb0\x04\x1b\x2f\x58\x6b"
+          "\x53\x05\xe5\xe4\x4a\xff\x19\xb2\x35\x93\x61\x44\x67\x5e\xfb\xe4"
+          "\x40\x9e\xb7\xe8\xe5\xf1\x43\x0f\x5f\x58\x36\xae\xb4\x9b\xb5\x32"
+          "\x8b\x01\x7c\x4b\x9d\xc1\x1f\x8a\x03\x86\x3f\xa8\x03\xdc\x71\xd5"
+          "\x72\x6b\x2b\x6b\x31\xaa\x32\x70\x8a\xfe\x5a\xf1\xd6\xb6\x90\x58"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC4",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
+      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff",
+      "\xff\xff\xff\xff\xff\xff\xff\xff",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\xd9\xbf\x3f\x6b\xce\x6e\xd0\xb5\x42\x54\x55\x77\x67\xfb\x57\x44"
+          "\x3d\xd4\x77\x89\x11\xb6\x06\x05\x5c\x39\xcc\x25\xe6\x74\xb8\x36"
+          "\x3f\xea\xbc\x57\xfd\xe5\x4f\x79\x0c\x52\xc8\xae\x43\x24\x0b\x79"
+          "\xd4\x90\x42\xb7\x77\xbf\xd6\xcb\x80\xe9\x31\x27\x0b\x7f\x50\xeb"
+          "\x5b\xac\x2a\xcd\x86\xa8\x36\xc5\xdc\x98\xc1\x16\xc1\x21\x7e\xc3"
+          "\x1d\x3a\x63\xa9\x45\x13\x19\xf0\x97\xf3\xb4\xd6\xda\xb0\x77\x87"
+          "\x19\x47\x7d\x24\xd2\x4b\x40\x3a\x12\x24\x1d\x7c\xca\x06\x4f\x79"
+          "\x0f\x1d\x51\xcc\xaf\xf6\xb1\x66\x7d\x4b\xbc\xa1\x95\x8c\x43\x06"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC5",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55"
+      "\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55\x55",
+      "\x55\x55\x55\x55\x55\x55\x55\x55",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\xbe\xa9\x41\x1a\xa4\x53\xc5\x43\x4a\x5a\xe8\xc9\x28\x62\xf5\x64"
+          "\x39\x68\x55\xa9\xea\x6e\x22\xd6\xd3\xb5\x0a\xe1\xb3\x66\x33\x11"
+          "\xa4\xa3\x60\x6c\x67\x1d\x60\x5c\xe1\x6c\x3a\xec\xe8\xe6\x1e\xa1"
+          "\x45\xc5\x97\x75\x01\x7b\xee\x2f\xa6\xf8\x8a\xfc\x75\x80\x69\xf7"
+          "\xe0\xb8\xf6\x76\xe6\x44\x21\x6f\x4d\x2a\x34\x22\xd7\xfa\x36\xc6"
+          "\xc4\x93\x1a\xca\x95\x0e\x9d\xa4\x27\x88\xe6\xd0\xb6\xd1\xcd\x83"
+          "\x8e\xf6\x52\xe9\x7b\x14\x5b\x14\x87\x1e\xae\x6c\x68\x04\xc7\x00"
+          "\x4d\xb5\xac\x2f\xce\x4c\x68\xc7\x26\xd0\x04\xb1\x0f\xca\xba\x86"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC6",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa"
+      "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa",
+      "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x9a\xa2\xa9\xf6\x56\xef\xde\x5a\xa7\x59\x1c\x5f\xed\x4b\x35\xae"
+          "\xa2\x89\x5d\xec\x7c\xb4\x54\x3b\x9e\x9f\x21\xf5\xe7\xbc\xbc\xf3"
+          "\xc4\x3c\x74\x8a\x97\x08\x88\xf8\x24\x83\x93\xa0\x9d\x43\xe0\xb7"
+          "\xe1\x64\xbc\x4d\x0b\x0f\xb2\x40\xa2\xd7\x21\x15\xc4\x80\x89\x06"
+          "\x72\x18\x44\x89\x44\x05\x45\xd0\x21\xd9\x7e\xf6\xb6\x93\xdf\xe5"
+          "\xb2\xc1\x32\xd4\x7e\x6f\x04\x1c\x90\x63\x65\x1f\x96\xb6\x23\xe6"
+          "\x2a\x11\x99\x9a\x23\xb6\xf7\xc4\x61\xb2\x15\x30\x26\xad\x5e\x86"
+          "\x6a\x2e\x59\x7e\xd0\x7b\x84\x01\xde\xc6\x3a\x09\x34\xc6\xb2\xa9"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC7",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff"
+      "\xff\xee\xdd\xcc\xbb\xaa\x99\x88\x77\x66\x55\x44\x33\x22\x11\x00",
+      "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x9f\xad\xf4\x09\xc0\x08\x11\xd0\x04\x31\xd6\x7e\xfb\xd8\x8f\xba"
+          "\x59\x21\x8d\x5d\x67\x08\xb1\xd6\x85\x86\x3f\xab\xbb\x0e\x96\x1e"
+          "\xea\x48\x0f\xd6\xfb\x53\x2b\xfd\x49\x4b\x21\x51\x01\x50\x57\x42"
+          "\x3a\xb6\x0a\x63\xfe\x4f\x55\xf7\xa2\x12\xe2\x16\x7c\xca\xb9\x31"
+          "\xfb\xfd\x29\xcf\x7b\xc1\xd2\x79\xed\xdf\x25\xdd\x31\x6b\xb8\x84"
+          "\x3d\x6e\xde\xe0\xbd\x1e\xf1\x21\xd1\x2f\xa1\x7c\xbc\x2c\x57\x4c"
+          "\xcc\xab\x5e\x27\x51\x67\xb0\x8b\xd6\x86\xf8\xa0\x9d\xf8\x7e\xc3"
+          "\xff\xb3\x53\x61\xb9\x4e\xbf\xa1\x3f\xec\x0e\x48\x89\xd1\x8d\xa5"
+        }
+      }
+    },
+    {
+      "ChaCha20 256 bit, TC8",
+      GCRY_CIPHER_CHACHA20, 32, 8,
+      "\xc4\x6e\xc1\xb1\x8c\xe8\xa8\x78\x72\x5a\x37\xe7\x80\xdf\xb7\x35"
+      "\x1f\x68\xed\x2e\x19\x4c\x79\xfb\xc6\xae\xbe\xe1\xa6\x67\x97\x5d",
+      "\x1a\xda\x31\xd5\xcf\x68\x82\x21",
+      {
+        { 128,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\xf6\x3a\x89\xb7\x5c\x22\x71\xf9\x36\x88\x16\x54\x2b\xa5\x2f\x06"
+          "\xed\x49\x24\x17\x92\x30\x2b\x00\xb5\xe8\xf8\x0a\xe9\xa4\x73\xaf"
+          "\xc2\x5b\x21\x8f\x51\x9a\xf0\xfd\xd4\x06\x36\x2e\x8d\x69\xde\x7f"
+          "\x54\xc6\x04\xa6\xe0\x0f\x35\x3f\x11\x0f\x77\x1b\xdc\xa8\xab\x92"
+          "\xe5\xfb\xc3\x4e\x60\xa1\xd9\xa9\xdb\x17\x34\x5b\x0a\x40\x27\x36"
+          "\x85\x3b\xf9\x10\xb0\x60\xbd\xf1\xf8\x97\xb6\x29\x0f\x01\xd1\x38"
+          "\xae\x2c\x4c\x90\x22\x5b\xa9\xea\x14\xd5\x18\xf5\x59\x29\xde\xa0"
+          "\x98\xca\x7a\x6c\xcf\xe6\x12\x27\x05\x3c\x84\xe4\x9a\x4a\x33\x32"
+        },
+        { 127,
+          "\xf6\x3a\x89\xb7\x5c\x22\x71\xf9\x36\x88\x16\x54\x2b\xa5\x2f\x06"
+          "\xed\x49\x24\x17\x92\x30\x2b\x00\xb5\xe8\xf8\x0a\xe9\xa4\x73\xaf"
+          "\xc2\x5b\x21\x8f\x51\x9a\xf0\xfd\xd4\x06\x36\x2e\x8d\x69\xde\x7f"
+          "\x54\xc6\x04\xa6\xe0\x0f\x35\x3f\x11\x0f\x77\x1b\xdc\xa8\xab\x92"
+          "\xe5\xfb\xc3\x4e\x60\xa1\xd9\xa9\xdb\x17\x34\x5b\x0a\x40\x27\x36"
+          "\x85\x3b\xf9\x10\xb0\x60\xbd\xf1\xf8\x97\xb6\x29\x0f\x01\xd1\x38"
+          "\xae\x2c\x4c\x90\x22\x5b\xa9\xea\x14\xd5\x18\xf5\x59\x29\xde\xa0"
+          "\x98\xca\x7a\x6c\xcf\xe6\x12\x27\x05\x3c\x84\xe4\x9a\x4a\x33",
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+        }
+      }
+    },
+    /* from draft-nir-cfrg-chacha20-poly1305-02 */
+    {
+      "ChaCha20 256 bit, IV96-bit",
+      GCRY_CIPHER_CHACHA20, 32, 12,
+      "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+      "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f",
+      "\x07\x00\x00\x00\x40\x41\x42\x43\x44\x45\x46\x47",
+      {
+        { 64,
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+          "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+          "\x7b\xac\x2b\x25\x2d\xb4\x47\xaf\x09\xb6\x7a\x55\xa4\xe9\x55\x84"
+          "\x0a\xe1\xd6\x73\x10\x75\xd9\xeb\x2a\x93\x75\x78\x3e\xd5\x53\xff"
+          "\xa2\x7e\xcc\xde\xad\xdb\x4d\xb4\xd1\x17\x9c\xe4\xc9\x0b\x43\xd8"
+          "\xbc\xb7\x94\x8c\x4b\x4b\x7d\x8b\x7d\xf6\x27\x39\x32\xa4\x69\x16"
+        },
+      },
+    },
+#endif /*USE_CHACHA20*/
   };
 
   gcry_cipher_hd_t hde, hdd;
@@ -3649,6 +3972,9 @@ check_ciphers (void)
     GCRY_CIPHER_SALSA20,
     GCRY_CIPHER_SALSA20R12,
 #endif
+#if USE_CHACHA20
+    GCRY_CIPHER_CHACHA20,
+#endif
     0
   };
   int i;

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am            |    1 +
 cipher/chacha20-avx2-amd64.S  |  949 +++++++++++++++++++++++++++++++++++++++++
 cipher/chacha20-ssse3-amd64.S |  610 ++++++++++++++++++++++++++
 cipher/chacha20.c             |  544 +++++++++++++++++++++++
 cipher/cipher.c               |    3 +
 configure.ac                  |   16 +-
 doc/gcrypt.texi               |   10 +-
 src/cipher.h                  |    1 +
 src/gcrypt.h.in               |    3 +-
 tests/basic.c                 |  330 +++++++++++++-
 10 files changed, 2460 insertions(+), 7 deletions(-)
 create mode 100644 cipher/chacha20-avx2-amd64.S
 create mode 100644 cipher/chacha20-ssse3-amd64.S
 create mode 100644 cipher/chacha20.c


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list