[PATCH 1/2] Add AMD64 assembly implementation of Salsa20

Jussi Kivilinna jussi.kivilinna at iki.fi
Sat Oct 26 15:03:53 CEST 2013


* cipher/Makefile.am: Add 'salsa20-amd64.S'.
* cipher/salsa20-amd64.S: New.
* cipher/salsa20.c (USE_AMD64): New macro.
[USE_AMD64] (_gcry_salsa20_amd64_keysetup, _gcry_salsa20_amd64_ivsetup)
(_gcry_salsa20_amd64_encrypt_blocks): New prototypes.
[USE_AMD64] (salsa20_keysetup, salsa20_ivsetup, salsa20_core): New.
[!USE_AMD64] (salsa20_core): Change 'src' to non-constant, update block
counter in 'salsa20_core' and return burn stack depth.
[!USE_AMD64] (salsa20_keysetup, salsa20_ivsetup): New.
(salsa20_do_setkey): Move generic key setup to 'salsa20_keysetup'.
(salsa20_setkey): Fix burn stack depth.
(salsa20_setiv): Move generic IV setup to 'salsa20_ivsetup'.
(salsa20_do_encrypt_stream) [USE_AMD64]: Process large buffers in AMD64
implementation.
(salsa20_do_encrypt_stream): Move stack burning to this function...
(salsa20_encrypt_stream, salsa20r12_encrypt_stream): ...from these
functions.
* configure.ac [x86-64]: Add 'salsa20-amd64.lo'.
--

Patch adds fast AMD64 assembly implementation for Salsa20. This implementation
is based on public domain code by D. J. Bernstein and it is available at
http://cr.yp.to/snuffle.html (amd64-xmm6). Implementation gains extra speed
by processing four blocks in parallel with help SSE2 instructions.

Benchmark results on Intel Core i5-4570 (3.2 Ghz):

Before:
SALSA20        |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      3.88 ns/B     246.0 MiB/s     12.41 c/B
     STREAM dec |      3.88 ns/B     246.0 MiB/s     12.41 c/B
                =
 SALSA20R12     |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      2.46 ns/B     387.9 MiB/s      7.87 c/B
     STREAM dec |      2.46 ns/B     387.7 MiB/s      7.87 c/B

After:
 SALSA20        |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.985 ns/B     967.8 MiB/s      3.15 c/B
     STREAM dec |     0.987 ns/B     966.5 MiB/s      3.16 c/B
                =
 SALSA20R12     |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |     0.636 ns/B    1500.5 MiB/s      2.03 c/B
     STREAM dec |     0.636 ns/B    1499.2 MiB/s      2.04 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am     |    2 
 cipher/salsa20-amd64.S |  924 ++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/salsa20.c       |  197 ++++++----
 configure.ac           |    7 
 4 files changed, 1056 insertions(+), 74 deletions(-)
 create mode 100644 cipher/salsa20-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index d7db933..e786713 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -71,7 +71,7 @@ md5.c \
 rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \
 rmd160.c \
 rsa.c \
-salsa20.c \
+salsa20.c salsa20-amd64.S \
 scrypt.c \
 seed.c \
 serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
new file mode 100644
index 0000000..691df58
--- /dev/null
+++ b/cipher/salsa20-amd64.S
@@ -0,0 +1,924 @@
+/* salsa20-amd64.S  -  AMD64 implementation of Salsa20
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by D. J. Bernstein at
+ *  http://cr.yp.to/snuffle.html
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SALSA20)
+
+.text
+
+.align 8
+.globl _gcry_salsa20_amd64_keysetup
+.type  _gcry_salsa20_amd64_keysetup, at function;
+_gcry_salsa20_amd64_keysetup:
+	movl   0(%rsi),%r8d
+	movl   4(%rsi),%r9d
+	movl   8(%rsi),%eax
+	movl   12(%rsi),%r10d
+	movl   %r8d,20(%rdi)
+	movl   %r9d,40(%rdi)
+	movl   %eax,60(%rdi)
+	movl   %r10d,48(%rdi)
+	cmp  $256,%rdx
+	jb ._kbits128
+._kbits256:
+	movl   16(%rsi),%edx
+	movl   20(%rsi),%ecx
+	movl   24(%rsi),%r8d
+	movl   28(%rsi),%esi
+	movl   %edx,28(%rdi)
+	movl   %ecx,16(%rdi)
+	movl   %r8d,36(%rdi)
+	movl   %esi,56(%rdi)
+	mov  $1634760805,%rsi
+	mov  $857760878,%rdx
+	mov  $2036477234,%rcx
+	mov  $1797285236,%r8
+	movl   %esi,0(%rdi)
+	movl   %edx,4(%rdi)
+	movl   %ecx,8(%rdi)
+	movl   %r8d,12(%rdi)
+	jmp ._keysetupdone
+._kbits128:
+	movl   0(%rsi),%edx
+	movl   4(%rsi),%ecx
+	movl   8(%rsi),%r8d
+	movl   12(%rsi),%esi
+	movl   %edx,28(%rdi)
+	movl   %ecx,16(%rdi)
+	movl   %r8d,36(%rdi)
+	movl   %esi,56(%rdi)
+	mov  $1634760805,%rsi
+	mov  $824206446,%rdx
+	mov  $2036477238,%rcx
+	mov  $1797285236,%r8
+	movl   %esi,0(%rdi)
+	movl   %edx,4(%rdi)
+	movl   %ecx,8(%rdi)
+	movl   %r8d,12(%rdi)
+._keysetupdone:
+	ret
+
+.align 8
+.globl _gcry_salsa20_amd64_ivsetup
+.type  _gcry_salsa20_amd64_ivsetup, at function;
+_gcry_salsa20_amd64_ivsetup:
+	movl   0(%rsi),%r8d
+	movl   4(%rsi),%esi
+	mov  $0,%r9
+	mov  $0,%rax
+	movl   %r8d,24(%rdi)
+	movl   %esi,44(%rdi)
+	movl   %r9d,32(%rdi)
+	movl   %eax,52(%rdi)
+	ret
+
+.align 8
+.globl _gcry_salsa20_amd64_encrypt_blocks
+.type  _gcry_salsa20_amd64_encrypt_blocks, at function;
+_gcry_salsa20_amd64_encrypt_blocks:
+	/*
+	 * Modifications to original implementation:
+	 *  - Number of rounds passing in register %r8 (for Salsa20/12).
+	 *  - Length is input as number of blocks, so don't handle tail bytes
+	 *    (this is done in salsa20.c).
+	 */
+	push %rbx
+	shlq $6, %rcx /* blocks to bytes */
+	mov %r8, %rbx
+	mov %rsp,%r11
+	and $31,%r11
+	add $384,%r11
+	sub %r11,%rsp
+	mov  %rdi,%r8
+	mov  %rsi,%rsi
+	mov  %rdx,%rdi
+	mov  %rcx,%rdx
+	cmp  $0,%rdx
+	jbe ._done
+._start:
+	cmp  $256,%rdx
+	jb ._bytes_are_64_128_or_192
+	movdqa 0(%r8),%xmm0
+	pshufd $0x55,%xmm0,%xmm1
+	pshufd $0xaa,%xmm0,%xmm2
+	pshufd $0xff,%xmm0,%xmm3
+	pshufd $0x00,%xmm0,%xmm0
+	movdqa %xmm1,0(%rsp)
+	movdqa %xmm2,16(%rsp)
+	movdqa %xmm3,32(%rsp)
+	movdqa %xmm0,48(%rsp)
+	movdqa 16(%r8),%xmm0
+	pshufd $0xaa,%xmm0,%xmm1
+	pshufd $0xff,%xmm0,%xmm2
+	pshufd $0x00,%xmm0,%xmm3
+	pshufd $0x55,%xmm0,%xmm0
+	movdqa %xmm1,64(%rsp)
+	movdqa %xmm2,80(%rsp)
+	movdqa %xmm3,96(%rsp)
+	movdqa %xmm0,112(%rsp)
+	movdqa 32(%r8),%xmm0
+	pshufd $0xff,%xmm0,%xmm1
+	pshufd $0x55,%xmm0,%xmm2
+	pshufd $0xaa,%xmm0,%xmm0
+	movdqa %xmm1,128(%rsp)
+	movdqa %xmm2,144(%rsp)
+	movdqa %xmm0,160(%rsp)
+	movdqa 48(%r8),%xmm0
+	pshufd $0x00,%xmm0,%xmm1
+	pshufd $0xaa,%xmm0,%xmm2
+	pshufd $0xff,%xmm0,%xmm0
+	movdqa %xmm1,176(%rsp)
+	movdqa %xmm2,192(%rsp)
+	movdqa %xmm0,208(%rsp)
+._bytesatleast256:
+	movl   32(%r8),%ecx
+	movl   52(%r8),%r9d
+	movl %ecx,224(%rsp)
+	movl %r9d,240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl %ecx,4+224(%rsp)
+	movl %r9d,4+240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl %ecx,8+224(%rsp)
+	movl %r9d,8+240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl %ecx,12+224(%rsp)
+	movl %r9d,12+240(%rsp)
+	add  $1,%ecx
+	adc  $0,%r9d
+	movl   %ecx,32(%r8)
+	movl   %r9d,52(%r8)
+	movq %rdx,288(%rsp)
+	mov  %rbx,%rdx
+	movdqa 0(%rsp),%xmm0
+	movdqa 16(%rsp),%xmm1
+	movdqa 32(%rsp),%xmm2
+	movdqa 192(%rsp),%xmm3
+	movdqa 208(%rsp),%xmm4
+	movdqa 64(%rsp),%xmm5
+	movdqa 80(%rsp),%xmm6
+	movdqa 112(%rsp),%xmm7
+	movdqa 128(%rsp),%xmm8
+	movdqa 144(%rsp),%xmm9
+	movdqa 160(%rsp),%xmm10
+	movdqa 240(%rsp),%xmm11
+	movdqa 48(%rsp),%xmm12
+	movdqa 96(%rsp),%xmm13
+	movdqa 176(%rsp),%xmm14
+	movdqa 224(%rsp),%xmm15
+._mainloop1:
+	movdqa %xmm1,256(%rsp)
+	movdqa %xmm2,272(%rsp)
+	movdqa %xmm13,%xmm1
+	paddd %xmm12,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm14
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm14
+	movdqa %xmm7,%xmm1
+	paddd %xmm0,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm11
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm11
+	movdqa %xmm12,%xmm1
+	paddd %xmm14,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm15
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm15
+	movdqa %xmm0,%xmm1
+	paddd %xmm11,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm9
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm9
+	movdqa %xmm14,%xmm1
+	paddd %xmm15,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm13
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm13
+	movdqa %xmm11,%xmm1
+	paddd %xmm9,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm7
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm7
+	movdqa %xmm15,%xmm1
+	paddd %xmm13,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $18,%xmm1
+	pxor  %xmm1,%xmm12
+	psrld $14,%xmm2
+	pxor  %xmm2,%xmm12
+	movdqa 256(%rsp),%xmm1
+	movdqa %xmm12,256(%rsp)
+	movdqa %xmm9,%xmm2
+	paddd %xmm7,%xmm2
+	movdqa %xmm2,%xmm12
+	pslld $18,%xmm2
+	pxor  %xmm2,%xmm0
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm0
+	movdqa %xmm5,%xmm2
+	paddd %xmm1,%xmm2
+	movdqa %xmm2,%xmm12
+	pslld $7,%xmm2
+	pxor  %xmm2,%xmm3
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm3
+	movdqa 272(%rsp),%xmm2
+	movdqa %xmm0,272(%rsp)
+	movdqa %xmm6,%xmm0
+	paddd %xmm2,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $7,%xmm0
+	pxor  %xmm0,%xmm4
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm4
+	movdqa %xmm1,%xmm0
+	paddd %xmm3,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm10
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm10
+	movdqa %xmm2,%xmm0
+	paddd %xmm4,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm8
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm8
+	movdqa %xmm3,%xmm0
+	paddd %xmm10,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm5
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm5
+	movdqa %xmm4,%xmm0
+	paddd %xmm8,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm6
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm6
+	movdqa %xmm10,%xmm0
+	paddd %xmm5,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm1
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm1
+	movdqa 256(%rsp),%xmm0
+	movdqa %xmm1,256(%rsp)
+	movdqa %xmm4,%xmm1
+	paddd %xmm0,%xmm1
+	movdqa %xmm1,%xmm12
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm7
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm7
+	movdqa %xmm8,%xmm1
+	paddd %xmm6,%xmm1
+	movdqa %xmm1,%xmm12
+	pslld $18,%xmm1
+	pxor  %xmm1,%xmm2
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm2
+	movdqa 272(%rsp),%xmm12
+	movdqa %xmm2,272(%rsp)
+	movdqa %xmm14,%xmm1
+	paddd %xmm12,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $7,%xmm1
+	pxor  %xmm1,%xmm5
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm5
+	movdqa %xmm0,%xmm1
+	paddd %xmm7,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm10
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm10
+	movdqa %xmm12,%xmm1
+	paddd %xmm5,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $9,%xmm1
+	pxor  %xmm1,%xmm8
+	psrld $23,%xmm2
+	pxor  %xmm2,%xmm8
+	movdqa %xmm7,%xmm1
+	paddd %xmm10,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm4
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm4
+	movdqa %xmm5,%xmm1
+	paddd %xmm8,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $13,%xmm1
+	pxor  %xmm1,%xmm14
+	psrld $19,%xmm2
+	pxor  %xmm2,%xmm14
+	movdqa %xmm10,%xmm1
+	paddd %xmm4,%xmm1
+	movdqa %xmm1,%xmm2
+	pslld $18,%xmm1
+	pxor  %xmm1,%xmm0
+	psrld $14,%xmm2
+	pxor  %xmm2,%xmm0
+	movdqa 256(%rsp),%xmm1
+	movdqa %xmm0,256(%rsp)
+	movdqa %xmm8,%xmm0
+	paddd %xmm14,%xmm0
+	movdqa %xmm0,%xmm2
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm12
+	psrld $14,%xmm2
+	pxor  %xmm2,%xmm12
+	movdqa %xmm11,%xmm0
+	paddd %xmm1,%xmm0
+	movdqa %xmm0,%xmm2
+	pslld $7,%xmm0
+	pxor  %xmm0,%xmm6
+	psrld $25,%xmm2
+	pxor  %xmm2,%xmm6
+	movdqa 272(%rsp),%xmm2
+	movdqa %xmm12,272(%rsp)
+	movdqa %xmm3,%xmm0
+	paddd %xmm2,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $7,%xmm0
+	pxor  %xmm0,%xmm13
+	psrld $25,%xmm12
+	pxor  %xmm12,%xmm13
+	movdqa %xmm1,%xmm0
+	paddd %xmm6,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm15
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm15
+	movdqa %xmm2,%xmm0
+	paddd %xmm13,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $9,%xmm0
+	pxor  %xmm0,%xmm9
+	psrld $23,%xmm12
+	pxor  %xmm12,%xmm9
+	movdqa %xmm6,%xmm0
+	paddd %xmm15,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm11
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm11
+	movdqa %xmm13,%xmm0
+	paddd %xmm9,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $13,%xmm0
+	pxor  %xmm0,%xmm3
+	psrld $19,%xmm12
+	pxor  %xmm12,%xmm3
+	movdqa %xmm15,%xmm0
+	paddd %xmm11,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm1
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm1
+	movdqa %xmm9,%xmm0
+	paddd %xmm3,%xmm0
+	movdqa %xmm0,%xmm12
+	pslld $18,%xmm0
+	pxor  %xmm0,%xmm2
+	psrld $14,%xmm12
+	pxor  %xmm12,%xmm2
+	movdqa 256(%rsp),%xmm12
+	movdqa 272(%rsp),%xmm0
+	sub  $2,%rdx
+	ja ._mainloop1
+	paddd 48(%rsp),%xmm12
+	paddd 112(%rsp),%xmm7
+	paddd 160(%rsp),%xmm10
+	paddd 208(%rsp),%xmm4
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	pshufd $0x39,%xmm12,%xmm12
+	pshufd $0x39,%xmm7,%xmm7
+	pshufd $0x39,%xmm10,%xmm10
+	pshufd $0x39,%xmm4,%xmm4
+	xorl 0(%rsi),%edx
+	xorl 4(%rsi),%ecx
+	xorl 8(%rsi),%r9d
+	xorl 12(%rsi),%eax
+	movl   %edx,0(%rdi)
+	movl   %ecx,4(%rdi)
+	movl   %r9d,8(%rdi)
+	movl   %eax,12(%rdi)
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	pshufd $0x39,%xmm12,%xmm12
+	pshufd $0x39,%xmm7,%xmm7
+	pshufd $0x39,%xmm10,%xmm10
+	pshufd $0x39,%xmm4,%xmm4
+	xorl 64(%rsi),%edx
+	xorl 68(%rsi),%ecx
+	xorl 72(%rsi),%r9d
+	xorl 76(%rsi),%eax
+	movl   %edx,64(%rdi)
+	movl   %ecx,68(%rdi)
+	movl   %r9d,72(%rdi)
+	movl   %eax,76(%rdi)
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	pshufd $0x39,%xmm12,%xmm12
+	pshufd $0x39,%xmm7,%xmm7
+	pshufd $0x39,%xmm10,%xmm10
+	pshufd $0x39,%xmm4,%xmm4
+	xorl 128(%rsi),%edx
+	xorl 132(%rsi),%ecx
+	xorl 136(%rsi),%r9d
+	xorl 140(%rsi),%eax
+	movl   %edx,128(%rdi)
+	movl   %ecx,132(%rdi)
+	movl   %r9d,136(%rdi)
+	movl   %eax,140(%rdi)
+	movd   %xmm12,%rdx
+	movd   %xmm7,%rcx
+	movd   %xmm10,%r9
+	movd   %xmm4,%rax
+	xorl 192(%rsi),%edx
+	xorl 196(%rsi),%ecx
+	xorl 200(%rsi),%r9d
+	xorl 204(%rsi),%eax
+	movl   %edx,192(%rdi)
+	movl   %ecx,196(%rdi)
+	movl   %r9d,200(%rdi)
+	movl   %eax,204(%rdi)
+	paddd 176(%rsp),%xmm14
+	paddd 0(%rsp),%xmm0
+	paddd 64(%rsp),%xmm5
+	paddd 128(%rsp),%xmm8
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	pshufd $0x39,%xmm14,%xmm14
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm5,%xmm5
+	pshufd $0x39,%xmm8,%xmm8
+	xorl 16(%rsi),%edx
+	xorl 20(%rsi),%ecx
+	xorl 24(%rsi),%r9d
+	xorl 28(%rsi),%eax
+	movl   %edx,16(%rdi)
+	movl   %ecx,20(%rdi)
+	movl   %r9d,24(%rdi)
+	movl   %eax,28(%rdi)
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	pshufd $0x39,%xmm14,%xmm14
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm5,%xmm5
+	pshufd $0x39,%xmm8,%xmm8
+	xorl 80(%rsi),%edx
+	xorl 84(%rsi),%ecx
+	xorl 88(%rsi),%r9d
+	xorl 92(%rsi),%eax
+	movl   %edx,80(%rdi)
+	movl   %ecx,84(%rdi)
+	movl   %r9d,88(%rdi)
+	movl   %eax,92(%rdi)
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	pshufd $0x39,%xmm14,%xmm14
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm5,%xmm5
+	pshufd $0x39,%xmm8,%xmm8
+	xorl 144(%rsi),%edx
+	xorl 148(%rsi),%ecx
+	xorl 152(%rsi),%r9d
+	xorl 156(%rsi),%eax
+	movl   %edx,144(%rdi)
+	movl   %ecx,148(%rdi)
+	movl   %r9d,152(%rdi)
+	movl   %eax,156(%rdi)
+	movd   %xmm14,%rdx
+	movd   %xmm0,%rcx
+	movd   %xmm5,%r9
+	movd   %xmm8,%rax
+	xorl 208(%rsi),%edx
+	xorl 212(%rsi),%ecx
+	xorl 216(%rsi),%r9d
+	xorl 220(%rsi),%eax
+	movl   %edx,208(%rdi)
+	movl   %ecx,212(%rdi)
+	movl   %r9d,216(%rdi)
+	movl   %eax,220(%rdi)
+	paddd 224(%rsp),%xmm15
+	paddd 240(%rsp),%xmm11
+	paddd 16(%rsp),%xmm1
+	paddd 80(%rsp),%xmm6
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	pshufd $0x39,%xmm15,%xmm15
+	pshufd $0x39,%xmm11,%xmm11
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm6,%xmm6
+	xorl 32(%rsi),%edx
+	xorl 36(%rsi),%ecx
+	xorl 40(%rsi),%r9d
+	xorl 44(%rsi),%eax
+	movl   %edx,32(%rdi)
+	movl   %ecx,36(%rdi)
+	movl   %r9d,40(%rdi)
+	movl   %eax,44(%rdi)
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	pshufd $0x39,%xmm15,%xmm15
+	pshufd $0x39,%xmm11,%xmm11
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm6,%xmm6
+	xorl 96(%rsi),%edx
+	xorl 100(%rsi),%ecx
+	xorl 104(%rsi),%r9d
+	xorl 108(%rsi),%eax
+	movl   %edx,96(%rdi)
+	movl   %ecx,100(%rdi)
+	movl   %r9d,104(%rdi)
+	movl   %eax,108(%rdi)
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	pshufd $0x39,%xmm15,%xmm15
+	pshufd $0x39,%xmm11,%xmm11
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm6,%xmm6
+	xorl 160(%rsi),%edx
+	xorl 164(%rsi),%ecx
+	xorl 168(%rsi),%r9d
+	xorl 172(%rsi),%eax
+	movl   %edx,160(%rdi)
+	movl   %ecx,164(%rdi)
+	movl   %r9d,168(%rdi)
+	movl   %eax,172(%rdi)
+	movd   %xmm15,%rdx
+	movd   %xmm11,%rcx
+	movd   %xmm1,%r9
+	movd   %xmm6,%rax
+	xorl 224(%rsi),%edx
+	xorl 228(%rsi),%ecx
+	xorl 232(%rsi),%r9d
+	xorl 236(%rsi),%eax
+	movl   %edx,224(%rdi)
+	movl   %ecx,228(%rdi)
+	movl   %r9d,232(%rdi)
+	movl   %eax,236(%rdi)
+	paddd 96(%rsp),%xmm13
+	paddd 144(%rsp),%xmm9
+	paddd 192(%rsp),%xmm3
+	paddd 32(%rsp),%xmm2
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	pshufd $0x39,%xmm13,%xmm13
+	pshufd $0x39,%xmm9,%xmm9
+	pshufd $0x39,%xmm3,%xmm3
+	pshufd $0x39,%xmm2,%xmm2
+	xorl 48(%rsi),%edx
+	xorl 52(%rsi),%ecx
+	xorl 56(%rsi),%r9d
+	xorl 60(%rsi),%eax
+	movl   %edx,48(%rdi)
+	movl   %ecx,52(%rdi)
+	movl   %r9d,56(%rdi)
+	movl   %eax,60(%rdi)
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	pshufd $0x39,%xmm13,%xmm13
+	pshufd $0x39,%xmm9,%xmm9
+	pshufd $0x39,%xmm3,%xmm3
+	pshufd $0x39,%xmm2,%xmm2
+	xorl 112(%rsi),%edx
+	xorl 116(%rsi),%ecx
+	xorl 120(%rsi),%r9d
+	xorl 124(%rsi),%eax
+	movl   %edx,112(%rdi)
+	movl   %ecx,116(%rdi)
+	movl   %r9d,120(%rdi)
+	movl   %eax,124(%rdi)
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	pshufd $0x39,%xmm13,%xmm13
+	pshufd $0x39,%xmm9,%xmm9
+	pshufd $0x39,%xmm3,%xmm3
+	pshufd $0x39,%xmm2,%xmm2
+	xorl 176(%rsi),%edx
+	xorl 180(%rsi),%ecx
+	xorl 184(%rsi),%r9d
+	xorl 188(%rsi),%eax
+	movl   %edx,176(%rdi)
+	movl   %ecx,180(%rdi)
+	movl   %r9d,184(%rdi)
+	movl   %eax,188(%rdi)
+	movd   %xmm13,%rdx
+	movd   %xmm9,%rcx
+	movd   %xmm3,%r9
+	movd   %xmm2,%rax
+	xorl 240(%rsi),%edx
+	xorl 244(%rsi),%ecx
+	xorl 248(%rsi),%r9d
+	xorl 252(%rsi),%eax
+	movl   %edx,240(%rdi)
+	movl   %ecx,244(%rdi)
+	movl   %r9d,248(%rdi)
+	movl   %eax,252(%rdi)
+	movq 288(%rsp),%rdx
+	sub  $256,%rdx
+	add  $256,%rsi
+	add  $256,%rdi
+	cmp  $256,%rdx
+	jae ._bytesatleast256
+	cmp  $0,%rdx
+	jbe ._done
+._bytes_are_64_128_or_192:
+	movq %rdx,288(%rsp)
+	movdqa 0(%r8),%xmm0
+	movdqa 16(%r8),%xmm1
+	movdqa 32(%r8),%xmm2
+	movdqa 48(%r8),%xmm3
+	movdqa %xmm1,%xmm4
+	mov  %rbx,%rdx
+._mainloop2:
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm3
+	pxor  %xmm6,%xmm3
+	paddd %xmm3,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm3,%xmm3
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm1
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pxor  %xmm6,%xmm0
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm1
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm1,%xmm1
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm3
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm3
+	paddd %xmm3,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm3,%xmm3
+	pxor  %xmm6,%xmm0
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm3
+	pxor  %xmm6,%xmm3
+	paddd %xmm3,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm3,%xmm3
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm1
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm3,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pxor  %xmm6,%xmm0
+	paddd %xmm0,%xmm4
+	movdqa %xmm0,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $7,%xmm4
+	psrld $25,%xmm6
+	pxor  %xmm4,%xmm1
+	pxor  %xmm6,%xmm1
+	paddd %xmm1,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $9,%xmm5
+	psrld $23,%xmm6
+	pxor  %xmm5,%xmm2
+	pshufd $0x93,%xmm1,%xmm1
+	pxor  %xmm6,%xmm2
+	paddd %xmm2,%xmm4
+	movdqa %xmm2,%xmm5
+	movdqa %xmm4,%xmm6
+	pslld $13,%xmm4
+	psrld $19,%xmm6
+	pxor  %xmm4,%xmm3
+	pshufd $0x4e,%xmm2,%xmm2
+	pxor  %xmm6,%xmm3
+	sub  $4,%rdx
+	paddd %xmm3,%xmm5
+	movdqa %xmm1,%xmm4
+	movdqa %xmm5,%xmm6
+	pslld $18,%xmm5
+	pxor   %xmm7,%xmm7
+	psrld $14,%xmm6
+	pxor  %xmm5,%xmm0
+	pshufd $0x39,%xmm3,%xmm3
+	pxor  %xmm6,%xmm0
+	ja ._mainloop2
+	paddd 0(%r8),%xmm0
+	paddd 16(%r8),%xmm1
+	paddd 32(%r8),%xmm2
+	paddd 48(%r8),%xmm3
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm2,%xmm2
+	pshufd $0x39,%xmm3,%xmm3
+	xorl 0(%rsi),%edx
+	xorl 48(%rsi),%ecx
+	xorl 32(%rsi),%eax
+	xorl 16(%rsi),%r10d
+	movl   %edx,0(%rdi)
+	movl   %ecx,48(%rdi)
+	movl   %eax,32(%rdi)
+	movl   %r10d,16(%rdi)
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm2,%xmm2
+	pshufd $0x39,%xmm3,%xmm3
+	xorl 20(%rsi),%edx
+	xorl 4(%rsi),%ecx
+	xorl 52(%rsi),%eax
+	xorl 36(%rsi),%r10d
+	movl   %edx,20(%rdi)
+	movl   %ecx,4(%rdi)
+	movl   %eax,52(%rdi)
+	movl   %r10d,36(%rdi)
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	pshufd $0x39,%xmm0,%xmm0
+	pshufd $0x39,%xmm1,%xmm1
+	pshufd $0x39,%xmm2,%xmm2
+	pshufd $0x39,%xmm3,%xmm3
+	xorl 40(%rsi),%edx
+	xorl 24(%rsi),%ecx
+	xorl 8(%rsi),%eax
+	xorl 56(%rsi),%r10d
+	movl   %edx,40(%rdi)
+	movl   %ecx,24(%rdi)
+	movl   %eax,8(%rdi)
+	movl   %r10d,56(%rdi)
+	movd   %xmm0,%rdx
+	movd   %xmm1,%rcx
+	movd   %xmm2,%rax
+	movd   %xmm3,%r10
+	xorl 60(%rsi),%edx
+	xorl 44(%rsi),%ecx
+	xorl 28(%rsi),%eax
+	xorl 12(%rsi),%r10d
+	movl   %edx,60(%rdi)
+	movl   %ecx,44(%rdi)
+	movl   %eax,28(%rdi)
+	movl   %r10d,12(%rdi)
+	movq 288(%rsp),%rdx
+	movl   32(%r8),%ecx
+	movl   52(%r8),%eax
+	add  $1,%ecx
+	adc  $0,%eax
+	movl   %ecx,32(%r8)
+	movl   %eax,52(%r8)
+	cmp  $64,%rdx
+	ja ._bytes_are_128_or_192
+._done:
+	add %r11,%rsp
+	mov %r11,%rax
+	pop %rbx
+	ret
+._bytes_are_128_or_192:
+	sub  $64,%rdx
+	add  $64,%rdi
+	add  $64,%rsi
+	jmp ._bytes_are_64_128_or_192
+.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;
+
+#endif /*defined(USE_SALSA20)*/
+#endif /*__x86_64*/
diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index 6189bca..892b9fc 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -40,6 +40,14 @@
 #include "cipher.h"
 #include "bufhelp.h"
 
+
+/* USE_AMD64 indicates whether to compile with AMD64 code. */
+#undef USE_AMD64
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64 1
+#endif
+
+
 #define SALSA20_MIN_KEY_SIZE 16  /* Bytes.  */
 #define SALSA20_MAX_KEY_SIZE 32  /* Bytes.  */
 #define SALSA20_BLOCK_SIZE   64  /* Bytes.  */
@@ -83,6 +91,36 @@ typedef struct
 static void salsa20_setiv (void *context, const byte *iv, unsigned int ivlen);
 static const char *selftest (void);
 
+
+#ifdef USE_AMD64
+/* AMD64 assembly implementations of Salsa20. */
+void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits);
+void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv);
+unsigned int
+_gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst,
+                                   size_t len, int rounds);
+
+static void
+salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
+{
+  _gcry_salsa20_amd64_keysetup(ctx->input, key, keylen * 8);
+}
+
+static void
+salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
+{
+  _gcry_salsa20_amd64_ivsetup(ctx->input, iv);
+}
+
+static unsigned int
+salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
+{
+  memset(dst, 0, SALSA20_BLOCK_SIZE);
+  return _gcry_salsa20_amd64_encrypt_blocks(src, dst, dst, 1, rounds);
+}
+
+#else /* USE_AMD64 */
+
 

 
 #if 0
@@ -110,8 +148,8 @@ static const char *selftest (void);
     x0 ^= ROTL32 (18, x3 + x2);	    \
   } while(0)
 
-static void
-salsa20_core (u32 *dst, const u32 *src, unsigned rounds)
+static unsigned int
+salsa20_core (u32 *dst, u32 *src, unsigned int rounds)
 {
   u32 pad[SALSA20_INPUT_LENGTH];
   unsigned int i;
@@ -138,31 +176,24 @@ salsa20_core (u32 *dst, const u32 *src, unsigned rounds)
       u32 t = pad[i] + src[i];
       dst[i] = LE_SWAP32 (t);
     }
+
+  /* Update counter. */
+  if (!++src[8])
+    src[9]++;
+
+  /* burn_stack */
+  return ( 3*sizeof (void*) \
+         + 2*sizeof (void*) \
+         + 64 \
+         + sizeof (unsigned int) \
+         + sizeof (u32) );
 }
 #undef QROUND
 #undef SALSA20_CORE_DEBUG
 
-static gcry_err_code_t
-salsa20_do_setkey (SALSA20_context_t *ctx,
-                   const byte *key, unsigned int keylen)
+static void
+salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
 {
-  static int initialized;
-  static const char *selftest_failed;
-
-  if (!initialized )
-    {
-      initialized = 1;
-      selftest_failed = selftest ();
-      if (selftest_failed)
-        log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
-    }
-  if (selftest_failed)
-    return GPG_ERR_SELFTEST_FAILED;
-
-  if (keylen != SALSA20_MIN_KEY_SIZE
-      && keylen != SALSA20_MAX_KEY_SIZE)
-    return GPG_ERR_INV_KEYLEN;
-
   /* These constants are the little endian encoding of the string
      "expand 32-byte k".  For the 128 bit variant, the "32" in that
      string will be fixed up to "16".  */
@@ -192,6 +223,41 @@ salsa20_do_setkey (SALSA20_context_t *ctx,
       ctx->input[5]  -= 0x02000000; /* Change to "1 dn".  */
       ctx->input[10] += 0x00000004; /* Change to "yb-6".  */
     }
+}
+
+static void salsa20_ivsetup(SALSA20_context_t *ctx, const byte *iv)
+{
+  ctx->input[6] = LE_READ_UINT32(iv + 0);
+  ctx->input[7] = LE_READ_UINT32(iv + 4);
+  /* Reset the block counter.  */
+  ctx->input[8] = 0;
+  ctx->input[9] = 0;
+}
+
+#endif /*!USE_AMD64*/
+
+static gcry_err_code_t
+salsa20_do_setkey (SALSA20_context_t *ctx,
+                   const byte *key, unsigned int keylen)
+{
+  static int initialized;
+  static const char *selftest_failed;
+
+  if (!initialized )
+    {
+      initialized = 1;
+      selftest_failed = selftest ();
+      if (selftest_failed)
+        log_error ("SALSA20 selftest failed (%s)\n", selftest_failed );
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != SALSA20_MIN_KEY_SIZE
+      && keylen != SALSA20_MAX_KEY_SIZE)
+    return GPG_ERR_INV_KEYLEN;
+
+  salsa20_keysetup (ctx, key, keylen);
 
   /* We default to a zero nonce.  */
   salsa20_setiv (ctx, NULL, 0);
@@ -205,7 +271,7 @@ salsa20_setkey (void *context, const byte *key, unsigned int keylen)
 {
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
   gcry_err_code_t rc = salsa20_do_setkey (ctx, key, keylen);
-  _gcry_burn_stack (300/* FIXME*/);
+  _gcry_burn_stack (4 + sizeof (void *) + 4 * sizeof (void *));
   return rc;
 }
 
@@ -214,28 +280,22 @@ static void
 salsa20_setiv (void *context, const byte *iv, unsigned int ivlen)
 {
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
+  byte tmp[SALSA20_IV_SIZE];
 
-  if (!iv)
-    {
-      ctx->input[6] = 0;
-      ctx->input[7] = 0;
-    }
-  else if (ivlen == SALSA20_IV_SIZE)
-    {
-      ctx->input[6] = LE_READ_UINT32(iv + 0);
-      ctx->input[7] = LE_READ_UINT32(iv + 4);
-    }
+  if (iv && ivlen != SALSA20_IV_SIZE)
+    log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", ivlen);
+
+  if (!iv || ivlen != SALSA20_IV_SIZE)
+    memset (tmp, 0, sizeof(tmp));
   else
-    {
-      log_info ("WARNING: salsa20_setiv: bad ivlen=%u\n", ivlen);
-      ctx->input[6] = 0;
-      ctx->input[7] = 0;
-    }
-  /* Reset the block counter.  */
-  ctx->input[8] = 0;
-  ctx->input[9] = 0;
+    memcpy (tmp, iv, SALSA20_IV_SIZE);
+
+  salsa20_ivsetup (ctx, tmp);
+
   /* Reset the unused pad bytes counter.  */
   ctx->unused = 0;
+
+  wipememory (tmp, sizeof(tmp));
 }
 
 
@@ -246,6 +306,8 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
                            byte *outbuf, const byte *inbuf,
                            unsigned int length, unsigned rounds)
 {
+  unsigned int nburn, burn = 0;
+
   if (ctx->unused)
     {
       unsigned char *p = (void*)ctx->pad;
@@ -266,26 +328,39 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
       gcry_assert (!ctx->unused);
     }
 
-  for (;;)
+#ifdef USE_AMD64
+  if (length >= SALSA20_BLOCK_SIZE)
+    {
+      unsigned int nblocks = length / SALSA20_BLOCK_SIZE;
+      burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf,
+                                                nblocks, rounds);
+      length -= SALSA20_BLOCK_SIZE * nblocks;
+      outbuf += SALSA20_BLOCK_SIZE * nblocks;
+      inbuf  += SALSA20_BLOCK_SIZE * nblocks;
+    }
+#endif
+
+  while (length > 0)
     {
       /* Create the next pad and bump the block counter.  Note that it
          is the user's duty to change to another nonce not later than
          after 2^70 processed bytes.  */
-      salsa20_core (ctx->pad, ctx->input, rounds);
-      if (!++ctx->input[8])
-        ctx->input[9]++;
+      nburn = salsa20_core (ctx->pad, ctx->input, rounds);
+      burn = nburn > burn ? nburn : burn;
 
       if (length <= SALSA20_BLOCK_SIZE)
 	{
 	  buf_xor (outbuf, inbuf, ctx->pad, length);
           ctx->unused = SALSA20_BLOCK_SIZE - length;
-	  return;
+	  break;
 	}
       buf_xor (outbuf, inbuf, ctx->pad, SALSA20_BLOCK_SIZE);
       length -= SALSA20_BLOCK_SIZE;
       outbuf += SALSA20_BLOCK_SIZE;
       inbuf  += SALSA20_BLOCK_SIZE;
-  }
+    }
+
+  _gcry_burn_stack (burn);
 }
 
 
@@ -296,19 +371,7 @@ salsa20_encrypt_stream (void *context,
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
 
   if (length)
-    {
-      salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
-      _gcry_burn_stack (/* salsa20_do_encrypt_stream: */
-                        2*sizeof (void*)
-                        + 3*sizeof (void*) + sizeof (unsigned int)
-                        /* salsa20_core: */
-                        + 2*sizeof (void*)
-                        + 2*sizeof (void*)
-                        + 64
-                        + sizeof (unsigned int)
-                        + sizeof (u32)
-                        );
-    }
+    salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20_ROUNDS);
 }
 
 
@@ -319,19 +382,7 @@ salsa20r12_encrypt_stream (void *context,
   SALSA20_context_t *ctx = (SALSA20_context_t *)context;
 
   if (length)
-    {
-      salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
-      _gcry_burn_stack (/* salsa20_do_encrypt_stream: */
-                        2*sizeof (void*)
-                        + 3*sizeof (void*) + sizeof (unsigned int)
-                        /* salsa20_core: */
-                        + 2*sizeof (void*)
-                        + 2*sizeof (void*)
-                        + 64
-                        + sizeof (unsigned int)
-                        + sizeof (u32)
-                        );
-    }
+    salsa20_do_encrypt_stream (ctx, outbuf, inbuf, length, SALSA20R12_ROUNDS);
 }
 
 
diff --git a/configure.ac b/configure.ac
index 5b7ba0d..114460c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1553,6 +1553,13 @@ LIST_MEMBER(salsa20, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20.lo"
    AC_DEFINE(USE_SALSA20, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS salsa20-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(gost28147, $enabled_ciphers)




More information about the Gcrypt-devel mailing list