[PATCH] 3des: add amd64 assembly implementation for 3DES

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Mar 30 17:27:32 CEST 2014


* cipher/Makefile.am: Add 'des-amd64.S'.
* cipher/cipher-selftests.c (_gcry_selftest_helper_cbc)
(_gcry_selftest_helper_cfb, _gcry_selftest_helper_ctr): Handle failures
from 'setkey' function.
* cipher/cipher.c (_gcry_cipher_open_internal) [USE_DES]: Setup bulk
functions for 3DES.
* cipher/des-amd64.S: New file.
* cipher/des.c (USE_AMD64_ASM, ATTR_ALIGNED_16): New macros.
[USE_AMD64_ASM] (_gcry_3des_amd64_crypt_block)
(_gcry_3des_amd64_ctr_enc), _gcry_3des_amd64_cbc_dec)
(_gcry_3des_amd64_cfb_dec): New prototypes.
[USE_AMD64_ASM] (tripledes_ecb_crypt): New function.
(TRIPLEDES_ECB_BURN_STACK): New macro.
(_gcry_3des_ctr_enc, _gcry_3des_cbc_dec, _gcry_3des_cfb_dec)
(bulk_selftest_setkey, selftest_ctr, selftest_cbc, selftest_cfb): New
functions.
(selftest): Add call to CTR, CBC and CFB selftest functions.
(do_tripledes_encrypt, do_tripledes_decrypt): Use
TRIPLEDES_ECB_BURN_STACK.
* configure.ac [host=x86-64]: Add 'des-amd64.lo'.
* src/cipher.h (_gcry_3des_ctr_enc, _gcry_3des_cbc_dec)
(_gcry_3des_cfb_dec): New prototypes.
--

Add non-parallel functions for small speed-up and 3-way parallel functions for
modes of operation that support parallel processing.

Old vs new (Intel Core i5-4570):
================================
        enc    dec
 ECB    1.17x  1.17x
 CBC    1.17x  2.51x
 CFB    1.16x  2.49x
 OFB    1.17x  1.17x
 CTR    2.56x  2.56x

Old vs new (Intel Core i5-2450M):
=================================
        enc    dec
 ECB    1.28x  1.28x
 CBC    1.27x  2.33x
 CFB    1.27x  2.34x
 OFB    1.27x  1.27x
 CTR    2.36x  2.35x

New (Intel Core i5-4570):
=========================
 3DES           |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     28.39 ns/B     33.60 MiB/s     90.84 c/B
        ECB dec |     28.27 ns/B     33.74 MiB/s     90.45 c/B
        CBC enc |     29.50 ns/B     32.33 MiB/s     94.40 c/B
        CBC dec |     13.35 ns/B     71.45 MiB/s     42.71 c/B
        CFB enc |     29.59 ns/B     32.23 MiB/s     94.68 c/B
        CFB dec |     13.41 ns/B     71.12 MiB/s     42.91 c/B
        OFB enc |     28.90 ns/B     33.00 MiB/s     92.47 c/B
        OFB dec |     28.90 ns/B     33.00 MiB/s     92.48 c/B
        CTR enc |     13.39 ns/B     71.20 MiB/s     42.86 c/B
        CTR dec |     13.39 ns/B     71.21 MiB/s     42.86 c/B

Old (Intel Core i5-4570):
=========================
 3DES           |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     33.24 ns/B     28.69 MiB/s     106.4 c/B
        ECB dec |     33.26 ns/B     28.67 MiB/s     106.4 c/B
        CBC enc |     34.45 ns/B     27.69 MiB/s     110.2 c/B
        CBC dec |     33.45 ns/B     28.51 MiB/s     107.1 c/B
        CFB enc |     34.43 ns/B     27.70 MiB/s     110.2 c/B
        CFB dec |     33.41 ns/B     28.55 MiB/s     106.9 c/B
        OFB enc |     33.79 ns/B     28.22 MiB/s     108.1 c/B
        OFB dec |     33.79 ns/B     28.22 MiB/s     108.1 c/B
        CTR enc |     34.27 ns/B     27.83 MiB/s     109.7 c/B
        CTR dec |     34.27 ns/B     27.83 MiB/s     109.7 c/B

New (Intel Core i5-2450M):
==========================
 3DES           |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     42.21 ns/B     22.59 MiB/s     105.5 c/B
        ECB dec |     42.23 ns/B     22.58 MiB/s     105.6 c/B
        CBC enc |     43.70 ns/B     21.82 MiB/s     109.2 c/B
        CBC dec |     23.25 ns/B     41.02 MiB/s     58.12 c/B
        CFB enc |     43.71 ns/B     21.82 MiB/s     109.3 c/B
        CFB dec |     23.23 ns/B     41.05 MiB/s     58.08 c/B
        OFB enc |     42.73 ns/B     22.32 MiB/s     106.8 c/B
        OFB dec |     42.73 ns/B     22.32 MiB/s     106.8 c/B
        CTR enc |     23.31 ns/B     40.92 MiB/s     58.27 c/B
        CTR dec |     23.35 ns/B     40.84 MiB/s     58.38 c/B

Old (Intel Core i5-2450M):
==========================
 3DES           |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     53.98 ns/B     17.67 MiB/s     134.9 c/B
        ECB dec |     54.00 ns/B     17.66 MiB/s     135.0 c/B
        CBC enc |     55.43 ns/B     17.20 MiB/s     138.6 c/B
        CBC dec |     54.27 ns/B     17.57 MiB/s     135.7 c/B
        CFB enc |     55.42 ns/B     17.21 MiB/s     138.6 c/B
        CFB dec |     54.35 ns/B     17.55 MiB/s     135.9 c/B
        OFB enc |     54.49 ns/B     17.50 MiB/s     136.2 c/B
        OFB dec |     54.49 ns/B     17.50 MiB/s     136.2 c/B
        CTR enc |     55.02 ns/B     17.33 MiB/s     137.5 c/B
        CTR dec |     55.01 ns/B     17.34 MiB/s     137.5 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am       |    2 
 cipher/cipher-selftest.c |   18 +
 cipher/cipher.c          |    7 
 cipher/des-amd64.S       | 1030 ++++++++++++++++++++++++++++++++++++++++++++++
 cipher/des.c             |  292 +++++++++++++
 configure.ac             |    7 
 src/cipher.h             |   13 +
 7 files changed, 1362 insertions(+), 7 deletions(-)
 create mode 100644 cipher/des-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 462e6db..3c20d3c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -60,7 +60,7 @@ arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
 crc.c \
-des.c \
+des.c des-amd64.S \
 dsa.c \
 elgamal.c \
 ecc.c ecc-curves.c ecc-misc.c ecc-common.h \
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
index 5e95814..852368a 100644
--- a/cipher/cipher-selftest.c
+++ b/cipher/cipher-selftest.c
@@ -82,7 +82,11 @@ _gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey_func,
   ciphertext = plaintext2 + nblocks * blocksize;
 
   /* Initialize ctx */
-  setkey_func (ctx, key, sizeof(key));
+  if (setkey_func (ctx, key, sizeof(key)) != GPG_ERR_NO_ERROR)
+   {
+     xfree(mem);
+     return "setkey failed";
+   }
 
   /* Test single block code path */
   memset (iv, 0x4e, blocksize);
@@ -199,7 +203,11 @@ _gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey_func,
   ciphertext = plaintext2 + nblocks * blocksize;
 
   /* Initialize ctx */
-  setkey_func (ctx, key, sizeof(key));
+  if (setkey_func (ctx, key, sizeof(key)) != GPG_ERR_NO_ERROR)
+   {
+     xfree(mem);
+     return "setkey failed";
+   }
 
   /* Test single block code path */
   memset(iv, 0xd3, blocksize);
@@ -316,7 +324,11 @@ _gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey_func,
   ciphertext2 = ciphertext + nblocks * blocksize;
 
   /* Initialize ctx */
-  setkey_func (ctx, key, sizeof(key));
+  if (setkey_func (ctx, key, sizeof(key)) != GPG_ERR_NO_ERROR)
+   {
+     xfree(mem);
+     return "setkey failed";
+   }
 
   /* Test single block code path */
   memset (iv, 0xff, blocksize);
diff --git a/cipher/cipher.c b/cipher/cipher.c
index baa4720..6552ed3 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -513,6 +513,13 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
               break;
 #endif /*USE_CAMELLIA*/
+#ifdef USE_DES
+            case GCRY_CIPHER_3DES:
+              h->bulk.cbc_dec =  _gcry_3des_cbc_dec;
+              h->bulk.cfb_dec =  _gcry_3des_cfb_dec;
+              h->bulk.ctr_enc =  _gcry_3des_ctr_enc;
+              break;
+#endif /*USE_DES*/
 #ifdef USE_SERPENT
 	    case GCRY_CIPHER_SERPENT128:
 	    case GCRY_CIPHER_SERPENT192:
diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
new file mode 100644
index 0000000..e8b2c56
--- /dev/null
+++ b/cipher/des-amd64.S
@@ -0,0 +1,1030 @@
+/* des-amd64.S  -  AMD64 assembly implementation of 3DES cipher
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_DES) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+.text
+
+#define s1 0
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+#define SBOXES %rbp
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rsi
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %esi
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+	movl a, RT0d; \
+	shrl $(offset), RT0d; \
+	xorl b, RT0d; \
+	andl $(mask), RT0d; \
+	xorl RT0d, b; \
+	shll $(offset), RT0d; \
+	xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+	movl val##d, RT0d; \
+	rorl $4, RT0d; \
+	shlq $32, RT0; \
+	orq RT0, val; \
+	andq mask, val;
+
+#define compress_to_64bits(val) \
+	movq val, RT0; \
+	shrq $32, RT0; \
+	roll $4, RT0d; \
+	orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+	do_permutation(left##d, right##d,  4, 0x0f0f0f0f); \
+	do_permutation(left##d, right##d, 16, 0x0000ffff); \
+	do_permutation(right##d, left##d,  2, 0x33333333); \
+	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
+	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+	movl left##d, RW0d; \
+	roll $1, right##d; \
+	xorl right##d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, left##d; \
+	xorl RW0d, right##d; \
+	roll $1, left##d; \
+	expand_to_64bits(right, RT3); \
+	expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+	compress_to_64bits(right); \
+	compress_to_64bits(left); \
+	movl right##d, RW0d; \
+	rorl $1, left##d; \
+	xorl left##d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, right##d; \
+	xorl RW0d, left##d; \
+	rorl $1, right##d; \
+	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
+	do_permutation(right##d, left##d,  2, 0x33333333); \
+	do_permutation(left##d, right##d, 16, 0x0000ffff); \
+	do_permutation(left##d, right##d,  4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+	xorq from, RW0; \
+	\
+	movzbl RW0bl, RT0d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	movzbl RW0bl, RT2d; \
+	movzbl RW0bh, RT3d; \
+	shrq $16, RW0; \
+	movq s8(SBOXES, RT0, 8), RT0; \
+	xorq s6(SBOXES, RT1, 8), to; \
+	movzbl RW0bl, RL1d; \
+	movzbl RW0bh, RT1d; \
+	shrl $16, RW0d; \
+	xorq s4(SBOXES, RT2, 8), RT0; \
+	xorq s2(SBOXES, RT3, 8), to; \
+	movzbl RW0bl, RT2d; \
+	movzbl RW0bh, RT3d; \
+	xorq s7(SBOXES, RL1, 8), RT0; \
+	xorq s5(SBOXES, RT1, 8), to; \
+	xorq s3(SBOXES, RT2, 8), RT0; \
+	load_next_key(n, RW0); \
+	xorq RT0, to; \
+	xorq s1(SBOXES, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+	movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+	movl    (io), left##d; \
+	movl   4(io), right##d; \
+	bswapl left##d; \
+	bswapl right##d;
+
+#define write_block(io, left, right) \
+	bswapl left##d; \
+	bswapl right##d; \
+	movl   left##d,   (io); \
+	movl   right##d, 4(io);
+
+.align 8
+.globl _gcry_3des_amd64_crypt_block
+.type  _gcry_3des_amd64_crypt_block, at function;
+
+_gcry_3des_amd64_crypt_block:
+	/* input:
+	 *	%rdi: round keys, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+	pushq %r15;
+	pushq %rsi; /*dst*/
+
+	leaq .L_s1 RIP, SBOXES;
+
+	read_block(%rdx, RL0, RR0);
+	initial_permutation(RL0, RR0);
+
+	movq (CTX), RW0;
+
+	round1(0, RR0, RL0, load_next_key);
+	round1(1, RL0, RR0, load_next_key);
+	round1(2, RR0, RL0, load_next_key);
+	round1(3, RL0, RR0, load_next_key);
+	round1(4, RR0, RL0, load_next_key);
+	round1(5, RL0, RR0, load_next_key);
+	round1(6, RR0, RL0, load_next_key);
+	round1(7, RL0, RR0, load_next_key);
+	round1(8, RR0, RL0, load_next_key);
+	round1(9, RL0, RR0, load_next_key);
+	round1(10, RR0, RL0, load_next_key);
+	round1(11, RL0, RR0, load_next_key);
+	round1(12, RR0, RL0, load_next_key);
+	round1(13, RL0, RR0, load_next_key);
+	round1(14, RR0, RL0, load_next_key);
+	round1(15, RL0, RR0, load_next_key);
+
+	round1(16+0, RL0, RR0, load_next_key);
+	round1(16+1, RR0, RL0, load_next_key);
+	round1(16+2, RL0, RR0, load_next_key);
+	round1(16+3, RR0, RL0, load_next_key);
+	round1(16+4, RL0, RR0, load_next_key);
+	round1(16+5, RR0, RL0, load_next_key);
+	round1(16+6, RL0, RR0, load_next_key);
+	round1(16+7, RR0, RL0, load_next_key);
+	round1(16+8, RL0, RR0, load_next_key);
+	round1(16+9, RR0, RL0, load_next_key);
+	round1(16+10, RL0, RR0, load_next_key);
+	round1(16+11, RR0, RL0, load_next_key);
+	round1(16+12, RL0, RR0, load_next_key);
+	round1(16+13, RR0, RL0, load_next_key);
+	round1(16+14, RL0, RR0, load_next_key);
+	round1(16+15, RR0, RL0, load_next_key);
+
+	round1(32+0, RR0, RL0, load_next_key);
+	round1(32+1, RL0, RR0, load_next_key);
+	round1(32+2, RR0, RL0, load_next_key);
+	round1(32+3, RL0, RR0, load_next_key);
+	round1(32+4, RR0, RL0, load_next_key);
+	round1(32+5, RL0, RR0, load_next_key);
+	round1(32+6, RR0, RL0, load_next_key);
+	round1(32+7, RL0, RR0, load_next_key);
+	round1(32+8, RR0, RL0, load_next_key);
+	round1(32+9, RL0, RR0, load_next_key);
+	round1(32+10, RR0, RL0, load_next_key);
+	round1(32+11, RL0, RR0, load_next_key);
+	round1(32+12, RR0, RL0, load_next_key);
+	round1(32+13, RL0, RR0, load_next_key);
+	round1(32+14, RR0, RL0, load_next_key);
+	round1(32+15, RL0, RR0, dummy2);
+
+	popq RW2; /*dst*/
+	final_permutation(RR0, RL0);
+	write_block(RW2, RR0, RL0);
+
+	popq %r15;
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+	movl val##d, RT0d; \
+	rorl $4, RT0d; \
+	shlq $32, RT0; \
+	orq RT0, val; \
+	andq mask, val;
+
+#define compress_to_64bits(val) \
+	movq val, RT0; \
+	shrq $32, RT0; \
+	roll $4, RT0d; \
+	orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
+	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
+	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f); \
+	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+	    \
+	do_permutation(right##0d, left##0d,  2, 0x33333333); \
+	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
+	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
+	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
+	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
+	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
+	    \
+	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+	    \
+	movl left##0d, RW0d; \
+	roll $1, right##0d; \
+	xorl right##0d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, left##0d; \
+	xorl RW0d, right##0d; \
+	roll $1, left##0d; \
+	expand_to_64bits(right##0, RT3); \
+	expand_to_64bits(left##0, RT3); \
+	  movl left##1d, RW1d; \
+	  roll $1, right##1d; \
+	  xorl right##1d, RW1d; \
+	  andl $0xaaaaaaaa, RW1d; \
+	  xorl RW1d, left##1d; \
+	  xorl RW1d, right##1d; \
+	  roll $1, left##1d; \
+	  expand_to_64bits(right##1, RT3); \
+	  expand_to_64bits(left##1, RT3); \
+	    movl left##2d, RW2d; \
+	    roll $1, right##2d; \
+	    xorl right##2d, RW2d; \
+	    andl $0xaaaaaaaa, RW2d; \
+	    xorl RW2d, left##2d; \
+	    xorl RW2d, right##2d; \
+	    roll $1, left##2d; \
+	    expand_to_64bits(right##2, RT3); \
+	    expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+	compress_to_64bits(right##0); \
+	compress_to_64bits(left##0); \
+	movl right##0d, RW0d; \
+	rorl $1, left##0d; \
+	xorl left##0d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, right##0d; \
+	xorl RW0d, left##0d; \
+	rorl $1, right##0d; \
+	  compress_to_64bits(right##1); \
+	  compress_to_64bits(left##1); \
+	  movl right##1d, RW1d; \
+	  rorl $1, left##1d; \
+	  xorl left##1d, RW1d; \
+	  andl $0xaaaaaaaa, RW1d; \
+	  xorl RW1d, right##1d; \
+	  xorl RW1d, left##1d; \
+	  rorl $1, right##1d; \
+	    compress_to_64bits(right##2); \
+	    compress_to_64bits(left##2); \
+	    movl right##2d, RW2d; \
+	    rorl $1, left##2d; \
+	    xorl left##2d, RW2d; \
+	    andl $0xaaaaaaaa, RW2d; \
+	    xorl RW2d, right##2d; \
+	    xorl RW2d, left##2d; \
+	    rorl $1, right##2d; \
+	    \
+	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
+	do_permutation(right##0d, left##0d,  2, 0x33333333); \
+	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
+	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
+	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
+	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
+	    \
+	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
+	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
+	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+	xorq from##0, RW0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	xorq s8(SBOXES, RT3, 8), to##0; \
+	xorq s6(SBOXES, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	xorq s4(SBOXES, RT3, 8), to##0; \
+	xorq s2(SBOXES, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrl $16, RW0d; \
+	xorq s7(SBOXES, RT3, 8), to##0; \
+	xorq s5(SBOXES, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	load_next_key(n, RW0); \
+	xorq s3(SBOXES, RT3, 8), to##0; \
+	xorq s1(SBOXES, RT1, 8), to##0; \
+		xorq from##1, RW1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrq $16, RW1; \
+		xorq s8(SBOXES, RT3, 8), to##1; \
+		xorq s6(SBOXES, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrq $16, RW1; \
+		xorq s4(SBOXES, RT3, 8), to##1; \
+		xorq s2(SBOXES, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrl $16, RW1d; \
+		xorq s7(SBOXES, RT3, 8), to##1; \
+		xorq s5(SBOXES, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		do_movq(RW0, RW1); \
+		xorq s3(SBOXES, RT3, 8), to##1; \
+		xorq s1(SBOXES, RT1, 8), to##1; \
+			xorq from##2, RW2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrq $16, RW2; \
+			xorq s8(SBOXES, RT3, 8), to##2; \
+			xorq s6(SBOXES, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrq $16, RW2; \
+			xorq s4(SBOXES, RT3, 8), to##2; \
+			xorq s2(SBOXES, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrl $16, RW2d; \
+			xorq s7(SBOXES, RT3, 8), to##2; \
+			xorq s5(SBOXES, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			do_movq(RW0, RW2); \
+			xorq s3(SBOXES, RT3, 8), to##2; \
+			xorq s1(SBOXES, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+	movq src, dst;
+
+#define read_block(io, left, right) \
+	movl    (io), left##d; \
+	movl   4(io), right##d; \
+	bswapl left##d; \
+	bswapl right##d;
+
+#define write_block(io, left, right) \
+	bswapl left##d; \
+	bswapl right##d; \
+	movl   left##d,   (io); \
+	movl   right##d, 4(io);
+
+.align 8
+.type  _gcry_3des_amd64_crypt_blk3, at function;
+_gcry_3des_amd64_crypt_blk3:
+	/* input:
+	 *  %rdi: round keys, CTX
+	 *  RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks
+	 *  RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks
+	 */
+
+	leaq .L_s1 RIP, SBOXES;
+
+	initial_permutation3(RL, RR);
+
+	movq 0(CTX), RW0;
+	movq RW0, RW1;
+	movq RW0, RW2;
+
+	round3(0, RR, RL, load_next_key, __movq);
+	round3(1, RL, RR, load_next_key, __movq);
+	round3(2, RR, RL, load_next_key, __movq);
+	round3(3, RL, RR, load_next_key, __movq);
+	round3(4, RR, RL, load_next_key, __movq);
+	round3(5, RL, RR, load_next_key, __movq);
+	round3(6, RR, RL, load_next_key, __movq);
+	round3(7, RL, RR, load_next_key, __movq);
+	round3(8, RR, RL, load_next_key, __movq);
+	round3(9, RL, RR, load_next_key, __movq);
+	round3(10, RR, RL, load_next_key, __movq);
+	round3(11, RL, RR, load_next_key, __movq);
+	round3(12, RR, RL, load_next_key, __movq);
+	round3(13, RL, RR, load_next_key, __movq);
+	round3(14, RR, RL, load_next_key, __movq);
+	round3(15, RL, RR, load_next_key, __movq);
+
+	round3(16+0, RL, RR, load_next_key, __movq);
+	round3(16+1, RR, RL, load_next_key, __movq);
+	round3(16+2, RL, RR, load_next_key, __movq);
+	round3(16+3, RR, RL, load_next_key, __movq);
+	round3(16+4, RL, RR, load_next_key, __movq);
+	round3(16+5, RR, RL, load_next_key, __movq);
+	round3(16+6, RL, RR, load_next_key, __movq);
+	round3(16+7, RR, RL, load_next_key, __movq);
+	round3(16+8, RL, RR, load_next_key, __movq);
+	round3(16+9, RR, RL, load_next_key, __movq);
+	round3(16+10, RL, RR, load_next_key, __movq);
+	round3(16+11, RR, RL, load_next_key, __movq);
+	round3(16+12, RL, RR, load_next_key, __movq);
+	round3(16+13, RR, RL, load_next_key, __movq);
+	round3(16+14, RL, RR, load_next_key, __movq);
+	round3(16+15, RR, RL, load_next_key, __movq);
+
+	round3(32+0, RR, RL, load_next_key, __movq);
+	round3(32+1, RL, RR, load_next_key, __movq);
+	round3(32+2, RR, RL, load_next_key, __movq);
+	round3(32+3, RL, RR, load_next_key, __movq);
+	round3(32+4, RR, RL, load_next_key, __movq);
+	round3(32+5, RL, RR, load_next_key, __movq);
+	round3(32+6, RR, RL, load_next_key, __movq);
+	round3(32+7, RL, RR, load_next_key, __movq);
+	round3(32+8, RR, RL, load_next_key, __movq);
+	round3(32+9, RL, RR, load_next_key, __movq);
+	round3(32+10, RR, RL, load_next_key, __movq);
+	round3(32+11, RL, RR, load_next_key, __movq);
+	round3(32+12, RR, RL, load_next_key, __movq);
+	round3(32+13, RL, RR, load_next_key, __movq);
+	round3(32+14, RR, RL, load_next_key, __movq);
+	round3(32+15, RL, RR, dummy2, dummy2);
+
+	final_permutation3(RR, RL);
+
+	ret;
+.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;
+
+.align 8
+.globl  _gcry_3des_amd64_cbc_dec
+.type   _gcry_3des_amd64_cbc_dec, at function;
+_gcry_3des_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+	pushq %r15;
+
+	pushq %rsi; /*dst*/
+	pushq %rdx; /*src*/
+	pushq %rcx; /*iv*/
+
+	/* load input */
+	movl 0 * 4(%rdx), RL0d;
+	movl 1 * 4(%rdx), RR0d;
+	movl 2 * 4(%rdx), RL1d;
+	movl 3 * 4(%rdx), RR1d;
+	movl 4 * 4(%rdx), RL2d;
+	movl 5 * 4(%rdx), RR2d;
+
+	bswapl RL0d;
+	bswapl RR0d;
+	bswapl RL1d;
+	bswapl RR1d;
+	bswapl RL2d;
+	bswapl RR2d;
+
+	call _gcry_3des_amd64_crypt_blk3;
+
+	popq %rcx; /*iv*/
+	popq %rdx; /*src*/
+	popq %rsi; /*dst*/
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	movq 2 * 8(%rdx), RT0;
+	xorl 0 * 4(%rcx), RR0d;
+	xorl 1 * 4(%rcx), RL0d;
+	xorl 0 * 4(%rdx), RR1d;
+	xorl 1 * 4(%rdx), RL1d;
+	xorl 2 * 4(%rdx), RR2d;
+	xorl 3 * 4(%rdx), RL2d;
+	movq RT0, (%rcx); /* store new IV */
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;
+
+.align 8
+.globl  _gcry_3des_amd64_ctr_enc
+.type   _gcry_3des_amd64_ctr_enc, at function;
+_gcry_3des_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+	pushq %r15;
+
+	pushq %rsi; /*dst*/
+	pushq %rdx; /*src*/
+	movq %rcx, RW2;
+
+	/* load IV and byteswap */
+	movq (RW2), RT0;
+	bswapq RT0;
+	movq RT0, RR0;
+
+	/* construct IVs */
+	leaq 1(RT0), RR1;
+	leaq 2(RT0), RR2;
+	leaq 3(RT0), RT0;
+	movq RR0, RL0;
+	movq RR1, RL1;
+	movq RR2, RL2;
+	bswapq RT0;
+	shrq $32, RL0;
+	shrq $32, RL1;
+	shrq $32, RL2;
+
+	/* store new IV */
+	movq RT0, (RW2);
+
+	call _gcry_3des_amd64_crypt_blk3;
+
+	popq %rdx; /*src*/
+	popq %rsi; /*dst*/
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	xorl 0 * 4(%rdx), RR0d;
+	xorl 1 * 4(%rdx), RL0d;
+	xorl 2 * 4(%rdx), RR1d;
+	xorl 3 * 4(%rdx), RL1d;
+	xorl 4 * 4(%rdx), RR2d;
+	xorl 5 * 4(%rdx), RL2d;
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;
+
+.align 8
+.globl  _gcry_3des_amd64_cfb_dec
+.type   _gcry_3des_amd64_cfb_dec, at function;
+_gcry_3des_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+	pushq %r15;
+
+	pushq %rsi; /*dst*/
+	pushq %rdx; /*src*/
+	movq %rcx, RW2;
+
+	/* Load input */
+	movl 0 * 4(RW2), RL0d;
+	movl 1 * 4(RW2), RR0d;
+	movl 0 * 4(%rdx), RL1d;
+	movl 1 * 4(%rdx), RR1d;
+	movl 2 * 4(%rdx), RL2d;
+	movl 3 * 4(%rdx), RR2d;
+
+	bswapl RL0d;
+	bswapl RR0d;
+	bswapl RL1d;
+	bswapl RR1d;
+	bswapl RL2d;
+	bswapl RR2d;
+
+	/* Update IV */
+	movq 4 * 4(%rdx), RW0;
+	movq RW0, (RW2);
+
+	call _gcry_3des_amd64_crypt_blk3;
+
+	popq %rdx; /*src*/
+	popq %rsi; /*dst*/
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	xorl 0 * 4(%rdx), RR0d;
+	xorl 1 * 4(%rdx), RL0d;
+	xorl 2 * 4(%rdx), RR1d;
+	xorl 3 * 4(%rdx), RL1d;
+	xorl 4 * 4(%rdx), RR2d;
+	xorl 5 * 4(%rdx), RL2d;
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;
+
+.data
+.align 16
+.L_s1:
+	.quad 0x0010100001010400, 0x0000000000000000
+	.quad 0x0000100000010000, 0x0010100001010404
+	.quad 0x0010100001010004, 0x0000100000010404
+	.quad 0x0000000000000004, 0x0000100000010000
+	.quad 0x0000000000000400, 0x0010100001010400
+	.quad 0x0010100001010404, 0x0000000000000400
+	.quad 0x0010000001000404, 0x0010100001010004
+	.quad 0x0010000001000000, 0x0000000000000004
+	.quad 0x0000000000000404, 0x0010000001000400
+	.quad 0x0010000001000400, 0x0000100000010400
+	.quad 0x0000100000010400, 0x0010100001010000
+	.quad 0x0010100001010000, 0x0010000001000404
+	.quad 0x0000100000010004, 0x0010000001000004
+	.quad 0x0010000001000004, 0x0000100000010004
+	.quad 0x0000000000000000, 0x0000000000000404
+	.quad 0x0000100000010404, 0x0010000001000000
+	.quad 0x0000100000010000, 0x0010100001010404
+	.quad 0x0000000000000004, 0x0010100001010000
+	.quad 0x0010100001010400, 0x0010000001000000
+	.quad 0x0010000001000000, 0x0000000000000400
+	.quad 0x0010100001010004, 0x0000100000010000
+	.quad 0x0000100000010400, 0x0010000001000004
+	.quad 0x0000000000000400, 0x0000000000000004
+	.quad 0x0010000001000404, 0x0000100000010404
+	.quad 0x0010100001010404, 0x0000100000010004
+	.quad 0x0010100001010000, 0x0010000001000404
+	.quad 0x0010000001000004, 0x0000000000000404
+	.quad 0x0000100000010404, 0x0010100001010400
+	.quad 0x0000000000000404, 0x0010000001000400
+	.quad 0x0010000001000400, 0x0000000000000000
+	.quad 0x0000100000010004, 0x0000100000010400
+	.quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+	.quad 0x0801080200100020, 0x0800080000000000
+	.quad 0x0000080000000000, 0x0001080200100020
+	.quad 0x0001000000100000, 0x0000000200000020
+	.quad 0x0801000200100020, 0x0800080200000020
+	.quad 0x0800000200000020, 0x0801080200100020
+	.quad 0x0801080000100000, 0x0800000000000000
+	.quad 0x0800080000000000, 0x0001000000100000
+	.quad 0x0000000200000020, 0x0801000200100020
+	.quad 0x0001080000100000, 0x0001000200100020
+	.quad 0x0800080200000020, 0x0000000000000000
+	.quad 0x0800000000000000, 0x0000080000000000
+	.quad 0x0001080200100020, 0x0801000000100000
+	.quad 0x0001000200100020, 0x0800000200000020
+	.quad 0x0000000000000000, 0x0001080000100000
+	.quad 0x0000080200000020, 0x0801080000100000
+	.quad 0x0801000000100000, 0x0000080200000020
+	.quad 0x0000000000000000, 0x0001080200100020
+	.quad 0x0801000200100020, 0x0001000000100000
+	.quad 0x0800080200000020, 0x0801000000100000
+	.quad 0x0801080000100000, 0x0000080000000000
+	.quad 0x0801000000100000, 0x0800080000000000
+	.quad 0x0000000200000020, 0x0801080200100020
+	.quad 0x0001080200100020, 0x0000000200000020
+	.quad 0x0000080000000000, 0x0800000000000000
+	.quad 0x0000080200000020, 0x0801080000100000
+	.quad 0x0001000000100000, 0x0800000200000020
+	.quad 0x0001000200100020, 0x0800080200000020
+	.quad 0x0800000200000020, 0x0001000200100020
+	.quad 0x0001080000100000, 0x0000000000000000
+	.quad 0x0800080000000000, 0x0000080200000020
+	.quad 0x0800000000000000, 0x0801000200100020
+	.quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+	.quad 0x0000002000000208, 0x0000202008020200
+	.quad 0x0000000000000000, 0x0000200008020008
+	.quad 0x0000002008000200, 0x0000000000000000
+	.quad 0x0000202000020208, 0x0000002008000200
+	.quad 0x0000200000020008, 0x0000000008000008
+	.quad 0x0000000008000008, 0x0000200000020000
+	.quad 0x0000202008020208, 0x0000200000020008
+	.quad 0x0000200008020000, 0x0000002000000208
+	.quad 0x0000000008000000, 0x0000000000000008
+	.quad 0x0000202008020200, 0x0000002000000200
+	.quad 0x0000202000020200, 0x0000200008020000
+	.quad 0x0000200008020008, 0x0000202000020208
+	.quad 0x0000002008000208, 0x0000202000020200
+	.quad 0x0000200000020000, 0x0000002008000208
+	.quad 0x0000000000000008, 0x0000202008020208
+	.quad 0x0000002000000200, 0x0000000008000000
+	.quad 0x0000202008020200, 0x0000000008000000
+	.quad 0x0000200000020008, 0x0000002000000208
+	.quad 0x0000200000020000, 0x0000202008020200
+	.quad 0x0000002008000200, 0x0000000000000000
+	.quad 0x0000002000000200, 0x0000200000020008
+	.quad 0x0000202008020208, 0x0000002008000200
+	.quad 0x0000000008000008, 0x0000002000000200
+	.quad 0x0000000000000000, 0x0000200008020008
+	.quad 0x0000002008000208, 0x0000200000020000
+	.quad 0x0000000008000000, 0x0000202008020208
+	.quad 0x0000000000000008, 0x0000202000020208
+	.quad 0x0000202000020200, 0x0000000008000008
+	.quad 0x0000200008020000, 0x0000002008000208
+	.quad 0x0000002000000208, 0x0000200008020000
+	.quad 0x0000202000020208, 0x0000000000000008
+	.quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+	.quad 0x1008020000002001, 0x1000020800002001
+	.quad 0x1000020800002001, 0x0000000800000000
+	.quad 0x0008020800002000, 0x1008000800000001
+	.quad 0x1008000000000001, 0x1000020000002001
+	.quad 0x0000000000000000, 0x0008020000002000
+	.quad 0x0008020000002000, 0x1008020800002001
+	.quad 0x1000000800000001, 0x0000000000000000
+	.quad 0x0008000800000000, 0x1008000000000001
+	.quad 0x1000000000000001, 0x0000020000002000
+	.quad 0x0008000000000000, 0x1008020000002001
+	.quad 0x0000000800000000, 0x0008000000000000
+	.quad 0x1000020000002001, 0x0000020800002000
+	.quad 0x1008000800000001, 0x1000000000000001
+	.quad 0x0000020800002000, 0x0008000800000000
+	.quad 0x0000020000002000, 0x0008020800002000
+	.quad 0x1008020800002001, 0x1000000800000001
+	.quad 0x0008000800000000, 0x1008000000000001
+	.quad 0x0008020000002000, 0x1008020800002001
+	.quad 0x1000000800000001, 0x0000000000000000
+	.quad 0x0000000000000000, 0x0008020000002000
+	.quad 0x0000020800002000, 0x0008000800000000
+	.quad 0x1008000800000001, 0x1000000000000001
+	.quad 0x1008020000002001, 0x1000020800002001
+	.quad 0x1000020800002001, 0x0000000800000000
+	.quad 0x1008020800002001, 0x1000000800000001
+	.quad 0x1000000000000001, 0x0000020000002000
+	.quad 0x1008000000000001, 0x1000020000002001
+	.quad 0x0008020800002000, 0x1008000800000001
+	.quad 0x1000020000002001, 0x0000020800002000
+	.quad 0x0008000000000000, 0x1008020000002001
+	.quad 0x0000000800000000, 0x0008000000000000
+	.quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+	.quad 0x0000001000000100, 0x0020001002080100
+	.quad 0x0020000002080000, 0x0420001002000100
+	.quad 0x0000000000080000, 0x0000001000000100
+	.quad 0x0400000000000000, 0x0020000002080000
+	.quad 0x0400001000080100, 0x0000000000080000
+	.quad 0x0020001002000100, 0x0400001000080100
+	.quad 0x0420001002000100, 0x0420000002080000
+	.quad 0x0000001000080100, 0x0400000000000000
+	.quad 0x0020000002000000, 0x0400000000080000
+	.quad 0x0400000000080000, 0x0000000000000000
+	.quad 0x0400001000000100, 0x0420001002080100
+	.quad 0x0420001002080100, 0x0020001002000100
+	.quad 0x0420000002080000, 0x0400001000000100
+	.quad 0x0000000000000000, 0x0420000002000000
+	.quad 0x0020001002080100, 0x0020000002000000
+	.quad 0x0420000002000000, 0x0000001000080100
+	.quad 0x0000000000080000, 0x0420001002000100
+	.quad 0x0000001000000100, 0x0020000002000000
+	.quad 0x0400000000000000, 0x0020000002080000
+	.quad 0x0420001002000100, 0x0400001000080100
+	.quad 0x0020001002000100, 0x0400000000000000
+	.quad 0x0420000002080000, 0x0020001002080100
+	.quad 0x0400001000080100, 0x0000001000000100
+	.quad 0x0020000002000000, 0x0420000002080000
+	.quad 0x0420001002080100, 0x0000001000080100
+	.quad 0x0420000002000000, 0x0420001002080100
+	.quad 0x0020000002080000, 0x0000000000000000
+	.quad 0x0400000000080000, 0x0420000002000000
+	.quad 0x0000001000080100, 0x0020001002000100
+	.quad 0x0400001000000100, 0x0000000000080000
+	.quad 0x0000000000000000, 0x0400000000080000
+	.quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+	.quad 0x0200000120000010, 0x0204000020000000
+	.quad 0x0000040000000000, 0x0204040120000010
+	.quad 0x0204000020000000, 0x0000000100000010
+	.quad 0x0204040120000010, 0x0004000000000000
+	.quad 0x0200040020000000, 0x0004040100000010
+	.quad 0x0004000000000000, 0x0200000120000010
+	.quad 0x0004000100000010, 0x0200040020000000
+	.quad 0x0200000020000000, 0x0000040100000010
+	.quad 0x0000000000000000, 0x0004000100000010
+	.quad 0x0200040120000010, 0x0000040000000000
+	.quad 0x0004040000000000, 0x0200040120000010
+	.quad 0x0000000100000010, 0x0204000120000010
+	.quad 0x0204000120000010, 0x0000000000000000
+	.quad 0x0004040100000010, 0x0204040020000000
+	.quad 0x0000040100000010, 0x0004040000000000
+	.quad 0x0204040020000000, 0x0200000020000000
+	.quad 0x0200040020000000, 0x0000000100000010
+	.quad 0x0204000120000010, 0x0004040000000000
+	.quad 0x0204040120000010, 0x0004000000000000
+	.quad 0x0000040100000010, 0x0200000120000010
+	.quad 0x0004000000000000, 0x0200040020000000
+	.quad 0x0200000020000000, 0x0000040100000010
+	.quad 0x0200000120000010, 0x0204040120000010
+	.quad 0x0004040000000000, 0x0204000020000000
+	.quad 0x0004040100000010, 0x0204040020000000
+	.quad 0x0000000000000000, 0x0204000120000010
+	.quad 0x0000000100000010, 0x0000040000000000
+	.quad 0x0204000020000000, 0x0004040100000010
+	.quad 0x0000040000000000, 0x0004000100000010
+	.quad 0x0200040120000010, 0x0000000000000000
+	.quad 0x0204040020000000, 0x0200000020000000
+	.quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+	.quad 0x0002000000200000, 0x2002000004200002
+	.quad 0x2000000004000802, 0x0000000000000000
+	.quad 0x0000000000000800, 0x2000000004000802
+	.quad 0x2002000000200802, 0x0002000004200800
+	.quad 0x2002000004200802, 0x0002000000200000
+	.quad 0x0000000000000000, 0x2000000004000002
+	.quad 0x2000000000000002, 0x0000000004000000
+	.quad 0x2002000004200002, 0x2000000000000802
+	.quad 0x0000000004000800, 0x2002000000200802
+	.quad 0x2002000000200002, 0x0000000004000800
+	.quad 0x2000000004000002, 0x0002000004200000
+	.quad 0x0002000004200800, 0x2002000000200002
+	.quad 0x0002000004200000, 0x0000000000000800
+	.quad 0x2000000000000802, 0x2002000004200802
+	.quad 0x0002000000200800, 0x2000000000000002
+	.quad 0x0000000004000000, 0x0002000000200800
+	.quad 0x0000000004000000, 0x0002000000200800
+	.quad 0x0002000000200000, 0x2000000004000802
+	.quad 0x2000000004000802, 0x2002000004200002
+	.quad 0x2002000004200002, 0x2000000000000002
+	.quad 0x2002000000200002, 0x0000000004000000
+	.quad 0x0000000004000800, 0x0002000000200000
+	.quad 0x0002000004200800, 0x2000000000000802
+	.quad 0x2002000000200802, 0x0002000004200800
+	.quad 0x2000000000000802, 0x2000000004000002
+	.quad 0x2002000004200802, 0x0002000004200000
+	.quad 0x0002000000200800, 0x0000000000000000
+	.quad 0x2000000000000002, 0x2002000004200802
+	.quad 0x0000000000000000, 0x2002000000200802
+	.quad 0x0002000004200000, 0x0000000000000800
+	.quad 0x2000000004000002, 0x0000000004000800
+	.quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+	.quad 0x0100010410001000, 0x0000010000001000
+	.quad 0x0000000000040000, 0x0100010410041000
+	.quad 0x0100000010000000, 0x0100010410001000
+	.quad 0x0000000400000000, 0x0100000010000000
+	.quad 0x0000000400040000, 0x0100000010040000
+	.quad 0x0100010410041000, 0x0000010000041000
+	.quad 0x0100010010041000, 0x0000010400041000
+	.quad 0x0000010000001000, 0x0000000400000000
+	.quad 0x0100000010040000, 0x0100000410000000
+	.quad 0x0100010010001000, 0x0000010400001000
+	.quad 0x0000010000041000, 0x0000000400040000
+	.quad 0x0100000410040000, 0x0100010010041000
+	.quad 0x0000010400001000, 0x0000000000000000
+	.quad 0x0000000000000000, 0x0100000410040000
+	.quad 0x0100000410000000, 0x0100010010001000
+	.quad 0x0000010400041000, 0x0000000000040000
+	.quad 0x0000010400041000, 0x0000000000040000
+	.quad 0x0100010010041000, 0x0000010000001000
+	.quad 0x0000000400000000, 0x0100000410040000
+	.quad 0x0000010000001000, 0x0000010400041000
+	.quad 0x0100010010001000, 0x0000000400000000
+	.quad 0x0100000410000000, 0x0100000010040000
+	.quad 0x0100000410040000, 0x0100000010000000
+	.quad 0x0000000000040000, 0x0100010410001000
+	.quad 0x0000000000000000, 0x0100010410041000
+	.quad 0x0000000400040000, 0x0100000410000000
+	.quad 0x0100000010040000, 0x0100010010001000
+	.quad 0x0100010410001000, 0x0000000000000000
+	.quad 0x0100010410041000, 0x0000010000041000
+	.quad 0x0000010000041000, 0x0000010400001000
+	.quad 0x0000010400001000, 0x0000000400040000
+	.quad 0x0100000010000000, 0x0100010010041000
+
+#endif
+#endif
diff --git a/cipher/des.c b/cipher/des.c
index 6611fd3..bc2a474 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -119,9 +119,27 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
+#include "cipher-selftest.h"
+
+
+#define DES_BLOCKSIZE 8
+
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64_ASM 1
+#endif
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
 
 #if defined(__GNUC__) && defined(__GNU_LIBRARY__)
-#define working_memcmp memcmp
+# define working_memcmp memcmp
 #else
 /*
  * According to the SunOS man page, memcmp returns indeterminate sign
@@ -171,6 +189,12 @@ static int tripledes_ecb_crypt (struct _tripledes_ctx *,
                                 const byte *, byte *, int);
 static int is_weak_key ( const byte *key );
 static const char *selftest (void);
+static unsigned int do_tripledes_encrypt(void *context, byte *outbuf,
+					 const byte *inbuf );
+static unsigned int do_tripledes_decrypt(void *context, byte *outbuf,
+					 const byte *inbuf );
+static gcry_err_code_t do_tripledes_setkey(void *context, const byte *key,
+                                           unsigned keylen);
 
 static int initialized;
 
@@ -727,6 +751,46 @@ tripledes_set3keys (struct _tripledes_ctx *ctx,
 
 
 
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementation of triple-DES. */
+extern void _gcry_3des_amd64_crypt_block(const void *keys, byte *out,
+                                         const byte *in);
+
+/* These assembly implementations process three blocks in parallel. */
+extern void _gcry_3des_amd64_ctr_enc(const void *keys, byte *out,
+                                     const byte *in, byte *ctr);
+
+extern void _gcry_3des_amd64_cbc_dec(const void *keys, byte *out,
+                                     const byte *in, byte *iv);
+
+extern void _gcry_3des_amd64_cfb_dec(const void *keys, byte *out,
+                                     const byte *in, byte *iv);
+
+#define TRIPLEDES_ECB_BURN_STACK (8 * sizeof(void *))
+
+/*
+ * Electronic Codebook Mode Triple-DES encryption/decryption of data
+ * according to 'mode'.  Sometimes this mode is named 'EDE' mode
+ * (Encryption-Decryption-Encryption).
+ */
+static inline int
+tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
+                     byte * to, int mode)
+{
+  u32 *keys;
+
+  keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
+
+  _gcry_3des_amd64_crypt_block(keys, to, from);
+
+  return 0;
+}
+
+#else /*USE_AMD64_ASM*/
+
+#define TRIPLEDES_ECB_BURN_STACK 32
+
 /*
  * Electronic Codebook Mode Triple-DES encryption/decryption of data
  * according to 'mode'.  Sometimes this mode is named 'EDE' mode
@@ -777,8 +841,158 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
   return 0;
 }
 
+#endif /*!USE_AMD64_ASM*/
+
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size DES_BLOCKSIZE. */
+void
+_gcry_3des_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+                   const void *inbuf_arg, size_t nblocks)
+{
+  struct _tripledes_ctx *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[DES_BLOCKSIZE];
+  int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK;
+  int i;
+
+#ifdef USE_AMD64_ASM
+  {
+    int asm_burn_depth = 9 * sizeof(void *);
+
+    if (nblocks >= 3 && burn_stack_depth < asm_burn_depth)
+      burn_stack_depth = asm_burn_depth;
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        _gcry_3des_amd64_ctr_enc(ctx->encrypt_subkeys, outbuf, inbuf, ctr);
+
+        nblocks -= 3;
+        outbuf += 3 * DES_BLOCKSIZE;
+        inbuf  += 3 * DES_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      tripledes_ecb_encrypt (ctx, ctr, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, DES_BLOCKSIZE);
+      outbuf += DES_BLOCKSIZE;
+      inbuf  += DES_BLOCKSIZE;
+      /* Increment the counter.  */
+      for (i = DES_BLOCKSIZE; i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_3des_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+                   const void *inbuf_arg, size_t nblocks)
+{
+  struct _tripledes_ctx *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[DES_BLOCKSIZE];
+  int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK;
+
+#ifdef USE_AMD64_ASM
+  {
+    int asm_burn_depth = 10 * sizeof(void *);
+
+    if (nblocks >= 3 && burn_stack_depth < asm_burn_depth)
+      burn_stack_depth = asm_burn_depth;
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        _gcry_3des_amd64_cbc_dec(ctx->decrypt_subkeys, outbuf, inbuf, iv);
+
+        nblocks -= 3;
+        outbuf += 3 * DES_BLOCKSIZE;
+        inbuf  += 3 * DES_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      tripledes_ecb_decrypt (ctx, inbuf, savebuf);
+
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, DES_BLOCKSIZE);
+      inbuf += DES_BLOCKSIZE;
+      outbuf += DES_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_3des_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		   const void *inbuf_arg, size_t nblocks)
+{
+  struct _tripledes_ctx *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = TRIPLEDES_ECB_BURN_STACK;
+
+#ifdef USE_AMD64_ASM
+  {
+    int asm_burn_depth = 9 * sizeof(void *);
+
+    if (nblocks >= 3 && burn_stack_depth < asm_burn_depth)
+      burn_stack_depth = asm_burn_depth;
 
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+        _gcry_3des_amd64_cfb_dec(ctx->encrypt_subkeys, outbuf, inbuf, iv);
 
+        nblocks -= 3;
+        outbuf += 3 * DES_BLOCKSIZE;
+        inbuf  += 3 * DES_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      tripledes_ecb_encrypt (ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, DES_BLOCKSIZE);
+      outbuf += DES_BLOCKSIZE;
+      inbuf  += DES_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
 
 
 /*
@@ -815,6 +1029,67 @@ is_weak_key ( const byte *key )
 }
 
 
+/* Alternative setkey for selftests; need larger key than default. */
+static gcry_err_code_t
+bulk_selftest_setkey (void *context, const byte *__key, unsigned __keylen)
+{
+  static const unsigned char key[24] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22,
+      0x18,0x2A,0x39,0x47,0x5E,0x6F,0x75,0x82
+    };
+
+  (void)__key;
+  (void)__keylen;
+
+  return do_tripledes_setkey(context, key, sizeof(key));
+}
+
+
+/* Run the self-tests for DES-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 3+1;
+  const int blocksize = DES_BLOCKSIZE;
+  const int context_size = sizeof(struct _tripledes_ctx);
+
+  return _gcry_selftest_helper_ctr("3DES", &bulk_selftest_setkey,
+           &do_tripledes_encrypt, &_gcry_3des_ctr_enc, nblocks, blocksize,
+           context_size);
+}
+
+
+/* Run the self-tests for DES-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 3+2;
+  const int blocksize = DES_BLOCKSIZE;
+  const int context_size = sizeof(struct _tripledes_ctx);
+
+  return _gcry_selftest_helper_cbc("3DES", &bulk_selftest_setkey,
+           &do_tripledes_encrypt, &_gcry_3des_cbc_dec, nblocks, blocksize,
+           context_size);
+}
+
+
+/* Run the self-tests for DES-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 3+2;
+  const int blocksize = DES_BLOCKSIZE;
+  const int context_size = sizeof(struct _tripledes_ctx);
+
+  return _gcry_selftest_helper_cfb("3DES", &bulk_selftest_setkey,
+           &do_tripledes_encrypt, &_gcry_3des_cfb_dec, nblocks, blocksize,
+           context_size);
+}
+
 
 /*
  * Performs a selftest of this DES/Triple-DES implementation.
@@ -824,6 +1099,8 @@ is_weak_key ( const byte *key )
 static const char *
 selftest (void)
 {
+  const char *r;
+
   /*
    * Check if 'u32' is really 32 bits wide. This DES / 3DES implementation
    * need this.
@@ -1003,6 +1280,15 @@ selftest (void)
         return "DES weak key detection failed";
   }
 
+  if ( (r = selftest_cbc ()) )
+    return r;
+
+  if ( (r = selftest_cfb ()) )
+    return r;
+
+  if ( (r = selftest_ctr ()) )
+    return r;
+
   return 0;
 }
 
@@ -1060,7 +1346,7 @@ do_tripledes_encrypt( void *context, byte *outbuf, const byte *inbuf )
   struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context;
 
   tripledes_ecb_encrypt ( ctx, inbuf, outbuf );
-  return /*burn_stack*/ (32);
+  return /*burn_stack*/ TRIPLEDES_ECB_BURN_STACK;
 }
 
 static unsigned int
@@ -1068,7 +1354,7 @@ do_tripledes_decrypt( void *context, byte *outbuf, const byte *inbuf )
 {
   struct _tripledes_ctx *ctx = (struct _tripledes_ctx *) context;
   tripledes_ecb_decrypt ( ctx, inbuf, outbuf );
-  return /*burn_stack*/ (32);
+  return /*burn_stack*/ TRIPLEDES_ECB_BURN_STACK;
 }
 
 static gcry_err_code_t
diff --git a/configure.ac b/configure.ac
index a0f75a5..79f79ef 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1671,6 +1671,13 @@ LIST_MEMBER(des, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS des.lo"
    AC_DEFINE(USE_DES, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS des-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(aes, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index cd981b3..5d1b5f6 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -173,6 +173,19 @@ void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              size_t nblocks);
 
+/*-- des.c --*/
+void _gcry_3des_ctr_enc (void *context, unsigned char *ctr,
+                         void *outbuf_arg, const void *inbuf_arg,
+                         size_t nblocks);
+
+void _gcry_3des_cbc_dec (void *context, unsigned char *iv,
+                         void *outbuf_arg, const void *inbuf_arg,
+                         size_t nblocks);
+
+void _gcry_3des_cfb_dec (void *context, unsigned char *iv,
+                         void *outbuf_arg, const void *inbuf_arg,
+                         size_t nblocks);
+
 /*-- serpent.c --*/
 void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
                             void *outbuf_arg, const void *inbuf_arg,




More information about the Gcrypt-devel mailing list