[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-138-gb402de8

Thu May 23 12:45:35 CEST 2013

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  b402de8b9c4a9f269faf03ca952b1eb68a1f33c8 (commit)
       via  2fd06e207dcea1d8a7f0e7e92f3359615a99421b (commit)
       via  c85501af8222913f0a1e20e77fceb88e93417925 (commit)
      from  4e46d8bc78008ba06f106b368cefb0dddf15fe38 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit b402de8b9c4a9f269faf03ca952b1eb68a1f33c8
Author: Werner Koch <wk at gnupg.org>
Date:   Thu Apr 18 14:40:43 2013 +0200

    Avoid compiler warning due to the global symbol setkey.
    
    * cipher/cipher-selftest.c (_gcry_selftest_helper_cbc_128)
    (_gcry_selftest_helper_ctr_128): Rename setkey to setkey_func.
    --
    
    setkey is a POSIX.1 function defined in stdlib.

diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
index 50c7752..439f3ae 100644
--- a/cipher/cipher-selftest.c
+++ b/cipher/cipher-selftest.c
@@ -47,7 +47,8 @@
 /* Run the self-tests for <block cipher>-CBC-128, tests bulk CBC
    decryption.  Returns NULL on success. */
 const char *
-_gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+_gcry_selftest_helper_cbc_128 (const char *cipher,
+                               gcry_cipher_setkey_t setkey_func,
 			       gcry_cipher_encrypt_t encrypt_one,
 			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
 			       const int nblocks, const int blocksize,
@@ -81,7 +82,7 @@ _gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
   ciphertext = plaintext2 + nblocks * blocksize;
 
   /* Initialize ctx */
-  setkey (ctx, key, sizeof(key));
+  setkey_func (ctx, key, sizeof(key));
 
   /* Test single block code path */
   memset (iv, 0x4e, blocksize);
@@ -162,7 +163,8 @@ _gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
 /* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 const char *
-_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+_gcry_selftest_helper_ctr_128 (const char *cipher,
+                               gcry_cipher_setkey_t setkey_func,
 			       gcry_cipher_encrypt_t encrypt_one,
 			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
 			       const int nblocks, const int blocksize,
@@ -196,7 +198,7 @@ _gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
   ciphertext = plaintext2 + nblocks * blocksize;
 
   /* Initialize ctx */
-  setkey (ctx, key, sizeof(key));
+  setkey_func (ctx, key, sizeof(key));
 
   /* Test single block code path */
   memset (iv, 0xff, blocksize);

commit 2fd06e207dcea1d8a7f0e7e92f3359615a99421b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 23 11:04:18 2013 +0300

    serpent: add SSE2 accelerated amd64 implementation
    
    * configure.ac (serpent): Add 'serpent-sse2-amd64.lo'.
    * cipher/Makefile.am (EXTRA_libcipher_la_SOURCES): Add
    'serpent-sse2-amd64.S'.
    * cipher/cipher.c (gcry_cipher_open) [USE_SERPENT]: Register bulk
    functions for CBC-decryption and CTR-mode.
    * cipher/serpent.c (USE_SSE2): New macro.
    [USE_SSE2] (_gcry_serpent_sse2_ctr_enc, _gcry_serpent_sse2_cbc_dec):
    New prototypes to assembler functions.
    (serpent_setkey): Set 'serpent_init_done' before calling serpent_test.
    (_gcry_serpent_ctr_enc): New function.
    (_gcry_serpent_cbc_dec): New function.
    (selftest_ctr_128): New function.
    (selftest_cbc_128): New function.
    (selftest): Call selftest_ctr_128 and selftest_cbc_128.
    * cipher/serpent-sse2-amd64.S: New file.
    * src/cipher.h (_gcry_serpent_ctr_enc): New prototype.
    (_gcry_serpent_cbc_dec): New prototype.
    --
    
    [v2]: Converted to SSE2, to support all amd64 processors (SSE2 is required
          feature by AMD64 SysV ABI).
    
    Patch adds word-sliced SSE2 implementation of Serpent for amd64 for speeding
    up parallelizable workloads (CTR mode, CBC mode decryption). Implementation
    processes eight blocks in parallel, with two four-block sets interleaved for
    out-of-order scheduling.
    
    Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    SERPENT128    1.00x   0.99x   1.00x   3.98x   1.00x   1.01x   1.00x   1.01x   4.04x   4.04x
    
    Speed old vs. new on AMD Phenom II X6 1055T:
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    SERPENT128    1.02x   1.01x   1.00x   2.83x   1.00x   1.00x   1.00x   1.00x   2.72x   2.72x
    
    Speed old vs. new on Intel Core2 Duo T8100:
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    SERPENT128    1.00x   1.02x   0.97x   4.02x   0.98x   1.01x   0.98x   1.00x   3.82x   3.91x
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 0808bd2..69f1e6d 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -68,7 +68,7 @@ rmd160.c \
 rsa.c \
 scrypt.c \
 seed.c \
-serpent.c \
+serpent.c serpent-sse2-amd64.S \
 sha1.c \
 sha256.c \
 sha512.c \
diff --git a/cipher/cipher.c b/cipher/cipher.c
index f1224af..20ac2c7 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -726,6 +726,14 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
               break;
 #endif /*USE_CAMELLIA*/
+#ifdef USE_SERPENT
+	    case GCRY_CIPHER_SERPENT128:
+	    case GCRY_CIPHER_SERPENT192:
+	    case GCRY_CIPHER_SERPENT256:
+              h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+              h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
+              break;
+#endif /*USE_SERPENT*/
 
             default:
               break;
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
new file mode 100644
index 0000000..8d8c8dd
--- /dev/null
+++ b/cipher/serpent-sse2-amd64.S
@@ -0,0 +1,826 @@
+/* serpent-sse2-amd64.S  -  SSE2 implementation of Serpent cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_SERPENT)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* struct serpent_context: */
+#define ctx_keys 0
+
+/* register macros */
+#define CTX %rdi
+
+/* vector registers */
+.set RA0, %xmm0
+.set RA1, %xmm1
+.set RA2, %xmm2
+.set RA3, %xmm3
+.set RA4, %xmm4
+
+.set RB0, %xmm5
+.set RB1, %xmm6
+.set RB2, %xmm7
+.set RB3, %xmm8
+.set RB4, %xmm9
+
+.set RNOT, %xmm10
+.set RTMP0, %xmm11
+.set RTMP1, %xmm12
+.set RTMP2, %xmm13
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* preprocessor macro for renaming vector registers using GAS macros */
+#define sbox_reg_rename(r0, r1, r2, r3, r4, \
+			new_r0, new_r1, new_r2, new_r3, new_r4) \
+	.set rename_reg0, new_r0; \
+	.set rename_reg1, new_r1; \
+	.set rename_reg2, new_r2; \
+	.set rename_reg3, new_r3; \
+	.set rename_reg4, new_r4; \
+	\
+	.set r0, rename_reg0; \
+	.set r1, rename_reg1; \
+	.set r2, rename_reg2; \
+	.set r3, rename_reg3; \
+	.set r4, rename_reg4;
+
+/* vector 32-bit rotation to left */
+#define vec_rol(reg, nleft, tmp) \
+	movdqa reg, tmp; 		\
+	pslld $(nleft), tmp;		\
+	psrld $(32 - (nleft)), reg;	\
+	por tmp, reg;
+
+/* vector 32-bit rotation to right */
+#define vec_ror(reg, nright, tmp) \
+	vec_rol(reg, 32 - nright, tmp)
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* 128-bit wide byte swap */
+#define pbswap(xreg, t0) \
+	/* reorder 32-bit words, [a,b,c,d] => [d,c,b,a] */ \
+	pshufd $0x1b, xreg, xreg; \
+	/* reorder high&low 16-bit words, [d0,d1,c0,c1] => [d1,d0,c1,c0] */ \
+	pshuflw $0xb1, xreg, xreg; \
+	pshufhw $0xb1, xreg, xreg; \
+	/* reorder bytes in 16-bit words */ \
+	movdqa xreg, t0; \
+	psrlw $8, t0; \
+	psllw $8, xreg; \
+	por t0, xreg;
+
+/**********************************************************************
+  8-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(r0, r1, r2, r3, r4) \
+	pxor	r0, r3;		movdqa	r1, r4;		\
+	pand	r3, r1;		pxor	r2, r4;		\
+	pxor	r0, r1;		por	r3, r0;		\
+	pxor	r4, r0;		pxor	r3, r4;		\
+	pxor	r2, r3;		por	r1, r2;		\
+	pxor	r4, r2;		pxor	RNOT, r4;	\
+	por	r1, r4;		pxor	r3, r1;		\
+	pxor	r4, r1;		por	r0, r3;		\
+	pxor	r3, r1;		pxor	r3, r4;		\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r2;	movdqa	r1, r4;		\
+	por	r0, r1;		pxor	RNOT, r4;	\
+	pxor	r2, r1;		por	r4, r2;		\
+	pxor	r3, r1;		pxor	r4, r0;		\
+	pxor	r0, r2;		pand	r3, r0;		\
+	pxor	r0, r4;		por	r1, r0;		\
+	pxor	r2, r0;		pxor	r4, r3;		\
+	pxor	r1, r2;		pxor	r0, r3;		\
+	pxor	r1, r3;	\
+	pand	r3, r2;	\
+	pxor	r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+
+#define SBOX1(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r0;	pxor	RNOT, r2;	\
+	movdqa	r0, r4;		pand	r1, r0;		\
+	pxor	r0, r2;		por	r3, r0;		\
+	pxor	r2, r3;		pxor	r0, r1;		\
+	pxor	r4, r0;		por	r1, r4;		\
+	pxor	r3, r1;		por	r0, r2;		\
+	pand	r4, r2;		pxor	r1, r0;		\
+	pand	r2, r1;	\
+	pxor	r0, r1;		pand	r2, r0;		\
+	pxor	r4, r0;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r1, r4;		pxor	r3, r1;		\
+	pand	r1, r3;		pxor	r2, r4;		\
+	pxor	r0, r3;		por	r1, r0;		\
+	pxor	r3, r2;		pxor	r4, r0;		\
+	por	r2, r0;		pxor	r3, r1;		\
+	pxor	r1, r0;		por	r3, r1;		\
+	pxor	r0, r1;		pxor	RNOT, r4;	\
+	pxor	r1, r4;		por	r0, r1;		\
+	pxor	r0, r1;	\
+	por	r4, r1;	\
+	pxor	r1, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+
+#define SBOX2(r0, r1, r2, r3, r4) \
+	movdqa	r0, r4;		pand	r2, r0;		\
+	pxor	r3, r0;		pxor	r1, r2;		\
+	pxor	r0, r2;		por	r4, r3;		\
+	pxor	r1, r3;		pxor	r2, r4;		\
+	movdqa	r3, r1;		por	r4, r3;		\
+	pxor	r0, r3;		pand	r1, r0;		\
+	pxor	r0, r4;		pxor	r3, r1;		\
+	pxor	r4, r1;		pxor	RNOT, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	r3, r2;		pxor	r0, r3;		\
+	movdqa	r3, r4;		pand	r2, r3;		\
+	pxor	r1, r3;		por	r2, r1;		\
+	pxor	r4, r1;		pand	r3, r4;		\
+	pxor	r3, r2;		pand	r0, r4;		\
+	pxor	r2, r4;		pand	r1, r2;		\
+	por	r0, r2;		pxor	RNOT, r3;	\
+	pxor	r3, r2;		pxor	r3, r0;		\
+	pand	r1, r0;		pxor	r4, r3;		\
+	pxor	r0, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+
+#define SBOX3(r0, r1, r2, r3, r4) \
+	movdqa	r0, r4;		por	r3, r0;		\
+	pxor	r1, r3;		pand	r4, r1;		\
+	pxor	r2, r4;		pxor	r3, r2;		\
+	pand	r0, r3;		por	r1, r4;		\
+	pxor	r4, r3;		pxor	r1, r0;		\
+	pand	r0, r4;		pxor	r3, r1;		\
+	pxor	r2, r4;		por	r0, r1;		\
+	pxor	r2, r1;		pxor	r3, r0;		\
+	movdqa	r1, r2;		por	r3, r1;		\
+	pxor	r0, r1;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pxor	r1, r2;		\
+	pxor	r2, r0;		pand	r2, r4;		\
+	pxor	r0, r4;		pand	r1, r0;		\
+	pxor	r3, r1;		por	r4, r3;		\
+	pxor	r3, r2;		pxor	r3, r0;		\
+	pxor	r4, r1;		pand	r2, r3;		\
+	pxor	r1, r3;		pxor	r0, r1;		\
+	por	r2, r1;		pxor	r3, r0;		\
+	pxor	r4, r1;	\
+	pxor	r1, r0;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+
+#define SBOX4(r0, r1, r2, r3, r4) \
+	pxor	r3, r1;		pxor	RNOT, r3;	\
+	pxor	r3, r2;		pxor	r0, r3;		\
+	movdqa	r1, r4;		pand	r3, r1;		\
+	pxor	r2, r1;		pxor	r3, r4;		\
+	pxor	r4, r0;		pand	r4, r2;		\
+	pxor	r0, r2;		pand	r1, r0;		\
+	pxor	r0, r3;		por	r1, r4;		\
+	pxor	r0, r4;		por	r3, r0;		\
+	pxor	r2, r0;		pand	r3, r2;		\
+	pxor	RNOT, r0;	pxor	r2, r4;		\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pand	r3, r2;		\
+	pxor	r1, r2;		por	r3, r1;		\
+	pand	r0, r1;		pxor	r2, r4;		\
+	pxor	r1, r4;		pand	r2, r1;		\
+	pxor	RNOT, r0;	pxor	r4, r3;		\
+	pxor	r3, r1;		pand	r0, r3;		\
+	pxor	r2, r3;		pxor	r1, r0;		\
+	pand	r0, r2;		pxor	r0, r3;		\
+	pxor	r4, r2;	\
+	por	r3, r2;		pxor	r0, r3;		\
+	pxor	r1, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+
+#define SBOX5(r0, r1, r2, r3, r4) \
+	pxor	r1, r0;		pxor	r3, r1;		\
+	pxor	RNOT, r3;	movdqa	r1, r4;		\
+	pand	r0, r1;		pxor	r3, r2;		\
+	pxor	r2, r1;		por	r4, r2;		\
+	pxor	r3, r4;		pand	r1, r3;		\
+	pxor	r0, r3;		pxor	r1, r4;		\
+	pxor	r2, r4;		pxor	r0, r2;		\
+	pand	r3, r0;		pxor	RNOT, r2;	\
+	pxor	r4, r0;		por	r3, r4;		\
+	pxor	r4, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r1;	movdqa	r3, r4;		\
+	pxor	r1, r2;		por	r0, r3;		\
+	pxor	r2, r3;		por	r1, r2;		\
+	pand	r0, r2;		pxor	r3, r4;		\
+	pxor	r4, r2;		por	r0, r4;		\
+	pxor	r1, r4;		pand	r2, r1;		\
+	pxor	r3, r1;		pxor	r2, r4;		\
+	pand	r4, r3;		pxor	r1, r4;		\
+	pxor	r4, r3;		pxor	RNOT, r4;	\
+	pxor	r0, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+
+#define SBOX6(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r2;	movdqa	r3, r4;		\
+	pand	r0, r3;		pxor	r4, r0;		\
+	pxor	r2, r3;		por	r4, r2;		\
+	pxor	r3, r1;		pxor	r0, r2;		\
+	por	r1, r0;		pxor	r1, r2;		\
+	pxor	r0, r4;		por	r3, r0;		\
+	pxor	r2, r0;		pxor	r3, r4;		\
+	pxor	r0, r4;		pxor	RNOT, r3;	\
+	pand	r4, r2;	\
+	pxor	r3, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	r2, r0;		movdqa	r2, r4;		\
+	pand	r0, r2;		pxor	r3, r4;		\
+	pxor	RNOT, r2;	pxor	r1, r3;		\
+	pxor	r3, r2;		por	r0, r4;		\
+	pxor	r2, r0;		pxor	r4, r3;		\
+	pxor	r1, r4;		pand	r3, r1;		\
+	pxor	r0, r1;		pxor	r3, r0;		\
+	por	r2, r0;		pxor	r1, r3;		\
+	pxor	r0, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+
+#define SBOX7(r0, r1, r2, r3, r4) \
+	movdqa	r1, r4;		por	r2, r1;		\
+	pxor	r3, r1;		pxor	r2, r4;		\
+	pxor	r1, r2;		por	r4, r3;		\
+	pand	r0, r3;		pxor	r2, r4;		\
+	pxor	r1, r3;		por	r4, r1;		\
+	pxor	r0, r1;		por	r4, r0;		\
+	pxor	r2, r0;		pxor	r4, r1;		\
+	pxor	r1, r2;		pand	r0, r1;		\
+	pxor	r4, r1;		pxor	RNOT, r2;	\
+	por	r0, r2;	\
+	pxor	r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pxor	r0, r2;		\
+	pand	r3, r0;		por	r3, r4;		\
+	pxor	RNOT, r2;	pxor	r1, r3;		\
+	por	r0, r1;		pxor	r2, r0;		\
+	pand	r4, r2;		pand	r4, r3;		\
+	pxor	r2, r1;		pxor	r0, r2;		\
+	por	r2, r0;		pxor	r1, r4;		\
+	pxor	r3, r0;		pxor	r4, r3;		\
+	por	r0, r4;		pxor	r2, r3;		\
+	pxor	r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+
+/* Apply SBOX number WHICH to to the block.  */
+#define SBOX(which, r0, r1, r2, r3, r4) \
+	SBOX##which (r0, r1, r2, r3, r4)
+
+/* Apply inverse SBOX number WHICH to to the block.  */
+#define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
+	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
+
+/* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
+#define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
+	pbroadcastd ((ctx_keys + (round) * 16 + 0 * 4)(CTX), r4); \
+	pxor r4, r0; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 1 * 4)(CTX), r4); \
+	pxor r4, r1; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 2 * 4)(CTX), r4); \
+	pxor r4, r2; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 3 * 4)(CTX), r4); \
+	pxor r4, r3;
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
+	vec_rol(r0, 13, r4);	\
+	vec_rol(r2, 3, r4);	\
+	pxor r0, r1;		\
+	pxor r2, r1;		\
+	movdqa r0, r4;		\
+	pslld $3, r4;		\
+	pxor r2, r3;		\
+	pxor r4, r3;		\
+	vec_rol(r1, 1, r4);	\
+	vec_rol(r3, 7, r4);	\
+	pxor r1, r0;		\
+	pxor r3, r0;		\
+	movdqa r1, r4;		\
+	pslld $7, r4;		\
+	pxor r3, r2;		\
+	pxor r4, r2;		\
+	vec_rol(r0, 5, r4);	\
+	vec_rol(r2, 22, r4);
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
+	vec_ror(r2, 22, r4);	\
+	vec_ror(r0, 5, r4);	\
+	movdqa r1, r4;		\
+	pslld $7, r4;		\
+	pxor r3, r2;		\
+	pxor r4, r2;		\
+	pxor r1, r0;		\
+	pxor r3, r0;		\
+	vec_ror(r3, 7, r4);	\
+	vec_ror(r1, 1, r4);	\
+	movdqa r0, r4;		\
+	pslld $3, r4;		\
+	pxor r2, r3;		\
+	pxor r4, r3;		\
+	pxor r0, r1;		\
+	pxor r2, r1;		\
+	vec_ror(r2, 3, r4);	\
+	vec_ror(r0, 13, r4);
+
+/* Apply a Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	SBOX (which, a0, a1, a2, a3, a4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+		SBOX (which, b0, b1, b2, b3, b4);		\
+	LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4);	\
+	.set round, (round + 1);
+
+/* Apply the last Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	SBOX (which, a0, a1, a2, a3, a4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+		SBOX (which, b0, b1, b2, b3, b4);		\
+	.set round, (round + 1);			\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round + 1);
+
+/* Apply an inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+	.set round, (round - 1);
+
+/* Apply the first inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round - 1);			\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round - 1);
+
+.text
+
+.align 8
+.type   __serpent_enc_blk8, at function;
+__serpent_enc_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 */
+
+	/* record input vector names for __serpent_enc_blk8 */
+	.set enc_in_a0, RA0
+	.set enc_in_a1, RA1
+	.set enc_in_a2, RA2
+	.set enc_in_a3, RA3
+	.set enc_in_b0, RB0
+	.set enc_in_b1, RB1
+	.set enc_in_b2, RB2
+	.set enc_in_b3, RB3
+
+	pcmpeqd RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	.set round, 0
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	/* record output vector names for __serpent_enc_blk8 */
+	.set enc_out_a0, RA0
+	.set enc_out_a1, RA1
+	.set enc_out_a2, RA2
+	.set enc_out_a3, RA3
+	.set enc_out_b0, RB0
+	.set enc_out_b1, RB1
+	.set enc_out_b2, RB2
+	.set enc_out_b3, RB3
+
+	ret;
+.size __serpent_enc_blk8,.-__serpent_enc_blk8;
+
+.align 8
+.type   __serpent_dec_blk8, at function;
+__serpent_dec_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 */
+
+	/* record input vector names for __serpent_dec_blk8 */
+	.set dec_in_a0, RA0
+	.set dec_in_a1, RA1
+	.set dec_in_a2, RA2
+	.set dec_in_a3, RA3
+	.set dec_in_b0, RB0
+	.set dec_in_b1, RB1
+	.set dec_in_b2, RB2
+	.set dec_in_b3, RB3
+
+	pcmpeqd RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	.set round, 32
+	ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	/* record output vector names for __serpent_dec_blk8 */
+	.set dec_out_a0, RA0
+	.set dec_out_a1, RA1
+	.set dec_out_a2, RA2
+	.set dec_out_a3, RA3
+	.set dec_out_b0, RB0
+	.set dec_out_b1, RB1
+	.set dec_out_b2, RB2
+	.set dec_out_b3, RB3
+
+	ret;
+.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+
+.align 8
+.global _gcry_serpent_sse2_ctr_enc
+.type   _gcry_serpent_sse2_ctr_enc, at function;
+_gcry_serpent_sse2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+
+	.set RA0, enc_in_a0
+	.set RA1, enc_in_a1
+	.set RA2, enc_in_a2
+	.set RA3, enc_in_a3
+	.set RB0, enc_in_b0
+	.set RB1, enc_in_b1
+	.set RB2, enc_in_b2
+	.set RB3, enc_in_b3
+
+	/* load IV and byteswap */
+	movdqu (%rcx), RA0;
+	movdqa RA0, RTMP0;
+	pbswap(RTMP0, RTMP1); /* be => le */
+
+	pcmpeqd RNOT, RNOT;
+	psrldq $8, RNOT; /* low: -1, high: 0 */
+	movdqa RNOT, RTMP2;
+	paddq RTMP2, RTMP2; /* low: -2, high: 0 */
+
+	/* construct IVs */
+	movdqa RTMP0, RTMP1;
+	psubq RNOT, RTMP0; /* +1 */
+	movdqa RTMP0, RA1;
+	psubq RTMP2, RTMP1; /* +2 */
+	movdqa RTMP1, RA2;
+	psubq RTMP2, RTMP0; /* +3 */
+	movdqa RTMP0, RA3;
+	psubq RTMP2, RTMP1; /* +4 */
+	movdqa RTMP1, RB0;
+	psubq RTMP2, RTMP0; /* +5 */
+	movdqa RTMP0, RB1;
+	psubq RTMP2, RTMP1; /* +6 */
+	movdqa RTMP1, RB2;
+	psubq RTMP2, RTMP0; /* +7 */
+	movdqa RTMP0, RB3;
+	psubq RTMP2, RTMP1; /* +8 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpl $0xffffffff, 8(%rcx);
+	jne .Lno_ctr_carry;
+
+	movl 12(%rcx), %eax;
+	bswapl %eax;
+	cmpl $-8, %eax;
+	jb .Lno_ctr_carry;
+	pslldq $8, RNOT; /* low: 0, high: -1 */
+	je .Lcarry_RTMP0;
+
+	cmpl $-6, %eax;
+	jb .Lcarry_RB3;
+	je .Lcarry_RB2;
+
+	cmpl $-4, %eax;
+	jb .Lcarry_RB1;
+	je .Lcarry_RB0;
+
+	cmpl $-2, %eax;
+	jb .Lcarry_RA3;
+	je .Lcarry_RA2;
+
+	psubq RNOT, RA1;
+.Lcarry_RA2:
+	psubq RNOT, RA2;
+.Lcarry_RA3:
+	psubq RNOT, RA3;
+.Lcarry_RB0:
+	psubq RNOT, RB0;
+.Lcarry_RB1:
+	psubq RNOT, RB1;
+.Lcarry_RB2:
+	psubq RNOT, RB2;
+.Lcarry_RB3:
+	psubq RNOT, RB3;
+.Lcarry_RTMP0:
+	psubq RNOT, RTMP1;
+
+.Lno_ctr_carry:
+	/* le => be */
+	pbswap(RA1, RTMP0);
+	pbswap(RA2, RTMP0);
+	pbswap(RA3, RTMP0);
+	pbswap(RB0, RTMP0);
+	pbswap(RB1, RTMP0);
+	pbswap(RB2, RTMP0);
+	pbswap(RB3, RTMP0);
+	pbswap(RTMP1, RTMP0);
+	/* store new IV */
+	movdqu RTMP1, (%rcx);
+
+	call __serpent_enc_blk8;
+
+	.set RA0, enc_out_a0
+	.set RA1, enc_out_a1
+	.set RA2, enc_out_a2
+	.set RA3, enc_out_a3
+	.set RB0, enc_out_b0
+	.set RB1, enc_out_b1
+	.set RB2, enc_out_b2
+	.set RB3, enc_out_b3
+
+	pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;
+
+.align 8
+.global _gcry_serpent_sse2_cbc_dec
+.type   _gcry_serpent_sse2_cbc_dec, at function;
+_gcry_serpent_sse2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+
+	.set RA0, dec_in_a0
+	.set RA1, dec_in_a1
+	.set RA2, dec_in_a2
+	.set RA3, dec_in_a3
+	.set RB0, dec_in_b0
+	.set RB1, dec_in_b1
+	.set RB2, dec_in_b2
+	.set RB3, dec_in_b3
+
+	movdqu (0 * 16)(%rdx), RA0;
+	movdqu (1 * 16)(%rdx), RA1;
+	movdqu (2 * 16)(%rdx), RA2;
+	movdqu (3 * 16)(%rdx), RA3;
+	movdqu (4 * 16)(%rdx), RB0;
+	movdqu (5 * 16)(%rdx), RB1;
+	movdqu (6 * 16)(%rdx), RB2;
+	movdqu (7 * 16)(%rdx), RB3;
+
+	call __serpent_dec_blk8;
+
+	.set RA0, dec_out_a0
+	.set RA1, dec_out_a1
+	.set RA2, dec_out_a2
+	.set RA3, dec_out_a3
+	.set RB0, dec_out_b0
+	.set RB1, dec_out_b1
+	.set RB2, dec_out_b2
+	.set RB3, dec_out_b3
+
+	movdqu (7 * 16)(%rdx), RNOT;
+	pxor_u((%rcx), RA0, RTMP0);
+	pxor_u((0 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((3 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB3, RTMP0);
+	movdqu RNOT, (%rcx); /* store new IV */
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
+
+#endif /*defined(USE_SERPENT)*/
+#endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 72840cf..7b82b48 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -28,6 +28,15 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+
+
+/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
+#undef USE_SSE2
+#if defined(__x86_64__)
+# define USE_SSE2 1
+#endif
 
 /* Number of rounds per Serpent encrypt/decrypt operation.  */
 #define ROUNDS 32
@@ -52,6 +61,21 @@ typedef struct serpent_context
 } serpent_context_t;
 
 
+#ifdef USE_SSE2
+/* Assembler implementations of Serpent using SSE2.  Process 8 block in
+   parallel.
+ */
+extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr);
+
+extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv);
+#endif
+
 /* A prototype.  */
 static const char *serpent_test (void);
 
@@ -191,7 +215,7 @@ static const char *serpent_test (void);
     r4 &= r0; r1 ^= r3; \
     r4 ^= r2; r1 |= r0; \
     r1 ^= r2; r0 ^= r3; \
-    r2  = r1; r1 |= r3; \
+    r2 =  r1; r1 |= r3; \
     r1 ^= r0; \
     \
     w = r1; x = r2; y = r3; z = r4; \
@@ -587,10 +611,10 @@ serpent_setkey (void *ctx,
   if (! serpent_init_done)
     {
       /* Execute a self-test the first time, Serpent is used.  */
+      serpent_init_done = 1;
       serpent_test_ret = serpent_test ();
       if (serpent_test_ret)
 	log_error ("Serpent test failure: %s\n", serpent_test_ret);
-      serpent_init_done = 1;
     }
 
   if (serpent_test_ret)
@@ -740,6 +764,190 @@ serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
 
 

 
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size sizeof(serpent_block_t). */
+void
+_gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+  int i;
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* clear SSE2 registers used by serpent-sse2 */
+        asm volatile (
+          "pxor %%xmm0, %%xmm0;\n"
+          "pxor %%xmm1, %%xmm1;\n"
+          "pxor %%xmm2, %%xmm2;\n"
+          "pxor %%xmm3, %%xmm3;\n"
+          "pxor %%xmm4, %%xmm4;\n"
+          "pxor %%xmm5, %%xmm5;\n"
+          "pxor %%xmm6, %%xmm6;\n"
+          "pxor %%xmm7, %%xmm7;\n"
+          "pxor %%xmm10, %%xmm10;\n"
+          "pxor %%xmm11, %%xmm11;\n"
+          "pxor %%xmm12, %%xmm12;\n"
+          "pxor %%xmm13, %%xmm13;\n"
+          :::);
+
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      serpent_encrypt_internal(ctx, ctr, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+      /* Increment the counter.  */
+      for (i = sizeof(serpent_block_t); i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cbc_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* clear SSE2 registers used by serpent-sse2 */
+        asm volatile (
+          "pxor %%xmm0, %%xmm0;\n"
+          "pxor %%xmm1, %%xmm1;\n"
+          "pxor %%xmm2, %%xmm2;\n"
+          "pxor %%xmm3, %%xmm3;\n"
+          "pxor %%xmm4, %%xmm4;\n"
+          "pxor %%xmm5, %%xmm5;\n"
+          "pxor %%xmm6, %%xmm6;\n"
+          "pxor %%xmm7, %%xmm7;\n"
+          "pxor %%xmm10, %%xmm10;\n"
+          "pxor %%xmm11, %%xmm11;\n"
+          "pxor %%xmm12, %%xmm12;\n"
+          "pxor %%xmm13, %%xmm13;\n"
+          :::);
+
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, sizeof(serpent_block_t));
+
+      serpent_decrypt_internal (ctx, inbuf, outbuf);
+
+      buf_xor(outbuf, outbuf, iv, sizeof(serpent_block_t));
+      memcpy(iv, savebuf, sizeof(serpent_block_t));
+      inbuf += sizeof(serpent_block_t);
+      outbuf += sizeof(serpent_block_t);
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+

+
+/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 8+1;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_ctr_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cbc_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Serpent test.  */
 
 static const char *
@@ -748,6 +956,7 @@ serpent_test (void)
   serpent_context_t context;
   unsigned char scratch[16];
   unsigned int i;
+  const char *r;
 
   static struct test
   {
@@ -819,6 +1028,12 @@ serpent_test (void)
 	}
     }
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/configure.ac b/configure.ac
index f14e28a..3fec8bc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1219,6 +1219,13 @@ LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the SSE2 implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 17fec6c..4e68487 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -103,6 +103,13 @@ void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              unsigned int nblocks);
 
+/*-- serpent.c --*/
+void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
+void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);

commit c85501af8222913f0a1e20e77fceb88e93417925
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 23 11:04:13 2013 +0300

    Serpent: faster S-box implementation
    
    * cipher/serpent.c (SBOX0, SBOX1, SBOX2, SBOX3, SBOX4, SBOX5, SBOX6)
    (SBOX7, SBOX0_INVERSE, SBOX1_INVERSE, SBOX2_INVERSE, SBOX3_INVERSE)
    (SBOX4_INVERSE, SBOX5_INVERSE, SBOX6_INVERSE, SBOX7_INVERSE): Replace
    with new definitions.
    --
    
    These new S-box definitions are from paper:
     D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
     (New York, New York, USA), p. 317–329, National Institute of Standards and
     Technology, 2000. Available at http://www.ii.uib.no/~osvik/pub/aes3.ps.gz
    
    Although these were optimized for two-operand instructions on i386 and for
    old Pentium-1 processors, they are slightly faster on current processors
    on i386 and x86-64. On ARM, the performance of these S-boxes is about the
    same as with the old S-boxes.
    
    new vs old speed ratios (AMD K10, x86-64):
                     ECB/Stream         CBC             CFB             OFB             CTR
                  --------------- --------------- --------------- --------------- ---------------
     SERPENT128     1.06x   1.02x   1.06x   1.02x   1.06x   1.06x   1.06x   1.05x   1.07x   1.07x
    
    new vs old speed ratios (Intel Atom, i486):
                     ECB/Stream         CBC             CFB             OFB             CTR
                  --------------- --------------- --------------- --------------- ---------------
     SERPENT128     1.12x   1.15x   1.12x   1.15x   1.13x   1.11x   1.12x   1.12x   1.12x   1.13x
    
    new vs old speed ratios (ARM Cortex A8):
                     ECB/Stream         CBC             CFB             OFB             CTR
                  --------------- --------------- --------------- --------------- ---------------
     SERPENT128     1.04x   1.02x   1.02x   0.99x   1.02x   1.02x   1.03x   1.03x   1.01x   1.01x
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/serpent.c b/cipher/serpent.c
index ea14c7e..72840cf 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -61,403 +61,303 @@ static const char *serpent_test (void);
    | (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) \
    | (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
 
-/* These are the S-Boxes of Serpent.  They are copied from Serpents
-   reference implementation (the optimized one, contained in
-   `floppy2') and are therefore:
-
-     Copyright (C) 1998 Ross Anderson, Eli Biham, Lars Knudsen.
-
-  To quote the Serpent homepage
-  (http://www.cl.cam.ac.uk/~rja14/serpent.html):
-
-  "Serpent is now completely in the public domain, and we impose no
-   restrictions on its use.  This was announced on the 21st August at
-   the First AES Candidate Conference. The optimised implementations
-   in the submission package are now under the GNU PUBLIC LICENSE
-   (GPL), although some comments in the code still say otherwise. You
-   are welcome to use Serpent for any application."  */
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317–329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
 
-#define SBOX0(a, b, c, d, w, x, y, z) \
+#define SBOX0(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t05, t06, t07, t08, t09; \
-    u32 t11, t12, t13, t14, t15, t17, t01; \
-    t01 = b   ^ c  ; \
-    t02 = a   | d  ; \
-    t03 = a   ^ b  ; \
-    z   = t02 ^ t01; \
-    t05 = c   | z  ; \
-    t06 = a   ^ d  ; \
-    t07 = b   | c  ; \
-    t08 = d   & t05; \
-    t09 = t03 & t07; \
-    y   = t09 ^ t08; \
-    t11 = t09 & y  ; \
-    t12 = c   ^ d  ; \
-    t13 = t07 ^ t11; \
-    t14 = b   & t06; \
-    t15 = t06 ^ t13; \
-    w   =     ~ t15; \
-    t17 = w   ^ t14; \
-    x   = t12 ^ t17; \
+    u32 r4; \
+    \
+    r3 ^= r0; r4 =  r1; \
+    r1 &= r3; r4 ^= r2; \
+    r1 ^= r0; r0 |= r3; \
+    r0 ^= r4; r4 ^= r3; \
+    r3 ^= r2; r2 |= r1; \
+    r2 ^= r4; r4 = ~r4; \
+    r4 |= r1; r1 ^= r3; \
+    r1 ^= r4; r3 |= r0; \
+    r1 ^= r3; r4 ^= r3; \
+    \
+    w = r1; x = r4; y = r2; z = r0; \
   }
 
-#define SBOX0_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09, t10; \
-    u32 t12, t13, t14, t15, t17, t18, t01; \
-    t01 = c   ^ d  ; \
-    t02 = a   | b  ; \
-    t03 = b   | c  ; \
-    t04 = c   & t01; \
-    t05 = t02 ^ t01; \
-    t06 = a   | t04; \
-    y   =     ~ t05; \
-    t08 = b   ^ d  ; \
-    t09 = t03 & t08; \
-    t10 = d   | y  ; \
-    x   = t09 ^ t06; \
-    t12 = a   | t05; \
-    t13 = x   ^ t12; \
-    t14 = t03 ^ t10; \
-    t15 = a   ^ c  ; \
-    z   = t14 ^ t13; \
-    t17 = t05 & t13; \
-    t18 = t14 | t17; \
-    w   = t15 ^ t18; \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r1; \
+    r1 |= r0; r4 = ~r4; \
+    r1 ^= r2; r2 |= r4; \
+    r1 ^= r3; r0 ^= r4; \
+    r2 ^= r0; r0 &= r3; \
+    r4 ^= r0; r0 |= r1; \
+    r0 ^= r2; r3 ^= r4; \
+    r2 ^= r1; r3 ^= r0; \
+    r3 ^= r1; \
+    r2 &= r3; \
+    r4 ^= r2; \
+    \
+    w = r0; x = r4; y = r1; z = r3; \
   }
 
-#define SBOX1(a, b, c, d, w, x, y, z) \
+#define SBOX1(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t10, t11, t12, t13, t16, t17, t01; \
-    t01 = a   | d  ; \
-    t02 = c   ^ d  ; \
-    t03 =     ~ b  ; \
-    t04 = a   ^ c  ; \
-    t05 = a   | t03; \
-    t06 = d   & t04; \
-    t07 = t01 & t02; \
-    t08 = b   | t06; \
-    y   = t02 ^ t05; \
-    t10 = t07 ^ t08; \
-    t11 = t01 ^ t10; \
-    t12 = y   ^ t11; \
-    t13 = b   & d  ; \
-    z   =     ~ t10; \
-    x   = t13 ^ t12; \
-    t16 = t10 | x  ; \
-    t17 = t05 & t16; \
-    w   = c   ^ t17; \
+    u32 r4; \
+    \
+    r0 = ~r0; r2 = ~r2; \
+    r4 =  r0; r0 &= r1; \
+    r2 ^= r0; r0 |= r3; \
+    r3 ^= r2; r1 ^= r0; \
+    r0 ^= r4; r4 |= r1; \
+    r1 ^= r3; r2 |= r0; \
+    r2 &= r4; r0 ^= r1; \
+    r1 &= r2; \
+    r1 ^= r0; r0 &= r2; \
+    r0 ^= r4; \
+    \
+    w = r2; x = r0; y = r3; z = r1; \
   }
 
-#define SBOX1_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t09, t10, t11, t14, t15, t17, t01; \
-    t01 = a   ^ b  ; \
-    t02 = b   | d  ; \
-    t03 = a   & c  ; \
-    t04 = c   ^ t02; \
-    t05 = a   | t04; \
-    t06 = t01 & t05; \
-    t07 = d   | t03; \
-    t08 = b   ^ t06; \
-    t09 = t07 ^ t06; \
-    t10 = t04 | t03; \
-    t11 = d   & t08; \
-    y   =     ~ t09; \
-    x   = t10 ^ t11; \
-    t14 = a   | y  ; \
-    t15 = t06 ^ x  ; \
-    z   = t01 ^ t04; \
-    t17 = c   ^ t15; \
-    w   = t14 ^ t17; \
+    u32 r4; \
+    \
+    r4 =  r1; r1 ^= r3; \
+    r3 &= r1; r4 ^= r2; \
+    r3 ^= r0; r0 |= r1; \
+    r2 ^= r3; r0 ^= r4; \
+    r0 |= r2; r1 ^= r3; \
+    r0 ^= r1; r1 |= r3; \
+    r1 ^= r0; r4 = ~r4; \
+    r4 ^= r1; r1 |= r0; \
+    r1 ^= r0; \
+    r1 |= r4; \
+    r3 ^= r1; \
+    \
+    w = r4; x = r0; y = r3; z = r2; \
   }
 
-#define SBOX2(a, b, c, d, w, x, y, z) \
+#define SBOX2(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t05, t06, t07, t08; \
-    u32 t09, t10, t12, t13, t14, t01; \
-    t01 = a   | c  ; \
-    t02 = a   ^ b  ; \
-    t03 = d   ^ t01; \
-    w   = t02 ^ t03; \
-    t05 = c   ^ w  ; \
-    t06 = b   ^ t05; \
-    t07 = b   | t05; \
-    t08 = t01 & t06; \
-    t09 = t03 ^ t07; \
-    t10 = t02 | t09; \
-    x   = t10 ^ t08; \
-    t12 = a   | d  ; \
-    t13 = t09 ^ x  ; \
-    t14 = b   ^ t13; \
-    z   =     ~ t09; \
-    y   = t12 ^ t14; \
+    u32 r4; \
+    \
+    r4 =  r0; r0 &= r2; \
+    r0 ^= r3; r2 ^= r1; \
+    r2 ^= r0; r3 |= r4; \
+    r3 ^= r1; r4 ^= r2; \
+    r1 =  r3; r3 |= r4; \
+    r3 ^= r0; r0 &= r1; \
+    r4 ^= r0; r1 ^= r3; \
+    r1 ^= r4; r4 = ~r4; \
+    \
+    w = r2; x = r3; y = r1; z = r4; \
   }
 
-#define SBOX2_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t06, t07, t08, t09; \
-    u32 t10, t11, t12, t15, t16, t17, t01; \
-    t01 = a   ^ d  ; \
-    t02 = c   ^ d  ; \
-    t03 = a   & c  ; \
-    t04 = b   | t02; \
-    w   = t01 ^ t04; \
-    t06 = a   | c  ; \
-    t07 = d   | w  ; \
-    t08 =     ~ d  ; \
-    t09 = b   & t06; \
-    t10 = t08 | t03; \
-    t11 = b   & t07; \
-    t12 = t06 & t02; \
-    z   = t09 ^ t10; \
-    x   = t12 ^ t11; \
-    t15 = c   & z  ; \
-    t16 = w   ^ x  ; \
-    t17 = t10 ^ t15; \
-    y   = t16 ^ t17; \
+    u32 r4; \
+    \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r3; r3 &= r2; \
+    r3 ^= r1; r1 |= r2; \
+    r1 ^= r4; r4 &= r3; \
+    r2 ^= r3; r4 &= r0; \
+    r4 ^= r2; r2 &= r1; \
+    r2 |= r0; r3 = ~r3; \
+    r2 ^= r3; r0 ^= r3; \
+    r0 &= r1; r3 ^= r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r2; z = r3; \
   }
 
-#define SBOX3(a, b, c, d, w, x, y, z) \
+#define SBOX3(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t09, t10, t11, t13, t14, t15, t01; \
-    t01 = a   ^ c  ; \
-    t02 = a   | d  ; \
-    t03 = a   & d  ; \
-    t04 = t01 & t02; \
-    t05 = b   | t03; \
-    t06 = a   & b  ; \
-    t07 = d   ^ t04; \
-    t08 = c   | t06; \
-    t09 = b   ^ t07; \
-    t10 = d   & t05; \
-    t11 = t02 ^ t10; \
-    z   = t08 ^ t09; \
-    t13 = d   | z  ; \
-    t14 = a   | t07; \
-    t15 = b   & t13; \
-    y   = t08 ^ t11; \
-    w   = t14 ^ t15; \
-    x   = t05 ^ t04; \
+    u32 r4; \
+    \
+    r4 =  r0; r0 |= r3; \
+    r3 ^= r1; r1 &= r4; \
+    r4 ^= r2; r2 ^= r3; \
+    r3 &= r0; r4 |= r1; \
+    r3 ^= r4; r0 ^= r1; \
+    r4 &= r0; r1 ^= r3; \
+    r4 ^= r2; r1 |= r0; \
+    r1 ^= r2; r0 ^= r3; \
+    r2  = r1; r1 |= r3; \
+    r1 ^= r0; \
+    \
+    w = r1; x = r2; y = r3; z = r4; \
   }
 
-#define SBOX3_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t09; \
-    u32 t11, t12, t13, t14, t16, t01; \
-    t01 = c   | d  ; \
-    t02 = a   | d  ; \
-    t03 = c   ^ t02; \
-    t04 = b   ^ t02; \
-    t05 = a   ^ d  ; \
-    t06 = t04 & t03; \
-    t07 = b   & t01; \
-    y   = t05 ^ t06; \
-    t09 = a   ^ t03; \
-    w   = t07 ^ t03; \
-    t11 = w   | t05; \
-    t12 = t09 & t11; \
-    t13 = a   & y  ; \
-    t14 = t01 ^ t05; \
-    x   = b   ^ t12; \
-    t16 = b   | t13; \
-    z   = t14 ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r1; \
+    r0 ^= r2; r4 &= r2; \
+    r4 ^= r0; r0 &= r1; \
+    r1 ^= r3; r3 |= r4; \
+    r2 ^= r3; r0 ^= r3; \
+    r1 ^= r4; r3 &= r2; \
+    r3 ^= r1; r1 ^= r0; \
+    r1 |= r2; r0 ^= r3; \
+    r1 ^= r4; \
+    r0 ^= r1; \
+    \
+    w = r2; x = r1; y = r3; z = r0; \
   }
 
-#define SBOX4(a, b, c, d, w, x, y, z) \
+#define SBOX4(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09; \
-    u32 t10, t11, t12, t13, t14, t15, t16, t01; \
-    t01 = a   | b  ; \
-    t02 = b   | c  ; \
-    t03 = a   ^ t02; \
-    t04 = b   ^ d  ; \
-    t05 = d   | t03; \
-    t06 = d   & t01; \
-    z   = t03 ^ t06; \
-    t08 = z   & t04; \
-    t09 = t04 & t05; \
-    t10 = c   ^ t06; \
-    t11 = b   & c  ; \
-    t12 = t04 ^ t08; \
-    t13 = t11 | t03; \
-    t14 = t10 ^ t09; \
-    t15 = a   & t05; \
-    t16 = t11 | t12; \
-    y   = t13 ^ t08; \
-    x   = t15 ^ t16; \
-    w   =     ~ t14; \
+    u32 r4; \
+    \
+    r1 ^= r3; r3 = ~r3; \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r1; r1 &= r3; \
+    r1 ^= r2; r4 ^= r3; \
+    r0 ^= r4; r2 &= r4; \
+    r2 ^= r0; r0 &= r1; \
+    r3 ^= r0; r4 |= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r2 &= r3; \
+    r0 = ~r0; r4 ^= r2; \
+    \
+    w = r1; x = r4; y = r0; z = r3; \
   }
 
-#define SBOX4_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t09; \
-    u32 t10, t11, t12, t13, t15, t01; \
-    t01 = b   | d  ; \
-    t02 = c   | d  ; \
-    t03 = a   & t01; \
-    t04 = b   ^ t02; \
-    t05 = c   ^ d  ; \
-    t06 =     ~ t03; \
-    t07 = a   & t04; \
-    x   = t05 ^ t07; \
-    t09 = x   | t06; \
-    t10 = a   ^ t07; \
-    t11 = t01 ^ t09; \
-    t12 = d   ^ t04; \
-    t13 = c   | t10; \
-    z   = t03 ^ t12; \
-    t15 = a   ^ t04; \
-    y   = t11 ^ t13; \
-    w   = t15 ^ t09; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 &= r3; \
+    r2 ^= r1; r1 |= r3; \
+    r1 &= r0; r4 ^= r2; \
+    r4 ^= r1; r1 &= r2; \
+    r0 = ~r0; r3 ^= r4; \
+    r1 ^= r3; r3 &= r0; \
+    r3 ^= r2; r0 ^= r1; \
+    r2 &= r0; r3 ^= r0; \
+    r2 ^= r4; \
+    r2 |= r3; r3 ^= r0; \
+    r2 ^= r1; \
+    \
+    w = r0; x = r3; y = r2; z = r4; \
   }
 
-#define SBOX5(a, b, c, d, w, x, y, z) \
+#define SBOX5(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09; \
-    u32 t10, t11, t12, t13, t14, t01; \
-    t01 = b   ^ d  ; \
-    t02 = b   | d  ; \
-    t03 = a   & t01; \
-    t04 = c   ^ t02; \
-    t05 = t03 ^ t04; \
-    w   =     ~ t05; \
-    t07 = a   ^ t01; \
-    t08 = d   | w  ; \
-    t09 = b   | t05; \
-    t10 = d   ^ t08; \
-    t11 = b   | t07; \
-    t12 = t03 | w  ; \
-    t13 = t07 | t10; \
-    t14 = t01 ^ t11; \
-    y   = t09 ^ t13; \
-    x   = t07 ^ t08; \
-    z   = t12 ^ t14; \
+    u32 r4; \
+    \
+    r0 ^= r1; r1 ^= r3; \
+    r3 = ~r3; r4 =  r1; \
+    r1 &= r0; r2 ^= r3; \
+    r1 ^= r2; r2 |= r4; \
+    r4 ^= r3; r3 &= r1; \
+    r3 ^= r0; r4 ^= r1; \
+    r4 ^= r2; r2 ^= r0; \
+    r0 &= r3; r2 = ~r2; \
+    r0 ^= r4; r4 |= r3; \
+    r2 ^= r4; \
+    \
+    w = r1; x = r3; y = r0; z = r2; \
   }
 
-#define SBOX5_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09; \
-    u32 t10, t12, t13, t15, t16, t01; \
-    t01 = a   & d  ; \
-    t02 = c   ^ t01; \
-    t03 = a   ^ d  ; \
-    t04 = b   & t02; \
-    t05 = a   & c  ; \
-    w   = t03 ^ t04; \
-    t07 = a   & w  ; \
-    t08 = t01 ^ w  ; \
-    t09 = b   | t05; \
-    t10 =     ~ b  ; \
-    x   = t08 ^ t09; \
-    t12 = t10 | t07; \
-    t13 = w   | x  ; \
-    z   = t02 ^ t12; \
-    t15 = t02 ^ t13; \
-    t16 = b   ^ d  ; \
-    y   = t16 ^ t15; \
+    u32 r4; \
+    \
+    r1 = ~r1; r4 =  r3; \
+    r2 ^= r1; r3 |= r0; \
+    r3 ^= r2; r2 |= r1; \
+    r2 &= r0; r4 ^= r3; \
+    r2 ^= r4; r4 |= r0; \
+    r4 ^= r1; r1 &= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r3 &= r4; r4 ^= r1; \
+    r3 ^= r4; r4 = ~r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r3; z = r2; \
   }
 
-#define SBOX6(a, b, c, d, w, x, y, z) \
+#define SBOX6(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09, t10; \
-    u32 t11, t12, t13, t15, t17, t18, t01; \
-    t01 = a   & d  ; \
-    t02 = b   ^ c  ; \
-    t03 = a   ^ d  ; \
-    t04 = t01 ^ t02; \
-    t05 = b   | c  ; \
-    x   =     ~ t04; \
-    t07 = t03 & t05; \
-    t08 = b   & x  ; \
-    t09 = a   | c  ; \
-    t10 = t07 ^ t08; \
-    t11 = b   | d  ; \
-    t12 = c   ^ t11; \
-    t13 = t09 ^ t10; \
-    y   =     ~ t13; \
-    t15 = x   & t03; \
-    z   = t12 ^ t07; \
-    t17 = a   ^ b  ; \
-    t18 = y   ^ t15; \
-    w   = t17 ^ t18; \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r3; \
+    r3 &= r0; r0 ^= r4; \
+    r3 ^= r2; r2 |= r4; \
+    r1 ^= r3; r2 ^= r0; \
+    r0 |= r1; r2 ^= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r4 ^= r3; \
+    r4 ^= r0; r3 = ~r3; \
+    r2 &= r4; \
+    r2 ^= r3; \
+    \
+    w = r0; x = r1; y = r4; z = r2; \
   }
 
-#define SBOX6_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08, t09; \
-    u32 t12, t13, t14, t15, t16, t17, t01; \
-    t01 = a   ^ c  ; \
-    t02 =     ~ c  ; \
-    t03 = b   & t01; \
-    t04 = b   | t02; \
-    t05 = d   | t03; \
-    t06 = b   ^ d  ; \
-    t07 = a   & t04; \
-    t08 = a   | t02; \
-    t09 = t07 ^ t05; \
-    x   = t06 ^ t08; \
-    w   =     ~ t09; \
-    t12 = b   & w  ; \
-    t13 = t01 & t05; \
-    t14 = t01 ^ t12; \
-    t15 = t07 ^ t13; \
-    t16 = d   | t02; \
-    t17 = a   ^ x  ; \
-    z   = t17 ^ t15; \
-    y   = t16 ^ t14; \
+    u32 r4; \
+    \
+    r0 ^= r2; r4 =  r2; \
+    r2 &= r0; r4 ^= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r2 ^= r3; r4 |= r0; \
+    r0 ^= r2; r3 ^= r4; \
+    r4 ^= r1; r1 &= r3; \
+    r1 ^= r0; r0 ^= r3; \
+    r0 |= r2; r3 ^= r1; \
+    r4 ^= r0; \
+    \
+    w = r1; x = r2; y = r4; z = r3; \
   }
 
-#define SBOX7(a, b, c, d, w, x, y, z) \
+#define SBOX7(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09, t10; \
-    u32 t11, t13, t14, t15, t16, t17, t01; \
-    t01 = a   & c  ; \
-    t02 =     ~ d  ; \
-    t03 = a   & t02; \
-    t04 = b   | t01; \
-    t05 = a   & b  ; \
-    t06 = c   ^ t04; \
-    z   = t03 ^ t06; \
-    t08 = c   | z  ; \
-    t09 = d   | t05; \
-    t10 = a   ^ t08; \
-    t11 = t04 & z  ; \
-    x   = t09 ^ t10; \
-    t13 = b   ^ x  ; \
-    t14 = t01 ^ x  ; \
-    t15 = c   ^ t05; \
-    t16 = t11 | t13; \
-    t17 = t02 | t14; \
-    w   = t15 ^ t17; \
-    y   = a   ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r1; r1 |= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r2 ^= r1; r3 |= r4; \
+    r3 &= r0; r4 ^= r2; \
+    r3 ^= r1; r1 |= r4; \
+    r1 ^= r0; r0 |= r4; \
+    r0 ^= r2; r1 ^= r4; \
+    r2 ^= r1; r1 &= r0; \
+    r1 ^= r4; r2 = ~r2; \
+    r2 |= r0; \
+    r4 ^= r2; \
+    \
+    w = r4; x = r3; y = r1; z = r0; \
   }
 
-#define SBOX7_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t06, t07, t08, t09; \
-    u32 t10, t11, t13, t14, t15, t16, t01; \
-    t01 = a   & b  ; \
-    t02 = a   | b  ; \
-    t03 = c   | t01; \
-    t04 = d   & t02; \
-    z   = t03 ^ t04; \
-    t06 = b   ^ t04; \
-    t07 = d   ^ z  ; \
-    t08 =     ~ t07; \
-    t09 = t06 | t08; \
-    t10 = b   ^ d  ; \
-    t11 = a   | d  ; \
-    x   = a   ^ t09; \
-    t13 = c   ^ t06; \
-    t14 = c   & t11; \
-    t15 = d   | x  ; \
-    t16 = t01 | t10; \
-    w   = t13 ^ t15; \
-    y   = t14 ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r0; \
+    r0 &= r3; r4 |= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r1 |= r0; r0 ^= r2; \
+    r2 &= r4; r3 &= r4; \
+    r1 ^= r2; r2 ^= r0; \
+    r0 |= r2; r4 ^= r1; \
+    r0 ^= r3; r3 ^= r4; \
+    r4 |= r0; r3 ^= r2; \
+    r4 ^= r2; \
+    \
+    w = r3; x = r0; y = r1; z = r4; \
   }
 
 /* XOR BLOCK1 into BLOCK0.  */

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am          |    2 +-
 cipher/cipher-selftest.c    |   10 +-
 cipher/cipher.c             |    8 +
 cipher/serpent-sse2-amd64.S |  826 +++++++++++++++++++++++++++++++++++++++++++
 cipher/serpent.c            |  817 ++++++++++++++++++++++++------------------
 configure.ac                |    7 +
 src/cipher.h                |    7 +
 7 files changed, 1321 insertions(+), 356 deletions(-)
 create mode 100644 cipher/serpent-sse2-amd64.S


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org