From jussi.kivilinna at iki.fi  Sun May  5 16:51:54 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 17:51:54 +0300
Subject: New email, new DCO?
Message-ID: <5186720A.4090101@iki.fi>

Hello,

I've changed my email address and started wondering about changing email address in AUTHORS. Do I need to send new signed DCO?

-Jussi


Libgcrypt Developer's Certificate of Origin.  Version 1.0
=========================================================

By making a contribution to the Libgcrypt project, I certify that:

(a) The contribution was created in whole or in part by me and I
    have the right to submit it under the free software license
    indicated in the file; or

(b) The contribution is based upon previous work that, to the
    best of my knowledge, is covered under an appropriate free
    software license and I have the right under that license to
    submit that work with modifications, whether created in whole
    or in part by me, under the same free software license
    (unless I am permitted to submit under a different license),
    as indicated in the file; or

(c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.

(d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including
    all personal information I submit with it, including my
    sign-off) is maintained indefinitely and may be redistributed
    consistent with this project or the free software license(s)
    involved.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 730 bytes
Desc: OpenPGP digital signature
URL: </pipermail/attachments/20130505/f7cfbda8/attachment-0001.sig>

From jussi.kivilinna at iki.fi  Sun May  5 16:36:56 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 17:36:56 +0300
Subject: [PATCH 2/4] Convert bulk CTR and CBC selftest functions in Camellia
 to generic selftest helper functions
In-Reply-To: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
References: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
Message-ID: <20130505143656.29094.33625.stgit@localhost6.localdomain6>

* cipher/Makefile.am (libcipher_la_SOURCES): Add selftest_help files.
* cipher/camellia-glue.c (selftest_ctr_128, selftest_cbc_128): Change
to use the new selftest helper functions.
* cipher/selftest_help.c: New.
* cipher/selftest_help.h: New.
--

Convert selftest functions into generic helper functions for code sharing.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am     |    1 
 cipher/camellia-glue.c |  168 ++----------------------------
 cipher/selftest_help.c |  271 ++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/selftest_help.h |   54 ++++++++++
 4 files changed, 337 insertions(+), 157 deletions(-)
 create mode 100644 cipher/selftest_help.c
 create mode 100644 cipher/selftest_help.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index c39f627..0b61a27 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -47,6 +47,7 @@ hmac-tests.c \
 bithelp.h  \
 bufhelp.h  \
 primegen.c  \
+selftest_help.c selftest_help.h \
 hash-common.c hash-common.h \
 rmd.h
 
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index e9e2bf2..e6c5ecf 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -63,6 +63,7 @@
 #include "cipher.h"
 #include "camellia.h"
 #include "bufhelp.h"
+#include "selftest_help.h"
 
 /* Helper macro to force alignment to 16 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
@@ -313,93 +314,12 @@ static const char*
 selftest_ctr_128 (void)
 {
   const int nblocks = 16+1;
-  CAMELLIA_context ctx ATTR_ALIGNED_16;
-  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char iv[16] ATTR_ALIGNED_16;
-  unsigned char iv2[16] ATTR_ALIGNED_16;
-  int i, j, diff;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
-    };
-  static char error_str[128];
-
-  camellia_setkey (&ctx, key, sizeof (key));
-
-  /* Test single block code path */
-  memset(iv, 0xff, sizeof(iv));
-  for (i = 0; i < 16; i++)
-    plaintext[i] = i;
-
-  /* CTR manually.  */
-  camellia_encrypt (&ctx, ciphertext, iv);
-  for (i = 0; i < 16; i++)
-    ciphertext[i] ^= plaintext[i];
-  for (i = 16; i > 0; i--)
-    {
-      iv[i-1]++;
-      if (iv[i-1])
-        break;
-    }
-
-  memset(iv2, 0xff, sizeof(iv2));
-  _gcry_camellia_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp(plaintext2, plaintext, 16))
-    return "CAMELLIA-128-CTR test failed (plaintext mismatch)";
-
-  if (memcmp(iv2, iv, 16))
-    return "CAMELLIA-128-CTR test failed (IV mismatch)";
-
-  /* Test parallelized code paths */
-  for (diff = 0; diff < nblocks; diff++) {
-    memset(iv, 0xff, sizeof(iv));
-    iv[15] -= diff;
-
-    for (i = 0; i < sizeof(plaintext); i++)
-      plaintext[i] = i;
-
-    /* Create CTR ciphertext manually.  */
-    for (i = 0; i < sizeof(plaintext); i+=16)
-      {
-        camellia_encrypt (&ctx, &ciphertext[i], iv);
-        for (j = 0; j < 16; j++)
-          ciphertext[i+j] ^= plaintext[i+j];
-        for (j = 16; j > 0; j--)
-          {
-            iv[j-1]++;
-            if (iv[j-1])
-              break;
-          }
-      }
-
-    /* Decrypt using bulk CTR and compare result.  */
-    memset(iv2, 0xff, sizeof(iv2));
-    iv2[15] -= diff;
-
-    _gcry_camellia_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
-                            sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
-
-    if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "CAMELLIA-128-CTR test failed (plaintext mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-    if (memcmp(iv2, iv, sizeof(iv)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "CAMELLIA-128-CTR test failed (IV mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-  }
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
 
-  return NULL;
+  return _gcry_selftest_helper_ctr_128("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, &_gcry_camellia_ctr_enc, nblocks, blocksize,
+	   context_size);
 }
 
 /* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
@@ -408,78 +328,12 @@ static const char*
 selftest_cbc_128 (void)
 {
   const int nblocks = 16+2;
-  CAMELLIA_context ctx ATTR_ALIGNED_16;
-  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char iv[16] ATTR_ALIGNED_16;
-  unsigned char iv2[16] ATTR_ALIGNED_16;
-  int i, j;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
-    };
-  static char error_str[128];
-
-  camellia_setkey (&ctx, key, sizeof (key));
-
-  /* Test single block code path */
-  memset(iv, 0x4e, sizeof(iv));
-  memset(iv2, 0x4e, sizeof(iv2));
-  for (i = 0; i < 16; i++)
-    plaintext[i] = i;
-
-  /* CBC manually.  */
-  for (i = 0; i < 16; i++)
-    ciphertext[i] = iv[i] ^ plaintext[i];
-  camellia_encrypt (&ctx, ciphertext, ciphertext);
-  memcpy(iv, ciphertext, sizeof(iv));
-
-  /* CBC decrypt.  */
-  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp(plaintext2, plaintext, 16))
-    return "CAMELLIA-128-CBC test failed (plaintext mismatch)";
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
 
-  if (memcmp(iv2, iv, 16))
-    return "CAMELLIA-128-CBC test failed (IV mismatch)";
-
-  /* Test parallelized code paths */
-  memset(iv, 0x5f, sizeof(iv));
-  memset(iv2, 0x5f, sizeof(iv2));
-
-  for (i = 0; i < sizeof(plaintext); i++)
-    plaintext[i] = i;
-
-  /* Create CBC ciphertext manually.  */
-  for (i = 0; i < sizeof(plaintext); i+=16)
-    {
-      for (j = 0; j < 16; j++)
-        ciphertext[i+j] = iv[j] ^ plaintext[i+j];
-      camellia_encrypt (&ctx, &ciphertext[i], &ciphertext[i]);
-      memcpy(iv, &ciphertext[i], sizeof(iv));
-    }
-
-  /* Decrypt using bulk CBC and compare result.  */
-  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext,
-                          sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
-
-  if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
-    {
-      snprintf(error_str, sizeof(error_str),
-               "CAMELLIA-128-CBC test failed (plaintext mismatch, "
-	       "parallel path)");
-      return error_str;
-    }
-  if (memcmp(iv2, iv, sizeof(iv)))
-    {
-      snprintf(error_str, sizeof(error_str),
-               "CAMELLIA-128-CBC test failed (IV mismatch, parallel path)");
-      return error_str;
-    }
-
-  return NULL;
+  return _gcry_selftest_helper_cbc_128("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, &_gcry_camellia_cbc_dec, nblocks, blocksize,
+	   context_size);
 }
 
 static const char *
diff --git a/cipher/selftest_help.c b/cipher/selftest_help.c
new file mode 100644
index 0000000..031f1f0
--- /dev/null
+++ b/cipher/selftest_help.c
@@ -0,0 +1,271 @@
+/* selftest_help.c - Helper functions for bulk encryption selftests.
+ *	Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "selftest_help.h"
+
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* Run the self-tests for <block cipher>-CBC-128, tests bulk CBC
+   decryption.  Returns NULL on success. */
+const char*
+_gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
+			       const int nblocks, const int blocksize,
+			       const int context_size)
+{
+  int i, offs;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+    };
+  static char error_str[128];
+
+  /* Allocate buffers, align elements to 16 bytes.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = gcry_calloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  setkey (ctx, key, sizeof(key));
+
+  /* Test single block code path */
+  memset(iv, 0x4e, blocksize);
+  memset(iv2, 0x4e, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CBC manually.  */
+  buf_xor(ciphertext, iv, plaintext, blocksize);
+  encrypt_one (ctx, ciphertext, ciphertext);
+  memcpy(iv, ciphertext, blocksize);
+
+  /* CBC decrypt.  */
+  bulk_cbc_dec (ctx, iv2, plaintext2, ciphertext, 1);
+  if (memcmp(plaintext2, plaintext, 16))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "%s-128-CBC test failed (plaintext mismatch)", cipher);
+      gcry_free(mem);
+      return error_str;
+    }
+
+  if (memcmp(iv2, iv, 16))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "%s-128-CBC test failed (IV mismatch)", cipher);
+      gcry_free(mem);
+      return error_str;
+    }
+
+  /* Test parallelized code paths */
+  memset(iv, 0x5f, blocksize);
+  memset(iv2, 0x5f, blocksize);
+
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CBC ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      buf_xor(&ciphertext[i], iv, &plaintext[i], blocksize);
+      encrypt_one (ctx, &ciphertext[i], &ciphertext[i]);
+      memcpy(iv, &ciphertext[i], blocksize);
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  bulk_cbc_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+  if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "%s-128-CBC test failed (plaintext mismatch, parallel path)",
+                cipher);
+      gcry_free(mem);
+      return error_str;
+    }
+  if (memcmp(iv2, iv, blocksize))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "%s-128-CBC test failed (IV mismatch, parallel path)",
+	       cipher);
+      gcry_free(mem);
+      return error_str;
+    }
+
+  gcry_free(mem);
+  return NULL;
+}
+
+/* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+const char*
+_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
+			       const int nblocks, const int blocksize,
+			       const int context_size)
+{
+  int i, j, offs, diff;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
+  static char error_str[128];
+
+  /* Allocate buffers, align elements to 16 bytes.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = gcry_calloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  setkey (ctx, key, sizeof(key));
+
+  /* Test single block code path */
+  memset(iv, 0xff, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CTR manually.  */
+  encrypt_one (ctx, ciphertext, iv);
+  for (i = 0; i < blocksize; i++)
+    ciphertext[i] ^= plaintext[i];
+  for (i = blocksize; i > 0; i--)
+    {
+      iv[i-1]++;
+      if (iv[i-1])
+        break;
+    }
+
+  memset(iv2, 0xff, blocksize);
+  bulk_ctr_enc (ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp(plaintext2, plaintext, blocksize))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "%s-128-CTR test failed (plaintext mismatch)", cipher);
+      gcry_free(mem);
+      return error_str;
+    }
+
+  if (memcmp(iv2, iv, blocksize))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "%s-128-CTR test failed (IV mismatch)", cipher);
+      gcry_free(mem);
+      return error_str;
+    }
+
+  /* Test parallelized code paths */
+  for (diff = 0; diff < nblocks; diff++) {
+    memset(iv, 0xff, blocksize);
+    iv[blocksize-1] -= diff;
+
+    for (i = 0; i < blocksize * nblocks; i++)
+      plaintext[i] = i;
+
+    /* Create CTR ciphertext manually.  */
+    for (i = 0; i < blocksize * nblocks; i+=blocksize)
+      {
+        encrypt_one (ctx, &ciphertext[i], iv);
+        for (j = 0; j < blocksize; j++)
+          ciphertext[i+j] ^= plaintext[i+j];
+        for (j = blocksize; j > 0; j--)
+          {
+            iv[j-1]++;
+            if (iv[j-1])
+              break;
+          }
+      }
+
+    /* Decrypt using bulk CTR and compare result.  */
+    memset(iv2, 0xff, blocksize);
+    iv2[blocksize-1] -= diff;
+
+    bulk_ctr_enc (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+    if (memcmp(plaintext2, plaintext, blocksize * nblocks))
+      {
+        snprintf(error_str, sizeof(error_str),
+                 "%s-128-CTR test failed (plaintext mismatch, diff: %d)",
+                 cipher, diff);
+        return error_str;
+      }
+    if (memcmp(iv2, iv, blocksize))
+      {
+        snprintf(error_str, sizeof(error_str),
+                 "%s-128-CTR test failed (IV mismatch, diff: %d)", cipher,
+                 diff);
+        return error_str;
+      }
+  }
+
+  return NULL;
+}
diff --git a/cipher/selftest_help.h b/cipher/selftest_help.h
new file mode 100644
index 0000000..368b2c8
--- /dev/null
+++ b/cipher/selftest_help.h
@@ -0,0 +1,54 @@
+/* selftest_help.h - Helper functions for bulk encryption selftests.
+ *	Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_SELFTEST_HELP_H
+#define G10_SELFTEST_HELP_H
+
+#include <config.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+
+typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   unsigned int nblocks);
+
+typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   unsigned int nblocks);
+
+/* Helper function for bulk CBC decryption selftest */
+const char*
+_gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
+			       const int nblocks, const int blocksize,
+			       const int context_size);
+
+/* Helper function for bulk CTR encryption selftest */
+const char*
+_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
+			       const int nblocks, const int blocksize,
+			       const int context_size);
+
+#endif /*G10_SELFTEST_HELP_H*/


From jussi.kivilinna at iki.fi  Sun May  5 16:37:06 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 17:37:06 +0300
Subject: [PATCH 4/4] Add AES bulk CBC decryption selftest
In-Reply-To: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
References: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
Message-ID: <20130505143706.29094.31696.stgit@localhost6.localdomain6>

* cipher/rinjdael.c (selftest_cbc_128): New.
(selftest): Call selftest_cbc_128.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael.c |   18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index b8fb3ab..4564258 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -2020,6 +2020,21 @@ selftest_ctr_128 (void)
 }
 
 
+/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cbc_128("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -2035,6 +2050,9 @@ selftest (void)
   if ( (r = selftest_ctr_128 ()) )
     return r;
 
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return r;
 }
 

From jussi.kivilinna at iki.fi  Sun May  5 16:37:01 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 17:37:01 +0300
Subject: [PATCH 3/4] Change AES bulk CTR encryption selftest use new
 selftest helper function
In-Reply-To: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
References: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
Message-ID: <20130505143701.29094.84131.stgit@localhost6.localdomain6>

* cipher/rinjdael.c: (selftest_ctr_128): Change to use new selftest
helper function.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael.c |   93 ++++-------------------------------------------------
 1 file changed, 7 insertions(+), 86 deletions(-)

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8d4036b..b8fb3ab 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -46,6 +46,7 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
+#include "selftest_help.h"
 
 #define MAXKC			(256/32)
 #define MAXROUNDS		14
@@ -2009,93 +2010,13 @@ selftest_basic_256 (void)
 static const char*
 selftest_ctr_128 (void)
 {
-  RIJNDAEL_context ctx ATTR_ALIGNED_16;
-  unsigned char plaintext[7*16] ATTR_ALIGNED_16;
-  unsigned char ciphertext[7*16] ATTR_ALIGNED_16;
-  unsigned char plaintext2[7*16] ATTR_ALIGNED_16;
-  unsigned char iv[16] ATTR_ALIGNED_16;
-  unsigned char iv2[16] ATTR_ALIGNED_16;
-  int i, j, diff;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
-    };
-  static char error_str[128];
-
-  rijndael_setkey (&ctx, key, sizeof (key));
-
-  /* Test single block code path */
-  memset(iv, 0xff, sizeof(iv));
-  for (i = 0; i < 16; i++)
-    plaintext[i] = i;
-
-  /* CTR manually.  */
-  rijndael_encrypt (&ctx, ciphertext, iv);
-  for (i = 0; i < 16; i++)
-    ciphertext[i] ^= plaintext[i];
-  for (i = 16; i > 0; i--)
-    {
-      iv[i-1]++;
-      if (iv[i-1])
-        break;
-    }
-
-  memset(iv2, 0xff, sizeof(iv2));
-  _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp(plaintext2, plaintext, 16))
-    return "AES-128-CTR test failed (plaintext mismatch)";
-
-  if (memcmp(iv2, iv, 16))
-    return "AES-128-CTR test failed (IV mismatch)";
-
-  /* Test parallelized code paths */
-  for (diff = 0; diff < 7; diff++) {
-    memset(iv, 0xff, sizeof(iv));
-    iv[15] -= diff;
-
-    for (i = 0; i < sizeof(plaintext); i++)
-      plaintext[i] = i;
+  const int nblocks = 8+1;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
 
-    /* Create CTR ciphertext manually.  */
-    for (i = 0; i < sizeof(plaintext); i+=16)
-      {
-        rijndael_encrypt (&ctx, &ciphertext[i], iv);
-        for (j = 0; j < 16; j++)
-          ciphertext[i+j] ^= plaintext[i+j];
-        for (j = 16; j > 0; j--)
-          {
-            iv[j-1]++;
-            if (iv[j-1])
-              break;
-          }
-      }
-
-    /* Decrypt using bulk CTR and compare result.  */
-    memset(iv2, 0xff, sizeof(iv2));
-    iv2[15] -= diff;
-
-    _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
-                       sizeof(ciphertext) / BLOCKSIZE);
-
-    if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "AES-128-CTR test failed (plaintext mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-    if (memcmp(iv2, iv, sizeof(iv)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "AES-128-CTR test failed (IV mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-  }
-
-  return NULL;
+  return _gcry_selftest_helper_ctr_128("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize,
+	   context_size);
 }
 
 
From jussi.kivilinna at iki.fi  Sun May  5 16:36:51 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 17:36:51 +0300
Subject: [PATCH 1/4] camellia: add bulk CBC decryption selftest
Message-ID: <20130505143651.29094.85500.stgit@localhost6.localdomain6>

* cipher/camellia-glue.c: (selftest_cbc_128): New selftest function for
bulk CBC decryption.
(selftest): Add call to selftest_cbc_128.
--

Add selftest for the parallel code paths in bulk CBC decryption.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-glue.c |   83 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index dd9206f..e9e2bf2 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -402,6 +402,86 @@ selftest_ctr_128 (void)
   return NULL;
 }
 
+/* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 16+2;
+  CAMELLIA_context ctx ATTR_ALIGNED_16;
+  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char iv[16] ATTR_ALIGNED_16;
+  unsigned char iv2[16] ATTR_ALIGNED_16;
+  int i, j;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+    };
+  static char error_str[128];
+
+  camellia_setkey (&ctx, key, sizeof (key));
+
+  /* Test single block code path */
+  memset(iv, 0x4e, sizeof(iv));
+  memset(iv2, 0x4e, sizeof(iv2));
+  for (i = 0; i < 16; i++)
+    plaintext[i] = i;
+
+  /* CBC manually.  */
+  for (i = 0; i < 16; i++)
+    ciphertext[i] = iv[i] ^ plaintext[i];
+  camellia_encrypt (&ctx, ciphertext, ciphertext);
+  memcpy(iv, ciphertext, sizeof(iv));
+
+  /* CBC decrypt.  */
+  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp(plaintext2, plaintext, 16))
+    return "CAMELLIA-128-CBC test failed (plaintext mismatch)";
+
+  if (memcmp(iv2, iv, 16))
+    return "CAMELLIA-128-CBC test failed (IV mismatch)";
+
+  /* Test parallelized code paths */
+  memset(iv, 0x5f, sizeof(iv));
+  memset(iv2, 0x5f, sizeof(iv2));
+
+  for (i = 0; i < sizeof(plaintext); i++)
+    plaintext[i] = i;
+
+  /* Create CBC ciphertext manually.  */
+  for (i = 0; i < sizeof(plaintext); i+=16)
+    {
+      for (j = 0; j < 16; j++)
+        ciphertext[i+j] = iv[j] ^ plaintext[i+j];
+      camellia_encrypt (&ctx, &ciphertext[i], &ciphertext[i]);
+      memcpy(iv, &ciphertext[i], sizeof(iv));
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext,
+                          sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
+
+  if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "CAMELLIA-128-CBC test failed (plaintext mismatch, "
+	       "parallel path)");
+      return error_str;
+    }
+  if (memcmp(iv2, iv, sizeof(iv)))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "CAMELLIA-128-CBC test failed (IV mismatch, parallel path)");
+      return error_str;
+    }
+
+  return NULL;
+}
+
 static const char *
 selftest(void)
 {
@@ -474,6 +554,9 @@ selftest(void)
   if ( (r = selftest_ctr_128 ()) )
     return r;
 
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return NULL;
 }
 

From jussi.kivilinna at iki.fi  Sun May  5 17:55:33 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 18:55:33 +0300
Subject: [PATCH 1/2] Serpent: faster S-box implementation
Message-ID: <20130505155532.16505.3475.stgit@localhost6.localdomain6>

* cipher/serpent.c (SBOX0, SBOX1, SBOX2, SBOX3, SBOX4, SBOX5, SBOX6)
(SBOX7, SBOX0_INVERSE, SBOX1_INVERSE, SBOX2_INVERSE, SBOX3_INVERSE)
(SBOX4_INVERSE, SBOX5_INVERSE, SBOX6_INVERSE, SBOX7_INVERSE): Replace
with new definitions.
--

These new S-box definitions are from paper:
 D. A. Osvik, ?Speeding up Serpent,? in Third AES Candidate Conference,
 (New York, New York, USA), p. 317?329, National Institute of Standards and
 Technology, 2000. Received from http://www.ii.uib.no/~osvik/pub/aes3.ps.gz

Although these were optimized for two-operand instructions on i386 and for
old Pentium-1 processors, they are slightly faster on current processors
on i386 and x86-64. On ARM, the performance of these S-boxes is about the
same as with the old S-boxes.

new vs old speed ratios (AMD K10, x86-64):
                 ECB/Stream         CBC             CFB             OFB             CTR
              --------------- --------------- --------------- --------------- ---------------
 SERPENT128     1.06x   1.02x   1.06x   1.02x   1.06x   1.06x   1.06x   1.05x   1.07x   1.07x

new vs old speed ratios (Intel Atom, i486):
                 ECB/Stream         CBC             CFB             OFB             CTR
              --------------- --------------- --------------- --------------- ---------------
 SERPENT128     1.12x   1.15x   1.12x   1.15x   1.13x   1.11x   1.12x   1.12x   1.12x   1.13x

new vs old speed ratios (ARM Cortex A8):
                 ECB/Stream         CBC             CFB             OFB             CTR
              --------------- --------------- --------------- --------------- ---------------
 SERPENT128     1.04x   1.02x   1.02x   0.99x   1.02x   1.02x   1.03x   1.03x   1.01x   1.01x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/serpent.c |  600 +++++++++++++++++++++++-------------------------------
 1 file changed, 250 insertions(+), 350 deletions(-)

diff --git a/cipher/serpent.c b/cipher/serpent.c
index ea14c7e..72840cf 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -61,403 +61,303 @@ static const char *serpent_test (void);
    | (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) \
    | (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
 
-/* These are the S-Boxes of Serpent.  They are copied from Serpents
-   reference implementation (the optimized one, contained in
-   `floppy2') and are therefore:
-
-     Copyright (C) 1998 Ross Anderson, Eli Biham, Lars Knudsen.
-
-  To quote the Serpent homepage
-  (http://www.cl.cam.ac.uk/~rja14/serpent.html):
-
-  "Serpent is now completely in the public domain, and we impose no
-   restrictions on its use.  This was announced on the 21st August at
-   the First AES Candidate Conference. The optimised implementations
-   in the submission package are now under the GNU PUBLIC LICENSE
-   (GPL), although some comments in the code still say otherwise. You
-   are welcome to use Serpent for any application."  */
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, ?Speeding up Serpent,? in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317?329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
 
-#define SBOX0(a, b, c, d, w, x, y, z) \
+#define SBOX0(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t05, t06, t07, t08, t09; \
-    u32 t11, t12, t13, t14, t15, t17, t01; \
-    t01 = b   ^ c  ; \
-    t02 = a   | d  ; \
-    t03 = a   ^ b  ; \
-    z   = t02 ^ t01; \
-    t05 = c   | z  ; \
-    t06 = a   ^ d  ; \
-    t07 = b   | c  ; \
-    t08 = d   & t05; \
-    t09 = t03 & t07; \
-    y   = t09 ^ t08; \
-    t11 = t09 & y  ; \
-    t12 = c   ^ d  ; \
-    t13 = t07 ^ t11; \
-    t14 = b   & t06; \
-    t15 = t06 ^ t13; \
-    w   =     ~ t15; \
-    t17 = w   ^ t14; \
-    x   = t12 ^ t17; \
+    u32 r4; \
+    \
+    r3 ^= r0; r4 =  r1; \
+    r1 &= r3; r4 ^= r2; \
+    r1 ^= r0; r0 |= r3; \
+    r0 ^= r4; r4 ^= r3; \
+    r3 ^= r2; r2 |= r1; \
+    r2 ^= r4; r4 = ~r4; \
+    r4 |= r1; r1 ^= r3; \
+    r1 ^= r4; r3 |= r0; \
+    r1 ^= r3; r4 ^= r3; \
+    \
+    w = r1; x = r4; y = r2; z = r0; \
   }
 
-#define SBOX0_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09, t10; \
-    u32 t12, t13, t14, t15, t17, t18, t01; \
-    t01 = c   ^ d  ; \
-    t02 = a   | b  ; \
-    t03 = b   | c  ; \
-    t04 = c   & t01; \
-    t05 = t02 ^ t01; \
-    t06 = a   | t04; \
-    y   =     ~ t05; \
-    t08 = b   ^ d  ; \
-    t09 = t03 & t08; \
-    t10 = d   | y  ; \
-    x   = t09 ^ t06; \
-    t12 = a   | t05; \
-    t13 = x   ^ t12; \
-    t14 = t03 ^ t10; \
-    t15 = a   ^ c  ; \
-    z   = t14 ^ t13; \
-    t17 = t05 & t13; \
-    t18 = t14 | t17; \
-    w   = t15 ^ t18; \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r1; \
+    r1 |= r0; r4 = ~r4; \
+    r1 ^= r2; r2 |= r4; \
+    r1 ^= r3; r0 ^= r4; \
+    r2 ^= r0; r0 &= r3; \
+    r4 ^= r0; r0 |= r1; \
+    r0 ^= r2; r3 ^= r4; \
+    r2 ^= r1; r3 ^= r0; \
+    r3 ^= r1; \
+    r2 &= r3; \
+    r4 ^= r2; \
+    \
+    w = r0; x = r4; y = r1; z = r3; \
   }
 
-#define SBOX1(a, b, c, d, w, x, y, z) \
+#define SBOX1(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t10, t11, t12, t13, t16, t17, t01; \
-    t01 = a   | d  ; \
-    t02 = c   ^ d  ; \
-    t03 =     ~ b  ; \
-    t04 = a   ^ c  ; \
-    t05 = a   | t03; \
-    t06 = d   & t04; \
-    t07 = t01 & t02; \
-    t08 = b   | t06; \
-    y   = t02 ^ t05; \
-    t10 = t07 ^ t08; \
-    t11 = t01 ^ t10; \
-    t12 = y   ^ t11; \
-    t13 = b   & d  ; \
-    z   =     ~ t10; \
-    x   = t13 ^ t12; \
-    t16 = t10 | x  ; \
-    t17 = t05 & t16; \
-    w   = c   ^ t17; \
+    u32 r4; \
+    \
+    r0 = ~r0; r2 = ~r2; \
+    r4 =  r0; r0 &= r1; \
+    r2 ^= r0; r0 |= r3; \
+    r3 ^= r2; r1 ^= r0; \
+    r0 ^= r4; r4 |= r1; \
+    r1 ^= r3; r2 |= r0; \
+    r2 &= r4; r0 ^= r1; \
+    r1 &= r2; \
+    r1 ^= r0; r0 &= r2; \
+    r0 ^= r4; \
+    \
+    w = r2; x = r0; y = r3; z = r1; \
   }
 
-#define SBOX1_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t09, t10, t11, t14, t15, t17, t01; \
-    t01 = a   ^ b  ; \
-    t02 = b   | d  ; \
-    t03 = a   & c  ; \
-    t04 = c   ^ t02; \
-    t05 = a   | t04; \
-    t06 = t01 & t05; \
-    t07 = d   | t03; \
-    t08 = b   ^ t06; \
-    t09 = t07 ^ t06; \
-    t10 = t04 | t03; \
-    t11 = d   & t08; \
-    y   =     ~ t09; \
-    x   = t10 ^ t11; \
-    t14 = a   | y  ; \
-    t15 = t06 ^ x  ; \
-    z   = t01 ^ t04; \
-    t17 = c   ^ t15; \
-    w   = t14 ^ t17; \
+    u32 r4; \
+    \
+    r4 =  r1; r1 ^= r3; \
+    r3 &= r1; r4 ^= r2; \
+    r3 ^= r0; r0 |= r1; \
+    r2 ^= r3; r0 ^= r4; \
+    r0 |= r2; r1 ^= r3; \
+    r0 ^= r1; r1 |= r3; \
+    r1 ^= r0; r4 = ~r4; \
+    r4 ^= r1; r1 |= r0; \
+    r1 ^= r0; \
+    r1 |= r4; \
+    r3 ^= r1; \
+    \
+    w = r4; x = r0; y = r3; z = r2; \
   }
 
-#define SBOX2(a, b, c, d, w, x, y, z) \
+#define SBOX2(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t05, t06, t07, t08; \
-    u32 t09, t10, t12, t13, t14, t01; \
-    t01 = a   | c  ; \
-    t02 = a   ^ b  ; \
-    t03 = d   ^ t01; \
-    w   = t02 ^ t03; \
-    t05 = c   ^ w  ; \
-    t06 = b   ^ t05; \
-    t07 = b   | t05; \
-    t08 = t01 & t06; \
-    t09 = t03 ^ t07; \
-    t10 = t02 | t09; \
-    x   = t10 ^ t08; \
-    t12 = a   | d  ; \
-    t13 = t09 ^ x  ; \
-    t14 = b   ^ t13; \
-    z   =     ~ t09; \
-    y   = t12 ^ t14; \
+    u32 r4; \
+    \
+    r4 =  r0; r0 &= r2; \
+    r0 ^= r3; r2 ^= r1; \
+    r2 ^= r0; r3 |= r4; \
+    r3 ^= r1; r4 ^= r2; \
+    r1 =  r3; r3 |= r4; \
+    r3 ^= r0; r0 &= r1; \
+    r4 ^= r0; r1 ^= r3; \
+    r1 ^= r4; r4 = ~r4; \
+    \
+    w = r2; x = r3; y = r1; z = r4; \
   }
 
-#define SBOX2_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t06, t07, t08, t09; \
-    u32 t10, t11, t12, t15, t16, t17, t01; \
-    t01 = a   ^ d  ; \
-    t02 = c   ^ d  ; \
-    t03 = a   & c  ; \
-    t04 = b   | t02; \
-    w   = t01 ^ t04; \
-    t06 = a   | c  ; \
-    t07 = d   | w  ; \
-    t08 =     ~ d  ; \
-    t09 = b   & t06; \
-    t10 = t08 | t03; \
-    t11 = b   & t07; \
-    t12 = t06 & t02; \
-    z   = t09 ^ t10; \
-    x   = t12 ^ t11; \
-    t15 = c   & z  ; \
-    t16 = w   ^ x  ; \
-    t17 = t10 ^ t15; \
-    y   = t16 ^ t17; \
+    u32 r4; \
+    \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r3; r3 &= r2; \
+    r3 ^= r1; r1 |= r2; \
+    r1 ^= r4; r4 &= r3; \
+    r2 ^= r3; r4 &= r0; \
+    r4 ^= r2; r2 &= r1; \
+    r2 |= r0; r3 = ~r3; \
+    r2 ^= r3; r0 ^= r3; \
+    r0 &= r1; r3 ^= r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r2; z = r3; \
   }
 
-#define SBOX3(a, b, c, d, w, x, y, z) \
+#define SBOX3(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t09, t10, t11, t13, t14, t15, t01; \
-    t01 = a   ^ c  ; \
-    t02 = a   | d  ; \
-    t03 = a   & d  ; \
-    t04 = t01 & t02; \
-    t05 = b   | t03; \
-    t06 = a   & b  ; \
-    t07 = d   ^ t04; \
-    t08 = c   | t06; \
-    t09 = b   ^ t07; \
-    t10 = d   & t05; \
-    t11 = t02 ^ t10; \
-    z   = t08 ^ t09; \
-    t13 = d   | z  ; \
-    t14 = a   | t07; \
-    t15 = b   & t13; \
-    y   = t08 ^ t11; \
-    w   = t14 ^ t15; \
-    x   = t05 ^ t04; \
+    u32 r4; \
+    \
+    r4 =  r0; r0 |= r3; \
+    r3 ^= r1; r1 &= r4; \
+    r4 ^= r2; r2 ^= r3; \
+    r3 &= r0; r4 |= r1; \
+    r3 ^= r4; r0 ^= r1; \
+    r4 &= r0; r1 ^= r3; \
+    r4 ^= r2; r1 |= r0; \
+    r1 ^= r2; r0 ^= r3; \
+    r2  = r1; r1 |= r3; \
+    r1 ^= r0; \
+    \
+    w = r1; x = r2; y = r3; z = r4; \
   }
 
-#define SBOX3_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t09; \
-    u32 t11, t12, t13, t14, t16, t01; \
-    t01 = c   | d  ; \
-    t02 = a   | d  ; \
-    t03 = c   ^ t02; \
-    t04 = b   ^ t02; \
-    t05 = a   ^ d  ; \
-    t06 = t04 & t03; \
-    t07 = b   & t01; \
-    y   = t05 ^ t06; \
-    t09 = a   ^ t03; \
-    w   = t07 ^ t03; \
-    t11 = w   | t05; \
-    t12 = t09 & t11; \
-    t13 = a   & y  ; \
-    t14 = t01 ^ t05; \
-    x   = b   ^ t12; \
-    t16 = b   | t13; \
-    z   = t14 ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r1; \
+    r0 ^= r2; r4 &= r2; \
+    r4 ^= r0; r0 &= r1; \
+    r1 ^= r3; r3 |= r4; \
+    r2 ^= r3; r0 ^= r3; \
+    r1 ^= r4; r3 &= r2; \
+    r3 ^= r1; r1 ^= r0; \
+    r1 |= r2; r0 ^= r3; \
+    r1 ^= r4; \
+    r0 ^= r1; \
+    \
+    w = r2; x = r1; y = r3; z = r0; \
   }
 
-#define SBOX4(a, b, c, d, w, x, y, z) \
+#define SBOX4(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09; \
-    u32 t10, t11, t12, t13, t14, t15, t16, t01; \
-    t01 = a   | b  ; \
-    t02 = b   | c  ; \
-    t03 = a   ^ t02; \
-    t04 = b   ^ d  ; \
-    t05 = d   | t03; \
-    t06 = d   & t01; \
-    z   = t03 ^ t06; \
-    t08 = z   & t04; \
-    t09 = t04 & t05; \
-    t10 = c   ^ t06; \
-    t11 = b   & c  ; \
-    t12 = t04 ^ t08; \
-    t13 = t11 | t03; \
-    t14 = t10 ^ t09; \
-    t15 = a   & t05; \
-    t16 = t11 | t12; \
-    y   = t13 ^ t08; \
-    x   = t15 ^ t16; \
-    w   =     ~ t14; \
+    u32 r4; \
+    \
+    r1 ^= r3; r3 = ~r3; \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r1; r1 &= r3; \
+    r1 ^= r2; r4 ^= r3; \
+    r0 ^= r4; r2 &= r4; \
+    r2 ^= r0; r0 &= r1; \
+    r3 ^= r0; r4 |= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r2 &= r3; \
+    r0 = ~r0; r4 ^= r2; \
+    \
+    w = r1; x = r4; y = r0; z = r3; \
   }
 
-#define SBOX4_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t09; \
-    u32 t10, t11, t12, t13, t15, t01; \
-    t01 = b   | d  ; \
-    t02 = c   | d  ; \
-    t03 = a   & t01; \
-    t04 = b   ^ t02; \
-    t05 = c   ^ d  ; \
-    t06 =     ~ t03; \
-    t07 = a   & t04; \
-    x   = t05 ^ t07; \
-    t09 = x   | t06; \
-    t10 = a   ^ t07; \
-    t11 = t01 ^ t09; \
-    t12 = d   ^ t04; \
-    t13 = c   | t10; \
-    z   = t03 ^ t12; \
-    t15 = a   ^ t04; \
-    y   = t11 ^ t13; \
-    w   = t15 ^ t09; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 &= r3; \
+    r2 ^= r1; r1 |= r3; \
+    r1 &= r0; r4 ^= r2; \
+    r4 ^= r1; r1 &= r2; \
+    r0 = ~r0; r3 ^= r4; \
+    r1 ^= r3; r3 &= r0; \
+    r3 ^= r2; r0 ^= r1; \
+    r2 &= r0; r3 ^= r0; \
+    r2 ^= r4; \
+    r2 |= r3; r3 ^= r0; \
+    r2 ^= r1; \
+    \
+    w = r0; x = r3; y = r2; z = r4; \
   }
 
-#define SBOX5(a, b, c, d, w, x, y, z) \
+#define SBOX5(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09; \
-    u32 t10, t11, t12, t13, t14, t01; \
-    t01 = b   ^ d  ; \
-    t02 = b   | d  ; \
-    t03 = a   & t01; \
-    t04 = c   ^ t02; \
-    t05 = t03 ^ t04; \
-    w   =     ~ t05; \
-    t07 = a   ^ t01; \
-    t08 = d   | w  ; \
-    t09 = b   | t05; \
-    t10 = d   ^ t08; \
-    t11 = b   | t07; \
-    t12 = t03 | w  ; \
-    t13 = t07 | t10; \
-    t14 = t01 ^ t11; \
-    y   = t09 ^ t13; \
-    x   = t07 ^ t08; \
-    z   = t12 ^ t14; \
+    u32 r4; \
+    \
+    r0 ^= r1; r1 ^= r3; \
+    r3 = ~r3; r4 =  r1; \
+    r1 &= r0; r2 ^= r3; \
+    r1 ^= r2; r2 |= r4; \
+    r4 ^= r3; r3 &= r1; \
+    r3 ^= r0; r4 ^= r1; \
+    r4 ^= r2; r2 ^= r0; \
+    r0 &= r3; r2 = ~r2; \
+    r0 ^= r4; r4 |= r3; \
+    r2 ^= r4; \
+    \
+    w = r1; x = r3; y = r0; z = r2; \
   }
 
-#define SBOX5_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09; \
-    u32 t10, t12, t13, t15, t16, t01; \
-    t01 = a   & d  ; \
-    t02 = c   ^ t01; \
-    t03 = a   ^ d  ; \
-    t04 = b   & t02; \
-    t05 = a   & c  ; \
-    w   = t03 ^ t04; \
-    t07 = a   & w  ; \
-    t08 = t01 ^ w  ; \
-    t09 = b   | t05; \
-    t10 =     ~ b  ; \
-    x   = t08 ^ t09; \
-    t12 = t10 | t07; \
-    t13 = w   | x  ; \
-    z   = t02 ^ t12; \
-    t15 = t02 ^ t13; \
-    t16 = b   ^ d  ; \
-    y   = t16 ^ t15; \
+    u32 r4; \
+    \
+    r1 = ~r1; r4 =  r3; \
+    r2 ^= r1; r3 |= r0; \
+    r3 ^= r2; r2 |= r1; \
+    r2 &= r0; r4 ^= r3; \
+    r2 ^= r4; r4 |= r0; \
+    r4 ^= r1; r1 &= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r3 &= r4; r4 ^= r1; \
+    r3 ^= r4; r4 = ~r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r3; z = r2; \
   }
 
-#define SBOX6(a, b, c, d, w, x, y, z) \
+#define SBOX6(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09, t10; \
-    u32 t11, t12, t13, t15, t17, t18, t01; \
-    t01 = a   & d  ; \
-    t02 = b   ^ c  ; \
-    t03 = a   ^ d  ; \
-    t04 = t01 ^ t02; \
-    t05 = b   | c  ; \
-    x   =     ~ t04; \
-    t07 = t03 & t05; \
-    t08 = b   & x  ; \
-    t09 = a   | c  ; \
-    t10 = t07 ^ t08; \
-    t11 = b   | d  ; \
-    t12 = c   ^ t11; \
-    t13 = t09 ^ t10; \
-    y   =     ~ t13; \
-    t15 = x   & t03; \
-    z   = t12 ^ t07; \
-    t17 = a   ^ b  ; \
-    t18 = y   ^ t15; \
-    w   = t17 ^ t18; \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r3; \
+    r3 &= r0; r0 ^= r4; \
+    r3 ^= r2; r2 |= r4; \
+    r1 ^= r3; r2 ^= r0; \
+    r0 |= r1; r2 ^= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r4 ^= r3; \
+    r4 ^= r0; r3 = ~r3; \
+    r2 &= r4; \
+    r2 ^= r3; \
+    \
+    w = r0; x = r1; y = r4; z = r2; \
   }
 
-#define SBOX6_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08, t09; \
-    u32 t12, t13, t14, t15, t16, t17, t01; \
-    t01 = a   ^ c  ; \
-    t02 =     ~ c  ; \
-    t03 = b   & t01; \
-    t04 = b   | t02; \
-    t05 = d   | t03; \
-    t06 = b   ^ d  ; \
-    t07 = a   & t04; \
-    t08 = a   | t02; \
-    t09 = t07 ^ t05; \
-    x   = t06 ^ t08; \
-    w   =     ~ t09; \
-    t12 = b   & w  ; \
-    t13 = t01 & t05; \
-    t14 = t01 ^ t12; \
-    t15 = t07 ^ t13; \
-    t16 = d   | t02; \
-    t17 = a   ^ x  ; \
-    z   = t17 ^ t15; \
-    y   = t16 ^ t14; \
+    u32 r4; \
+    \
+    r0 ^= r2; r4 =  r2; \
+    r2 &= r0; r4 ^= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r2 ^= r3; r4 |= r0; \
+    r0 ^= r2; r3 ^= r4; \
+    r4 ^= r1; r1 &= r3; \
+    r1 ^= r0; r0 ^= r3; \
+    r0 |= r2; r3 ^= r1; \
+    r4 ^= r0; \
+    \
+    w = r1; x = r2; y = r4; z = r3; \
   }
 
-#define SBOX7(a, b, c, d, w, x, y, z) \
+#define SBOX7(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09, t10; \
-    u32 t11, t13, t14, t15, t16, t17, t01; \
-    t01 = a   & c  ; \
-    t02 =     ~ d  ; \
-    t03 = a   & t02; \
-    t04 = b   | t01; \
-    t05 = a   & b  ; \
-    t06 = c   ^ t04; \
-    z   = t03 ^ t06; \
-    t08 = c   | z  ; \
-    t09 = d   | t05; \
-    t10 = a   ^ t08; \
-    t11 = t04 & z  ; \
-    x   = t09 ^ t10; \
-    t13 = b   ^ x  ; \
-    t14 = t01 ^ x  ; \
-    t15 = c   ^ t05; \
-    t16 = t11 | t13; \
-    t17 = t02 | t14; \
-    w   = t15 ^ t17; \
-    y   = a   ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r1; r1 |= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r2 ^= r1; r3 |= r4; \
+    r3 &= r0; r4 ^= r2; \
+    r3 ^= r1; r1 |= r4; \
+    r1 ^= r0; r0 |= r4; \
+    r0 ^= r2; r1 ^= r4; \
+    r2 ^= r1; r1 &= r0; \
+    r1 ^= r4; r2 = ~r2; \
+    r2 |= r0; \
+    r4 ^= r2; \
+    \
+    w = r4; x = r3; y = r1; z = r0; \
   }
 
-#define SBOX7_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t06, t07, t08, t09; \
-    u32 t10, t11, t13, t14, t15, t16, t01; \
-    t01 = a   & b  ; \
-    t02 = a   | b  ; \
-    t03 = c   | t01; \
-    t04 = d   & t02; \
-    z   = t03 ^ t04; \
-    t06 = b   ^ t04; \
-    t07 = d   ^ z  ; \
-    t08 =     ~ t07; \
-    t09 = t06 | t08; \
-    t10 = b   ^ d  ; \
-    t11 = a   | d  ; \
-    x   = a   ^ t09; \
-    t13 = c   ^ t06; \
-    t14 = c   & t11; \
-    t15 = d   | x  ; \
-    t16 = t01 | t10; \
-    w   = t13 ^ t15; \
-    y   = t14 ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r0; \
+    r0 &= r3; r4 |= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r1 |= r0; r0 ^= r2; \
+    r2 &= r4; r3 &= r4; \
+    r1 ^= r2; r2 ^= r0; \
+    r0 |= r2; r4 ^= r1; \
+    r0 ^= r3; r3 ^= r4; \
+    r4 |= r0; r3 ^= r2; \
+    r4 ^= r2; \
+    \
+    w = r3; x = r0; y = r1; z = r4; \
   }
 
 /* XOR BLOCK1 into BLOCK0.  */


From jussi.kivilinna at iki.fi  Sun May  5 17:55:38 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Sun, 05 May 2013 18:55:38 +0300
Subject: [PATCH 2/2] serpent: add AVX accelerated x86-64 implementation
In-Reply-To: <20130505155532.16505.3475.stgit@localhost6.localdomain6>
References: <20130505155532.16505.3475.stgit@localhost6.localdomain6>
Message-ID: <20130505155538.16505.18822.stgit@localhost6.localdomain6>

* configure.ac (serpent) [ENABLE_AVX_SUPPORT]: Add
'serpent_avx_x86-64.lo'.
* cipher/Makefile.am (EXTRA_libcipher_la_SOURCES): Add
'serpent_avx_x86-64.S'.
* cipher/cipher.c (gcry_cipher_open) [USE_SERPENT]: Register bulk
functions for CBC-decryption and CTR-mode.
* cipher/serpent.c (USE_AVX): New macro.
[USE_AVX] (serpent_context_t): Add 'use_avx'.
[USE_AVX] (_gcry_serpent_avx_ctr_enc, _gcry_serpent_avx_cbc_dec): New
prototypes to assembler functions.
(serpent_setkey): Set 'serpent_init_done' before calling serpent_test.
(serpent_setkey) [USE_AVX]: Enable 'use_avx' is hardware supports AVX.
(_gcry_serpent_ctr_enc): New function.
(_gcry_serpent_cbc_dec): New function.
(selftest_ctr_128): New function.
(selftest_cbc_128): New function.
(selftest): Call selftest_ctr_128 and selftest_cbc_128.
* cipher/serpent_avx_x86-64.S: New file.
* src/cipher.h (_gcry_serpent_ctr_enc): New prototype.
(_gcry_serpent_cbc_dec): New prototype.
--

Patch adds word-sliced AVX implementation of Serpent for x86-64 for speeding
up parallelizable workloads (CTR mode, CBC mode decryption). Implementation
processes eight blocks in parallel, with two four-block sets interleaved for
out-of-order scheduling.

Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):

                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
SERPENT128    1.00x   1.00x   1.00x   4.44x   1.00x   0.99x   1.00x   1.00x   4.68x   4.58x
SERPENT256    1.00x   0.99x   1.00x   4.42x   1.00x   1.01x   1.00x   1.00x   4.68x   4.68x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am          |    2 
 cipher/cipher.c             |    8 
 cipher/serpent.c            |  209 ++++++++++++
 cipher/serpent_avx_x86-64.S |  766 +++++++++++++++++++++++++++++++++++++++++++
 configure.ac                |    5 
 src/cipher.h                |    7 
 6 files changed, 994 insertions(+), 3 deletions(-)
 create mode 100644 cipher/serpent_avx_x86-64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 0b61a27..f9291a8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -68,7 +68,7 @@ rmd160.c \
 rsa.c \
 scrypt.c \
 seed.c \
-serpent.c \
+serpent.c serpent_avx_x86-64.S \
 sha1.c \
 sha256.c \
 sha512.c \
diff --git a/cipher/cipher.c b/cipher/cipher.c
index f1224af..20ac2c7 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -726,6 +726,14 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
               break;
 #endif /*USE_CAMELLIA*/
+#ifdef USE_SERPENT
+	    case GCRY_CIPHER_SERPENT128:
+	    case GCRY_CIPHER_SERPENT192:
+	    case GCRY_CIPHER_SERPENT256:
+              h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+              h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
+              break;
+#endif /*USE_SERPENT*/
 
             default:
               break;
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 72840cf..b38c586 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -28,6 +28,17 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bithelp.h"
+#include "bufhelp.h"
+#include "selftest_help.h"
+
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__)
+#  define USE_AVX 1
+# endif
+#endif
 
 /* Number of rounds per Serpent encrypt/decrypt operation.  */
 #define ROUNDS 32
@@ -49,9 +60,28 @@ typedef u32 serpent_subkeys_t[ROUNDS + 1][4];
 typedef struct serpent_context
 {
   serpent_subkeys_t keys;	/* Generated subkeys.  */
+
+#ifdef USE_AVX
+  int use_avx;			/* AVX implementation shall be used. */
+#endif /*USE_AVX*/
 } serpent_context_t;
 
 
+#ifdef USE_AVX
+/* Assembler implementations of Serpent using AVX.  Process data in 18 block
+   same time.
+ */
+extern void _gcry_serpent_avx_ctr_enc(serpent_context_t *ctx,
+				      unsigned char *out,
+				      const unsigned char *in,
+				      unsigned char *ctr);
+
+extern void _gcry_serpent_avx_cbc_dec(serpent_context_t *ctx,
+				      unsigned char *out,
+				      const unsigned char *in,
+				      unsigned char *iv);
+#endif
+
 /* A prototype.  */
 static const char *serpent_test (void);
 
@@ -191,7 +221,7 @@ static const char *serpent_test (void);
     r4 &= r0; r1 ^= r3; \
     r4 ^= r2; r1 |= r0; \
     r1 ^= r2; r0 ^= r3; \
-    r2  = r1; r1 |= r3; \
+    r2 =  r1; r1 |= r3; \
     r1 ^= r0; \
     \
     w = r1; x = r2; y = r3; z = r4; \
@@ -587,10 +617,10 @@ serpent_setkey (void *ctx,
   if (! serpent_init_done)
     {
       /* Execute a self-test the first time, Serpent is used.  */
+      serpent_init_done = 1;
       serpent_test_ret = serpent_test ();
       if (serpent_test_ret)
 	log_error ("Serpent test failure: %s\n", serpent_test_ret);
-      serpent_init_done = 1;
     }
 
   if (serpent_test_ret)
@@ -601,6 +631,14 @@ serpent_setkey (void *ctx,
       _gcry_burn_stack (sizeof (serpent_key_t));
     }
 
+#ifdef USE_AVX
+  context->use_avx = 0;
+  if ((_gcry_get_hw_features () & HWF_INTEL_AVX))
+    {
+      context->use_avx = 1;
+    }
+#endif
+
   return ret;
 }
 
@@ -740,6 +778,166 @@ serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
 
 
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size sizeof(serpent_block_t). */
+void
+_gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+  int i;
+
+#ifdef USE_AVX
+  if (ctx->use_avx)
+    {
+      int did_use_avx = 0;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_serpent_avx_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 8;
+          outbuf += 8 * sizeof(serpent_block_t);
+          inbuf  += 8 * sizeof(serpent_block_t);
+          did_use_avx = 1;
+        }
+
+      if (did_use_avx)
+        {
+          /* clear AVX registers */
+          asm volatile ("vzeroall;\n":::);
+
+          /* serpent-avx assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      serpent_encrypt_internal(ctx, ctr, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+      /* Increment the counter.  */
+      for (i = sizeof(serpent_block_t); i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cbc_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_AVX
+  if (ctx->use_avx)
+    {
+      int did_use_avx = 0;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+        {
+          _gcry_serpent_avx_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 8;
+          outbuf += 8 * sizeof(serpent_block_t);
+          inbuf  += 8 * sizeof(serpent_block_t);
+          did_use_avx = 1;
+        }
+
+      if (did_use_avx)
+        {
+          /* clear AVX registers */
+          asm volatile ("vzeroall;\n":::);
+
+          /* serpent-avx assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, sizeof(serpent_block_t));
+
+      serpent_decrypt_internal (ctx, inbuf, outbuf);
+
+      buf_xor(outbuf, outbuf, iv, sizeof(serpent_block_t));
+      memcpy(iv, savebuf, sizeof(serpent_block_t));
+      inbuf += sizeof(serpent_block_t);
+      outbuf += sizeof(serpent_block_t);
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+

+
+/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 8+1;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_ctr_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cbc_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Serpent test.  */
 
 static const char *
@@ -748,6 +946,7 @@ serpent_test (void)
   serpent_context_t context;
   unsigned char scratch[16];
   unsigned int i;
+  const char *r;
 
   static struct test
   {
@@ -819,6 +1018,12 @@ serpent_test (void)
 	}
     }
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/cipher/serpent_avx_x86-64.S b/cipher/serpent_avx_x86-64.S
new file mode 100644
index 0000000..0544f67
--- /dev/null
+++ b/cipher/serpent_avx_x86-64.S
@@ -0,0 +1,766 @@
+/* serpent_avx_x86-64.S  -  AVX implementation of Serpent cipher
+ *
+ * Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(ENABLE_AVX_SUPPORT) && defined(USE_SERPENT)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* struct serpent_context: */
+#define ctx_keys 0
+
+/* register macros */
+#define CTX %rdi
+
+/* vector registers */
+.set RA0, %xmm0
+.set RA1, %xmm1
+.set RA2, %xmm2
+.set RA3, %xmm3
+.set RA4, %xmm4
+
+.set RB0, %xmm5
+.set RB1, %xmm6
+.set RB2, %xmm7
+.set RB3, %xmm8
+.set RB4, %xmm9
+
+.set RNOT, %xmm10
+.set RTMP0, %xmm11
+.set RTMP1, %xmm12
+.set RTMP2, %xmm13
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* preprocessor macro for renaming vector registers using GAS macros */
+#define sbox_reg_rename(r0, r1, r2, r3, r4, \
+			new_r0, new_r1, new_r2, new_r3, new_r4) \
+	.set rename_reg0, new_r0; \
+	.set rename_reg1, new_r1; \
+	.set rename_reg2, new_r2; \
+	.set rename_reg3, new_r3; \
+	.set rename_reg4, new_r4; \
+	\
+	.set r0, rename_reg0; \
+	.set r1, rename_reg1; \
+	.set r2, rename_reg2; \
+	.set r3, rename_reg3; \
+	.set r4, rename_reg4;
+
+/* vector 32-bit rotation to left */
+#define vec_rol(reg, nleft, tmp) \
+	vpslld $(nleft), reg, tmp;	\
+	vpsrld $(32 - (nleft)), reg, reg;	\
+	vpor reg, tmp, reg;
+
+/* vector 32-bit rotation to right */
+#define vec_ror(reg, nright, tmp) \
+	vec_rol(reg, 32 - nright, tmp)
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/**********************************************************************
+  8-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, ?Speeding up Serpent,? in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317?329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(r0, r1, r2, r3, r4) \
+	vpxor	r3, r0, r3;	vmovdqa	r1, r4;		\
+	vpand	r1, r3, r1;	vpxor	r4, r2, r4;	\
+	vpxor	r1, r0, r1;	vpor	r0, r3, r0;	\
+	vpxor	r0, r4, r0;	vpxor	r4, r3, r4;	\
+	vpxor	r3, r2, r3;	vpor	r2, r1, r2;	\
+	vpxor	r2, r4, r2;	vpxor	r4, RNOT, r4;	\
+	vpor	r4, r1, r4;	vpxor	r1, r3, r1;	\
+	vpxor	r1, r4, r1;	vpor	r3, r0, r3;	\
+	vpxor	r1, r3, r1;	vpxor	r4, r3, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	r2, RNOT, r2;	vmovdqa	r1, r4;		\
+	vpor	r1, r0, r1;	vpxor	r4, RNOT, r4;	\
+	vpxor	r1, r2, r1;	vpor	r2, r4, r2;	\
+	vpxor	r1, r3, r1;	vpxor	r0, r4, r0;	\
+	vpxor	r2, r0, r2;	vpand	r0, r3, r0;	\
+	vpxor	r4, r0, r4;	vpor	r0, r1, r0;	\
+	vpxor	r0, r2, r0;	vpxor	r3, r4, r3;	\
+	vpxor	r2, r1, r2;	vpxor	r3, r0, r3;	\
+	vpxor	r3, r1, r3;	\
+	vpand	r2, r3, r2;	\
+	vpxor	r4, r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+
+#define SBOX1(r0, r1, r2, r3, r4) \
+	vpxor	r0, RNOT, r0;	vpxor	r2, RNOT, r2;	\
+	vmovdqa	r0, r4;		vpand	r0, r1, r0;	\
+	vpxor	r2, r0, r2;	vpor	r0, r3, r0;	\
+	vpxor	r3, r2, r3;	vpxor	r1, r0, r1;	\
+	vpxor	r0, r4, r0;	vpor	r4, r1, r4;	\
+	vpxor	r1, r3, r1;	vpor	r2, r0, r2;	\
+	vpand	r2, r4, r2;	vpxor	r0, r1, r0;	\
+	vpand	r1, r2, r1;	\
+	vpxor	r1, r0, r1;	vpand	r0, r2, r0;	\
+	vpxor	r0, r4, r0;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r1, r4;		vpxor	r1, r3, r1;	\
+	vpand	r3, r1, r3;	vpxor	r4, r2, r4;	\
+	vpxor	r3, r0, r3;	vpor	r0, r1, r0;	\
+	vpxor	r2, r3, r2;	vpxor	r0, r4, r0;	\
+	vpor	r0, r2, r0;	vpxor	r1, r3, r1;	\
+	vpxor	r0, r1, r0;	vpor	r1, r3, r1;	\
+	vpxor	r1, r0, r1;	vpxor	r4, RNOT, r4;	\
+	vpxor	r4, r1, r4;	vpor	r1, r0, r1;	\
+	vpxor	r1, r0, r1;	\
+	vpor	r1, r4, r1;	\
+	vpxor	r3, r1, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+
+#define SBOX2(r0, r1, r2, r3, r4) \
+	vmovdqa	r0, r4;		vpand	r0, r2, r0;	\
+	vpxor	r0, r3, r0;	vpxor	r2, r1, r2;	\
+	vpxor	r2, r0, r2;	vpor	r3, r4, r3;	\
+	vpxor	r3, r1, r3;	vpxor	r4, r2, r4;	\
+	vmovdqa	r3, r1;		vpor	r3, r4, r3;	\
+	vpxor	r3, r0, r3;	vpand	r0, r1, r0;	\
+	vpxor	r4, r0, r4;	vpxor	r1, r3, r1;	\
+	vpxor	r1, r4, r1;	vpxor	r4, RNOT, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	r2, r3, r2;	vpxor	r3, r0, r3;	\
+	vmovdqa	r3, r4;		vpand	r3, r2, r3;	\
+	vpxor	r3, r1, r3;	vpor	r1, r2, r1;	\
+	vpxor	r1, r4, r1;	vpand	r4, r3, r4;	\
+	vpxor	r2, r3, r2;	vpand	r4, r0, r4;	\
+	vpxor	r4, r2, r4;	vpand	r2, r1, r2;	\
+	vpor	r2, r0, r2;	vpxor	r3, RNOT, r3;	\
+	vpxor	r2, r3, r2;	vpxor	r0, r3, r0;	\
+	vpand	r0, r1, r0;	vpxor	r3, r4, r3;	\
+	vpxor	r3, r0, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+
+#define SBOX3(r0, r1, r2, r3, r4) \
+	vmovdqa	r0, r4;		vpor	r0, r3, r0;	\
+	vpxor	r3, r1, r3;	vpand	r1, r4, r1;	\
+	vpxor	r4, r2, r4;	vpxor	r2, r3, r2;	\
+	vpand	r3, r0, r3;	vpor	r4, r1, r4;	\
+	vpxor	r3, r4, r3;	vpxor	r0, r1, r0;	\
+	vpand	r4, r0, r4;	vpxor	r1, r3, r1;	\
+	vpxor	r4, r2, r4;	vpor	r1, r0, r1;	\
+	vpxor	r1, r2, r1;	vpxor	r0, r3, r0;	\
+	vmovdqa	r1, r2;		vpor	r1, r3, r1;	\
+	vpxor	r1, r0, r1;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r2, r4;		vpxor	r2, r1, r2;	\
+	vpxor	r0, r2, r0;	vpand	r4, r2, r4;	\
+	vpxor	r4, r0, r4;	vpand	r0, r1, r0;	\
+	vpxor	r1, r3, r1;	vpor	r3, r4, r3;	\
+	vpxor	r2, r3, r2;	vpxor	r0, r3, r0;	\
+	vpxor	r1, r4, r1;	vpand	r3, r2, r3;	\
+	vpxor	r3, r1, r3;	vpxor	r1, r0, r1;	\
+	vpor	r1, r2, r1;	vpxor	r0, r3, r0;	\
+	vpxor	r1, r4, r1;	\
+	vpxor	r0, r1, r0;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+
+#define SBOX4(r0, r1, r2, r3, r4) \
+	vpxor	r1, r3, r1;	vpxor	r3, RNOT, r3;	\
+	vpxor	r2, r3, r2;	vpxor	r3, r0, r3;	\
+	vmovdqa	r1, r4;		vpand	r1, r3, r1;	\
+	vpxor	r1, r2, r1;	vpxor	r4, r3, r4;	\
+	vpxor	r0, r4, r0;	vpand	r2, r4, r2;	\
+	vpxor	r2, r0, r2;	vpand	r0, r1, r0;	\
+	vpxor	r3, r0, r3;	vpor	r4, r1, r4;	\
+	vpxor	r4, r0, r4;	vpor	r0, r3, r0;	\
+	vpxor	r0, r2, r0;	vpand	r2, r3, r2;	\
+	vpxor	r0, RNOT, r0;	vpxor	r4, r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r2, r4;		vpand	r2, r3, r2;	\
+	vpxor	r2, r1, r2;	vpor	r1, r3, r1;	\
+	vpand	r1, r0, r1;	vpxor	r4, r2, r4;	\
+	vpxor	r4, r1, r4;	vpand	r1, r2, r1;	\
+	vpxor	r0, RNOT, r0;	vpxor	r3, r4, r3;	\
+	vpxor	r1, r3, r1;	vpand	r3, r0, r3;	\
+	vpxor	r3, r2, r3;	vpxor	r0, r1, r0;	\
+	vpand	r2, r0, r2;	vpxor	r3, r0, r3;	\
+	vpxor	r2, r4, r2;	\
+	vpor	r2, r3, r2;	vpxor	r3, r0, r3;	\
+	vpxor	r2, r1, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+
+#define SBOX5(r0, r1, r2, r3, r4) \
+	vpxor	r0, r1, r0;	vpxor	r1, r3, r1;	\
+	vpxor	r3, RNOT, r3;	vmovdqa	r1, r4;		\
+	vpand	r1, r0, r1;	vpxor	r2, r3, r2;	\
+	vpxor	r1, r2, r1;	vpor	r2, r4, r2;	\
+	vpxor	r4, r3, r4;	vpand	r3, r1, r3;	\
+	vpxor	r3, r0, r3;	vpxor	r4, r1, r4;	\
+	vpxor	r4, r2, r4;	vpxor	r2, r0, r2;	\
+	vpand	r0, r3, r0;	vpxor	r2, RNOT, r2;	\
+	vpxor	r0, r4, r0;	vpor	r4, r3, r4;	\
+	vpxor	r2, r4, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	r1, RNOT, r1;	vmovdqa	r3, r4;		\
+	vpxor	r2, r1, r2;	vpor	r3, r0, r3;	\
+	vpxor	r3, r2, r3;	vpor	r2, r1, r2;	\
+	vpand	r2, r0, r2;	vpxor	r4, r3, r4;	\
+	vpxor	r2, r4, r2;	vpor	r4, r0, r4;	\
+	vpxor	r4, r1, r4;	vpand	r1, r2, r1;	\
+	vpxor	r1, r3, r1;	vpxor	r4, r2, r4;	\
+	vpand	r3, r4, r3;	vpxor	r4, r1, r4;	\
+	vpxor	r3, r4, r3;	vpxor	r4, RNOT, r4;	\
+	vpxor	r3, r0, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+
+#define SBOX6(r0, r1, r2, r3, r4) \
+	vpxor	r2, RNOT, r2;	vmovdqa	r3, r4;		\
+	vpand	r3, r0, r3;	vpxor	r0, r4, r0;	\
+	vpxor	r3, r2, r3;	vpor	r2, r4, r2;	\
+	vpxor	r1, r3, r1;	vpxor	r2, r0, r2;	\
+	vpor	r0, r1, r0;	vpxor	r2, r1, r2;	\
+	vpxor	r4, r0, r4;	vpor	r0, r3, r0;	\
+	vpxor	r0, r2, r0;	vpxor	r4, r3, r4;	\
+	vpxor	r4, r0, r4;	vpxor	r3, RNOT, r3;	\
+	vpand	r2, r4, r2;	\
+	vpxor	r2, r3, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
+	vpxor	r0, r2, r0;	vmovdqa	r2, r4;		\
+	vpand	r2, r0, r2;	vpxor	r4, r3, r4;	\
+	vpxor	r2, RNOT, r2;	vpxor	r3, r1, r3;	\
+	vpxor	r2, r3, r2;	vpor	r4, r0, r4;	\
+	vpxor	r0, r2, r0;	vpxor	r3, r4, r3;	\
+	vpxor	r4, r1, r4;	vpand	r1, r3, r1;	\
+	vpxor	r1, r0, r1;	vpxor	r0, r3, r0;	\
+	vpor	r0, r2, r0;	vpxor	r3, r1, r3;	\
+	vpxor	r4, r0, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+
+#define SBOX7(r0, r1, r2, r3, r4) \
+	vmovdqa	r1, r4;		vpor	r1, r2, r1;	\
+	vpxor	r1, r3, r1;	vpxor	r4, r2, r4;	\
+	vpxor	r2, r1, r2;	vpor	r3, r4, r3;	\
+	vpand	r3, r0, r3;	vpxor	r4, r2, r4;	\
+	vpxor	r3, r1, r3;	vpor	r1, r4, r1;	\
+	vpxor	r1, r0, r1;	vpor	r0, r4, r0;	\
+	vpxor	r0, r2, r0;	vpxor	r1, r4, r1;	\
+	vpxor	r2, r1, r2;	vpand	r1, r0, r1;	\
+	vpxor	r1, r4, r1;	vpxor	r2, RNOT, r2;	\
+	vpor	r2, r0, r2;	\
+	vpxor	r4, r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
+	vmovdqa	r2, r4;		vpxor	r2, r0, r2;	\
+	vpand	r0, r3, r0;	vpor	r4, r3, r4;	\
+	vpxor	r2, RNOT, r2;	vpxor	r3, r1, r3;	\
+	vpor	r1, r0, r1;	vpxor	r0, r2, r0;	\
+	vpand	r2, r4, r2;	vpand	r3, r4, r3;	\
+	vpxor	r1, r2, r1;	vpxor	r2, r0, r2;	\
+	vpor	r0, r2, r0;	vpxor	r4, r1, r4;	\
+	vpxor	r0, r3, r0;	vpxor	r3, r4, r3;	\
+	vpor	r4, r0, r4;	vpxor	r3, r2, r3;	\
+	vpxor	r4, r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+
+/* Apply SBOX number WHICH to to the block.  */
+#define SBOX(which, r0, r1, r2, r3, r4) \
+	SBOX##which (r0, r1, r2, r3, r4)
+
+/* Apply inverse SBOX number WHICH to to the block.  */
+#define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
+	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
+
+/* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
+#define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
+	vbroadcastss (ctx_keys + (round) * 16 + 0 * 4)(CTX), r4; \
+	vpxor r0, r4, r0; \
+	vbroadcastss (ctx_keys + (round) * 16 + 1 * 4)(CTX), r4; \
+	vpxor r1, r4, r1; \
+	vbroadcastss (ctx_keys + (round) * 16 + 2 * 4)(CTX), r4; \
+	vpxor r2, r4, r2; \
+	vbroadcastss (ctx_keys + (round) * 16 + 3 * 4)(CTX), r4; \
+	vpxor r3, r4, r3;
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
+	vec_rol(r0, 13, r4);	\
+	vec_rol(r2, 3, r4);	\
+	vpxor r1, r0, r1;	\
+	vpxor r2, r1, r1;	\
+	vpslld $3, r0, r4;	\
+	vpxor r3, r2, r3;	\
+	vpxor r4, r3, r3;	\
+	vec_rol(r1, 1, r4);	\
+	vec_rol(r3, 7, r4);	\
+	vpxor r0, r1, r0;	\
+	vpxor r3, r0, r0;	\
+	vpslld $7, r1, r4;	\
+	vpxor r2, r3, r2;	\
+	vpxor r4, r2, r2;	\
+	vec_rol(r0, 5, r4);	\
+	vec_rol(r2, 22, r4);
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
+	vec_ror(r2, 22, r4);	\
+	vec_ror(r0, 5, r4);	\
+	vpslld $7, r1, r4;	\
+	vpxor r2, r3, r2;	\
+	vpxor r4, r2, r2;	\
+	vpxor r0, r1, r0;	\
+	vpxor r3, r0, r0;	\
+	vec_ror(r3, 7, r4);	\
+	vec_ror(r1, 1, r4);	\
+	vpslld $3, r0, r4;	\
+	vpxor r3, r2, r3;	\
+	vpxor r4, r3, r3;	\
+	vpxor r1, r0, r1;	\
+	vpxor r2, r1, r1;	\
+	vec_ror(r2, 3, r4);	\
+	vec_ror(r0, 13, r4);
+
+/* Apply a Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	SBOX (which, a0, a1, a2, a3, a4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+		SBOX (which, b0, b1, b2, b3, b4);		\
+	LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4);	\
+	.set round, (round + 1);
+
+/* Apply the last Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	SBOX (which, a0, a1, a2, a3, a4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+		SBOX (which, b0, b1, b2, b3, b4);		\
+	.set round, (round + 1);			\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round + 1);
+
+/* Apply an inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+	.set round, (round - 1);
+
+/* Apply the first inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round - 1);			\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round - 1);
+
+.data
+.align 16
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+
+.align 8
+.type   __serpent_enc_blk8, at function;
+__serpent_enc_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 */
+
+	/* record input vector names for __serpent_enc_blk8 */
+	.set enc_in_a0, RA0
+	.set enc_in_a1, RA1
+	.set enc_in_a2, RA2
+	.set enc_in_a3, RA3
+	.set enc_in_b0, RB0
+	.set enc_in_b1, RB1
+	.set enc_in_b2, RB2
+	.set enc_in_b3, RB3
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP1);
+
+	.set round, 0
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP1);
+
+	/* record output vector names for __serpent_enc_blk8 */
+	.set enc_out_a0, RA0
+	.set enc_out_a1, RA1
+	.set enc_out_a2, RA2
+	.set enc_out_a3, RA3
+	.set enc_out_b0, RB0
+	.set enc_out_b1, RB1
+	.set enc_out_b2, RB2
+	.set enc_out_b3, RB3
+
+	ret;
+.size __serpent_enc_blk8,.-__serpent_enc_blk8;
+
+.align 8
+.type   __serpent_dec_blk8, at function;
+__serpent_dec_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 */
+
+	/* record input vector names for __serpent_dec_blk8 */
+	.set dec_in_a0, RA0
+	.set dec_in_a1, RA1
+	.set dec_in_a2, RA2
+	.set dec_in_a3, RA3
+	.set dec_in_b0, RB0
+	.set dec_in_b1, RB1
+	.set dec_in_b2, RB2
+	.set dec_in_b3, RB3
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP1);
+
+	.set round, 32
+	ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP1);
+
+	/* record output vector names for __serpent_dec_blk8 */
+	.set dec_out_a0, RA0
+	.set dec_out_a1, RA1
+	.set dec_out_a2, RA2
+	.set dec_out_a3, RA3
+	.set dec_out_b0, RB0
+	.set dec_out_b1, RB1
+	.set dec_out_b2, RB2
+	.set dec_out_b3, RB3
+
+	ret;
+.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.global _gcry_serpent_avx_ctr_enc
+.type   _gcry_serpent_avx_ctr_enc, at function;
+_gcry_serpent_avx_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+
+	.set RA0, enc_in_a0
+	.set RA1, enc_in_a1
+	.set RA2, enc_in_a2
+	.set RA3, enc_in_a3
+	.set RB0, enc_in_b0
+	.set RB1, enc_in_b1
+	.set RB2, enc_in_b2
+	.set RB3, enc_in_b3
+
+	vzeroupper;
+
+	vmovdqa .Lbswap128_mask RIP, RTMP1;
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), RA0;
+	vpshufb RTMP1, RA0, RTMP0; /* be => le */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+	vpsrldq $8, RNOT, RNOT; /* low: -1, high: 0 */
+
+	/* construct IVs */
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RA1;
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RA2;
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RA3;
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RB0;
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RB1;
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RB2;
+	inc_le128(RTMP0, RNOT, RB3);
+	vpshufb RTMP1, RTMP0, RB3;
+	inc_le128(RTMP0, RNOT, RTMP2);
+	vpshufb RTMP1, RTMP0, RTMP0;
+	vmovdqu RTMP0, (%rcx); /* store new IV */
+
+	call __serpent_enc_blk8;
+
+	.set RA0, enc_out_a0
+	.set RA1, enc_out_a1
+	.set RA2, enc_out_a2
+	.set RA3, enc_out_a3
+	.set RB0, enc_out_b0
+	.set RB1, enc_out_b1
+	.set RB2, enc_out_b2
+	.set RB3, enc_out_b3
+
+	vmovdqu (0 * 16)(%rdx), RNOT;
+	vpxor RA0, RNOT, RA0;
+	vmovdqu (1 * 16)(%rdx), RNOT;
+	vpxor RA1, RNOT, RA1;
+	vmovdqu (2 * 16)(%rdx), RNOT;
+	vpxor RA2, RNOT, RA2;
+	vmovdqu (3 * 16)(%rdx), RNOT;
+	vpxor RA3, RNOT, RA3;
+	vmovdqu (4 * 16)(%rdx), RNOT;
+	vpxor RB0, RNOT, RB0;
+	vmovdqu (5 * 16)(%rdx), RNOT;
+	vpxor RB1, RNOT, RB1;
+	vmovdqu (6 * 16)(%rdx), RNOT;
+	vpxor RB2, RNOT, RB2;
+	vmovdqu (7 * 16)(%rdx), RNOT;
+	vpxor RB3, RNOT, RB3;
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_avx_ctr_enc,.-_gcry_serpent_avx_ctr_enc;
+
+.align 8
+.global _gcry_serpent_avx_cbc_dec
+.type   _gcry_serpent_avx_cbc_dec, at function;
+_gcry_serpent_avx_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+
+	.set RA0, dec_in_a0
+	.set RA1, dec_in_a1
+	.set RA2, dec_in_a2
+	.set RA3, dec_in_a3
+	.set RB0, dec_in_b0
+	.set RB1, dec_in_b1
+	.set RB2, dec_in_b2
+	.set RB3, dec_in_b3
+
+	vzeroupper;
+
+	vmovdqu (0 * 16)(%rdx), RA0;
+	vmovdqu (1 * 16)(%rdx), RA1;
+	vmovdqu (2 * 16)(%rdx), RA2;
+	vmovdqu (3 * 16)(%rdx), RA3;
+	vmovdqu (4 * 16)(%rdx), RB0;
+	vmovdqu (5 * 16)(%rdx), RB1;
+	vmovdqu (6 * 16)(%rdx), RB2;
+	vmovdqu (7 * 16)(%rdx), RB3;
+
+	call __serpent_dec_blk8;
+
+	.set RA0, dec_out_a0
+	.set RA1, dec_out_a1
+	.set RA2, dec_out_a2
+	.set RA3, dec_out_a3
+	.set RB0, dec_out_b0
+	.set RB1, dec_out_b1
+	.set RB2, dec_out_b2
+	.set RB3, dec_out_b3
+
+	vmovdqu (7 * 16)(%rdx), RNOT;
+	vpxor (%rcx), RA0, RA0;
+	vpxor (0 * 16)(%rdx), RA1, RA1;
+	vpxor (1 * 16)(%rdx), RA2, RA2;
+	vpxor (2 * 16)(%rdx), RA3, RA3;
+	vpxor (3 * 16)(%rdx), RB0, RB0;
+	vpxor (4 * 16)(%rdx), RB1, RB1;
+	vpxor (5 * 16)(%rdx), RB2, RB2;
+	vpxor (6 * 16)(%rdx), RB3, RB3;
+	vmovdqu RNOT, (%rcx); /* store new IV */
+
+	vmovdqu RA0, (0 * 16)(%rsi);
+	vmovdqu RA1, (1 * 16)(%rsi);
+	vmovdqu RA2, (2 * 16)(%rsi);
+	vmovdqu RA3, (3 * 16)(%rsi);
+	vmovdqu RB0, (4 * 16)(%rsi);
+	vmovdqu RB1, (5 * 16)(%rsi);
+	vmovdqu RB2, (6 * 16)(%rsi);
+	vmovdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_avx_cbc_dec,.-_gcry_serpent_avx_cbc_dec;
+
+#endif /*defined(ENABLE_AVX_SUPPORT) && defined(USE_SERPENT)*/
+#endif /*__x86_64*/
diff --git a/configure.ac b/configure.ac
index 079951d..5ad2ebb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1219,6 +1219,11 @@ LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
+
+   if test x"$avxsupport" = xyes ; then
+      # Build with the AVX implementation
+      GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent_avx_x86-64.lo"
+   fi
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 17fec6c..4e68487 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -103,6 +103,13 @@ void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              unsigned int nblocks);
 
+/*-- serpent.c --*/
+void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
+void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);


From wk at gnupg.org  Tue May  7 11:35:10 2013
From: wk at gnupg.org (Werner Koch)
Date: Tue, 07 May 2013 11:35:10 +0200
Subject: New email, new DCO?
In-Reply-To: <5186720A.4090101@iki.fi> (Jussi Kivilinna's message of "Sun, 05
 May 2013 17:51:54 +0300")
References: <5186720A.4090101@iki.fi>
Message-ID: <87haifm8tt.fsf@vigenere.g10code.de>

On Sun,  5 May 2013 16:51, jussi.kivilinna at iki.fi said:

> I've changed my email address and started wondering about changing email address in AUTHORS. Do I need to send new signed DCO?

It is easier to do it that way.  Thanks.

I take care of your patches in the next days.


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From smueller at chronox.de  Mon May 13 18:58:55 2013
From: smueller at chronox.de (Stephan Mueller)
Date: Mon, 13 May 2013 18:58:55 +0200
Subject: [PATCH][RFC] CPU Jitter random number generator
Message-ID: <20130513185855.1105166f@tauon>

Hi,

[1] patch at http://www.chronox.de/jent/jitterentropy-20130508.tar.bz2

An implementation of a CPU Jitter random number generator is released
at http://www.chronox.de/ . The heart of the RNG is about 30 lines of
easy to read code. The readme in the main directory explains the
different code files.

The documentation of the CPU Jitter random number generator
(http://www.chronox.de/jent/doc/index.html and PDF at
http://www.chronox.de/jent/doc/CPU-Jitter-NPTRNG.pdf -- the graphs and
pictures are better in PDF) offers a full analysis of:

- the root cause of entropy

- a design of the RNG

- statistical tests and analyses

- entropy assessment and explanation of the flow of entropy

The document also explains the core concept to have a fully
decentralized entropy collector for every caller in need of entropy.

The appendix of the documentation contains example use cases by
providing link code to the Linux kernel crypto API, libgcrypt and
OpenSSL. These implementations follow the concept of
decentralized entropy collection.

The man page provided with the source code explains the use of the API
of the CPU Jitter random number generator.

The test cases used to compile the documentation are available at the
web site as well.

Ciao
Stephan

Signed-off-by: Stephan Mueller <smueller at chronox.de>


From jussi.kivilinna at iki.fi  Wed May 15 08:08:30 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 15 May 2013 09:08:30 +0300
Subject: [PATCH][RFC] CPU Jitter random number generator
In-Reply-To: <20130513185855.1105166f@tauon>
References: <20130513185855.1105166f@tauon>
Message-ID: <5193265E.7040308@iki.fi>

On 13.05.2013 19:58, Stephan Mueller wrote:
> Hi,
> 
> [1] patch at http://www.chronox.de/jent/jitterentropy-20130508.tar.bz2
> 
> An implementation of a CPU Jitter random number generator is released
> at http://www.chronox.de/ . The heart of the RNG is about 30 lines of
> easy to read code. The readme in the main directory explains the
> different code files.
> 
> The documentation of the CPU Jitter random number generator
> (http://www.chronox.de/jent/doc/index.html and PDF at
> http://www.chronox.de/jent/doc/CPU-Jitter-NPTRNG.pdf -- the graphs and
> pictures are better in PDF) offers a full analysis of:
> 
> - the root cause of entropy
> 
> - a design of the RNG
> 
> - statistical tests and analyses
> 
> - entropy assessment and explanation of the flow of entropy

Just want to say that this reminds me of 'haveged' entropy daemon, that uses 'internal volatile hardware states as source of uncertainty'..
	http://www.issihosts.com/haveged/

-Jussi

> 
> The document also explains the core concept to have a fully
> decentralized entropy collector for every caller in need of entropy.
> 
> The appendix of the documentation contains example use cases by
> providing link code to the Linux kernel crypto API, libgcrypt and
> OpenSSL. These implementations follow the concept of
> decentralized entropy collection.
> 
> The man page provided with the source code explains the use of the API
> of the CPU Jitter random number generator.
> 
> The test cases used to compile the documentation are available at the
> web site as well.
> 
> Ciao
> Stephan
> 
> Signed-off-by: Stephan Mueller <smueller at chronox.de>
> 
> _______________________________________________
> Gcrypt-devel mailing list
> Gcrypt-devel at gnupg.org
> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
> 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 730 bytes
Desc: OpenPGP digital signature
URL: </pipermail/attachments/20130515/376db8b7/attachment.sig>

From smueller at chronox.de  Wed May 15 10:05:24 2013
From: smueller at chronox.de (Stephan Mueller)
Date: Wed, 15 May 2013 10:05:24 +0200
Subject: [PATCH][RFC] CPU Jitter random number generator
In-Reply-To: <5193265E.7040308@iki.fi>
References: <20130513185855.1105166f@tauon>
	<5193265E.7040308@iki.fi>
Message-ID: <20130515100524.0b595e1d@tauon>

On Wed, 15 May 2013 09:08:30 +0300
Jussi Kivilinna <jussi.kivilinna at iki.fi> wrote:

Hi Jussi,

> On 13.05.2013 19:58, Stephan Mueller wrote:
> > Hi,
> > 
> > [1] patch at
> > http://www.chronox.de/jent/jitterentropy-20130508.tar.bz2
> > 
> > An implementation of a CPU Jitter random number generator is
> > released at http://www.chronox.de/ . The heart of the RNG is about
> > 30 lines of easy to read code. The readme in the main directory
> > explains the different code files.
> > 
> > The documentation of the CPU Jitter random number generator
> > (http://www.chronox.de/jent/doc/index.html and PDF at
> > http://www.chronox.de/jent/doc/CPU-Jitter-NPTRNG.pdf -- the graphs
> > and pictures are better in PDF) offers a full analysis of:
> > 
> > - the root cause of entropy
> > 
> > - a design of the RNG
> > 
> > - statistical tests and analyses
> > 
> > - entropy assessment and explanation of the flow of entropy
> 
> Just want to say that this reminds me of 'haveged' entropy daemon,
> that uses 'internal volatile hardware states as source of
> uncertainty'.. http://www.issihosts.com/haveged/

Have you looked into the code of haveged? It is a very large body of
code which is also very complicated. Even with the design I did not
really understand the code. In particular, look at oneiteration.h.

The approach I have tries to cover the heart in about 30 lines of code
which is very simple.

Moreover, haveged is intended to seed /dev/random. So you have again a
central source of entropy. Contrary, the suggested method shall
allow for multiple, independent, decentralized entropy gatherers. In
essence, every requestor in need of entropy can instantiate its own
copy of the entropy collector.

Thanks
Stephan
> 
> -Jussi
> 
> > 
> > The document also explains the core concept to have a fully
> > decentralized entropy collector for every caller in need of entropy.
> > 
> > The appendix of the documentation contains example use cases by
> > providing link code to the Linux kernel crypto API, libgcrypt and
> > OpenSSL. These implementations follow the concept of
> > decentralized entropy collection.
> > 
> > The man page provided with the source code explains the use of the
> > API of the CPU Jitter random number generator.
> > 
> > The test cases used to compile the documentation are available at
> > the web site as well.
> > 
> > Ciao
> > Stephan
> > 
> > Signed-off-by: Stephan Mueller <smueller at chronox.de>
> > 
> > _______________________________________________
> > Gcrypt-devel mailing list
> > Gcrypt-devel at gnupg.org
> > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
> > 
> 
> 


-- 
| Cui bono? |


From wk at gnupg.org  Thu May 16 11:12:38 2013
From: wk at gnupg.org (Werner Koch)
Date: Thu, 16 May 2013 11:12:38 +0200
Subject: yet another tiny feature: deterministic ECDSA
In-Reply-To: <51680B1D.1090901@grothoff.org> (Christian Grothoff's message of
 "Fri, 12 Apr 2013 15:24:45 +0200")
References: <516710E9.6090800@grothoff.org> <516714EB.1050408@gmail.com>
 <51671615.7020308@grothoff.org> <51673DC9.7000703@gmail.com>
 <5167B9DC.5000302@grothoff.org> <87obdk0xqu.fsf@vigenere.g10code.de>
 <CA+cU71m-tbemv=90_fmVVDu842jdoyx5cT+ugJqP+LVk1ZcFiA@mail.gmail.com>
 <51680B1D.1090901@grothoff.org>
Message-ID: <87y5bfcmpl.fsf@vigenere.g10code.de>

On Fri, 12 Apr 2013 15:24, christian at grothoff.org said:

> On 04/12/2013 03:16 PM, Tom Ritter wrote:
>>
>> There is a method to do deterministic DSA safely (as far as anyone
>> knows), that's been looked at some:
>> http://tools.ietf.org/html/draft-pornin-deterministic-dsa-01

I read the I-D and discussion at cfrg again and agree that it makes
sense to have support in Libgcrypt for this scheme.  There is no RFC
yet, but despite that -01 expired it seems to be moving on.

> Using this method would be fine by me as well; I can supply 'h1' (the
> H(m)) instead of
> the exact 'k' value.  What I care about is having an option to achieve
> determinism. Also,

Good that we can avoid a special GNUnet case here.

> as in our case 'm' itself is encrypted before being signed, I'd like
> to do the hashing myself
> as using h1 = H(E(m)) will give the adversary (who doesn't know 'm')

Sure, that is how we do it in Libgcrypt anyway.

I'll ask Thomas Porrin whether it is okay to implement this draft or
whether he intends any update.


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Tue May 21 14:59:24 2013
From: wk at gnupg.org (Werner Koch)
Date: Tue, 21 May 2013 14:59:24 +0200
Subject: [PATCH 2/2] serpent: add AVX accelerated x86-64 implementation
In-Reply-To: <20130505155538.16505.18822.stgit@localhost6.localdomain6> (Jussi
 Kivilinna's message of "Sun, 05 May 2013 18:55:38 +0300")
References: <20130505155532.16505.3475.stgit@localhost6.localdomain6>
 <20130505155538.16505.18822.stgit@localhost6.localdomain6>
Message-ID: <87li78a3pv.fsf@vigenere.g10code.de>

Hi,

some comments on this pacth:

> 'serpent_avx_x86-64.lo'.

Please use serpent-avx-amd64.* for the file names.  We generally use a
dash as word delimiter in file names.  Also we started to use amd64 for
this architecture (mpi/amd64) and to be consistent we should keep on
using "amd64".  You may keep the x86_64 in comments, though.

I only now noticed that I didn't caught your camellia_aesni_avx_x86-64
from January.  I will eventually fix that file name.


> +#include "selftest_help.h"

See my other mail.

I would have reworked your patch but there is a problem with the
selftest patches we need to be address first.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Tue May 21 15:13:03 2013
From: wk at gnupg.org (Werner Koch)
Date: Tue, 21 May 2013 15:13:03 +0200
Subject: [PATCH 1/4] camellia: add bulk CBC decryption selftest
In-Reply-To: <20130505143651.29094.85500.stgit@localhost6.localdomain6> (Jussi
 Kivilinna's message of "Sun, 05 May 2013 17:36:51 +0300")
References: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
Message-ID: <87hahwa334.fsf@vigenere.g10code.de>

On Sun,  5 May 2013 16:36, jussi.kivilinna at iki.fi said:
> * cipher/camellia-glue.c: (selftest_cbc_128): New selftest function for
> bulk CBC decryption.

I general I am fine with factoring the selftest code out to a separate
module.  However there are two problems:

The minor one is that selftest_help.c has an underscore in the name,
which should be replaced by a dash.  For easier grepping the source tree
I also suggest to use a different name; for example "cipher-selftest.c"

The other problem is that code like

  static char error_str[128];
  [...]
  snprintf (error_str, sizeof(error_str),
            "%s-128-CBC test failed (plaintext mismatch)", cipher);
  [...]
  return error_str;

is not thread-safe and should not be done this way.  To avoid the
overhead of adding static memory management code, what about this:

#ifdef HAVE_SYSLOG
  syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
          "%s-128-CBC test failed (plaintext mismatch)", cipher)
#endif /*HAVE_SYSLOG*/
  return "selftest for 128 bit CBC failed - see syslog for details";


Shalom-Salam,

   Werner


-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Tue May 21 15:40:24 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 21 May 2013 16:40:24 +0300
Subject: [PATCH 2/2] serpent: add AVX accelerated x86-64 implementation
In-Reply-To: <87li78a3pv.fsf@vigenere.g10code.de>
References: <20130505155532.16505.3475.stgit@localhost6.localdomain6>
 <20130505155538.16505.18822.stgit@localhost6.localdomain6>
 <87li78a3pv.fsf@vigenere.g10code.de>
Message-ID: <519B7948.4040607@iki.fi>

On 21.05.2013 15:59, Werner Koch wrote:
> Hi,
> 
> some comments on this pacth:
> 
>> 'serpent_avx_x86-64.lo'.
> 
> Please use serpent-avx-amd64.* for the file names.  We generally use a
> dash as word delimiter in file names.  Also we started to use amd64 for
> this architecture (mpi/amd64) and to be consistent we should keep on
> using "amd64".  You may keep the x86_64 in comments, though.

Ok, I'll use amd64 now on.

> 
> I only now noticed that I didn't caught your camellia_aesni_avx_x86-64
> from January.  I will eventually fix that file name.
>

I can send patch to do that.

>
>> +#include "selftest_help.h"
> 
> See my other mail.
> 
> I would have reworked your patch but there is a problem with the
> selftest patches we need to be address first.
> 

Ok.

-Jussi

> 
> Salam-Shalom,
> 
>    Werner
> 


From jussi.kivilinna at iki.fi  Tue May 21 15:51:37 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Tue, 21 May 2013 16:51:37 +0300
Subject: [PATCH 1/4] camellia: add bulk CBC decryption selftest
In-Reply-To: <87hahwa334.fsf@vigenere.g10code.de>
References: <20130505143651.29094.85500.stgit@localhost6.localdomain6>
 <87hahwa334.fsf@vigenere.g10code.de>
Message-ID: <519B7BE9.5040802@iki.fi>

On 21.05.2013 16:13, Werner Koch wrote:
> On Sun,  5 May 2013 16:36, jussi.kivilinna at iki.fi said:
>> * cipher/camellia-glue.c: (selftest_cbc_128): New selftest function for
>> bulk CBC decryption.
> 
> I general I am fine with factoring the selftest code out to a separate
> module.  However there are two problems:
> 
> The minor one is that selftest_help.c has an underscore in the name,
> which should be replaced by a dash.  For easier grepping the source tree
> I also suggest to use a different name; for example "cipher-selftest.c"

Ok, I'll make the changes.

> 
> The other problem is that code like
> 
>   static char error_str[128];
>   [...]
>   snprintf (error_str, sizeof(error_str),
>             "%s-128-CBC test failed (plaintext mismatch)", cipher);
>   [...]
>   return error_str;
> 

Agreed, that's not good.

> is not thread-safe and should not be done this way.  To avoid the
> overhead of adding static memory management code, what about this:
> 
> #ifdef HAVE_SYSLOG
>   syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
>           "%s-128-CBC test failed (plaintext mismatch)", cipher)
> #endif /*HAVE_SYSLOG*/
>   return "selftest for 128 bit CBC failed - see syslog for details";

I like the idea.. avoids introducing additional overhead for the error-paths.

-Jussi

> 
> 
> Shalom-Salam,
> 
>    Werner
> 
> 


From jussi.kivilinna at iki.fi  Wed May 22 11:06:03 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 May 2013 12:06:03 +0300
Subject: [PATCH] camellia: Rename camellia_aesni_avx_x86-64.S to
 camellia-aesni-avx-amd64.S
Message-ID: <20130522090603.9672.49342.stgit@localhost6.localdomain6>

* cipher/camellia_aesni_avx_x86-64.S: Remove.
* cipher/camellia-aesni-avx-amd64.S: New.
* cipher/Makefile.am: Use the new filename.
* configure.ac: Use the new filename.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am                 |    2 
 cipher/camellia-aesni-avx-amd64.S  | 1120 ++++++++++++++++++++++++++++++++++++
 cipher/camellia_aesni_avx_x86-64.S | 1120 ------------------------------------
 configure.ac                       |    2 
 4 files changed, 1122 insertions(+), 1122 deletions(-)
 create mode 100644 cipher/camellia-aesni-avx-amd64.S
 delete mode 100644 cipher/camellia_aesni_avx_x86-64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index c39f627..00e4429 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -75,7 +75,7 @@ tiger.c \
 whirlpool.c \
 twofish.c \
 rfc2268.c \
-camellia.c camellia.h camellia-glue.c camellia_aesni_avx_x86-64.S
+camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O\([2-9s][2-9s]*\)/-O1/' -e 's/-Ofast/-O1/g'
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
new file mode 100644
index 0000000..2b1df17
--- /dev/null
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -0,0 +1,1120 @@
+/* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
+ *
+ * Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_bitlength 0
+#define key_table 4
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/**********************************************************************
+  16-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vmovdqa .Linv_shift_row RIP, t4; \
+	vbroadcastss .L0f0f0f0f RIP, t7; \
+	vmovdqa .Lpre_tf_lo_s1 RIP, t0; \
+	vmovdqa .Lpre_tf_hi_s1 RIP, t1; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	vmovdqa .Lpre_tf_lo_s4 RIP, t2; \
+	vmovdqa .Lpre_tf_hi_s4 RIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	vpxor t4, t4, t4; \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vaesenclast t4, x0, x0; \
+	vaesenclast t4, x7, x7; \
+	vaesenclast t4, x1, x1; \
+	vaesenclast t4, x4, x4; \
+	vaesenclast t4, x2, x2; \
+	vaesenclast t4, x5, x5; \
+	vaesenclast t4, x3, x3; \
+	vaesenclast t4, x6, x6; \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vmovdqa .Lpost_tf_lo_s3 RIP, t2; \
+	vmovdqa .Lpost_tf_hi_s3 RIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vmovdqa .Lpost_tf_lo_s2 RIP, t4; \
+	vmovdqa .Lpost_tf_hi_s2 RIP, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpxor t6, t6, t6; \
+	vmovq key, t0; \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	vpsrldq $5, t0, t5; \
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpsrldq $3, t0, t3; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t6, t0, t0; \
+	vpshufb t6, t1, t1; \
+	vpshufb t6, t2, t2; \
+	vpshufb t6, t3, t3; \
+	vpshufb t6, t4, t4; \
+	vpsrldq $2, t5, t7; \
+	vpshufb t6, t7, t7; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 16(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 16(mem_cd), x5, x5; \
+	\
+	vpsrldq $1, t5, t3; \
+	vpshufb t6, t5, t5; \
+	vpshufb t6, t3, t6; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 16(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 16(mem_cd), x7, x7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 16(mem_cd), x0, x0; \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 16(mem_cd), x1, x1; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 16(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+	\
+	vmovdqu x4, 0 * 16(mem_cd); \
+	vmovdqu x5, 1 * 16(mem_cd); \
+	vmovdqu x6, 2 * 16(mem_cd); \
+	vmovdqu x7, 3 * 16(mem_cd); \
+	vmovdqu x0, 4 * 16(mem_cd); \
+	vmovdqu x1, 5 * 16(mem_cd); \
+	vmovdqu x2, 6 * 16(mem_cd); \
+	vmovdqu x3, 7 * 16(mem_cd); \
+	\
+	roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpxor tt0, tt0, tt0; \
+	vmovd kll, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vmovdqu l4, 4 * 16(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 16(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 16(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 16(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vmovd krr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor 4 * 16(r), t0, t0; \
+	vpor 5 * 16(r), t1, t1; \
+	vpor 6 * 16(r), t2, t2; \
+	vpor 7 * 16(r), t3, t3; \
+	\
+	vpxor 0 * 16(r), t0, t0; \
+	vpxor 1 * 16(r), t1, t1; \
+	vpxor 2 * 16(r), t2, t2; \
+	vpxor 3 * 16(r), t3, t3; \
+	vmovdqu t0, 0 * 16(r); \
+	vmovdqu t1, 1 * 16(r); \
+	vmovdqu t2, 2 * 16(r); \
+	vmovdqu t3, 3 * 16(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vmovd krl, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand 0 * 16(r), t0, t0; \
+	vpand 1 * 16(r), t1, t1; \
+	vpand 2 * 16(r), t2, t2; \
+	vpand 3 * 16(r), t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 16(r), t0, t0; \
+	vpxor 5 * 16(r), t1, t1; \
+	vpxor 6 * 16(r), t2, t2; \
+	vpxor 7 * 16(r), t3, t3; \
+	vmovdqu t0, 4 * 16(r); \
+	vmovdqu t1, 5 * 16(r); \
+	vmovdqu t2, 6 * 16(r); \
+	vmovdqu t3, 7 * 16(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vmovd klr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 16(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 16(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 16(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vmovdqu .Lshufb_16x16b RIP, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
+	vpunpcklbw a, b, t0; \
+	vpunpckhbw a, b, b; \
+	\
+	vpunpcklbw c, d, t1; \
+	vpunpckhbw c, d, d; \
+	\
+	vpunpcklbw e, f, t2; \
+	vpunpckhbw e, f, f; \
+	\
+	vpunpcklbw g, h, t3; \
+	vpunpckhbw g, h, h; \
+	\
+	vpunpcklwd t0, t1, g; \
+	vpunpckhwd t0, t1, t0; \
+	\
+	vpunpcklwd b, d, t1; \
+	vpunpckhwd b, d, e; \
+	\
+	vpunpcklwd t2, t3, c; \
+	vpunpckhwd t2, t3, t2; \
+	\
+	vpunpcklwd f, h, t3; \
+	vpunpckhwd f, h, b; \
+	\
+	vpunpcklwd e, b, t4; \
+	vpunpckhwd e, b, b; \
+	\
+	vpunpcklwd t1, t3, e; \
+	vpunpckhwd t1, t3, f; \
+	\
+	vmovdqa .Ltranspose_8x8_shuf RIP, t3; \
+	\
+	vpunpcklwd g, c, d; \
+	vpunpckhwd g, c, c; \
+	\
+	vpunpcklwd t0, t2, t1; \
+	vpunpckhwd t0, t2, h; \
+	\
+	vpunpckhqdq b, h, a; \
+	vpshufb t3, a, a; \
+	vpunpcklqdq b, h, b; \
+	vpshufb t3, b, b; \
+	\
+	vpunpckhqdq e, d, g; \
+	vpshufb t3, g, g; \
+	vpunpcklqdq e, d, h; \
+	vpshufb t3, h, h; \
+	\
+	vpunpckhqdq f, c, e; \
+	vpshufb t3, e, e; \
+	vpunpcklqdq f, c, f; \
+	vpshufb t3, f, f; \
+	\
+	vpunpckhqdq t4, t1, c; \
+	vpshufb t3, c, c; \
+	vpunpcklqdq t4, t1, d; \
+	vpshufb t3, d, d;
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap RIP, x0, x0; \
+	\
+	vpxor 0 * 16(rio), x0, y7; \
+	vpxor 1 * 16(rio), x0, y6; \
+	vpxor 2 * 16(rio), x0, y5; \
+	vpxor 3 * 16(rio), x0, y4; \
+	vpxor 4 * 16(rio), x0, y3; \
+	vpxor 5 * 16(rio), x0, y2; \
+	vpxor 6 * 16(rio), x0, y1; \
+	vpxor 7 * 16(rio), x0, y0; \
+	vpxor 8 * 16(rio), x0, x7; \
+	vpxor 9 * 16(rio), x0, x6; \
+	vpxor 10 * 16(rio), x0, x5; \
+	vpxor 11 * 16(rio), x0, x4; \
+	vpxor 12 * 16(rio), x0, x3; \
+	vpxor 13 * 16(rio), x0, x2; \
+	vpxor 14 * 16(rio), x0, x1; \
+	vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab); \
+	vmovdqu y0, 0 * 16(mem_cd); \
+	vmovdqu y1, 1 * 16(mem_cd); \
+	vmovdqu y2, 2 * 16(mem_cd); \
+	vmovdqu y3, 3 * 16(mem_cd); \
+	vmovdqu y4, 4 * 16(mem_cd); \
+	vmovdqu y5, 5 * 16(mem_cd); \
+	vmovdqu y6, 6 * 16(mem_cd); \
+	vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap RIP, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 16(rio); \
+	vmovdqu x1, 1 * 16(rio); \
+	vmovdqu x2, 2 * 16(rio); \
+	vmovdqu x3, 3 * 16(rio); \
+	vmovdqu x4, 4 * 16(rio); \
+	vmovdqu x5, 5 * 16(rio); \
+	vmovdqu x6, 6 * 16(rio); \
+	vmovdqu x7, 7 * 16(rio); \
+	vmovdqu y0, 8 * 16(rio); \
+	vmovdqu y1, 9 * 16(rio); \
+	vmovdqu y2, 10 * 16(rio); \
+	vmovdqu y3, 11 * 16(rio); \
+	vmovdqu y4, 12 * 16(rio); \
+	vmovdqu y5, 13 * 16(rio); \
+	vmovdqu y6, 14 * 16(rio); \
+	vmovdqu y7, 15 * 16(rio);
+
+.data
+.align 16
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x80808080
+	.long 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '? 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '? 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '? 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '? 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '? 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* shuffle mask for 8x8 byte transpose */
+.Ltranspose_8x8_shuf:
+	.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 8
+.type   __camellia_enc_blk16, at function;
+
+__camellia_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%xmm0..%xmm15: 16 plaintext blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX),
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 8);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX),
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 16);
+
+	movl $24, %r8d;
+	cmpl $128, key_bitlength(CTX);
+	jne .Lenc_max32;
+
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+
+	ret;
+
+.align 8
+.Lenc_max32:
+	movl $32, %r8d;
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX),
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 24);
+
+	jmp .Lenc_done;
+.size __camellia_enc_blk16,.-__camellia_enc_blk16;
+
+.align 8
+.type   __camellia_dec_blk16, at function;
+
+__camellia_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%xmm0..%xmm15: 16 encrypted blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+	cmpl $32, %r8d;
+	je .Ldec_max32;
+
+.Ldec_max24:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 16);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX),
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX));
+
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 8);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX),
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX));
+
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+	ret;
+
+.align 8
+.Ldec_max32:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 24);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX),
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX));
+
+	jmp .Ldec_max24;
+.size __camellia_dec_blk16,.-__camellia_dec_blk16;
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.global _gcry_camellia_aesni_avx_ctr_enc
+.type   _gcry_camellia_aesni_avx_ctr_enc, at function;
+
+_gcry_camellia_aesni_avx_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	vmovdqa .Lbswap128_mask RIP, %xmm14;
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm15;
+	vmovdqu %xmm15, 15 * 16(%rax);
+	vpshufb %xmm14, %xmm15, %xmm0; /* be => le */
+
+	vpcmpeqd %xmm15, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+	/* construct IVs */
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 14 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 13 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm12;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm11;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm10;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm9;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm8;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm7;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm6;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm5;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm4;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm3;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm2;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vmovdqa %xmm0, %xmm13;
+	vpshufb %xmm14, %xmm0, %xmm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */
+	vmovdqu %xmm13, (%rcx);
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor 13 * 16(%rax), %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
+
+.align 8
+.global _gcry_camellia_aesni_avx_cbc_dec
+.type   _gcry_camellia_aesni_avx_cbc_dec, at function;
+
+_gcry_camellia_aesni_avx_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+
+	movq %rcx, %r9;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	call __camellia_dec_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	/* XOR output with IV */
+	vpxor (%r9), %xmm7, %xmm7;
+	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+	movq (15 * 16 + 0)(%rdx), %r10;
+	movq (15 * 16 + 8)(%rdx), %r11;
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	/* store new IV */
+	movq %r10, (0)(%r9);
+	movq %r11, (8)(%r9);
+
+	ret;
+.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/cipher/camellia_aesni_avx_x86-64.S b/cipher/camellia_aesni_avx_x86-64.S
deleted file mode 100644
index e25ad8f..0000000
--- a/cipher/camellia_aesni_avx_x86-64.S
+++ /dev/null
@@ -1,1120 +0,0 @@
-/* camellia_avx_aesni_x86-64.S  -  AES-NI/AVX implementation of Camellia cipher
- *
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifdef __x86_64
-#include <config.h>
-#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
-
-#ifdef __PIC__
-#  define RIP (%rip)
-#else
-#  define RIP
-#endif
-
-#define CAMELLIA_TABLE_BYTE_LEN 272
-
-/* struct CAMELLIA_context: */
-#define key_bitlength 0
-#define key_table 4
-
-/* register macros */
-#define CTX %rdi
-#define RIO %r8
-
-/**********************************************************************
-  helper macros
- **********************************************************************/
-#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
-	vpand x, mask4bit, tmp0; \
-	vpandn x, mask4bit, x; \
-	vpsrld $4, x, x; \
-	\
-	vpshufb tmp0, lo_t, tmp0; \
-	vpshufb x, hi_t, x; \
-	vpxor tmp0, x, x;
-
-/**********************************************************************
-  16-way camellia
- **********************************************************************/
-
-/*
- * IN:
- *   x0..x7: byte-sliced AB state
- *   mem_cd: register pointer storing CD state
- *   key: index for key material
- * OUT:
- *   x0..x7: new byte-sliced CD state
- */
-#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
-		  t7, mem_cd, key) \
-	/* \
-	 * S-function with AES subbytes \
-	 */ \
-	vmovdqa .Linv_shift_row RIP, t4; \
-	vbroadcastss .L0f0f0f0f RIP, t7; \
-	vmovdqa .Lpre_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpre_tf_hi_s1 RIP, t1; \
-	\
-	/* AES inverse shift rows */ \
-	vpshufb t4, x0, x0; \
-	vpshufb t4, x7, x7; \
-	vpshufb t4, x1, x1; \
-	vpshufb t4, x4, x4; \
-	vpshufb t4, x2, x2; \
-	vpshufb t4, x5, x5; \
-	vpshufb t4, x3, x3; \
-	vpshufb t4, x6, x6; \
-	\
-	/* prefilter sboxes 1, 2 and 3 */ \
-	vmovdqa .Lpre_tf_lo_s4 RIP, t2; \
-	vmovdqa .Lpre_tf_hi_s4 RIP, t3; \
-	filter_8bit(x0, t0, t1, t7, t6); \
-	filter_8bit(x7, t0, t1, t7, t6); \
-	filter_8bit(x1, t0, t1, t7, t6); \
-	filter_8bit(x4, t0, t1, t7, t6); \
-	filter_8bit(x2, t0, t1, t7, t6); \
-	filter_8bit(x5, t0, t1, t7, t6); \
-	\
-	/* prefilter sbox 4 */ \
-	vpxor t4, t4, t4; \
-	filter_8bit(x3, t2, t3, t7, t6); \
-	filter_8bit(x6, t2, t3, t7, t6); \
-	\
-	/* AES subbytes + AES shift rows */ \
-	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
-	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
-	vaesenclast t4, x0, x0; \
-	vaesenclast t4, x7, x7; \
-	vaesenclast t4, x1, x1; \
-	vaesenclast t4, x4, x4; \
-	vaesenclast t4, x2, x2; \
-	vaesenclast t4, x5, x5; \
-	vaesenclast t4, x3, x3; \
-	vaesenclast t4, x6, x6; \
-	\
-	/* postfilter sboxes 1 and 4 */ \
-	vmovdqa .Lpost_tf_lo_s3 RIP, t2; \
-	vmovdqa .Lpost_tf_hi_s3 RIP, t3; \
-	filter_8bit(x0, t0, t1, t7, t6); \
-	filter_8bit(x7, t0, t1, t7, t6); \
-	filter_8bit(x3, t0, t1, t7, t6); \
-	filter_8bit(x6, t0, t1, t7, t6); \
-	\
-	/* postfilter sbox 3 */ \
-	vmovdqa .Lpost_tf_lo_s2 RIP, t4; \
-	vmovdqa .Lpost_tf_hi_s2 RIP, t5; \
-	filter_8bit(x2, t2, t3, t7, t6); \
-	filter_8bit(x5, t2, t3, t7, t6); \
-	\
-	vpxor t6, t6, t6; \
-	vmovq key, t0; \
-	\
-	/* postfilter sbox 2 */ \
-	filter_8bit(x1, t4, t5, t7, t2); \
-	filter_8bit(x4, t4, t5, t7, t2); \
-	\
-	vpsrldq $5, t0, t5; \
-	vpsrldq $1, t0, t1; \
-	vpsrldq $2, t0, t2; \
-	vpsrldq $3, t0, t3; \
-	vpsrldq $4, t0, t4; \
-	vpshufb t6, t0, t0; \
-	vpshufb t6, t1, t1; \
-	vpshufb t6, t2, t2; \
-	vpshufb t6, t3, t3; \
-	vpshufb t6, t4, t4; \
-	vpsrldq $2, t5, t7; \
-	vpshufb t6, t7, t7; \
-	\
-	/* P-function */ \
-	vpxor x5, x0, x0; \
-	vpxor x6, x1, x1; \
-	vpxor x7, x2, x2; \
-	vpxor x4, x3, x3; \
-	\
-	vpxor x2, x4, x4; \
-	vpxor x3, x5, x5; \
-	vpxor x0, x6, x6; \
-	vpxor x1, x7, x7; \
-	\
-	vpxor x7, x0, x0; \
-	vpxor x4, x1, x1; \
-	vpxor x5, x2, x2; \
-	vpxor x6, x3, x3; \
-	\
-	vpxor x3, x4, x4; \
-	vpxor x0, x5, x5; \
-	vpxor x1, x6, x6; \
-	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
-	\
-	/* Add key material and result to CD (x becomes new CD) */ \
-	\
-	vpxor t3, x4, x4; \
-	vpxor 0 * 16(mem_cd), x4, x4; \
-	\
-	vpxor t2, x5, x5; \
-	vpxor 1 * 16(mem_cd), x5, x5; \
-	\
-	vpsrldq $1, t5, t3; \
-	vpshufb t6, t5, t5; \
-	vpshufb t6, t3, t6; \
-	\
-	vpxor t1, x6, x6; \
-	vpxor 2 * 16(mem_cd), x6, x6; \
-	\
-	vpxor t0, x7, x7; \
-	vpxor 3 * 16(mem_cd), x7, x7; \
-	\
-	vpxor t7, x0, x0; \
-	vpxor 4 * 16(mem_cd), x0, x0; \
-	\
-	vpxor t6, x1, x1; \
-	vpxor 5 * 16(mem_cd), x1, x1; \
-	\
-	vpxor t5, x2, x2; \
-	vpxor 6 * 16(mem_cd), x2, x2; \
-	\
-	vpxor t4, x3, x3; \
-	vpxor 7 * 16(mem_cd), x3, x3;
-
-/*
- * IN/OUT:
- *  x0..x7: byte-sliced AB state preloaded
- *  mem_ab: byte-sliced AB state in memory
- *  mem_cb: byte-sliced CD state in memory
- */
-#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
-	roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
-	\
-	vmovdqu x4, 0 * 16(mem_cd); \
-	vmovdqu x5, 1 * 16(mem_cd); \
-	vmovdqu x6, 2 * 16(mem_cd); \
-	vmovdqu x7, 3 * 16(mem_cd); \
-	vmovdqu x0, 4 * 16(mem_cd); \
-	vmovdqu x1, 5 * 16(mem_cd); \
-	vmovdqu x2, 6 * 16(mem_cd); \
-	vmovdqu x3, 7 * 16(mem_cd); \
-	\
-	roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
-		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
-	\
-	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
-
-#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
-
-#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
-	/* Store new AB state */ \
-	vmovdqu x0, 0 * 16(mem_ab); \
-	vmovdqu x1, 1 * 16(mem_ab); \
-	vmovdqu x2, 2 * 16(mem_ab); \
-	vmovdqu x3, 3 * 16(mem_ab); \
-	vmovdqu x4, 4 * 16(mem_ab); \
-	vmovdqu x5, 5 * 16(mem_ab); \
-	vmovdqu x6, 6 * 16(mem_ab); \
-	vmovdqu x7, 7 * 16(mem_ab);
-
-#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, i) \
-	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
-	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
-	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
-
-#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, i) \
-	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
-	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
-	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
-
-/*
- * IN:
- *  v0..3: byte-sliced 32-bit integers
- * OUT:
- *  v0..3: (IN <<< 1)
- */
-#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
-	vpcmpgtb v0, zero, t0; \
-	vpaddb v0, v0, v0; \
-	vpabsb t0, t0; \
-	\
-	vpcmpgtb v1, zero, t1; \
-	vpaddb v1, v1, v1; \
-	vpabsb t1, t1; \
-	\
-	vpcmpgtb v2, zero, t2; \
-	vpaddb v2, v2, v2; \
-	vpabsb t2, t2; \
-	\
-	vpor t0, v1, v1; \
-	\
-	vpcmpgtb v3, zero, t0; \
-	vpaddb v3, v3, v3; \
-	vpabsb t0, t0; \
-	\
-	vpor t1, v2, v2; \
-	vpor t2, v3, v3; \
-	vpor t0, v0, v0;
-
-/*
- * IN:
- *   r: byte-sliced AB state in memory
- *   l: byte-sliced CD state in memory
- * OUT:
- *   x0..x7: new byte-sliced CD state
- */
-#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
-	      tt1, tt2, tt3, kll, klr, krl, krr) \
-	/* \
-	 * t0 = kll; \
-	 * t0 &= ll; \
-	 * lr ^= rol32(t0, 1); \
-	 */ \
-	vpxor tt0, tt0, tt0; \
-	vmovd kll, t0; \
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
-	\
-	vpand l0, t0, t0; \
-	vpand l1, t1, t1; \
-	vpand l2, t2, t2; \
-	vpand l3, t3, t3; \
-	\
-	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
-	\
-	vpxor l4, t0, l4; \
-	vmovdqu l4, 4 * 16(l); \
-	vpxor l5, t1, l5; \
-	vmovdqu l5, 5 * 16(l); \
-	vpxor l6, t2, l6; \
-	vmovdqu l6, 6 * 16(l); \
-	vpxor l7, t3, l7; \
-	vmovdqu l7, 7 * 16(l); \
-	\
-	/* \
-	 * t2 = krr; \
-	 * t2 |= rr; \
-	 * rl ^= t2; \
-	 */ \
-	\
-	vmovd krr, t0; \
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
-	\
-	vpor 4 * 16(r), t0, t0; \
-	vpor 5 * 16(r), t1, t1; \
-	vpor 6 * 16(r), t2, t2; \
-	vpor 7 * 16(r), t3, t3; \
-	\
-	vpxor 0 * 16(r), t0, t0; \
-	vpxor 1 * 16(r), t1, t1; \
-	vpxor 2 * 16(r), t2, t2; \
-	vpxor 3 * 16(r), t3, t3; \
-	vmovdqu t0, 0 * 16(r); \
-	vmovdqu t1, 1 * 16(r); \
-	vmovdqu t2, 2 * 16(r); \
-	vmovdqu t3, 3 * 16(r); \
-	\
-	/* \
-	 * t2 = krl; \
-	 * t2 &= rl; \
-	 * rr ^= rol32(t2, 1); \
-	 */ \
-	vmovd krl, t0; \
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
-	\
-	vpand 0 * 16(r), t0, t0; \
-	vpand 1 * 16(r), t1, t1; \
-	vpand 2 * 16(r), t2, t2; \
-	vpand 3 * 16(r), t3, t3; \
-	\
-	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
-	\
-	vpxor 4 * 16(r), t0, t0; \
-	vpxor 5 * 16(r), t1, t1; \
-	vpxor 6 * 16(r), t2, t2; \
-	vpxor 7 * 16(r), t3, t3; \
-	vmovdqu t0, 4 * 16(r); \
-	vmovdqu t1, 5 * 16(r); \
-	vmovdqu t2, 6 * 16(r); \
-	vmovdqu t3, 7 * 16(r); \
-	\
-	/* \
-	 * t0 = klr; \
-	 * t0 |= lr; \
-	 * ll ^= t0; \
-	 */ \
-	\
-	vmovd klr, t0; \
-	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
-	\
-	vpor l4, t0, t0; \
-	vpor l5, t1, t1; \
-	vpor l6, t2, t2; \
-	vpor l7, t3, t3; \
-	\
-	vpxor l0, t0, l0; \
-	vmovdqu l0, 0 * 16(l); \
-	vpxor l1, t1, l1; \
-	vmovdqu l1, 1 * 16(l); \
-	vpxor l2, t2, l2; \
-	vmovdqu l2, 2 * 16(l); \
-	vpxor l3, t3, l3; \
-	vmovdqu l3, 3 * 16(l);
-
-#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
-	vpunpckhdq x1, x0, t2; \
-	vpunpckldq x1, x0, x0; \
-	\
-	vpunpckldq x3, x2, t1; \
-	vpunpckhdq x3, x2, x2; \
-	\
-	vpunpckhqdq t1, x0, x1; \
-	vpunpcklqdq t1, x0, x0; \
-	\
-	vpunpckhqdq x2, t2, x3; \
-	vpunpcklqdq x2, t2, x2;
-
-#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
-			      a3, b3, c3, d3, st0, st1) \
-	vmovdqu d2, st0; \
-	vmovdqu d3, st1; \
-	transpose_4x4(a0, a1, a2, a3, d2, d3); \
-	transpose_4x4(b0, b1, b2, b3, d2, d3); \
-	vmovdqu st0, d2; \
-	vmovdqu st1, d3; \
-	\
-	vmovdqu a0, st0; \
-	vmovdqu a1, st1; \
-	transpose_4x4(c0, c1, c2, c3, a0, a1); \
-	transpose_4x4(d0, d1, d2, d3, a0, a1); \
-	\
-	vmovdqu .Lshufb_16x16b RIP, a0; \
-	vmovdqu st1, a1; \
-	vpshufb a0, a2, a2; \
-	vpshufb a0, a3, a3; \
-	vpshufb a0, b0, b0; \
-	vpshufb a0, b1, b1; \
-	vpshufb a0, b2, b2; \
-	vpshufb a0, b3, b3; \
-	vpshufb a0, a1, a1; \
-	vpshufb a0, c0, c0; \
-	vpshufb a0, c1, c1; \
-	vpshufb a0, c2, c2; \
-	vpshufb a0, c3, c3; \
-	vpshufb a0, d0, d0; \
-	vpshufb a0, d1, d1; \
-	vpshufb a0, d2, d2; \
-	vpshufb a0, d3, d3; \
-	vmovdqu d3, st1; \
-	vmovdqu st0, d3; \
-	vpshufb a0, d3, a0; \
-	vmovdqu d2, st0; \
-	\
-	transpose_4x4(a0, b0, c0, d0, d2, d3); \
-	transpose_4x4(a1, b1, c1, d1, d2, d3); \
-	vmovdqu st0, d2; \
-	vmovdqu st1, d3; \
-	\
-	vmovdqu b0, st0; \
-	vmovdqu b1, st1; \
-	transpose_4x4(a2, b2, c2, d2, b0, b1); \
-	transpose_4x4(a3, b3, c3, d3, b0, b1); \
-	vmovdqu st0, b0; \
-	vmovdqu st1, b1; \
-	/* does not adjust output bytes inside vectors */
-
-#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
-	vpunpcklbw a, b, t0; \
-	vpunpckhbw a, b, b; \
-	\
-	vpunpcklbw c, d, t1; \
-	vpunpckhbw c, d, d; \
-	\
-	vpunpcklbw e, f, t2; \
-	vpunpckhbw e, f, f; \
-	\
-	vpunpcklbw g, h, t3; \
-	vpunpckhbw g, h, h; \
-	\
-	vpunpcklwd t0, t1, g; \
-	vpunpckhwd t0, t1, t0; \
-	\
-	vpunpcklwd b, d, t1; \
-	vpunpckhwd b, d, e; \
-	\
-	vpunpcklwd t2, t3, c; \
-	vpunpckhwd t2, t3, t2; \
-	\
-	vpunpcklwd f, h, t3; \
-	vpunpckhwd f, h, b; \
-	\
-	vpunpcklwd e, b, t4; \
-	vpunpckhwd e, b, b; \
-	\
-	vpunpcklwd t1, t3, e; \
-	vpunpckhwd t1, t3, f; \
-	\
-	vmovdqa .Ltranspose_8x8_shuf RIP, t3; \
-	\
-	vpunpcklwd g, c, d; \
-	vpunpckhwd g, c, c; \
-	\
-	vpunpcklwd t0, t2, t1; \
-	vpunpckhwd t0, t2, h; \
-	\
-	vpunpckhqdq b, h, a; \
-	vpshufb t3, a, a; \
-	vpunpcklqdq b, h, b; \
-	vpshufb t3, b, b; \
-	\
-	vpunpckhqdq e, d, g; \
-	vpshufb t3, g, g; \
-	vpunpcklqdq e, d, h; \
-	vpshufb t3, h, h; \
-	\
-	vpunpckhqdq f, c, e; \
-	vpshufb t3, e, e; \
-	vpunpcklqdq f, c, f; \
-	vpshufb t3, f, f; \
-	\
-	vpunpckhqdq t4, t1, c; \
-	vpshufb t3, c, c; \
-	vpunpcklqdq t4, t1, d; \
-	vpshufb t3, d, d;
-
-/* load blocks to registers and apply pre-whitening */
-#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		     y6, y7, rio, key) \
-	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
-	\
-	vpxor 0 * 16(rio), x0, y7; \
-	vpxor 1 * 16(rio), x0, y6; \
-	vpxor 2 * 16(rio), x0, y5; \
-	vpxor 3 * 16(rio), x0, y4; \
-	vpxor 4 * 16(rio), x0, y3; \
-	vpxor 5 * 16(rio), x0, y2; \
-	vpxor 6 * 16(rio), x0, y1; \
-	vpxor 7 * 16(rio), x0, y0; \
-	vpxor 8 * 16(rio), x0, x7; \
-	vpxor 9 * 16(rio), x0, x6; \
-	vpxor 10 * 16(rio), x0, x5; \
-	vpxor 11 * 16(rio), x0, x4; \
-	vpxor 12 * 16(rio), x0, x3; \
-	vpxor 13 * 16(rio), x0, x2; \
-	vpxor 14 * 16(rio), x0, x1; \
-	vpxor 15 * 16(rio), x0, x0;
-
-/* byteslice pre-whitened blocks and store to temporary memory */
-#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		      y6, y7, mem_ab, mem_cd) \
-	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
-			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
-	\
-	vmovdqu x0, 0 * 16(mem_ab); \
-	vmovdqu x1, 1 * 16(mem_ab); \
-	vmovdqu x2, 2 * 16(mem_ab); \
-	vmovdqu x3, 3 * 16(mem_ab); \
-	vmovdqu x4, 4 * 16(mem_ab); \
-	vmovdqu x5, 5 * 16(mem_ab); \
-	vmovdqu x6, 6 * 16(mem_ab); \
-	vmovdqu x7, 7 * 16(mem_ab); \
-	vmovdqu y0, 0 * 16(mem_cd); \
-	vmovdqu y1, 1 * 16(mem_cd); \
-	vmovdqu y2, 2 * 16(mem_cd); \
-	vmovdqu y3, 3 * 16(mem_cd); \
-	vmovdqu y4, 4 * 16(mem_cd); \
-	vmovdqu y5, 5 * 16(mem_cd); \
-	vmovdqu y6, 6 * 16(mem_cd); \
-	vmovdqu y7, 7 * 16(mem_cd);
-
-/* de-byteslice, apply post-whitening and store blocks */
-#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
-		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
-	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
-			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
-	\
-	vmovdqu x0, stack_tmp0; \
-	\
-	vmovq key, x0; \
-	vpshufb .Lpack_bswap RIP, x0, x0; \
-	\
-	vpxor x0, y7, y7; \
-	vpxor x0, y6, y6; \
-	vpxor x0, y5, y5; \
-	vpxor x0, y4, y4; \
-	vpxor x0, y3, y3; \
-	vpxor x0, y2, y2; \
-	vpxor x0, y1, y1; \
-	vpxor x0, y0, y0; \
-	vpxor x0, x7, x7; \
-	vpxor x0, x6, x6; \
-	vpxor x0, x5, x5; \
-	vpxor x0, x4, x4; \
-	vpxor x0, x3, x3; \
-	vpxor x0, x2, x2; \
-	vpxor x0, x1, x1; \
-	vpxor stack_tmp0, x0, x0;
-
-#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
-		     y6, y7, rio) \
-	vmovdqu x0, 0 * 16(rio); \
-	vmovdqu x1, 1 * 16(rio); \
-	vmovdqu x2, 2 * 16(rio); \
-	vmovdqu x3, 3 * 16(rio); \
-	vmovdqu x4, 4 * 16(rio); \
-	vmovdqu x5, 5 * 16(rio); \
-	vmovdqu x6, 6 * 16(rio); \
-	vmovdqu x7, 7 * 16(rio); \
-	vmovdqu y0, 8 * 16(rio); \
-	vmovdqu y1, 9 * 16(rio); \
-	vmovdqu y2, 10 * 16(rio); \
-	vmovdqu y3, 11 * 16(rio); \
-	vmovdqu y4, 12 * 16(rio); \
-	vmovdqu y5, 13 * 16(rio); \
-	vmovdqu y6, 14 * 16(rio); \
-	vmovdqu y7, 15 * 16(rio);
-
-.data
-.align 16
-
-#define SHUFB_BYTES(idx) \
-	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
-
-.Lshufb_16x16b:
-	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
-
-.Lpack_bswap:
-	.long 0x00010203
-	.long 0x04050607
-	.long 0x80808080
-	.long 0x80808080
-
-/* For CTR-mode IV byteswap */
-.Lbswap128_mask:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/*
- * pre-SubByte transform
- *
- * pre-lookup for sbox1, sbox2, sbox3:
- *   swap_bitendianness(
- *       isom_map_camellia_to_aes(
- *           camellia_f(
- *               swap_bitendianess(in)
- *           )
- *       )
- *   )
- *
- * (note: '? 0xc5' inside camellia_f())
- */
-.Lpre_tf_lo_s1:
-	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
-	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
-.Lpre_tf_hi_s1:
-	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
-	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
-
-/*
- * pre-SubByte transform
- *
- * pre-lookup for sbox4:
- *   swap_bitendianness(
- *       isom_map_camellia_to_aes(
- *           camellia_f(
- *               swap_bitendianess(in <<< 1)
- *           )
- *       )
- *   )
- *
- * (note: '? 0xc5' inside camellia_f())
- */
-.Lpre_tf_lo_s4:
-	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
-	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
-.Lpre_tf_hi_s4:
-	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
-	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
-
-/*
- * post-SubByte transform
- *
- * post-lookup for sbox1, sbox4:
- *  swap_bitendianness(
- *      camellia_h(
- *          isom_map_aes_to_camellia(
- *              swap_bitendianness(
- *                  aes_inverse_affine_transform(in)
- *              )
- *          )
- *      )
- *  )
- *
- * (note: '? 0x6e' inside camellia_h())
- */
-.Lpost_tf_lo_s1:
-	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
-	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
-.Lpost_tf_hi_s1:
-	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
-	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
-
-/*
- * post-SubByte transform
- *
- * post-lookup for sbox2:
- *  swap_bitendianness(
- *      camellia_h(
- *          isom_map_aes_to_camellia(
- *              swap_bitendianness(
- *                  aes_inverse_affine_transform(in)
- *              )
- *          )
- *      )
- *  ) <<< 1
- *
- * (note: '? 0x6e' inside camellia_h())
- */
-.Lpost_tf_lo_s2:
-	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
-	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
-.Lpost_tf_hi_s2:
-	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
-	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
-
-/*
- * post-SubByte transform
- *
- * post-lookup for sbox3:
- *  swap_bitendianness(
- *      camellia_h(
- *          isom_map_aes_to_camellia(
- *              swap_bitendianness(
- *                  aes_inverse_affine_transform(in)
- *              )
- *          )
- *      )
- *  ) >>> 1
- *
- * (note: '? 0x6e' inside camellia_h())
- */
-.Lpost_tf_lo_s3:
-	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
-	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
-.Lpost_tf_hi_s3:
-	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
-	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
-
-/* For isolating SubBytes from AESENCLAST, inverse shift row */
-.Linv_shift_row:
-	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
-	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
-
-/* shuffle mask for 8x8 byte transpose */
-.Ltranspose_8x8_shuf:
-	.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
-
-.align 4
-/* 4-bit mask */
-.L0f0f0f0f:
-	.long 0x0f0f0f0f
-
-.text
-
-.align 8
-.type   __camellia_enc_blk16, at function;
-
-__camellia_enc_blk16:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rax: temporary storage, 256 bytes
-	 *	%xmm0..%xmm15: 16 plaintext blocks
-	 * output:
-	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
-	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-	 */
-
-	leaq 8 * 16(%rax), %rcx;
-
-	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		      %xmm15, %rax, %rcx);
-
-	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 0);
-
-	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-	      %xmm15,
-	      ((key_table + (8) * 8) + 0)(CTX),
-	      ((key_table + (8) * 8) + 4)(CTX),
-	      ((key_table + (8) * 8) + 8)(CTX),
-	      ((key_table + (8) * 8) + 12)(CTX));
-
-	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 8);
-
-	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-	      %xmm15,
-	      ((key_table + (16) * 8) + 0)(CTX),
-	      ((key_table + (16) * 8) + 4)(CTX),
-	      ((key_table + (16) * 8) + 8)(CTX),
-	      ((key_table + (16) * 8) + 12)(CTX));
-
-	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 16);
-
-	movl $24, %r8d;
-	cmpl $128, key_bitlength(CTX);
-	jne .Lenc_max32;
-
-.Lenc_done:
-	/* load CD for output */
-	vmovdqu 0 * 16(%rcx), %xmm8;
-	vmovdqu 1 * 16(%rcx), %xmm9;
-	vmovdqu 2 * 16(%rcx), %xmm10;
-	vmovdqu 3 * 16(%rcx), %xmm11;
-	vmovdqu 4 * 16(%rcx), %xmm12;
-	vmovdqu 5 * 16(%rcx), %xmm13;
-	vmovdqu 6 * 16(%rcx), %xmm14;
-	vmovdqu 7 * 16(%rcx), %xmm15;
-
-	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
-
-	ret;
-
-.align 8
-.Lenc_max32:
-	movl $32, %r8d;
-
-	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-	      %xmm15,
-	      ((key_table + (24) * 8) + 0)(CTX),
-	      ((key_table + (24) * 8) + 4)(CTX),
-	      ((key_table + (24) * 8) + 8)(CTX),
-	      ((key_table + (24) * 8) + 12)(CTX));
-
-	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 24);
-
-	jmp .Lenc_done;
-.size __camellia_enc_blk16,.-__camellia_enc_blk16;
-
-.align 8
-.type   __camellia_dec_blk16, at function;
-
-__camellia_dec_blk16:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rax: temporary storage, 256 bytes
-	 *	%r8d: 24 for 16 byte key, 32 for larger
-	 *	%xmm0..%xmm15: 16 encrypted blocks
-	 * output:
-	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
-	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
-	 */
-
-	leaq 8 * 16(%rax), %rcx;
-
-	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		      %xmm15, %rax, %rcx);
-
-	cmpl $32, %r8d;
-	je .Ldec_max32;
-
-.Ldec_max24:
-	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 16);
-
-	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-	      %xmm15,
-	      ((key_table + (16) * 8) + 8)(CTX),
-	      ((key_table + (16) * 8) + 12)(CTX),
-	      ((key_table + (16) * 8) + 0)(CTX),
-	      ((key_table + (16) * 8) + 4)(CTX));
-
-	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 8);
-
-	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-	      %xmm15,
-	      ((key_table + (8) * 8) + 8)(CTX),
-	      ((key_table + (8) * 8) + 12)(CTX),
-	      ((key_table + (8) * 8) + 0)(CTX),
-	      ((key_table + (8) * 8) + 4)(CTX));
-
-	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 0);
-
-	/* load CD for output */
-	vmovdqu 0 * 16(%rcx), %xmm8;
-	vmovdqu 1 * 16(%rcx), %xmm9;
-	vmovdqu 2 * 16(%rcx), %xmm10;
-	vmovdqu 3 * 16(%rcx), %xmm11;
-	vmovdqu 4 * 16(%rcx), %xmm12;
-	vmovdqu 5 * 16(%rcx), %xmm13;
-	vmovdqu 6 * 16(%rcx), %xmm14;
-	vmovdqu 7 * 16(%rcx), %xmm15;
-
-	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
-
-	ret;
-
-.align 8
-.Ldec_max32:
-	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rax, %rcx, 24);
-
-	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-	      %xmm15,
-	      ((key_table + (24) * 8) + 8)(CTX),
-	      ((key_table + (24) * 8) + 12)(CTX),
-	      ((key_table + (24) * 8) + 0)(CTX),
-	      ((key_table + (24) * 8) + 4)(CTX));
-
-	jmp .Ldec_max24;
-.size __camellia_dec_blk16,.-__camellia_dec_blk16;
-
-#define inc_le128(x, minus_one, tmp) \
-	vpcmpeqq minus_one, x, tmp; \
-	vpsubq minus_one, x, x; \
-	vpslldq $8, tmp, tmp; \
-	vpsubq tmp, x, x;
-
-.align 8
-.global _gcry_camellia_aesni_avx_ctr_enc
-.type   _gcry_camellia_aesni_avx_ctr_enc, at function;
-
-_gcry_camellia_aesni_avx_ctr_enc:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv (big endian, 128bit)
-	 */
-
-	subq $(16 * 16), %rsp;
-	movq %rsp, %rax;
-
-	vmovdqa .Lbswap128_mask RIP, %xmm14;
-
-	/* load IV and byteswap */
-	vmovdqu (%rcx), %xmm15;
-	vmovdqu %xmm15, 15 * 16(%rax);
-	vpshufb %xmm14, %xmm15, %xmm0; /* be => le */
-
-	vpcmpeqd %xmm15, %xmm15, %xmm15;
-	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
-
-	/* construct IVs */
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm13;
-	vmovdqu %xmm13, 14 * 16(%rax);
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm13;
-	vmovdqu %xmm13, 13 * 16(%rax);
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm12;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm11;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm10;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm9;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm8;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm7;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm6;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm5;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm4;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm3;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm2;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vpshufb %xmm14, %xmm0, %xmm1;
-	inc_le128(%xmm0, %xmm15, %xmm13);
-	vmovdqa %xmm0, %xmm13;
-	vpshufb %xmm14, %xmm0, %xmm0;
-	inc_le128(%xmm13, %xmm15, %xmm14);
-	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */
-	vmovdqu %xmm13, (%rcx);
-
-	/* inpack16_pre: */
-	vmovq (key_table)(CTX), %xmm15;
-	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
-	vpxor %xmm0, %xmm15, %xmm0;
-	vpxor %xmm1, %xmm15, %xmm1;
-	vpxor %xmm2, %xmm15, %xmm2;
-	vpxor %xmm3, %xmm15, %xmm3;
-	vpxor %xmm4, %xmm15, %xmm4;
-	vpxor %xmm5, %xmm15, %xmm5;
-	vpxor %xmm6, %xmm15, %xmm6;
-	vpxor %xmm7, %xmm15, %xmm7;
-	vpxor %xmm8, %xmm15, %xmm8;
-	vpxor %xmm9, %xmm15, %xmm9;
-	vpxor %xmm10, %xmm15, %xmm10;
-	vpxor %xmm11, %xmm15, %xmm11;
-	vpxor %xmm12, %xmm15, %xmm12;
-	vpxor 13 * 16(%rax), %xmm15, %xmm13;
-	vpxor 14 * 16(%rax), %xmm15, %xmm14;
-	vpxor 15 * 16(%rax), %xmm15, %xmm15;
-
-	call __camellia_enc_blk16;
-
-	addq $(16 * 16), %rsp;
-
-	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
-	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
-	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
-	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
-	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
-	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
-	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
-	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
-	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
-	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
-	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
-	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
-	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
-	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
-	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
-	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
-
-	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
-		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
-		     %xmm8, %rsi);
-
-	ret;
-.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
-
-.align 8
-.global _gcry_camellia_aesni_avx_cbc_dec
-.type   _gcry_camellia_aesni_avx_cbc_dec, at function;
-
-_gcry_camellia_aesni_avx_cbc_dec:
-	/* input:
-	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
-	 *	%rcx: iv
-	 */
-
-	movq %rcx, %r9;
-
-	cmpl $128, key_bitlength(CTX);
-	movl $32, %r8d;
-	movl $24, %eax;
-	cmovel %eax, %r8d; /* max */
-
-	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
-		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
-		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
-
-	subq $(16 * 16), %rsp;
-	movq %rsp, %rax;
-
-	call __camellia_dec_blk16;
-
-	addq $(16 * 16), %rsp;
-
-	/* XOR output with IV */
-	vpxor (%r9), %xmm7, %xmm7;
-	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
-	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
-	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
-	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
-	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
-	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
-	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
-	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
-	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
-	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
-	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
-	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
-	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
-	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
-	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
-	movq (15 * 16 + 0)(%rdx), %r10;
-	movq (15 * 16 + 8)(%rdx), %r11;
-	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
-		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
-		     %xmm8, %rsi);
-
-	/* store new IV */
-	movq %r10, (0)(%r9);
-	movq %r11, (8)(%r9);
-
-	ret;
-.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
-
-#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
-#endif /*__x86_64*/
diff --git a/configure.ac b/configure.ac
index 079951d..f14e28a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1241,7 +1241,7 @@ if test "$found" = "1" ; then
    if test x"$avxsupport" = xyes ; then
       if test x"$aesnisupport" = xyes ; then
         # Build with the AES-NI/AVX implementation
-        GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia_aesni_avx_x86-64.lo"
+        GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-aesni-avx-amd64.lo"
       fi
    fi
 fi


From jussi.kivilinna at iki.fi  Wed May 22 13:10:54 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 May 2013 14:10:54 +0300
Subject: [PATCH v2 1/4] camellia: add bulk CBC decryption selftest
Message-ID: <20130522111054.828.92672.stgit@localhost6.localdomain6>

* cipher/camellia-glue.c: (selftest_cbc_128): New selftest function for
bulk CBC decryption.
(selftest): Add call to selftest_cbc_128.
--

Add selftest for the parallel code paths in bulk CBC decryption.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-glue.c |   83 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index dd9206f..e9e2bf2 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -402,6 +402,86 @@ selftest_ctr_128 (void)
   return NULL;
 }
 
+/* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 16+2;
+  CAMELLIA_context ctx ATTR_ALIGNED_16;
+  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char iv[16] ATTR_ALIGNED_16;
+  unsigned char iv2[16] ATTR_ALIGNED_16;
+  int i, j;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+    };
+  static char error_str[128];
+
+  camellia_setkey (&ctx, key, sizeof (key));
+
+  /* Test single block code path */
+  memset(iv, 0x4e, sizeof(iv));
+  memset(iv2, 0x4e, sizeof(iv2));
+  for (i = 0; i < 16; i++)
+    plaintext[i] = i;
+
+  /* CBC manually.  */
+  for (i = 0; i < 16; i++)
+    ciphertext[i] = iv[i] ^ plaintext[i];
+  camellia_encrypt (&ctx, ciphertext, ciphertext);
+  memcpy(iv, ciphertext, sizeof(iv));
+
+  /* CBC decrypt.  */
+  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp(plaintext2, plaintext, 16))
+    return "CAMELLIA-128-CBC test failed (plaintext mismatch)";
+
+  if (memcmp(iv2, iv, 16))
+    return "CAMELLIA-128-CBC test failed (IV mismatch)";
+
+  /* Test parallelized code paths */
+  memset(iv, 0x5f, sizeof(iv));
+  memset(iv2, 0x5f, sizeof(iv2));
+
+  for (i = 0; i < sizeof(plaintext); i++)
+    plaintext[i] = i;
+
+  /* Create CBC ciphertext manually.  */
+  for (i = 0; i < sizeof(plaintext); i+=16)
+    {
+      for (j = 0; j < 16; j++)
+        ciphertext[i+j] = iv[j] ^ plaintext[i+j];
+      camellia_encrypt (&ctx, &ciphertext[i], &ciphertext[i]);
+      memcpy(iv, &ciphertext[i], sizeof(iv));
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext,
+                          sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
+
+  if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "CAMELLIA-128-CBC test failed (plaintext mismatch, "
+	       "parallel path)");
+      return error_str;
+    }
+  if (memcmp(iv2, iv, sizeof(iv)))
+    {
+      snprintf(error_str, sizeof(error_str),
+               "CAMELLIA-128-CBC test failed (IV mismatch, parallel path)");
+      return error_str;
+    }
+
+  return NULL;
+}
+
 static const char *
 selftest(void)
 {
@@ -474,6 +554,9 @@ selftest(void)
   if ( (r = selftest_ctr_128 ()) )
     return r;
 
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return NULL;
 }
 

From jussi.kivilinna at iki.fi  Wed May 22 13:11:10 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 May 2013 14:11:10 +0300
Subject: [PATCH v2 4/4] Add AES bulk CBC decryption selftest
In-Reply-To: <20130522111054.828.92672.stgit@localhost6.localdomain6>
References: <20130522111054.828.92672.stgit@localhost6.localdomain6>
Message-ID: <20130522111109.828.30371.stgit@localhost6.localdomain6>

* cipher/rinjdael.c (selftest_cbc_128): New.
(selftest): Call selftest_cbc_128.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael.c |   18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 6b16ce8..4c81688 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -2020,6 +2020,21 @@ selftest_ctr_128 (void)
 }
 
 
+/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cbc_128("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -2035,6 +2050,9 @@ selftest (void)
   if ( (r = selftest_ctr_128 ()) )
     return r;
 
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return r;
 }
 

From jussi.kivilinna at iki.fi  Wed May 22 13:10:59 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 May 2013 14:10:59 +0300
Subject: [PATCH v2 2/4] Convert bulk CTR and CBC selftest functions in
 Camellia to generic selftest helper functions
In-Reply-To: <20130522111054.828.92672.stgit@localhost6.localdomain6>
References: <20130522111054.828.92672.stgit@localhost6.localdomain6>
Message-ID: <20130522111059.828.24153.stgit@localhost6.localdomain6>

* cipher/Makefile.am (libcipher_la_SOURCES): Add cipher-selftest files.
* cipher/camellia-glue.c (selftest_ctr_128, selftest_cbc_128): Change
to use the new selftest helper functions.
* cipher/cipher-selftest.c: New.
* cipher/cipher-selftest.h: New.
--

Convert selftest functions into generic helper functions for code sharing.

[v2]: use syslog for more detailed selftest error messages

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am       |    1 
 cipher/camellia-glue.c   |  168 ++-------------------------
 cipher/cipher-selftest.c |  291 ++++++++++++++++++++++++++++++++++++++++++++++
 cipher/cipher-selftest.h |   54 +++++++++
 4 files changed, 357 insertions(+), 157 deletions(-)
 create mode 100644 cipher/cipher-selftest.c
 create mode 100644 cipher/cipher-selftest.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 00e4429..0808bd2 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -40,6 +40,7 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES)
 libcipher_la_SOURCES = \
 cipher.c cipher-internal.h \
 cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
+cipher-selftest.c cipher-selftest.h \
 pubkey.c pubkey-internal.h \
 md.c \
 kdf.c kdf-internal.h \
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index e9e2bf2..4c724a9 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -63,6 +63,7 @@
 #include "cipher.h"
 #include "camellia.h"
 #include "bufhelp.h"
+#include "cipher-selftest.h"
 
 /* Helper macro to force alignment to 16 bytes.  */
 #ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
@@ -313,93 +314,12 @@ static const char*
 selftest_ctr_128 (void)
 {
   const int nblocks = 16+1;
-  CAMELLIA_context ctx ATTR_ALIGNED_16;
-  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char iv[16] ATTR_ALIGNED_16;
-  unsigned char iv2[16] ATTR_ALIGNED_16;
-  int i, j, diff;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
-    };
-  static char error_str[128];
-
-  camellia_setkey (&ctx, key, sizeof (key));
-
-  /* Test single block code path */
-  memset(iv, 0xff, sizeof(iv));
-  for (i = 0; i < 16; i++)
-    plaintext[i] = i;
-
-  /* CTR manually.  */
-  camellia_encrypt (&ctx, ciphertext, iv);
-  for (i = 0; i < 16; i++)
-    ciphertext[i] ^= plaintext[i];
-  for (i = 16; i > 0; i--)
-    {
-      iv[i-1]++;
-      if (iv[i-1])
-        break;
-    }
-
-  memset(iv2, 0xff, sizeof(iv2));
-  _gcry_camellia_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp(plaintext2, plaintext, 16))
-    return "CAMELLIA-128-CTR test failed (plaintext mismatch)";
-
-  if (memcmp(iv2, iv, 16))
-    return "CAMELLIA-128-CTR test failed (IV mismatch)";
-
-  /* Test parallelized code paths */
-  for (diff = 0; diff < nblocks; diff++) {
-    memset(iv, 0xff, sizeof(iv));
-    iv[15] -= diff;
-
-    for (i = 0; i < sizeof(plaintext); i++)
-      plaintext[i] = i;
-
-    /* Create CTR ciphertext manually.  */
-    for (i = 0; i < sizeof(plaintext); i+=16)
-      {
-        camellia_encrypt (&ctx, &ciphertext[i], iv);
-        for (j = 0; j < 16; j++)
-          ciphertext[i+j] ^= plaintext[i+j];
-        for (j = 16; j > 0; j--)
-          {
-            iv[j-1]++;
-            if (iv[j-1])
-              break;
-          }
-      }
-
-    /* Decrypt using bulk CTR and compare result.  */
-    memset(iv2, 0xff, sizeof(iv2));
-    iv2[15] -= diff;
-
-    _gcry_camellia_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
-                            sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
-
-    if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "CAMELLIA-128-CTR test failed (plaintext mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-    if (memcmp(iv2, iv, sizeof(iv)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "CAMELLIA-128-CTR test failed (IV mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-  }
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
 
-  return NULL;
+  return _gcry_selftest_helper_ctr_128("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, &_gcry_camellia_ctr_enc, nblocks, blocksize,
+	   context_size);
 }
 
 /* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
@@ -408,78 +328,12 @@ static const char*
 selftest_cbc_128 (void)
 {
   const int nblocks = 16+2;
-  CAMELLIA_context ctx ATTR_ALIGNED_16;
-  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
-  unsigned char iv[16] ATTR_ALIGNED_16;
-  unsigned char iv2[16] ATTR_ALIGNED_16;
-  int i, j;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
-    };
-  static char error_str[128];
-
-  camellia_setkey (&ctx, key, sizeof (key));
-
-  /* Test single block code path */
-  memset(iv, 0x4e, sizeof(iv));
-  memset(iv2, 0x4e, sizeof(iv2));
-  for (i = 0; i < 16; i++)
-    plaintext[i] = i;
-
-  /* CBC manually.  */
-  for (i = 0; i < 16; i++)
-    ciphertext[i] = iv[i] ^ plaintext[i];
-  camellia_encrypt (&ctx, ciphertext, ciphertext);
-  memcpy(iv, ciphertext, sizeof(iv));
-
-  /* CBC decrypt.  */
-  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp(plaintext2, plaintext, 16))
-    return "CAMELLIA-128-CBC test failed (plaintext mismatch)";
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
 
-  if (memcmp(iv2, iv, 16))
-    return "CAMELLIA-128-CBC test failed (IV mismatch)";
-
-  /* Test parallelized code paths */
-  memset(iv, 0x5f, sizeof(iv));
-  memset(iv2, 0x5f, sizeof(iv2));
-
-  for (i = 0; i < sizeof(plaintext); i++)
-    plaintext[i] = i;
-
-  /* Create CBC ciphertext manually.  */
-  for (i = 0; i < sizeof(plaintext); i+=16)
-    {
-      for (j = 0; j < 16; j++)
-        ciphertext[i+j] = iv[j] ^ plaintext[i+j];
-      camellia_encrypt (&ctx, &ciphertext[i], &ciphertext[i]);
-      memcpy(iv, &ciphertext[i], sizeof(iv));
-    }
-
-  /* Decrypt using bulk CBC and compare result.  */
-  _gcry_camellia_cbc_dec (&ctx, iv2, plaintext2, ciphertext,
-                          sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
-
-  if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
-    {
-      snprintf(error_str, sizeof(error_str),
-               "CAMELLIA-128-CBC test failed (plaintext mismatch, "
-	       "parallel path)");
-      return error_str;
-    }
-  if (memcmp(iv2, iv, sizeof(iv)))
-    {
-      snprintf(error_str, sizeof(error_str),
-               "CAMELLIA-128-CBC test failed (IV mismatch, parallel path)");
-      return error_str;
-    }
-
-  return NULL;
+  return _gcry_selftest_helper_cbc_128("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, &_gcry_camellia_cbc_dec, nblocks, blocksize,
+	   context_size);
 }
 
 static const char *
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
new file mode 100644
index 0000000..50c7752
--- /dev/null
+++ b/cipher/cipher-selftest.c
@@ -0,0 +1,291 @@
+/* cipher-selftest.c - Helper functions for bulk encryption selftests.
+ *	Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#ifdef HAVE_SYSLOG
+# include <syslog.h>
+#endif /*HAVE_SYSLOG*/
+
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+/* Run the self-tests for <block cipher>-CBC-128, tests bulk CBC
+   decryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
+			       const int nblocks, const int blocksize,
+			       const int context_size)
+{
+  int i, offs;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+    };
+
+  /* Allocate buffers, align elements to 16 bytes.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = gcry_calloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  setkey (ctx, key, sizeof(key));
+
+  /* Test single block code path */
+  memset (iv, 0x4e, blocksize);
+  memset (iv2, 0x4e, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CBC manually.  */
+  buf_xor (ciphertext, iv, plaintext, blocksize);
+  encrypt_one (ctx, ciphertext, ciphertext);
+  memcpy (iv, ciphertext, blocksize);
+
+  /* CBC decrypt.  */
+  bulk_cbc_dec (ctx, iv2, plaintext2, ciphertext, 1);
+  if (memcmp (plaintext2, plaintext, 16))
+    {
+      gcry_free (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CBC test failed (plaintext mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CBC failed - see syslog for details";
+    }
+
+  if (memcmp (iv2, iv, 16))
+    {
+      gcry_free (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CBC test failed (IV mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CBC failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths */
+  memset (iv, 0x5f, blocksize);
+  memset (iv2, 0x5f, blocksize);
+
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CBC ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      buf_xor (&ciphertext[i], iv, &plaintext[i], blocksize);
+      encrypt_one (ctx, &ciphertext[i], &ciphertext[i]);
+      memcpy (iv, &ciphertext[i], blocksize);
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  bulk_cbc_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+  if (memcmp (plaintext2, plaintext, nblocks * blocksize))
+    {
+      gcry_free (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CBC test failed (plaintext mismatch, parallel path)",
+	      cipher);
+#endif
+      return "selftest for 128 bit CBC failed - see syslog for details";
+    }
+  if (memcmp (iv2, iv, blocksize))
+    {
+      gcry_free (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CBC test failed (IV mismatch, parallel path)",
+	      cipher);
+#endif
+      return "selftest for 128 bit CBC failed - see syslog for details";
+    }
+
+  gcry_free (mem);
+  return NULL;
+}
+
+/* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
+			       const int nblocks, const int blocksize,
+			       const int context_size)
+{
+  int i, j, offs, diff;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
+
+  /* Allocate buffers, align elements to 16 bytes.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = gcry_calloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  setkey (ctx, key, sizeof(key));
+
+  /* Test single block code path */
+  memset (iv, 0xff, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CTR manually.  */
+  encrypt_one (ctx, ciphertext, iv);
+  for (i = 0; i < blocksize; i++)
+    ciphertext[i] ^= plaintext[i];
+  for (i = blocksize; i > 0; i--)
+    {
+      iv[i-1]++;
+      if (iv[i-1])
+        break;
+    }
+
+  memset (iv2, 0xff, blocksize);
+  bulk_ctr_enc (ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp (plaintext2, plaintext, blocksize))
+    {
+      gcry_free (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CTR test failed (plaintext mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CTR failed - see syslog for details";
+    }
+
+  if (memcmp (iv2, iv, blocksize))
+    {
+      gcry_free (mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CTR test failed (IV mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CTR failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths */
+  for (diff = 0; diff < nblocks; diff++) {
+    memset(iv, 0xff, blocksize);
+    iv[blocksize-1] -= diff;
+
+    for (i = 0; i < blocksize * nblocks; i++)
+      plaintext[i] = i;
+
+    /* Create CTR ciphertext manually.  */
+    for (i = 0; i < blocksize * nblocks; i+=blocksize)
+      {
+        encrypt_one (ctx, &ciphertext[i], iv);
+        for (j = 0; j < blocksize; j++)
+          ciphertext[i+j] ^= plaintext[i+j];
+        for (j = blocksize; j > 0; j--)
+          {
+            iv[j-1]++;
+            if (iv[j-1])
+              break;
+          }
+      }
+
+    /* Decrypt using bulk CTR and compare result.  */
+    memset(iv2, 0xff, blocksize);
+    iv2[blocksize-1] -= diff;
+
+    bulk_ctr_enc (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+    if (memcmp (plaintext2, plaintext, blocksize * nblocks))
+      {
+        gcry_free (mem);
+#ifdef HAVE_SYSLOG
+        syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+                "%s-128-CTR test failed (plaintext mismatch, diff: %d)", cipher,
+		diff);
+#endif
+        return "selftest for 128 bit CTR failed - see syslog for details";
+      }
+    if (memcmp(iv2, iv, blocksize))
+      {
+        gcry_free (mem);
+#ifdef HAVE_SYSLOG
+        syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+                "%s-128-CTR test failed (IV mismatch, diff: %d)", cipher, diff);
+#endif
+        return "selftest for 128 bit CTR failed - see syslog for details";
+      }
+  }
+
+  gcry_free (mem);
+  return NULL;
+}
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
new file mode 100644
index 0000000..89d79c2
--- /dev/null
+++ b/cipher/cipher-selftest.h
@@ -0,0 +1,54 @@
+/* cipher-selftest.h - Helper functions for bulk encryption selftests.
+ *	Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser general Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_SELFTEST_HELP_H
+#define G10_SELFTEST_HELP_H
+
+#include <config.h>
+#include "types.h"
+#include "g10lib.h"
+#include "cipher.h"
+
+typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   unsigned int nblocks);
+
+typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   unsigned int nblocks);
+
+/* Helper function for bulk CBC decryption selftest */
+const char *
+_gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
+			       const int nblocks, const int blocksize,
+			       const int context_size);
+
+/* Helper function for bulk CTR encryption selftest */
+const char *
+_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
+			       const int nblocks, const int blocksize,
+			       const int context_size);
+
+#endif /*G10_SELFTEST_HELP_H*/


From jussi.kivilinna at iki.fi  Wed May 22 13:11:04 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 22 May 2013 14:11:04 +0300
Subject: [PATCH v2 3/4] Change AES bulk CTR encryption selftest use new
 selftest helper function
In-Reply-To: <20130522111054.828.92672.stgit@localhost6.localdomain6>
References: <20130522111054.828.92672.stgit@localhost6.localdomain6>
Message-ID: <20130522111104.828.77165.stgit@localhost6.localdomain6>

* cipher/rinjdael.c: (selftest_ctr_128): Change to use new selftest
helper function.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael.c |   93 ++++-------------------------------------------------
 1 file changed, 7 insertions(+), 86 deletions(-)

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8d4036b..6b16ce8 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -46,6 +46,7 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
+#include "cipher-selftest.h"
 
 #define MAXKC			(256/32)
 #define MAXROUNDS		14
@@ -2009,93 +2010,13 @@ selftest_basic_256 (void)
 static const char*
 selftest_ctr_128 (void)
 {
-  RIJNDAEL_context ctx ATTR_ALIGNED_16;
-  unsigned char plaintext[7*16] ATTR_ALIGNED_16;
-  unsigned char ciphertext[7*16] ATTR_ALIGNED_16;
-  unsigned char plaintext2[7*16] ATTR_ALIGNED_16;
-  unsigned char iv[16] ATTR_ALIGNED_16;
-  unsigned char iv2[16] ATTR_ALIGNED_16;
-  int i, j, diff;
-
-  static const unsigned char key[16] ATTR_ALIGNED_16 = {
-      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
-      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
-    };
-  static char error_str[128];
-
-  rijndael_setkey (&ctx, key, sizeof (key));
-
-  /* Test single block code path */
-  memset(iv, 0xff, sizeof(iv));
-  for (i = 0; i < 16; i++)
-    plaintext[i] = i;
-
-  /* CTR manually.  */
-  rijndael_encrypt (&ctx, ciphertext, iv);
-  for (i = 0; i < 16; i++)
-    ciphertext[i] ^= plaintext[i];
-  for (i = 16; i > 0; i--)
-    {
-      iv[i-1]++;
-      if (iv[i-1])
-        break;
-    }
-
-  memset(iv2, 0xff, sizeof(iv2));
-  _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
-
-  if (memcmp(plaintext2, plaintext, 16))
-    return "AES-128-CTR test failed (plaintext mismatch)";
-
-  if (memcmp(iv2, iv, 16))
-    return "AES-128-CTR test failed (IV mismatch)";
-
-  /* Test parallelized code paths */
-  for (diff = 0; diff < 7; diff++) {
-    memset(iv, 0xff, sizeof(iv));
-    iv[15] -= diff;
-
-    for (i = 0; i < sizeof(plaintext); i++)
-      plaintext[i] = i;
+  const int nblocks = 8+1;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
 
-    /* Create CTR ciphertext manually.  */
-    for (i = 0; i < sizeof(plaintext); i+=16)
-      {
-        rijndael_encrypt (&ctx, &ciphertext[i], iv);
-        for (j = 0; j < 16; j++)
-          ciphertext[i+j] ^= plaintext[i+j];
-        for (j = 16; j > 0; j--)
-          {
-            iv[j-1]++;
-            if (iv[j-1])
-              break;
-          }
-      }
-
-    /* Decrypt using bulk CTR and compare result.  */
-    memset(iv2, 0xff, sizeof(iv2));
-    iv2[15] -= diff;
-
-    _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
-                       sizeof(ciphertext) / BLOCKSIZE);
-
-    if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "AES-128-CTR test failed (plaintext mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-    if (memcmp(iv2, iv, sizeof(iv)))
-      {
-        snprintf(error_str, sizeof(error_str),
-                 "AES-128-CTR test failed (IV mismatch, diff: %d)",
-                 diff);
-        return error_str;
-      }
-  }
-
-  return NULL;
+  return _gcry_selftest_helper_ctr_128("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize,
+	   context_size);
 }
 
 
From ralph.berg at rheinmetall.com  Wed May 22 17:45:14 2013
From: ralph.berg at rheinmetall.com (Berg, Ralph)
Date: Wed, 22 May 2013 17:45:14 +0200
Subject: Problem: Buffer to MPI via scan and back with print
Message-ID: <6B8A44188A5EE5449B51500C8CC4200E021B43F6@mssrv12.defence-elec.de>

Being a newbie I found the following problem:

 
MPI scan and back with print has different results (first three zero
bytes omitted!):

 
gcry_mpi_scan( &plain_mpi, GCRYMPI_FMT_USG, inbuf, inlen, NULL )

 
gcry_mpi_print(GCRYMPI_FMT_USG,(unsigned
char*)outbuf,BUFSIZE,&size,plain_mpi)

 
Where is the problem? I would like to encrypt data directly using
libgcrypt.

 
unsigned char inbuf[]={

0x00, 0x00, 0x00, 0x08, 0x00, 0x7a, 0x7b, 0xe5, 0x40, 0x5d, 0xab, 0xb9,
0x8f, 0x6d, 0xf3, 0x02, 

0x00, 0x00, 0x4c, 0x0b, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x57, 0x75, 0x70,
0x69, 0x51, 0x75, 0x69, 

0x7a, 0x2f, 0x64, 0x65, 0x62, 0x75, 0x67, 0x2f, 0x57, 0x75, 0x70, 0x69,
0x51, 0x75, 0x69, 0x7a, 

0x2e, 0x70, 0x64, 0x62, 0xec, 0xdb, 0x09, 0x58, 0x13, 0xc7, 0x02, 0x07,
0xf0, 0x89, 0x80, 0x20, 

0x97, 0x09, 0x22, 0x0a, 0x2a, 0x46, 0x8b, 0xa8, 0xd5, 0x70, 0x2a, 0x22,
0x58, 0x01, 0x09, 0x20, 

0xca, 0x1d, 0x2e, 0x31, 0x8a, 0x21, 0x09, 0x10, 0x0d, 0x49, 0xcc, 0x21,
0xe2, 0x55, 0x6d, 0xad, 

0xda, 0x7a, 0xd4, 0x7a, 0x17, 0x15, 0x0f, 0xb4, 0x55, 0xab, 0xad, 0x47,
0x5f, 0x5b, 0xeb, 0x59, 

0xeb, 0x59, 0xe5, 0xd5, 0xb3, 0x1e, 0xad, 0xf5, 0xb5, 0xd5, 0xd6, 0x6a,
0xad, 0xb7, 0xb5, 0x5a, 

0xe5, 0xfd, 0x67, 0x77, 0xa3, 0x09, 0x72, 0xb4, 0xa2, 0xed, 0x7b, 0xdf,
0xc7, 0x7e, 0xdf, 0x8f, 

0x64, 0x27, 0xb3, 0xb3, 0xb3, 0x33, 0xbb, 0x33, 0xb3, 0xb3, 0x4b, 0xa2,
0x4a, 0xae, 0xd7, 0x1a, 

0xb4, 0xf9, 0x46, 0x61, 0xb4, 0x7f, 0x74, 0x97, 0x2e, 0xc2, 0x44, 0x49,
0xac, 0xb0, 0x87, 0x5f, 

0x40, 0x80, 0x8b, 0xa3, 0x97, 0x58, 0x42, 0xb0, 0xd8, 0x12, 0xd2, 0x08,
0x1f, 0xc7, 0xf0, 0x67, 

0xa6, 0x23, 0x61, 0x96, 0x83, 0x8d, 0x48, 0xc3, 0xd2, 0xb0, 0x34, 0x2c,
0x0d, 0x4b, 0x95, 0x65, 

0x5b, 0x65, 0xc3, 0xd2, 0xb0, 0x34, 0x2c, 0xff, 0x3f, 0x4b, 0xa8, 0x5d,
0x35, 0x97, 0xf1, 0xc3, 

0xca, 0xfb, 0x95, 0xe6, 0xef, 0xe3, 0xd9, 0x78, 0xf6, 0x7f, 0xe6, 0xf2,
0x1f, 0xdf, 0xe7, 0xf1, 

0xf7, 0x6b, 0xff, 0xec, 0x71, 0x35, 0x2c, 0x0d, 0x4b, 0xc3, 0x52, 0xf7,
0x32, 0xc7, 0x2f, 0x90

};

 
outbuf[]={

0x08, 0x00, 0x7a, 0x7b, 0xe5, 0x40, 0x5d, 0xab, 0xb9, 0x8f, 0x6d, 0xf3,
0x02, 0x00, 0x00, 0x4c, 

0x0b, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x57, 0x75, 0x70, 0x69, 0x51, 0x75,
0x69, 0x7a, 0x2f, 0x64, 

0x65, 0x62, 0x75, 0x67, 0x2f, 0x57, 0x75, 0x70, 0x69, 0x51, 0x75, 0x69,
0x7a, 0x2e, 0x70, 0x64, 

0x62, 0xec, 0xdb, 0x09, 0x58, 0x13, 0xc7, 0x02, 0x07, 0xf0, 0x89, 0x80,
0x20, 0x97, 0x09, 0x22, 

0x0a, 0x2a, 0x46, 0x8b, 0xa8, 0xd5, 0x70, 0x2a, 0x22, 0x58, 0x01, 0x09,
0x20, 0xca, 0x1d, 0x2e, 

0x31, 0x8a, 0x21, 0x09, 0x10, 0x0d, 0x49, 0xcc, 0x21, 0xe2, 0x55, 0x6d,
0xad, 0xda, 0x7a, 0xd4, 

0x7a, 0x17, 0x15, 0x0f, 0xb4, 0x55, 0xab, 0xad, 0x47, 0x5f, 0x5b, 0xeb,
0x59, 0xeb, 0x59, 0xe5, 

0xd5, 0xb3, 0x1e, 0xad, 0xf5, 0xb5, 0xd5, 0xd6, 0x6a, 0xad, 0xb7, 0xb5,
0x5a, 0xe5, 0xfd, 0x67, 

0x77, 0xa3, 0x09, 0x72, 0xb4, 0xa2, 0xed, 0x7b, 0xdf, 0xc7, 0x7e, 0xdf,
0x8f, 0x64, 0x27, 0xb3, 

0xb3, 0xb3, 0x33, 0xbb, 0x33, 0xb3, 0xb3, 0x4b, 0xa2, 0x4a, 0xae, 0xd7,
0x1a, 0xb4, 0xf9, 0x46, 

0x61, 0xb4, 0x7f, 0x74, 0x97, 0x2e, 0xc2, 0x44, 0x49, 0xac, 0xb0, 0x87,
0x5f, 0x40, 0x80, 0x8b, 

0xa3, 0x97, 0x58, 0x42, 0xb0, 0xd8, 0x12, 0xd2, 0x08, 0x1f, 0xc7, 0xf0,
0x67, 0xa6, 0x23, 0x61, 

0x96, 0x83, 0x8d, 0x48, 0xc3, 0xd2, 0xb0, 0x34, 0x2c, 0x0d, 0x4b, 0x95,
0x65, 0x5b, 0x65, 0xc3, 

0xd2, 0xb0, 0x34, 0x2c, 0xff, 0x3f, 0x4b, 0xa8, 0x5d, 0x35, 0x97, 0xf1,
0xc3, 0xca, 0xfb, 0x95, 

0xe6, 0xef, 0xe3, 0xd9, 0x78, 0xf6, 0x7f, 0xe6, 0xf2, 0x1f, 0xdf, 0xe7,
0xf1, 0xf7, 0x6b, 0xff, 

0xec, 0x71, 0x35, 0x2c, 0x0d, 0x4b, 0xc3, 0x52, 0xf7, 0x32, 0xc7, 0x2f,
0x90,

};

 
Kind regards

Ralph Berg

 
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 

Rheinmetall Defence Electronics GmbH
Br?ggeweg 54 - D-28309 Bremen - Tel +49 421 457-01 - Fax +49 421 457-2900 - Sitz der Gesellschaft: Bremen - Register: Amtsgericht Bremen, HRB 9659
Commerzbank AG, Bremen (BLZ 290 400 90) 102213600 - www.rheinmetall-defence.com
Aufsichtsratsvorsitzender: Bodo Garbe - Gesch?ftsf?hrung: Luitjen Ennenga, Ulrich Sasse, Thorsten Quade


-------------- next part --------------
An HTML attachment was scrubbed...
URL: </pipermail/attachments/20130522/5f9dba73/attachment.html>

From wk at gnupg.org  Wed May 22 18:27:56 2013
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 May 2013 18:27:56 +0200
Subject: [PATCH] camellia: Rename camellia_aesni_avx_x86-64.S to
 camellia-aesni-avx-amd64.S
In-Reply-To: <20130522090603.9672.49342.stgit@localhost6.localdomain6> (Jussi
 Kivilinna's message of "Wed, 22 May 2013 12:06:03 +0300")
References: <20130522090603.9672.49342.stgit@localhost6.localdomain6>
Message-ID: <87ip2b6ktv.fsf@vigenere.g10code.de>

Hi,

all 5 patches pushed.  Thanks.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Wed May 22 18:33:37 2013
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 May 2013 18:33:37 +0200
Subject: Wrong use of -export-symbols in src/Makefile.am
In-Reply-To: <514B22E6.108@gmail.com> (LRN's message of "Thu, 21 Mar 2013
 19:10:30 +0400")
References: <5149E1C6.2060106@gmail.com> <8738vpvwxs.fsf@vigenere.g10code.de>
 <514AEEF3.6090609@gmail.com> <87li9gvmtp.fsf@vigenere.g10code.de>
 <514B22E6.108@gmail.com>
Message-ID: <87ehcz6kke.fsf@vigenere.g10code.de>

On Thu, 21 Mar 2013 16:10, lrn1986 at gmail.com said:

> Both. I've had this problem with 1.5, and i still have it with git
> master HEAD.

I just pushed a change I did some time ago.  This solves the problem for
libgcrypt.  Needs to be applied to all other libs as well.  Libtool is
undergoing larger changes thus we can expect that we need to stick to
2.4.2 for quite some time.

> * the fig2dev issue does not affect tarballs, since they come with
> pre-generated image files,

This is still open.  Right.

> * libdir creation only affects package maintainers (i.e. when
> installing into empty staging directory and with -jX).,

Will eventually be fixed.

> * EXPORTS placement issue is the only one that will screw people
> compiling from git AND from tarballs. Fix that, and, hopefully, 1.6.0
> tarballs will come out fine.

Done.


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Wed May 22 18:42:05 2013
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 May 2013 18:42:05 +0200
Subject: Missing libdir when installing the .def file
In-Reply-To: <514AE34F.6060402@gmail.com> (LRN's message of "Thu, 21 Mar 2013
 14:39:11 +0400")
References: <514AE34F.6060402@gmail.com>
Message-ID: <87a9nn6k6a.fsf@vigenere.g10code.de>

On Thu, 21 Mar 2013 11:39, lrn1986 at gmail.com said:

> This is fixed by doing
> 	$(INSTALL) -d $(DESTDIR)$(libdir)
> before installing the .def file.

That is easy.  Pushed.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Thu May 23 09:39:54 2013
From: wk at gnupg.org (Werner Koch)
Date: Thu, 23 May 2013 09:39:54 +0200
Subject: Problem: Buffer to MPI via scan and back with print
In-Reply-To: <6B8A44188A5EE5449B51500C8CC4200E021B43F6@mssrv12.defence-elec.de>
 (Ralph Berg's message of "Wed, 22 May 2013 17:45:14 +0200")
References: <6B8A44188A5EE5449B51500C8CC4200E021B43F6@mssrv12.defence-elec.de>
Message-ID: <871u8y6t6d.fsf@vigenere.g10code.de>

On Wed, 22 May 2013 17:45, ralph.berg at rheinmetall.com said:
> Being a newbie I found the following problem:

Don't expect an answer from me; see http://www.leo-kette.de (German
only).

Other subscribers might have a more relaxed attitude towards your
business than me.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Thu May 23 10:04:13 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 23 May 2013 11:04:13 +0300
Subject: [PATCH v2 1/2] Serpent: faster S-box implementation
Message-ID: <20130523080413.24643.26348.stgit@localhost6.localdomain6>

* cipher/serpent.c (SBOX0, SBOX1, SBOX2, SBOX3, SBOX4, SBOX5, SBOX6)
(SBOX7, SBOX0_INVERSE, SBOX1_INVERSE, SBOX2_INVERSE, SBOX3_INVERSE)
(SBOX4_INVERSE, SBOX5_INVERSE, SBOX6_INVERSE, SBOX7_INVERSE): Replace
with new definitions.
--

These new S-box definitions are from paper:
 D. A. Osvik, ?Speeding up Serpent,? in Third AES Candidate Conference,
 (New York, New York, USA), p. 317?329, National Institute of Standards and
 Technology, 2000. Available at http://www.ii.uib.no/~osvik/pub/aes3.ps.gz

Although these were optimized for two-operand instructions on i386 and for
old Pentium-1 processors, they are slightly faster on current processors
on i386 and x86-64. On ARM, the performance of these S-boxes is about the
same as with the old S-boxes.

new vs old speed ratios (AMD K10, x86-64):
                 ECB/Stream         CBC             CFB             OFB             CTR
              --------------- --------------- --------------- --------------- ---------------
 SERPENT128     1.06x   1.02x   1.06x   1.02x   1.06x   1.06x   1.06x   1.05x   1.07x   1.07x

new vs old speed ratios (Intel Atom, i486):
                 ECB/Stream         CBC             CFB             OFB             CTR
              --------------- --------------- --------------- --------------- ---------------
 SERPENT128     1.12x   1.15x   1.12x   1.15x   1.13x   1.11x   1.12x   1.12x   1.12x   1.13x

new vs old speed ratios (ARM Cortex A8):
                 ECB/Stream         CBC             CFB             OFB             CTR
              --------------- --------------- --------------- --------------- ---------------
 SERPENT128     1.04x   1.02x   1.02x   0.99x   1.02x   1.02x   1.03x   1.03x   1.01x   1.01x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/serpent.c |  600 +++++++++++++++++++++++-------------------------------
 1 file changed, 250 insertions(+), 350 deletions(-)

diff --git a/cipher/serpent.c b/cipher/serpent.c
index ea14c7e..72840cf 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -61,403 +61,303 @@ static const char *serpent_test (void);
    | (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) \
    | (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
 
-/* These are the S-Boxes of Serpent.  They are copied from Serpents
-   reference implementation (the optimized one, contained in
-   `floppy2') and are therefore:
-
-     Copyright (C) 1998 Ross Anderson, Eli Biham, Lars Knudsen.
-
-  To quote the Serpent homepage
-  (http://www.cl.cam.ac.uk/~rja14/serpent.html):
-
-  "Serpent is now completely in the public domain, and we impose no
-   restrictions on its use.  This was announced on the 21st August at
-   the First AES Candidate Conference. The optimised implementations
-   in the submission package are now under the GNU PUBLIC LICENSE
-   (GPL), although some comments in the code still say otherwise. You
-   are welcome to use Serpent for any application."  */
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, ?Speeding up Serpent,? in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317?329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
 
-#define SBOX0(a, b, c, d, w, x, y, z) \
+#define SBOX0(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t05, t06, t07, t08, t09; \
-    u32 t11, t12, t13, t14, t15, t17, t01; \
-    t01 = b   ^ c  ; \
-    t02 = a   | d  ; \
-    t03 = a   ^ b  ; \
-    z   = t02 ^ t01; \
-    t05 = c   | z  ; \
-    t06 = a   ^ d  ; \
-    t07 = b   | c  ; \
-    t08 = d   & t05; \
-    t09 = t03 & t07; \
-    y   = t09 ^ t08; \
-    t11 = t09 & y  ; \
-    t12 = c   ^ d  ; \
-    t13 = t07 ^ t11; \
-    t14 = b   & t06; \
-    t15 = t06 ^ t13; \
-    w   =     ~ t15; \
-    t17 = w   ^ t14; \
-    x   = t12 ^ t17; \
+    u32 r4; \
+    \
+    r3 ^= r0; r4 =  r1; \
+    r1 &= r3; r4 ^= r2; \
+    r1 ^= r0; r0 |= r3; \
+    r0 ^= r4; r4 ^= r3; \
+    r3 ^= r2; r2 |= r1; \
+    r2 ^= r4; r4 = ~r4; \
+    r4 |= r1; r1 ^= r3; \
+    r1 ^= r4; r3 |= r0; \
+    r1 ^= r3; r4 ^= r3; \
+    \
+    w = r1; x = r4; y = r2; z = r0; \
   }
 
-#define SBOX0_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09, t10; \
-    u32 t12, t13, t14, t15, t17, t18, t01; \
-    t01 = c   ^ d  ; \
-    t02 = a   | b  ; \
-    t03 = b   | c  ; \
-    t04 = c   & t01; \
-    t05 = t02 ^ t01; \
-    t06 = a   | t04; \
-    y   =     ~ t05; \
-    t08 = b   ^ d  ; \
-    t09 = t03 & t08; \
-    t10 = d   | y  ; \
-    x   = t09 ^ t06; \
-    t12 = a   | t05; \
-    t13 = x   ^ t12; \
-    t14 = t03 ^ t10; \
-    t15 = a   ^ c  ; \
-    z   = t14 ^ t13; \
-    t17 = t05 & t13; \
-    t18 = t14 | t17; \
-    w   = t15 ^ t18; \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r1; \
+    r1 |= r0; r4 = ~r4; \
+    r1 ^= r2; r2 |= r4; \
+    r1 ^= r3; r0 ^= r4; \
+    r2 ^= r0; r0 &= r3; \
+    r4 ^= r0; r0 |= r1; \
+    r0 ^= r2; r3 ^= r4; \
+    r2 ^= r1; r3 ^= r0; \
+    r3 ^= r1; \
+    r2 &= r3; \
+    r4 ^= r2; \
+    \
+    w = r0; x = r4; y = r1; z = r3; \
   }
 
-#define SBOX1(a, b, c, d, w, x, y, z) \
+#define SBOX1(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t10, t11, t12, t13, t16, t17, t01; \
-    t01 = a   | d  ; \
-    t02 = c   ^ d  ; \
-    t03 =     ~ b  ; \
-    t04 = a   ^ c  ; \
-    t05 = a   | t03; \
-    t06 = d   & t04; \
-    t07 = t01 & t02; \
-    t08 = b   | t06; \
-    y   = t02 ^ t05; \
-    t10 = t07 ^ t08; \
-    t11 = t01 ^ t10; \
-    t12 = y   ^ t11; \
-    t13 = b   & d  ; \
-    z   =     ~ t10; \
-    x   = t13 ^ t12; \
-    t16 = t10 | x  ; \
-    t17 = t05 & t16; \
-    w   = c   ^ t17; \
+    u32 r4; \
+    \
+    r0 = ~r0; r2 = ~r2; \
+    r4 =  r0; r0 &= r1; \
+    r2 ^= r0; r0 |= r3; \
+    r3 ^= r2; r1 ^= r0; \
+    r0 ^= r4; r4 |= r1; \
+    r1 ^= r3; r2 |= r0; \
+    r2 &= r4; r0 ^= r1; \
+    r1 &= r2; \
+    r1 ^= r0; r0 &= r2; \
+    r0 ^= r4; \
+    \
+    w = r2; x = r0; y = r3; z = r1; \
   }
 
-#define SBOX1_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t09, t10, t11, t14, t15, t17, t01; \
-    t01 = a   ^ b  ; \
-    t02 = b   | d  ; \
-    t03 = a   & c  ; \
-    t04 = c   ^ t02; \
-    t05 = a   | t04; \
-    t06 = t01 & t05; \
-    t07 = d   | t03; \
-    t08 = b   ^ t06; \
-    t09 = t07 ^ t06; \
-    t10 = t04 | t03; \
-    t11 = d   & t08; \
-    y   =     ~ t09; \
-    x   = t10 ^ t11; \
-    t14 = a   | y  ; \
-    t15 = t06 ^ x  ; \
-    z   = t01 ^ t04; \
-    t17 = c   ^ t15; \
-    w   = t14 ^ t17; \
+    u32 r4; \
+    \
+    r4 =  r1; r1 ^= r3; \
+    r3 &= r1; r4 ^= r2; \
+    r3 ^= r0; r0 |= r1; \
+    r2 ^= r3; r0 ^= r4; \
+    r0 |= r2; r1 ^= r3; \
+    r0 ^= r1; r1 |= r3; \
+    r1 ^= r0; r4 = ~r4; \
+    r4 ^= r1; r1 |= r0; \
+    r1 ^= r0; \
+    r1 |= r4; \
+    r3 ^= r1; \
+    \
+    w = r4; x = r0; y = r3; z = r2; \
   }
 
-#define SBOX2(a, b, c, d, w, x, y, z) \
+#define SBOX2(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t05, t06, t07, t08; \
-    u32 t09, t10, t12, t13, t14, t01; \
-    t01 = a   | c  ; \
-    t02 = a   ^ b  ; \
-    t03 = d   ^ t01; \
-    w   = t02 ^ t03; \
-    t05 = c   ^ w  ; \
-    t06 = b   ^ t05; \
-    t07 = b   | t05; \
-    t08 = t01 & t06; \
-    t09 = t03 ^ t07; \
-    t10 = t02 | t09; \
-    x   = t10 ^ t08; \
-    t12 = a   | d  ; \
-    t13 = t09 ^ x  ; \
-    t14 = b   ^ t13; \
-    z   =     ~ t09; \
-    y   = t12 ^ t14; \
+    u32 r4; \
+    \
+    r4 =  r0; r0 &= r2; \
+    r0 ^= r3; r2 ^= r1; \
+    r2 ^= r0; r3 |= r4; \
+    r3 ^= r1; r4 ^= r2; \
+    r1 =  r3; r3 |= r4; \
+    r3 ^= r0; r0 &= r1; \
+    r4 ^= r0; r1 ^= r3; \
+    r1 ^= r4; r4 = ~r4; \
+    \
+    w = r2; x = r3; y = r1; z = r4; \
   }
 
-#define SBOX2_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t06, t07, t08, t09; \
-    u32 t10, t11, t12, t15, t16, t17, t01; \
-    t01 = a   ^ d  ; \
-    t02 = c   ^ d  ; \
-    t03 = a   & c  ; \
-    t04 = b   | t02; \
-    w   = t01 ^ t04; \
-    t06 = a   | c  ; \
-    t07 = d   | w  ; \
-    t08 =     ~ d  ; \
-    t09 = b   & t06; \
-    t10 = t08 | t03; \
-    t11 = b   & t07; \
-    t12 = t06 & t02; \
-    z   = t09 ^ t10; \
-    x   = t12 ^ t11; \
-    t15 = c   & z  ; \
-    t16 = w   ^ x  ; \
-    t17 = t10 ^ t15; \
-    y   = t16 ^ t17; \
+    u32 r4; \
+    \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r3; r3 &= r2; \
+    r3 ^= r1; r1 |= r2; \
+    r1 ^= r4; r4 &= r3; \
+    r2 ^= r3; r4 &= r0; \
+    r4 ^= r2; r2 &= r1; \
+    r2 |= r0; r3 = ~r3; \
+    r2 ^= r3; r0 ^= r3; \
+    r0 &= r1; r3 ^= r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r2; z = r3; \
   }
 
-#define SBOX3(a, b, c, d, w, x, y, z) \
+#define SBOX3(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08; \
-    u32 t09, t10, t11, t13, t14, t15, t01; \
-    t01 = a   ^ c  ; \
-    t02 = a   | d  ; \
-    t03 = a   & d  ; \
-    t04 = t01 & t02; \
-    t05 = b   | t03; \
-    t06 = a   & b  ; \
-    t07 = d   ^ t04; \
-    t08 = c   | t06; \
-    t09 = b   ^ t07; \
-    t10 = d   & t05; \
-    t11 = t02 ^ t10; \
-    z   = t08 ^ t09; \
-    t13 = d   | z  ; \
-    t14 = a   | t07; \
-    t15 = b   & t13; \
-    y   = t08 ^ t11; \
-    w   = t14 ^ t15; \
-    x   = t05 ^ t04; \
+    u32 r4; \
+    \
+    r4 =  r0; r0 |= r3; \
+    r3 ^= r1; r1 &= r4; \
+    r4 ^= r2; r2 ^= r3; \
+    r3 &= r0; r4 |= r1; \
+    r3 ^= r4; r0 ^= r1; \
+    r4 &= r0; r1 ^= r3; \
+    r4 ^= r2; r1 |= r0; \
+    r1 ^= r2; r0 ^= r3; \
+    r2  = r1; r1 |= r3; \
+    r1 ^= r0; \
+    \
+    w = r1; x = r2; y = r3; z = r4; \
   }
 
-#define SBOX3_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t09; \
-    u32 t11, t12, t13, t14, t16, t01; \
-    t01 = c   | d  ; \
-    t02 = a   | d  ; \
-    t03 = c   ^ t02; \
-    t04 = b   ^ t02; \
-    t05 = a   ^ d  ; \
-    t06 = t04 & t03; \
-    t07 = b   & t01; \
-    y   = t05 ^ t06; \
-    t09 = a   ^ t03; \
-    w   = t07 ^ t03; \
-    t11 = w   | t05; \
-    t12 = t09 & t11; \
-    t13 = a   & y  ; \
-    t14 = t01 ^ t05; \
-    x   = b   ^ t12; \
-    t16 = b   | t13; \
-    z   = t14 ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r1; \
+    r0 ^= r2; r4 &= r2; \
+    r4 ^= r0; r0 &= r1; \
+    r1 ^= r3; r3 |= r4; \
+    r2 ^= r3; r0 ^= r3; \
+    r1 ^= r4; r3 &= r2; \
+    r3 ^= r1; r1 ^= r0; \
+    r1 |= r2; r0 ^= r3; \
+    r1 ^= r4; \
+    r0 ^= r1; \
+    \
+    w = r2; x = r1; y = r3; z = r0; \
   }
 
-#define SBOX4(a, b, c, d, w, x, y, z) \
+#define SBOX4(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09; \
-    u32 t10, t11, t12, t13, t14, t15, t16, t01; \
-    t01 = a   | b  ; \
-    t02 = b   | c  ; \
-    t03 = a   ^ t02; \
-    t04 = b   ^ d  ; \
-    t05 = d   | t03; \
-    t06 = d   & t01; \
-    z   = t03 ^ t06; \
-    t08 = z   & t04; \
-    t09 = t04 & t05; \
-    t10 = c   ^ t06; \
-    t11 = b   & c  ; \
-    t12 = t04 ^ t08; \
-    t13 = t11 | t03; \
-    t14 = t10 ^ t09; \
-    t15 = a   & t05; \
-    t16 = t11 | t12; \
-    y   = t13 ^ t08; \
-    x   = t15 ^ t16; \
-    w   =     ~ t14; \
+    u32 r4; \
+    \
+    r1 ^= r3; r3 = ~r3; \
+    r2 ^= r3; r3 ^= r0; \
+    r4 =  r1; r1 &= r3; \
+    r1 ^= r2; r4 ^= r3; \
+    r0 ^= r4; r2 &= r4; \
+    r2 ^= r0; r0 &= r1; \
+    r3 ^= r0; r4 |= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r2 &= r3; \
+    r0 = ~r0; r4 ^= r2; \
+    \
+    w = r1; x = r4; y = r0; z = r3; \
   }
 
-#define SBOX4_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t09; \
-    u32 t10, t11, t12, t13, t15, t01; \
-    t01 = b   | d  ; \
-    t02 = c   | d  ; \
-    t03 = a   & t01; \
-    t04 = b   ^ t02; \
-    t05 = c   ^ d  ; \
-    t06 =     ~ t03; \
-    t07 = a   & t04; \
-    x   = t05 ^ t07; \
-    t09 = x   | t06; \
-    t10 = a   ^ t07; \
-    t11 = t01 ^ t09; \
-    t12 = d   ^ t04; \
-    t13 = c   | t10; \
-    z   = t03 ^ t12; \
-    t15 = a   ^ t04; \
-    y   = t11 ^ t13; \
-    w   = t15 ^ t09; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 &= r3; \
+    r2 ^= r1; r1 |= r3; \
+    r1 &= r0; r4 ^= r2; \
+    r4 ^= r1; r1 &= r2; \
+    r0 = ~r0; r3 ^= r4; \
+    r1 ^= r3; r3 &= r0; \
+    r3 ^= r2; r0 ^= r1; \
+    r2 &= r0; r3 ^= r0; \
+    r2 ^= r4; \
+    r2 |= r3; r3 ^= r0; \
+    r2 ^= r1; \
+    \
+    w = r0; x = r3; y = r2; z = r4; \
   }
 
-#define SBOX5(a, b, c, d, w, x, y, z) \
+#define SBOX5(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09; \
-    u32 t10, t11, t12, t13, t14, t01; \
-    t01 = b   ^ d  ; \
-    t02 = b   | d  ; \
-    t03 = a   & t01; \
-    t04 = c   ^ t02; \
-    t05 = t03 ^ t04; \
-    w   =     ~ t05; \
-    t07 = a   ^ t01; \
-    t08 = d   | w  ; \
-    t09 = b   | t05; \
-    t10 = d   ^ t08; \
-    t11 = b   | t07; \
-    t12 = t03 | w  ; \
-    t13 = t07 | t10; \
-    t14 = t01 ^ t11; \
-    y   = t09 ^ t13; \
-    x   = t07 ^ t08; \
-    z   = t12 ^ t14; \
+    u32 r4; \
+    \
+    r0 ^= r1; r1 ^= r3; \
+    r3 = ~r3; r4 =  r1; \
+    r1 &= r0; r2 ^= r3; \
+    r1 ^= r2; r2 |= r4; \
+    r4 ^= r3; r3 &= r1; \
+    r3 ^= r0; r4 ^= r1; \
+    r4 ^= r2; r2 ^= r0; \
+    r0 &= r3; r2 = ~r2; \
+    r0 ^= r4; r4 |= r3; \
+    r2 ^= r4; \
+    \
+    w = r1; x = r3; y = r0; z = r2; \
   }
 
-#define SBOX5_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09; \
-    u32 t10, t12, t13, t15, t16, t01; \
-    t01 = a   & d  ; \
-    t02 = c   ^ t01; \
-    t03 = a   ^ d  ; \
-    t04 = b   & t02; \
-    t05 = a   & c  ; \
-    w   = t03 ^ t04; \
-    t07 = a   & w  ; \
-    t08 = t01 ^ w  ; \
-    t09 = b   | t05; \
-    t10 =     ~ b  ; \
-    x   = t08 ^ t09; \
-    t12 = t10 | t07; \
-    t13 = w   | x  ; \
-    z   = t02 ^ t12; \
-    t15 = t02 ^ t13; \
-    t16 = b   ^ d  ; \
-    y   = t16 ^ t15; \
+    u32 r4; \
+    \
+    r1 = ~r1; r4 =  r3; \
+    r2 ^= r1; r3 |= r0; \
+    r3 ^= r2; r2 |= r1; \
+    r2 &= r0; r4 ^= r3; \
+    r2 ^= r4; r4 |= r0; \
+    r4 ^= r1; r1 &= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r3 &= r4; r4 ^= r1; \
+    r3 ^= r4; r4 = ~r4; \
+    r3 ^= r0; \
+    \
+    w = r1; x = r4; y = r3; z = r2; \
   }
 
-#define SBOX6(a, b, c, d, w, x, y, z) \
+#define SBOX6(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t07, t08, t09, t10; \
-    u32 t11, t12, t13, t15, t17, t18, t01; \
-    t01 = a   & d  ; \
-    t02 = b   ^ c  ; \
-    t03 = a   ^ d  ; \
-    t04 = t01 ^ t02; \
-    t05 = b   | c  ; \
-    x   =     ~ t04; \
-    t07 = t03 & t05; \
-    t08 = b   & x  ; \
-    t09 = a   | c  ; \
-    t10 = t07 ^ t08; \
-    t11 = b   | d  ; \
-    t12 = c   ^ t11; \
-    t13 = t09 ^ t10; \
-    y   =     ~ t13; \
-    t15 = x   & t03; \
-    z   = t12 ^ t07; \
-    t17 = a   ^ b  ; \
-    t18 = y   ^ t15; \
-    w   = t17 ^ t18; \
+    u32 r4; \
+    \
+    r2 = ~r2; r4 =  r3; \
+    r3 &= r0; r0 ^= r4; \
+    r3 ^= r2; r2 |= r4; \
+    r1 ^= r3; r2 ^= r0; \
+    r0 |= r1; r2 ^= r1; \
+    r4 ^= r0; r0 |= r3; \
+    r0 ^= r2; r4 ^= r3; \
+    r4 ^= r0; r3 = ~r3; \
+    r2 &= r4; \
+    r2 ^= r3; \
+    \
+    w = r0; x = r1; y = r4; z = r2; \
   }
 
-#define SBOX6_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t07, t08, t09; \
-    u32 t12, t13, t14, t15, t16, t17, t01; \
-    t01 = a   ^ c  ; \
-    t02 =     ~ c  ; \
-    t03 = b   & t01; \
-    t04 = b   | t02; \
-    t05 = d   | t03; \
-    t06 = b   ^ d  ; \
-    t07 = a   & t04; \
-    t08 = a   | t02; \
-    t09 = t07 ^ t05; \
-    x   = t06 ^ t08; \
-    w   =     ~ t09; \
-    t12 = b   & w  ; \
-    t13 = t01 & t05; \
-    t14 = t01 ^ t12; \
-    t15 = t07 ^ t13; \
-    t16 = d   | t02; \
-    t17 = a   ^ x  ; \
-    z   = t17 ^ t15; \
-    y   = t16 ^ t14; \
+    u32 r4; \
+    \
+    r0 ^= r2; r4 =  r2; \
+    r2 &= r0; r4 ^= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r2 ^= r3; r4 |= r0; \
+    r0 ^= r2; r3 ^= r4; \
+    r4 ^= r1; r1 &= r3; \
+    r1 ^= r0; r0 ^= r3; \
+    r0 |= r2; r3 ^= r1; \
+    r4 ^= r0; \
+    \
+    w = r1; x = r2; y = r4; z = r3; \
   }
 
-#define SBOX7(a, b, c, d, w, x, y, z) \
+#define SBOX7(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t05, t06, t08, t09, t10; \
-    u32 t11, t13, t14, t15, t16, t17, t01; \
-    t01 = a   & c  ; \
-    t02 =     ~ d  ; \
-    t03 = a   & t02; \
-    t04 = b   | t01; \
-    t05 = a   & b  ; \
-    t06 = c   ^ t04; \
-    z   = t03 ^ t06; \
-    t08 = c   | z  ; \
-    t09 = d   | t05; \
-    t10 = a   ^ t08; \
-    t11 = t04 & z  ; \
-    x   = t09 ^ t10; \
-    t13 = b   ^ x  ; \
-    t14 = t01 ^ x  ; \
-    t15 = c   ^ t05; \
-    t16 = t11 | t13; \
-    t17 = t02 | t14; \
-    w   = t15 ^ t17; \
-    y   = a   ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r1; r1 |= r2; \
+    r1 ^= r3; r4 ^= r2; \
+    r2 ^= r1; r3 |= r4; \
+    r3 &= r0; r4 ^= r2; \
+    r3 ^= r1; r1 |= r4; \
+    r1 ^= r0; r0 |= r4; \
+    r0 ^= r2; r1 ^= r4; \
+    r2 ^= r1; r1 &= r0; \
+    r1 ^= r4; r2 = ~r2; \
+    r2 |= r0; \
+    r4 ^= r2; \
+    \
+    w = r4; x = r3; y = r1; z = r0; \
   }
 
-#define SBOX7_INVERSE(a, b, c, d, w, x, y, z) \
+#define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
   { \
-    u32 t02, t03, t04, t06, t07, t08, t09; \
-    u32 t10, t11, t13, t14, t15, t16, t01; \
-    t01 = a   & b  ; \
-    t02 = a   | b  ; \
-    t03 = c   | t01; \
-    t04 = d   & t02; \
-    z   = t03 ^ t04; \
-    t06 = b   ^ t04; \
-    t07 = d   ^ z  ; \
-    t08 =     ~ t07; \
-    t09 = t06 | t08; \
-    t10 = b   ^ d  ; \
-    t11 = a   | d  ; \
-    x   = a   ^ t09; \
-    t13 = c   ^ t06; \
-    t14 = c   & t11; \
-    t15 = d   | x  ; \
-    t16 = t01 | t10; \
-    w   = t13 ^ t15; \
-    y   = t14 ^ t16; \
+    u32 r4; \
+    \
+    r4 =  r2; r2 ^= r0; \
+    r0 &= r3; r4 |= r3; \
+    r2 = ~r2; r3 ^= r1; \
+    r1 |= r0; r0 ^= r2; \
+    r2 &= r4; r3 &= r4; \
+    r1 ^= r2; r2 ^= r0; \
+    r0 |= r2; r4 ^= r1; \
+    r0 ^= r3; r3 ^= r4; \
+    r4 |= r0; r3 ^= r2; \
+    r4 ^= r2; \
+    \
+    w = r3; x = r0; y = r1; z = r4; \
   }
 
 /* XOR BLOCK1 into BLOCK0.  */


From jussi.kivilinna at iki.fi  Thu May 23 10:04:18 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 23 May 2013 11:04:18 +0300
Subject: [PATCH v2 2/2] serpent: add SSE2 accelerated amd64 implementation
In-Reply-To: <20130523080413.24643.26348.stgit@localhost6.localdomain6>
References: <20130523080413.24643.26348.stgit@localhost6.localdomain6>
Message-ID: <20130523080418.24643.76202.stgit@localhost6.localdomain6>

* configure.ac (serpent): Add 'serpent-sse2-amd64.lo'.
* cipher/Makefile.am (EXTRA_libcipher_la_SOURCES): Add
'serpent-sse2-amd64.S'.
* cipher/cipher.c (gcry_cipher_open) [USE_SERPENT]: Register bulk
functions for CBC-decryption and CTR-mode.
* cipher/serpent.c (USE_SSE2): New macro.
[USE_SSE2] (_gcry_serpent_sse2_ctr_enc, _gcry_serpent_sse2_cbc_dec):
New prototypes to assembler functions.
(serpent_setkey): Set 'serpent_init_done' before calling serpent_test.
(_gcry_serpent_ctr_enc): New function.
(_gcry_serpent_cbc_dec): New function.
(selftest_ctr_128): New function.
(selftest_cbc_128): New function.
(selftest): Call selftest_ctr_128 and selftest_cbc_128.
* cipher/serpent-sse2-amd64.S: New file.
* src/cipher.h (_gcry_serpent_ctr_enc): New prototype.
(_gcry_serpent_cbc_dec): New prototype.
--

[v2]: Converted to SSE2, to support all amd64 processors (SSE2 is required
      feature by AMD64 SysV ABI).

Patch adds word-sliced SSE2 implementation of Serpent for amd64 for speeding
up parallelizable workloads (CTR mode, CBC mode decryption). Implementation
processes eight blocks in parallel, with two four-block sets interleaved for
out-of-order scheduling.

Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
SERPENT128    1.00x   0.99x   1.00x   3.98x   1.00x   1.01x   1.00x   1.01x   4.04x   4.04x

Speed old vs. new on AMD Phenom II X6 1055T:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
SERPENT128    1.02x   1.01x   1.00x   2.83x   1.00x   1.00x   1.00x   1.00x   2.72x   2.72x

Speed old vs. new on Intel Core2 Duo T8100:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
SERPENT128    1.00x   1.02x   0.97x   4.02x   0.98x   1.01x   0.98x   1.00x   3.82x   3.91x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am          |    2 
 cipher/cipher.c             |    8 
 cipher/serpent-sse2-amd64.S |  826 +++++++++++++++++++++++++++++++++++++++++++
 cipher/serpent.c            |  219 +++++++++++
 configure.ac                |    7 
 src/cipher.h                |    7 
 6 files changed, 1066 insertions(+), 3 deletions(-)
 create mode 100644 cipher/serpent-sse2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 0808bd2..69f1e6d 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -68,7 +68,7 @@ rmd160.c \
 rsa.c \
 scrypt.c \
 seed.c \
-serpent.c \
+serpent.c serpent-sse2-amd64.S \
 sha1.c \
 sha256.c \
 sha512.c \
diff --git a/cipher/cipher.c b/cipher/cipher.c
index f1224af..20ac2c7 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -726,6 +726,14 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
               break;
 #endif /*USE_CAMELLIA*/
+#ifdef USE_SERPENT
+	    case GCRY_CIPHER_SERPENT128:
+	    case GCRY_CIPHER_SERPENT192:
+	    case GCRY_CIPHER_SERPENT256:
+              h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+              h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
+              break;
+#endif /*USE_SERPENT*/
 
             default:
               break;
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
new file mode 100644
index 0000000..8d8c8dd
--- /dev/null
+++ b/cipher/serpent-sse2-amd64.S
@@ -0,0 +1,826 @@
+/* serpent-sse2-amd64.S  -  SSE2 implementation of Serpent cipher
+ *
+ * Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_SERPENT)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+/* struct serpent_context: */
+#define ctx_keys 0
+
+/* register macros */
+#define CTX %rdi
+
+/* vector registers */
+.set RA0, %xmm0
+.set RA1, %xmm1
+.set RA2, %xmm2
+.set RA3, %xmm3
+.set RA4, %xmm4
+
+.set RB0, %xmm5
+.set RB1, %xmm6
+.set RB2, %xmm7
+.set RB3, %xmm8
+.set RB4, %xmm9
+
+.set RNOT, %xmm10
+.set RTMP0, %xmm11
+.set RTMP1, %xmm12
+.set RTMP2, %xmm13
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* preprocessor macro for renaming vector registers using GAS macros */
+#define sbox_reg_rename(r0, r1, r2, r3, r4, \
+			new_r0, new_r1, new_r2, new_r3, new_r4) \
+	.set rename_reg0, new_r0; \
+	.set rename_reg1, new_r1; \
+	.set rename_reg2, new_r2; \
+	.set rename_reg3, new_r3; \
+	.set rename_reg4, new_r4; \
+	\
+	.set r0, rename_reg0; \
+	.set r1, rename_reg1; \
+	.set r2, rename_reg2; \
+	.set r3, rename_reg3; \
+	.set r4, rename_reg4;
+
+/* vector 32-bit rotation to left */
+#define vec_rol(reg, nleft, tmp) \
+	movdqa reg, tmp; 		\
+	pslld $(nleft), tmp;		\
+	psrld $(32 - (nleft)), reg;	\
+	por tmp, reg;
+
+/* vector 32-bit rotation to right */
+#define vec_ror(reg, nright, tmp) \
+	vec_rol(reg, 32 - nright, tmp)
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* 128-bit wide byte swap */
+#define pbswap(xreg, t0) \
+	/* reorder 32-bit words, [a,b,c,d] => [d,c,b,a] */ \
+	pshufd $0x1b, xreg, xreg; \
+	/* reorder high&low 16-bit words, [d0,d1,c0,c1] => [d1,d0,c1,c0] */ \
+	pshuflw $0xb1, xreg, xreg; \
+	pshufhw $0xb1, xreg, xreg; \
+	/* reorder bytes in 16-bit words */ \
+	movdqa xreg, t0; \
+	psrlw $8, t0; \
+	psllw $8, xreg; \
+	por t0, xreg;
+
+/**********************************************************************
+  8-way serpent
+ **********************************************************************/
+
+/*
+ * These are the S-Boxes of Serpent from following research paper.
+ *
+ *  D. A. Osvik, ?Speeding up Serpent,? in Third AES Candidate Conference,
+ *   (New York, New York, USA), p. 317?329, National Institute of Standards and
+ *   Technology, 2000.
+ *
+ * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
+ *
+ */
+#define SBOX0(r0, r1, r2, r3, r4) \
+	pxor	r0, r3;		movdqa	r1, r4;		\
+	pand	r3, r1;		pxor	r2, r4;		\
+	pxor	r0, r1;		por	r3, r0;		\
+	pxor	r4, r0;		pxor	r3, r4;		\
+	pxor	r2, r3;		por	r1, r2;		\
+	pxor	r4, r2;		pxor	RNOT, r4;	\
+	por	r1, r4;		pxor	r3, r1;		\
+	pxor	r4, r1;		por	r0, r3;		\
+	pxor	r3, r1;		pxor	r3, r4;		\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+
+#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r2;	movdqa	r1, r4;		\
+	por	r0, r1;		pxor	RNOT, r4;	\
+	pxor	r2, r1;		por	r4, r2;		\
+	pxor	r3, r1;		pxor	r4, r0;		\
+	pxor	r0, r2;		pand	r3, r0;		\
+	pxor	r0, r4;		por	r1, r0;		\
+	pxor	r2, r0;		pxor	r4, r3;		\
+	pxor	r1, r2;		pxor	r0, r3;		\
+	pxor	r1, r3;	\
+	pand	r3, r2;	\
+	pxor	r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+
+#define SBOX1(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r0;	pxor	RNOT, r2;	\
+	movdqa	r0, r4;		pand	r1, r0;		\
+	pxor	r0, r2;		por	r3, r0;		\
+	pxor	r2, r3;		pxor	r0, r1;		\
+	pxor	r4, r0;		por	r1, r4;		\
+	pxor	r3, r1;		por	r0, r2;		\
+	pand	r4, r2;		pxor	r1, r0;		\
+	pand	r2, r1;	\
+	pxor	r0, r1;		pand	r2, r0;		\
+	pxor	r4, r0;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+
+#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r1, r4;		pxor	r3, r1;		\
+	pand	r1, r3;		pxor	r2, r4;		\
+	pxor	r0, r3;		por	r1, r0;		\
+	pxor	r3, r2;		pxor	r4, r0;		\
+	por	r2, r0;		pxor	r3, r1;		\
+	pxor	r1, r0;		por	r3, r1;		\
+	pxor	r0, r1;		pxor	RNOT, r4;	\
+	pxor	r1, r4;		por	r0, r1;		\
+	pxor	r0, r1;	\
+	por	r4, r1;	\
+	pxor	r1, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+
+#define SBOX2(r0, r1, r2, r3, r4) \
+	movdqa	r0, r4;		pand	r2, r0;		\
+	pxor	r3, r0;		pxor	r1, r2;		\
+	pxor	r0, r2;		por	r4, r3;		\
+	pxor	r1, r3;		pxor	r2, r4;		\
+	movdqa	r3, r1;		por	r4, r3;		\
+	pxor	r0, r3;		pand	r1, r0;		\
+	pxor	r0, r4;		pxor	r3, r1;		\
+	pxor	r4, r1;		pxor	RNOT, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+
+#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	r3, r2;		pxor	r0, r3;		\
+	movdqa	r3, r4;		pand	r2, r3;		\
+	pxor	r1, r3;		por	r2, r1;		\
+	pxor	r4, r1;		pand	r3, r4;		\
+	pxor	r3, r2;		pand	r0, r4;		\
+	pxor	r2, r4;		pand	r1, r2;		\
+	por	r0, r2;		pxor	RNOT, r3;	\
+	pxor	r3, r2;		pxor	r3, r0;		\
+	pand	r1, r0;		pxor	r4, r3;		\
+	pxor	r0, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+
+#define SBOX3(r0, r1, r2, r3, r4) \
+	movdqa	r0, r4;		por	r3, r0;		\
+	pxor	r1, r3;		pand	r4, r1;		\
+	pxor	r2, r4;		pxor	r3, r2;		\
+	pand	r0, r3;		por	r1, r4;		\
+	pxor	r4, r3;		pxor	r1, r0;		\
+	pand	r0, r4;		pxor	r3, r1;		\
+	pxor	r2, r4;		por	r0, r1;		\
+	pxor	r2, r1;		pxor	r3, r0;		\
+	movdqa	r1, r2;		por	r3, r1;		\
+	pxor	r0, r1;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+
+#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pxor	r1, r2;		\
+	pxor	r2, r0;		pand	r2, r4;		\
+	pxor	r0, r4;		pand	r1, r0;		\
+	pxor	r3, r1;		por	r4, r3;		\
+	pxor	r3, r2;		pxor	r3, r0;		\
+	pxor	r4, r1;		pand	r2, r3;		\
+	pxor	r1, r3;		pxor	r0, r1;		\
+	por	r2, r1;		pxor	r3, r0;		\
+	pxor	r4, r1;	\
+	pxor	r1, r0;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+
+#define SBOX4(r0, r1, r2, r3, r4) \
+	pxor	r3, r1;		pxor	RNOT, r3;	\
+	pxor	r3, r2;		pxor	r0, r3;		\
+	movdqa	r1, r4;		pand	r3, r1;		\
+	pxor	r2, r1;		pxor	r3, r4;		\
+	pxor	r4, r0;		pand	r4, r2;		\
+	pxor	r0, r2;		pand	r1, r0;		\
+	pxor	r0, r3;		por	r1, r4;		\
+	pxor	r0, r4;		por	r3, r0;		\
+	pxor	r2, r0;		pand	r3, r2;		\
+	pxor	RNOT, r0;	pxor	r2, r4;		\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+
+#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pand	r3, r2;		\
+	pxor	r1, r2;		por	r3, r1;		\
+	pand	r0, r1;		pxor	r2, r4;		\
+	pxor	r1, r4;		pand	r2, r1;		\
+	pxor	RNOT, r0;	pxor	r4, r3;		\
+	pxor	r3, r1;		pand	r0, r3;		\
+	pxor	r2, r3;		pxor	r1, r0;		\
+	pand	r0, r2;		pxor	r0, r3;		\
+	pxor	r4, r2;	\
+	por	r3, r2;		pxor	r0, r3;		\
+	pxor	r1, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+
+#define SBOX5(r0, r1, r2, r3, r4) \
+	pxor	r1, r0;		pxor	r3, r1;		\
+	pxor	RNOT, r3;	movdqa	r1, r4;		\
+	pand	r0, r1;		pxor	r3, r2;		\
+	pxor	r2, r1;		por	r4, r2;		\
+	pxor	r3, r4;		pand	r1, r3;		\
+	pxor	r0, r3;		pxor	r1, r4;		\
+	pxor	r2, r4;		pxor	r0, r2;		\
+	pand	r3, r0;		pxor	RNOT, r2;	\
+	pxor	r4, r0;		por	r3, r4;		\
+	pxor	r4, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+
+#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r1;	movdqa	r3, r4;		\
+	pxor	r1, r2;		por	r0, r3;		\
+	pxor	r2, r3;		por	r1, r2;		\
+	pand	r0, r2;		pxor	r3, r4;		\
+	pxor	r4, r2;		por	r0, r4;		\
+	pxor	r1, r4;		pand	r2, r1;		\
+	pxor	r3, r1;		pxor	r2, r4;		\
+	pand	r4, r3;		pxor	r1, r4;		\
+	pxor	r4, r3;		pxor	RNOT, r4;	\
+	pxor	r0, r3;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+
+#define SBOX6(r0, r1, r2, r3, r4) \
+	pxor	RNOT, r2;	movdqa	r3, r4;		\
+	pand	r0, r3;		pxor	r4, r0;		\
+	pxor	r2, r3;		por	r4, r2;		\
+	pxor	r3, r1;		pxor	r0, r2;		\
+	por	r1, r0;		pxor	r1, r2;		\
+	pxor	r0, r4;		por	r3, r0;		\
+	pxor	r2, r0;		pxor	r3, r4;		\
+	pxor	r0, r4;		pxor	RNOT, r3;	\
+	pand	r4, r2;	\
+	pxor	r3, r2;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+
+#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
+	pxor	r2, r0;		movdqa	r2, r4;		\
+	pand	r0, r2;		pxor	r3, r4;		\
+	pxor	RNOT, r2;	pxor	r1, r3;		\
+	pxor	r3, r2;		por	r0, r4;		\
+	pxor	r2, r0;		pxor	r4, r3;		\
+	pxor	r1, r4;		pand	r3, r1;		\
+	pxor	r0, r1;		pxor	r3, r0;		\
+	por	r2, r0;		pxor	r1, r3;		\
+	pxor	r0, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+
+#define SBOX7(r0, r1, r2, r3, r4) \
+	movdqa	r1, r4;		por	r2, r1;		\
+	pxor	r3, r1;		pxor	r2, r4;		\
+	pxor	r1, r2;		por	r4, r3;		\
+	pand	r0, r3;		pxor	r2, r4;		\
+	pxor	r1, r3;		por	r4, r1;		\
+	pxor	r0, r1;		por	r4, r0;		\
+	pxor	r2, r0;		pxor	r4, r1;		\
+	pxor	r1, r2;		pand	r0, r1;		\
+	pxor	r4, r1;		pxor	RNOT, r2;	\
+	por	r0, r2;	\
+	pxor	r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+
+#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
+	movdqa	r2, r4;		pxor	r0, r2;		\
+	pand	r3, r0;		por	r3, r4;		\
+	pxor	RNOT, r2;	pxor	r1, r3;		\
+	por	r0, r1;		pxor	r2, r0;		\
+	pand	r4, r2;		pand	r4, r3;		\
+	pxor	r2, r1;		pxor	r0, r2;		\
+	por	r2, r0;		pxor	r1, r4;		\
+	pxor	r3, r0;		pxor	r4, r3;		\
+	por	r0, r4;		pxor	r2, r3;		\
+	pxor	r2, r4;	\
+	\
+	sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+
+/* Apply SBOX number WHICH to to the block.  */
+#define SBOX(which, r0, r1, r2, r3, r4) \
+	SBOX##which (r0, r1, r2, r3, r4)
+
+/* Apply inverse SBOX number WHICH to to the block.  */
+#define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
+	SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
+
+/* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary.  */
+#define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
+	pbroadcastd ((ctx_keys + (round) * 16 + 0 * 4)(CTX), r4); \
+	pxor r4, r0; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 1 * 4)(CTX), r4); \
+	pxor r4, r1; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 2 * 4)(CTX), r4); \
+	pxor r4, r2; \
+	pbroadcastd ((ctx_keys + (round) * 16 + 3 * 4)(CTX), r4); \
+	pxor r4, r3;
+
+/* Apply the linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
+	vec_rol(r0, 13, r4);	\
+	vec_rol(r2, 3, r4);	\
+	pxor r0, r1;		\
+	pxor r2, r1;		\
+	movdqa r0, r4;		\
+	pslld $3, r4;		\
+	pxor r2, r3;		\
+	pxor r4, r3;		\
+	vec_rol(r1, 1, r4);	\
+	vec_rol(r3, 7, r4);	\
+	pxor r1, r0;		\
+	pxor r3, r0;		\
+	movdqa r1, r4;		\
+	pslld $7, r4;		\
+	pxor r3, r2;		\
+	pxor r4, r2;		\
+	vec_rol(r0, 5, r4);	\
+	vec_rol(r2, 22, r4);
+
+/* Apply the inverse linear transformation to BLOCK.  */
+#define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
+	vec_ror(r2, 22, r4);	\
+	vec_ror(r0, 5, r4);	\
+	movdqa r1, r4;		\
+	pslld $7, r4;		\
+	pxor r3, r2;		\
+	pxor r4, r2;		\
+	pxor r1, r0;		\
+	pxor r3, r0;		\
+	vec_ror(r3, 7, r4);	\
+	vec_ror(r1, 1, r4);	\
+	movdqa r0, r4;		\
+	pslld $3, r4;		\
+	pxor r2, r3;		\
+	pxor r4, r3;		\
+	pxor r0, r1;		\
+	pxor r2, r1;		\
+	vec_ror(r2, 3, r4);	\
+	vec_ror(r0, 13, r4);
+
+/* Apply a Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	SBOX (which, a0, a1, a2, a3, a4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+		SBOX (which, b0, b1, b2, b3, b4);		\
+	LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4);	\
+	.set round, (round + 1);
+
+/* Apply the last Serpent round to eight parallel blocks.  This macro increments
+   `round'.  */
+#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	SBOX (which, a0, a1, a2, a3, a4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+		SBOX (which, b0, b1, b2, b3, b4);		\
+	.set round, (round + 1);			\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round + 1);
+
+/* Apply an inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
+		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+	.set round, (round - 1);
+
+/* Apply the first inverse Serpent round to eight parallel blocks.  This macro
+   increments `round'.  */
+#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round - 1);			\
+	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
+	.set round, (round - 1);
+
+.text
+
+.align 8
+.type   __serpent_enc_blk8, at function;
+__serpent_enc_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 */
+
+	/* record input vector names for __serpent_enc_blk8 */
+	.set enc_in_a0, RA0
+	.set enc_in_a1, RA1
+	.set enc_in_a2, RA2
+	.set enc_in_a3, RA3
+	.set enc_in_b0, RB0
+	.set enc_in_b1, RB1
+	.set enc_in_b2, RB2
+	.set enc_in_b3, RB3
+
+	pcmpeqd RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	.set round, 0
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	/* record output vector names for __serpent_enc_blk8 */
+	.set enc_out_a0, RA0
+	.set enc_out_a1, RA1
+	.set enc_out_a2, RA2
+	.set enc_out_a3, RA3
+	.set enc_out_b0, RB0
+	.set enc_out_b1, RB1
+	.set enc_out_b2, RB2
+	.set enc_out_b3, RB3
+
+	ret;
+.size __serpent_enc_blk8,.-__serpent_enc_blk8;
+
+.align 8
+.type   __serpent_dec_blk8, at function;
+__serpent_dec_blk8:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 * 						ciphertext blocks
+	 * output:
+	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+	 *						blocks
+	 */
+
+	/* record input vector names for __serpent_dec_blk8 */
+	.set dec_in_a0, RA0
+	.set dec_in_a1, RA1
+	.set dec_in_a2, RA2
+	.set dec_in_a3, RA3
+	.set dec_in_b0, RB0
+	.set dec_in_b1, RB1
+	.set dec_in_b2, RB2
+	.set dec_in_b3, RB3
+
+	pcmpeqd RNOT, RNOT;
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	.set round, 32
+	ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+
+	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
+	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
+
+	/* record output vector names for __serpent_dec_blk8 */
+	.set dec_out_a0, RA0
+	.set dec_out_a1, RA1
+	.set dec_out_a2, RA2
+	.set dec_out_a3, RA3
+	.set dec_out_b0, RB0
+	.set dec_out_b1, RB1
+	.set dec_out_b2, RB2
+	.set dec_out_b3, RB3
+
+	ret;
+.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+
+.align 8
+.global _gcry_serpent_sse2_ctr_enc
+.type   _gcry_serpent_sse2_ctr_enc, at function;
+_gcry_serpent_sse2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+
+	.set RA0, enc_in_a0
+	.set RA1, enc_in_a1
+	.set RA2, enc_in_a2
+	.set RA3, enc_in_a3
+	.set RB0, enc_in_b0
+	.set RB1, enc_in_b1
+	.set RB2, enc_in_b2
+	.set RB3, enc_in_b3
+
+	/* load IV and byteswap */
+	movdqu (%rcx), RA0;
+	movdqa RA0, RTMP0;
+	pbswap(RTMP0, RTMP1); /* be => le */
+
+	pcmpeqd RNOT, RNOT;
+	psrldq $8, RNOT; /* low: -1, high: 0 */
+	movdqa RNOT, RTMP2;
+	paddq RTMP2, RTMP2; /* low: -2, high: 0 */
+
+	/* construct IVs */
+	movdqa RTMP0, RTMP1;
+	psubq RNOT, RTMP0; /* +1 */
+	movdqa RTMP0, RA1;
+	psubq RTMP2, RTMP1; /* +2 */
+	movdqa RTMP1, RA2;
+	psubq RTMP2, RTMP0; /* +3 */
+	movdqa RTMP0, RA3;
+	psubq RTMP2, RTMP1; /* +4 */
+	movdqa RTMP1, RB0;
+	psubq RTMP2, RTMP0; /* +5 */
+	movdqa RTMP0, RB1;
+	psubq RTMP2, RTMP1; /* +6 */
+	movdqa RTMP1, RB2;
+	psubq RTMP2, RTMP0; /* +7 */
+	movdqa RTMP0, RB3;
+	psubq RTMP2, RTMP1; /* +8 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpl $0xffffffff, 8(%rcx);
+	jne .Lno_ctr_carry;
+
+	movl 12(%rcx), %eax;
+	bswapl %eax;
+	cmpl $-8, %eax;
+	jb .Lno_ctr_carry;
+	pslldq $8, RNOT; /* low: 0, high: -1 */
+	je .Lcarry_RTMP0;
+
+	cmpl $-6, %eax;
+	jb .Lcarry_RB3;
+	je .Lcarry_RB2;
+
+	cmpl $-4, %eax;
+	jb .Lcarry_RB1;
+	je .Lcarry_RB0;
+
+	cmpl $-2, %eax;
+	jb .Lcarry_RA3;
+	je .Lcarry_RA2;
+
+	psubq RNOT, RA1;
+.Lcarry_RA2:
+	psubq RNOT, RA2;
+.Lcarry_RA3:
+	psubq RNOT, RA3;
+.Lcarry_RB0:
+	psubq RNOT, RB0;
+.Lcarry_RB1:
+	psubq RNOT, RB1;
+.Lcarry_RB2:
+	psubq RNOT, RB2;
+.Lcarry_RB3:
+	psubq RNOT, RB3;
+.Lcarry_RTMP0:
+	psubq RNOT, RTMP1;
+
+.Lno_ctr_carry:
+	/* le => be */
+	pbswap(RA1, RTMP0);
+	pbswap(RA2, RTMP0);
+	pbswap(RA3, RTMP0);
+	pbswap(RB0, RTMP0);
+	pbswap(RB1, RTMP0);
+	pbswap(RB2, RTMP0);
+	pbswap(RB3, RTMP0);
+	pbswap(RTMP1, RTMP0);
+	/* store new IV */
+	movdqu RTMP1, (%rcx);
+
+	call __serpent_enc_blk8;
+
+	.set RA0, enc_out_a0
+	.set RA1, enc_out_a1
+	.set RA2, enc_out_a2
+	.set RA3, enc_out_a3
+	.set RB0, enc_out_b0
+	.set RB1, enc_out_b1
+	.set RB2, enc_out_b2
+	.set RB3, enc_out_b3
+
+	pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;
+
+.align 8
+.global _gcry_serpent_sse2_cbc_dec
+.type   _gcry_serpent_sse2_cbc_dec, at function;
+_gcry_serpent_sse2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+
+	.set RA0, dec_in_a0
+	.set RA1, dec_in_a1
+	.set RA2, dec_in_a2
+	.set RA3, dec_in_a3
+	.set RB0, dec_in_b0
+	.set RB1, dec_in_b1
+	.set RB2, dec_in_b2
+	.set RB3, dec_in_b3
+
+	movdqu (0 * 16)(%rdx), RA0;
+	movdqu (1 * 16)(%rdx), RA1;
+	movdqu (2 * 16)(%rdx), RA2;
+	movdqu (3 * 16)(%rdx), RA3;
+	movdqu (4 * 16)(%rdx), RB0;
+	movdqu (5 * 16)(%rdx), RB1;
+	movdqu (6 * 16)(%rdx), RB2;
+	movdqu (7 * 16)(%rdx), RB3;
+
+	call __serpent_dec_blk8;
+
+	.set RA0, dec_out_a0
+	.set RA1, dec_out_a1
+	.set RA2, dec_out_a2
+	.set RA3, dec_out_a3
+	.set RB0, dec_out_b0
+	.set RB1, dec_out_b1
+	.set RB2, dec_out_b2
+	.set RB3, dec_out_b3
+
+	movdqu (7 * 16)(%rdx), RNOT;
+	pxor_u((%rcx), RA0, RTMP0);
+	pxor_u((0 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((3 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB3, RTMP0);
+	movdqu RNOT, (%rcx); /* store new IV */
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
+
+#endif /*defined(USE_SERPENT)*/
+#endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 72840cf..7b82b48 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -28,6 +28,15 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bithelp.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+
+
+/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
+#undef USE_SSE2
+#if defined(__x86_64__)
+# define USE_SSE2 1
+#endif
 
 /* Number of rounds per Serpent encrypt/decrypt operation.  */
 #define ROUNDS 32
@@ -52,6 +61,21 @@ typedef struct serpent_context
 } serpent_context_t;
 
 
+#ifdef USE_SSE2
+/* Assembler implementations of Serpent using SSE2.  Process 8 block in
+   parallel.
+ */
+extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr);
+
+extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv);
+#endif
+
 /* A prototype.  */
 static const char *serpent_test (void);
 
@@ -191,7 +215,7 @@ static const char *serpent_test (void);
     r4 &= r0; r1 ^= r3; \
     r4 ^= r2; r1 |= r0; \
     r1 ^= r2; r0 ^= r3; \
-    r2  = r1; r1 |= r3; \
+    r2 =  r1; r1 |= r3; \
     r1 ^= r0; \
     \
     w = r1; x = r2; y = r3; z = r4; \
@@ -587,10 +611,10 @@ serpent_setkey (void *ctx,
   if (! serpent_init_done)
     {
       /* Execute a self-test the first time, Serpent is used.  */
+      serpent_init_done = 1;
       serpent_test_ret = serpent_test ();
       if (serpent_test_ret)
 	log_error ("Serpent test failure: %s\n", serpent_test_ret);
-      serpent_init_done = 1;
     }
 
   if (serpent_test_ret)
@@ -740,6 +764,190 @@ serpent_decrypt (void *ctx, byte *buffer_out, const byte *buffer_in)
 
 
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size sizeof(serpent_block_t). */
+void
+_gcry_serpent_ctr_enc(void *context, unsigned char *ctr,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+  int i;
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* clear SSE2 registers used by serpent-sse2 */
+        asm volatile (
+          "pxor %%xmm0, %%xmm0;\n"
+          "pxor %%xmm1, %%xmm1;\n"
+          "pxor %%xmm2, %%xmm2;\n"
+          "pxor %%xmm3, %%xmm3;\n"
+          "pxor %%xmm4, %%xmm4;\n"
+          "pxor %%xmm5, %%xmm5;\n"
+          "pxor %%xmm6, %%xmm6;\n"
+          "pxor %%xmm7, %%xmm7;\n"
+          "pxor %%xmm10, %%xmm10;\n"
+          "pxor %%xmm11, %%xmm11;\n"
+          "pxor %%xmm12, %%xmm12;\n"
+          "pxor %%xmm13, %%xmm13;\n"
+          :::);
+
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      serpent_encrypt_internal(ctx, ctr, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+      /* Increment the counter.  */
+      for (i = sizeof(serpent_block_t); i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cbc_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[sizeof(serpent_block_t)];
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* clear SSE2 registers used by serpent-sse2 */
+        asm volatile (
+          "pxor %%xmm0, %%xmm0;\n"
+          "pxor %%xmm1, %%xmm1;\n"
+          "pxor %%xmm2, %%xmm2;\n"
+          "pxor %%xmm3, %%xmm3;\n"
+          "pxor %%xmm4, %%xmm4;\n"
+          "pxor %%xmm5, %%xmm5;\n"
+          "pxor %%xmm6, %%xmm6;\n"
+          "pxor %%xmm7, %%xmm7;\n"
+          "pxor %%xmm10, %%xmm10;\n"
+          "pxor %%xmm11, %%xmm11;\n"
+          "pxor %%xmm12, %%xmm12;\n"
+          "pxor %%xmm13, %%xmm13;\n"
+          :::);
+
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, sizeof(serpent_block_t));
+
+      serpent_decrypt_internal (ctx, inbuf, outbuf);
+
+      buf_xor(outbuf, outbuf, iv, sizeof(serpent_block_t));
+      memcpy(iv, savebuf, sizeof(serpent_block_t));
+      inbuf += sizeof(serpent_block_t);
+      outbuf += sizeof(serpent_block_t);
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+

+
+/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 8+1;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_ctr_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cbc_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Serpent test.  */
 
 static const char *
@@ -748,6 +956,7 @@ serpent_test (void)
   serpent_context_t context;
   unsigned char scratch[16];
   unsigned int i;
+  const char *r;
 
   static struct test
   {
@@ -819,6 +1028,12 @@ serpent_test (void)
 	}
     }
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/configure.ac b/configure.ac
index f14e28a..3fec8bc 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1219,6 +1219,13 @@ LIST_MEMBER(serpent, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent.lo"
    AC_DEFINE(USE_SERPENT, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the SSE2 implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS serpent-sse2-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(rfc2268, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 17fec6c..4e68487 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -103,6 +103,13 @@ void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              unsigned int nblocks);
 
+/*-- serpent.c --*/
+void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
+void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);


From wk at gnupg.org  Thu May 23 12:25:41 2013
From: wk at gnupg.org (Werner Koch)
Date: Thu, 23 May 2013 12:25:41 +0200
Subject: [PATCH v2 1/2] Serpent: faster S-box implementation
In-Reply-To: <20130523080413.24643.26348.stgit@localhost6.localdomain6> (Jussi
 Kivilinna's message of "Thu, 23 May 2013 11:04:13 +0300")
References: <20130523080413.24643.26348.stgit@localhost6.localdomain6>
Message-ID: <87obc256xm.fsf@vigenere.g10code.de>

Hi,

pushed.  Thanks.


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Thu May 23 13:15:41 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 23 May 2013 14:15:41 +0300
Subject: [PATCH 1/3] rinjdael: add parallel processing for CFB decryption
 with AES-NI
Message-ID: <20130523111541.30047.829.stgit@localhost6.localdomain6>

* cipher/cipher-selftest.c (_gcry_selftest_helper_cfb_128): New
function for CFB selftests.
* cipher/cipher-selftest.h (_gcry_selftest_helper_cfb_128): New
prototype.
* cipher/rijndael.c [USE_AESNI] (do_aesni_enc_vec4): New function.
(_gcry_aes_cfb_dec) [USE_AESNI]: Add parallelized CFB decryption.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
--

CFB decryption can be parallelized for additional performance. On Intel
Sandy-Bridge processor, this change makes CFB decryption 4.6 times faster.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher-selftest.c |  113 ++++++++++++++++++++++++++++++
 cipher/cipher-selftest.h |   13 +++
 cipher/rijndael.c        |  174 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 299 insertions(+), 1 deletion(-)

diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
index 439f3ae..41eb405 100644
--- a/cipher/cipher-selftest.c
+++ b/cipher/cipher-selftest.c
@@ -160,6 +160,119 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
   return NULL;
 }
 
+/* Run the self-tests for <block cipher>-CFB-128, tests bulk CFB
+   decryption.  Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher,
+			       gcry_cipher_setkey_t setkey_func,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+			       const int nblocks, const int blocksize,
+			       const int context_size)
+{
+  int i, offs;
+  unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+  unsigned int ctx_aligned_size, memsize;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+    };
+
+  /* Allocate buffers, align elements to 16 bytes.  */
+  ctx_aligned_size = context_size + 15;
+  ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+  memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+  mem = gcry_calloc (1, memsize);
+  if (!mem)
+    return "failed to allocate memory";
+
+  offs = (16 - ((uintptr_t)mem & 15)) & 15;
+  ctx = (void*)(mem + offs);
+  iv = ctx + ctx_aligned_size;
+  iv2 = iv + blocksize;
+  plaintext = iv2 + blocksize;
+  plaintext2 = plaintext + nblocks * blocksize;
+  ciphertext = plaintext2 + nblocks * blocksize;
+
+  /* Initialize ctx */
+  setkey_func (ctx, key, sizeof(key));
+
+  /* Test single block code path */
+  memset(iv, 0xd3, blocksize);
+  memset(iv2, 0xd3, blocksize);
+  for (i = 0; i < blocksize; i++)
+    plaintext[i] = i;
+
+  /* CFB manually.  */
+  encrypt_one (ctx, ciphertext, iv);
+  buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
+
+  /* CFB decrypt.  */
+  bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
+  if (memcmp(plaintext2, plaintext, blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (plaintext mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+
+  if (memcmp(iv2, iv, blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (IV mismatch)", cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+
+  /* Test parallelized code paths */
+  memset(iv, 0xe6, blocksize);
+  memset(iv2, 0xe6, blocksize);
+
+  for (i = 0; i < nblocks * blocksize; i++)
+    plaintext[i] = i;
+
+  /* Create CFB ciphertext manually.  */
+  for (i = 0; i < nblocks * blocksize; i+=blocksize)
+    {
+      encrypt_one (ctx, &ciphertext[i], iv);
+      buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
+    }
+
+  /* Decrypt using bulk CBC and compare result.  */
+  bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+  if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (plaintext mismatch, parallel path)",
+              cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+  if (memcmp(iv2, iv, blocksize))
+    {
+      gcry_free(mem);
+#ifdef HAVE_SYSLOG
+      syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+              "%s-128-CFB test failed (IV mismatch, parallel path)", cipher);
+#endif
+      return "selftest for 128 bit CFB failed - see syslog for details";
+    }
+
+  gcry_free(mem);
+  return NULL;
+}
+
 /* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 const char *
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
index 89d79c2..30bc251 100644
--- a/cipher/cipher-selftest.h
+++ b/cipher/cipher-selftest.h
@@ -30,6 +30,11 @@ typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
 					   const void *inbuf_arg,
 					   unsigned int nblocks);
 
+typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
+					   void *outbuf_arg,
+					   const void *inbuf_arg,
+					   unsigned int nblocks);
+
 typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
 					   void *outbuf_arg,
 					   const void *inbuf_arg,
@@ -43,6 +48,14 @@ _gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
 			       const int nblocks, const int blocksize,
 			       const int context_size);
 
+/* Helper function for bulk CFB decryption selftest */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+			       gcry_cipher_encrypt_t encrypt_one,
+			       gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+			       const int nblocks, const int blocksize,
+			       const int context_size);
+
 /* Helper function for bulk CTR encryption selftest */
 const char *
 _gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4c81688..9f075ff 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -821,6 +821,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
 }
 
 
+/* Encrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4      ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4  ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmpl $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmpl $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesenc_xmm0_xmm1
+                aesenc_xmm0_xmm2
+                aesenc_xmm0_xmm3
+                aesenc_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesenclast_xmm0_xmm1
+                aesenclast_xmm0_xmm2
+                aesenclast_xmm0_xmm3
+                aesenclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschenc),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
 /* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
  * and output through SSE registers xmm1 to xmm4.  */
 static void
@@ -1685,7 +1794,7 @@ rijndael_decrypt (void *context, byte *b, const byte *a)
 
 
 /* Bulk decryption of complete blocks in CFB mode.  Caller needs to
-   make sure that IV is aligned on an unisgned lonhg boundary.  This
+   make sure that IV is aligned on an unsigned long boundary.  This
    function is only intended for the bulk encryption feature of
    cipher.c. */
 void
@@ -1716,6 +1825,50 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
   else if (ctx->use_aesni)
     {
       aesni_prepare ();
+
+      /* CFB decryption can be parallelized */
+      for ( ;nblocks >= 4; nblocks -= 4)
+        {
+          asm volatile
+            ("movdqu (%[iv]),        %%xmm1\n\t" /* load input blocks */
+             "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+             "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+             "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+             "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+             "movdqu %%xmm0,         (%[iv])\n\t"
+             : /* No output */
+             : [inbuf] "r" (inbuf), [iv] "r" (iv)
+             : "memory");
+
+          do_aesni_enc_vec4 (ctx);
+
+          asm volatile
+            ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm1\n\t"
+             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm2\n\t"
+             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm3\n\t"
+             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+             "pxor %%xmm5, %%xmm4\n\t"
+             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+             : /* No output */
+             : [inbuf] "r" (inbuf),
+               [outbuf] "r" (outbuf)
+             : "memory");
+
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+
       for ( ;nblocks; nblocks-- )
         {
           do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
@@ -1723,6 +1876,7 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
           inbuf  += BLOCKSIZE;
         }
       aesni_cleanup ();
+      aesni_cleanup_2_5 ();
     }
 #endif /*USE_AESNI*/
   else
@@ -2035,6 +2189,21 @@ selftest_cbc_128 (void)
 }
 
 
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = BLOCKSIZE;
+  const int context_size = sizeof(RIJNDAEL_context);
+
+  return _gcry_selftest_helper_cfb_128("AES", &rijndael_setkey,
+           &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -2053,6 +2222,9 @@ selftest (void)
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return r;
 }
 

From jussi.kivilinna at iki.fi  Thu May 23 13:15:51 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 23 May 2013 14:15:51 +0300
Subject: [PATCH 3/3] serpent: add parallel processing for CFB decryption
In-Reply-To: <20130523111541.30047.829.stgit@localhost6.localdomain6>
References: <20130523111541.30047.829.stgit@localhost6.localdomain6>
Message-ID: <20130523111551.30047.78792.stgit@localhost6.localdomain6>

* cipher/cipher.c (gcry_cipher_open): Add bulf CFB decryption function
for Serpent.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_cfb_dec): New
function.
* cipher/serpent.c (_gcry_serpent_sse2_cfb_dec): New prototype.
(_gcry_serpent_cfb_dec) New function.
(selftest_cfb_128) New function.
(selftest) Call selftest_cfb_128.
* src/cipher.h (_gcry_serpent_cfb_dec): New prototype.
--

Patch makes Serpent-CFB decryption 4.0 times faster on Intel Sandy-Bridge and
2.7 times faster on AMD K10.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher.c             |    1 
 cipher/serpent-sse2-amd64.S |   66 ++++++++++++++++++++++++++++++++
 cipher/serpent.c            |   88 +++++++++++++++++++++++++++++++++++++++++++
 src/cipher.h                |    3 +
 4 files changed, 158 insertions(+)

diff --git a/cipher/cipher.c b/cipher/cipher.c
index e9a652f..652d795 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -732,6 +732,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
 	    case GCRY_CIPHER_SERPENT192:
 	    case GCRY_CIPHER_SERPENT256:
               h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+              h->bulk.cfb_dec = _gcry_serpent_cfb_dec;
               h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
               break;
 #endif /*USE_SERPENT*/
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 8d8c8dd..5f9e9d2 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -822,5 +822,71 @@ _gcry_serpent_sse2_cbc_dec:
 	ret
 .size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
 
+.align 8
+.global _gcry_serpent_sse2_cfb_dec
+.type   _gcry_serpent_sse2_cfb_dec, at function;
+_gcry_serpent_sse2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv
+	 */
+
+	.set RA0, enc_in_a0
+	.set RA1, enc_in_a1
+	.set RA2, enc_in_a2
+	.set RA3, enc_in_a3
+	.set RB0, enc_in_b0
+	.set RB1, enc_in_b1
+	.set RB2, enc_in_b2
+	.set RB3, enc_in_b3
+
+	/* Load input */
+	movdqu (%rcx), RA0;
+	movdqu 0 * 16(%rdx), RA1;
+	movdqu 1 * 16(%rdx), RA2;
+	movdqu 2 * 16(%rdx), RA3;
+	movdqu 3 * 16(%rdx), RB0;
+	movdqu 4 * 16(%rdx), RB1;
+	movdqu 5 * 16(%rdx), RB2;
+	movdqu 6 * 16(%rdx), RB3;
+
+	/* Update IV */
+	movdqu 7 * 16(%rdx), RNOT;
+	movdqu RNOT, (%rcx);
+
+	call __serpent_enc_blk8;
+
+	.set RA0, enc_out_a0
+	.set RA1, enc_out_a1
+	.set RA2, enc_out_a2
+	.set RA3, enc_out_a3
+	.set RB0, enc_out_b0
+	.set RB1, enc_out_b1
+	.set RB2, enc_out_b2
+	.set RB3, enc_out_b3
+
+	pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA3, (3 * 16)(%rsi);
+	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB3, (7 * 16)(%rsi);
+
+	ret
+.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;
+
 #endif /*defined(USE_SERPENT)*/
 #endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 7b82b48..95ac7c1 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -74,6 +74,11 @@ extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
+
+extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv);
 #endif
 
 /* A prototype.  */
@@ -916,6 +921,71 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cfb_dec(void *context, unsigned char *iv,
+                      void *outbuf_arg, const void *inbuf_arg,
+                      unsigned int nblocks)
+{
+  serpent_context_t *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+        _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 8;
+        outbuf += 8 * sizeof(serpent_block_t);
+        inbuf  += 8 * sizeof(serpent_block_t);
+        did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+        /* clear SSE2 registers used by serpent-sse2 */
+        asm volatile (
+          "pxor %%xmm0, %%xmm0;\n"
+          "pxor %%xmm1, %%xmm1;\n"
+          "pxor %%xmm2, %%xmm2;\n"
+          "pxor %%xmm3, %%xmm3;\n"
+          "pxor %%xmm4, %%xmm4;\n"
+          "pxor %%xmm5, %%xmm5;\n"
+          "pxor %%xmm6, %%xmm6;\n"
+          "pxor %%xmm7, %%xmm7;\n"
+          "pxor %%xmm10, %%xmm10;\n"
+          "pxor %%xmm11, %%xmm11;\n"
+          "pxor %%xmm12, %%xmm12;\n"
+          "pxor %%xmm13, %%xmm13;\n"
+          :::);
+
+        /* serpent-sse2 assembly code does not use stack */
+        if (nblocks == 0)
+          burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      serpent_encrypt_internal(ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
+      outbuf += sizeof(serpent_block_t);
+      inbuf  += sizeof(serpent_block_t);
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
 

 /* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
@@ -948,6 +1018,21 @@ selftest_cbc_128 (void)
 }
 
 
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 8+2;
+  const int blocksize = sizeof(serpent_block_t);
+  const int context_size = sizeof(serpent_context_t);
+
+  return _gcry_selftest_helper_cfb_128("SERPENT", &serpent_setkey,
+           &serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
 /* Serpent test.  */
 
 static const char *
@@ -1034,6 +1119,9 @@ serpent_test (void)
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/src/cipher.h b/src/cipher.h
index f28990d..9d6cc01 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -113,6 +113,9 @@ void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
 void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
                             void *outbuf_arg, const void *inbuf_arg,
                             unsigned int nblocks);
+void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
+                            void *outbuf_arg, const void *inbuf_arg,
+                            unsigned int nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);


From jussi.kivilinna at iki.fi  Thu May 23 13:15:46 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 23 May 2013 14:15:46 +0300
Subject: [PATCH 2/3] camellia: add parallel processing for CFB decryption
In-Reply-To: <20130523111541.30047.829.stgit@localhost6.localdomain6>
References: <20130523111541.30047.829.stgit@localhost6.localdomain6>
Message-ID: <20130523111546.30047.48369.stgit@localhost6.localdomain6>

* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_cfb_dec): New function.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_cfb_dec): New
prototype.
(_gcry_camellia_cfb_dec): New function.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
* cipher/cipher.c (gry_cipher_open): Add bulk CFB decryption function
for Camellia.
* src/cipher.h (_gcry_camellia_cfb_dec): New prototype.
--

Patch makes Camellia-CFB decryption 4.7 times faster on Intel Sandy-Bridge.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S |   65 +++++++++++++++++++++++++++++++++
 cipher/camellia-glue.c            |   74 +++++++++++++++++++++++++++++++++++++
 cipher/cipher.c                   |    1 +
 src/cipher.h                      |    3 ++
 4 files changed, 143 insertions(+)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 2b1df17..95c96b8 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1116,5 +1116,70 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	ret;
 .size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
 
+.align 8
+.global _gcry_camellia_aesni_avx_cfb_dec
+.type   _gcry_camellia_aesni_avx_cfb_dec, at function;
+
+_gcry_camellia_aesni_avx_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm0;
+	vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+	vpxor (%rcx), %xmm0, %xmm15;
+	vmovdqu 15 * 16(%rdx), %xmm1;
+	vmovdqu %xmm1, (%rcx); /* store new IV */
+	vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+	vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+	vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+	vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+	vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+	vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+	vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+	vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+	vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+	vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+	vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+	vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+	vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+	vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+	call __camellia_enc_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 4c724a9..f9bbb33 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -102,6 +102,11 @@ extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
 					     unsigned char *iv);
+
+extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv);
 #endif
 
 static const char *selftest(void);
@@ -308,6 +313,58 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_camellia_cfb_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          /* clear AVX registers */
+          asm volatile ("vzeroall;\n":::);
+
+          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf  += CAMELLIA_BLOCK_SIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 static const char*
@@ -336,6 +393,20 @@ selftest_cbc_128 (void)
 	   context_size);
 }
 
+/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 16+2;
+  const int blocksize = CAMELLIA_BLOCK_SIZE;
+  const int context_size = sizeof(CAMELLIA_context);
+
+  return _gcry_selftest_helper_cfb_128("CAMELLIA", &camellia_setkey,
+           &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
 static const char *
 selftest(void)
 {
@@ -411,6 +482,9 @@ selftest(void)
   if ( (r = selftest_cbc_128 ()) )
     return r;
 
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 20ac2c7..e9a652f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -723,6 +723,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
 	    case GCRY_CIPHER_CAMELLIA192:
 	    case GCRY_CIPHER_CAMELLIA256:
               h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
+              h->bulk.cfb_dec = _gcry_camellia_cfb_dec;
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
               break;
 #endif /*USE_CAMELLIA*/
diff --git a/src/cipher.h b/src/cipher.h
index 4e68487..f28990d 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -102,6 +102,9 @@ void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
 void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              unsigned int nblocks);
+void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
+                             void *outbuf_arg, const void *inbuf_arg,
+                             unsigned int nblocks);
 
 /*-- serpent.c --*/
 void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,


From wk at gnupg.org  Thu May 23 17:38:45 2013
From: wk at gnupg.org (Werner Koch)
Date: Thu, 23 May 2013 17:38:45 +0200
Subject: [PATCH 1/3] rinjdael: add parallel processing for CFB decryption
 with AES-NI
In-Reply-To: <20130523111541.30047.829.stgit@localhost6.localdomain6> (Jussi
 Kivilinna's message of "Thu, 23 May 2013 14:15:41 +0300")
References: <20130523111541.30047.829.stgit@localhost6.localdomain6>
Message-ID: <8738td670a.fsf@vigenere.g10code.de>

On Thu, 23 May 2013 13:15, jussi.kivilinna at iki.fi said:

> CFB decryption can be parallelized for additional performance. On Intel
> Sandy-Bridge processor, this change makes CFB decryption 4.6 times faster.

That is a nice improvement for OpenPGP.  All 3 pushed.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Fri May 24 11:43:24 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 24 May 2013 12:43:24 +0300
Subject: [PATCH 1/2] cipher-selftest: make selftest work with any block-size
Message-ID: <20130524094324.15701.68533.stgit@localhost6.localdomain6>

* cipher/cipher-selftest.c (_gcry_selftest_helper_cbc_128)
(_gcry_selftest_helper_cfb_128, _gcry_selftest_helper_ctr_128): Renamed
functions from '<name>_128' to '<name>'.
(_gcry_selftest_helper_cbc, _gcry_selftest_helper_cfb)
(_gcry_selftest_helper_ctr): Make work with different block sizes.
* cipher/cipher-selftest.h (_gcry_selftest_helper_cbc_128)
(_gcry_selftest_helper_cfb_128, _gcry_selftest_helper_ctr_128): Renamed
prototypes from '<name>_128' to '<name>'.
* cipher/camellia-glue.c (selftest_ctr_128, selftest_cfb_128)
(selftest_ctr_128): Change to use new function names.
* cipher/rijndael.c (selftest_ctr_128, selftest_cfb_128)
(selftest_ctr_128): Change to use new function names.
* cipher/serpent.c (selftest_ctr_128, selftest_cfb_128)
(selftest_ctr_128): Change to use new function names.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-glue.c   |    6 +-
 cipher/cipher-selftest.c |  116 ++++++++++++++++++++++++----------------------
 cipher/cipher-selftest.h |   30 ++++++------
 cipher/rijndael.c        |    6 +-
 cipher/serpent.c         |    6 +-
 5 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index f9bbb33..4163e82 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -374,7 +374,7 @@ selftest_ctr_128 (void)
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
-  return _gcry_selftest_helper_ctr_128("CAMELLIA", &camellia_setkey,
+  return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, &_gcry_camellia_ctr_enc, nblocks, blocksize,
 	   context_size);
 }
@@ -388,7 +388,7 @@ selftest_cbc_128 (void)
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
-  return _gcry_selftest_helper_cbc_128("CAMELLIA", &camellia_setkey,
+  return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, &_gcry_camellia_cbc_dec, nblocks, blocksize,
 	   context_size);
 }
@@ -402,7 +402,7 @@ selftest_cfb_128 (void)
   const int blocksize = CAMELLIA_BLOCK_SIZE;
   const int context_size = sizeof(CAMELLIA_context);
 
-  return _gcry_selftest_helper_cfb_128("CAMELLIA", &camellia_setkey,
+  return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
            &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
 	   context_size);
 }
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
index 41eb405..17742e3 100644
--- a/cipher/cipher-selftest.c
+++ b/cipher/cipher-selftest.c
@@ -44,15 +44,14 @@
 #endif
 
 
-/* Run the self-tests for <block cipher>-CBC-128, tests bulk CBC
+/* Run the self-tests for <block cipher>-CBC-<block size>, tests bulk CBC
    decryption.  Returns NULL on success. */
 const char *
-_gcry_selftest_helper_cbc_128 (const char *cipher,
-                               gcry_cipher_setkey_t setkey_func,
-			       gcry_cipher_encrypt_t encrypt_one,
-			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
-			       const int nblocks, const int blocksize,
-			       const int context_size)
+_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey_func,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
+			   const int nblocks, const int blocksize,
+			   const int context_size)
 {
   int i, offs;
   unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
@@ -63,7 +62,8 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
       0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
     };
 
-  /* Allocate buffers, align elements to 16 bytes.  */
+  /* Allocate buffers, align first two elements to 16 bytes and latter to
+     block size.  */
   ctx_aligned_size = context_size + 15;
   ctx_aligned_size -= ctx_aligned_size & 0xf;
 
@@ -97,24 +97,25 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
 
   /* CBC decrypt.  */
   bulk_cbc_dec (ctx, iv2, plaintext2, ciphertext, 1);
-  if (memcmp (plaintext2, plaintext, 16))
+  if (memcmp (plaintext2, plaintext, blocksize))
     {
       gcry_free (mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CBC test failed (plaintext mismatch)", cipher);
+              "%s-CBC-%d test failed (plaintext mismatch)", cipher,
+	      blocksize * 8);
 #endif
-      return "selftest for 128 bit CBC failed - see syslog for details";
+      return "selftest for CBC failed - see syslog for details";
     }
 
-  if (memcmp (iv2, iv, 16))
+  if (memcmp (iv2, iv, blocksize))
     {
       gcry_free (mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CBC test failed (IV mismatch)", cipher);
+              "%s-CBC-%d test failed (IV mismatch)", cipher, blocksize * 8);
 #endif
-      return "selftest for 128 bit CBC failed - see syslog for details";
+      return "selftest for CBC failed - see syslog for details";
     }
 
   /* Test parallelized code paths */
@@ -140,35 +141,34 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
       gcry_free (mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CBC test failed (plaintext mismatch, parallel path)",
-	      cipher);
+              "%s-CBC-%d test failed (plaintext mismatch, parallel path)",
+	      cipher, blocksize * 8);
 #endif
-      return "selftest for 128 bit CBC failed - see syslog for details";
+      return "selftest for CBC failed - see syslog for details";
     }
   if (memcmp (iv2, iv, blocksize))
     {
       gcry_free (mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CBC test failed (IV mismatch, parallel path)",
-	      cipher);
+              "%s-CBC-%d test failed (IV mismatch, parallel path)",
+	      cipher, blocksize * 8);
 #endif
-      return "selftest for 128 bit CBC failed - see syslog for details";
+      return "selftest for CBC failed - see syslog for details";
     }
 
   gcry_free (mem);
   return NULL;
 }
 
-/* Run the self-tests for <block cipher>-CFB-128, tests bulk CFB
+/* Run the self-tests for <block cipher>-CFB-<block size>, tests bulk CFB
    decryption.  Returns NULL on success. */
 const char *
-_gcry_selftest_helper_cfb_128 (const char *cipher,
-			       gcry_cipher_setkey_t setkey_func,
-			       gcry_cipher_encrypt_t encrypt_one,
-			       gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
-			       const int nblocks, const int blocksize,
-			       const int context_size)
+_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey_func,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+			   const int nblocks, const int blocksize,
+			   const int context_size)
 {
   int i, offs;
   unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
@@ -179,7 +179,8 @@ _gcry_selftest_helper_cfb_128 (const char *cipher,
       0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
     };
 
-  /* Allocate buffers, align elements to 16 bytes.  */
+  /* Allocate buffers, align first two elements to 16 bytes and latter to
+     block size.  */
   ctx_aligned_size = context_size + 15;
   ctx_aligned_size -= ctx_aligned_size & 0xf;
 
@@ -217,9 +218,10 @@ _gcry_selftest_helper_cfb_128 (const char *cipher,
       gcry_free(mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CFB test failed (plaintext mismatch)", cipher);
+              "%s-CFB-%d test failed (plaintext mismatch)", cipher,
+	      blocksize * 8);
 #endif
-      return "selftest for 128 bit CFB failed - see syslog for details";
+      return "selftest for CFB failed - see syslog for details";
     }
 
   if (memcmp(iv2, iv, blocksize))
@@ -227,9 +229,9 @@ _gcry_selftest_helper_cfb_128 (const char *cipher,
       gcry_free(mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CFB test failed (IV mismatch)", cipher);
+              "%s-CFB-%d test failed (IV mismatch)", cipher, blocksize * 8);
 #endif
-      return "selftest for 128 bit CFB failed - see syslog for details";
+      return "selftest for CFB failed - see syslog for details";
     }
 
   /* Test parallelized code paths */
@@ -254,34 +256,34 @@ _gcry_selftest_helper_cfb_128 (const char *cipher,
       gcry_free(mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CFB test failed (plaintext mismatch, parallel path)",
-              cipher);
+              "%s-CFB-%d test failed (plaintext mismatch, parallel path)",
+              cipher, blocksize * 8);
 #endif
-      return "selftest for 128 bit CFB failed - see syslog for details";
+      return "selftest for CFB failed - see syslog for details";
     }
   if (memcmp(iv2, iv, blocksize))
     {
       gcry_free(mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CFB test failed (IV mismatch, parallel path)", cipher);
+              "%s-CFB-%d test failed (IV mismatch, parallel path)", cipher,
+	      blocksize * 8);
 #endif
-      return "selftest for 128 bit CFB failed - see syslog for details";
+      return "selftest for CFB failed - see syslog for details";
     }
 
   gcry_free(mem);
   return NULL;
 }
 
-/* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
-   encryption.  Returns NULL on success. */
+/* Run the self-tests for <block cipher>-CTR-<block size>, tests IV increment
+   of bulk CTR encryption.  Returns NULL on success. */
 const char *
-_gcry_selftest_helper_ctr_128 (const char *cipher,
-                               gcry_cipher_setkey_t setkey_func,
-			       gcry_cipher_encrypt_t encrypt_one,
-			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
-			       const int nblocks, const int blocksize,
-			       const int context_size)
+_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey_func,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
+			   const int nblocks, const int blocksize,
+			   const int context_size)
 {
   int i, j, offs, diff;
   unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
@@ -292,7 +294,8 @@ _gcry_selftest_helper_ctr_128 (const char *cipher,
       0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
     };
 
-  /* Allocate buffers, align elements to 16 bytes.  */
+  /* Allocate buffers, align first two elements to 16 bytes and latter to
+     block size.  */
   ctx_aligned_size = context_size + 15;
   ctx_aligned_size -= ctx_aligned_size & 0xf;
 
@@ -337,9 +340,10 @@ _gcry_selftest_helper_ctr_128 (const char *cipher,
       gcry_free (mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CTR test failed (plaintext mismatch)", cipher);
+              "%s-CTR-%d test failed (plaintext mismatch)", cipher,
+	      blocksize * 8);
 #endif
-      return "selftest for 128 bit CTR failed - see syslog for details";
+      return "selftest for CTR failed - see syslog for details";
     }
 
   if (memcmp (iv2, iv, blocksize))
@@ -347,9 +351,10 @@ _gcry_selftest_helper_ctr_128 (const char *cipher,
       gcry_free (mem);
 #ifdef HAVE_SYSLOG
       syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-              "%s-128-CTR test failed (IV mismatch)", cipher);
+              "%s-CTR-%d test failed (IV mismatch)", cipher,
+	      blocksize * 8);
 #endif
-      return "selftest for 128 bit CTR failed - see syslog for details";
+      return "selftest for CTR failed - see syslog for details";
     }
 
   /* Test parallelized code paths */
@@ -385,19 +390,20 @@ _gcry_selftest_helper_ctr_128 (const char *cipher,
         gcry_free (mem);
 #ifdef HAVE_SYSLOG
         syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-                "%s-128-CTR test failed (plaintext mismatch, diff: %d)", cipher,
-		diff);
+                "%s-CTR-%d test failed (plaintext mismatch, diff: %d)", cipher,
+		blocksize * 8, diff);
 #endif
-        return "selftest for 128 bit CTR failed - see syslog for details";
+        return "selftest for CTR failed - see syslog for details";
       }
     if (memcmp(iv2, iv, blocksize))
       {
         gcry_free (mem);
 #ifdef HAVE_SYSLOG
         syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
-                "%s-128-CTR test failed (IV mismatch, diff: %d)", cipher, diff);
+                "%s-CTR-%d test failed (IV mismatch, diff: %d)", cipher,
+		blocksize * 8, diff);
 #endif
-        return "selftest for 128 bit CTR failed - see syslog for details";
+        return "selftest for CTR failed - see syslog for details";
       }
   }
 
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
index 30bc251..41d77af 100644
--- a/cipher/cipher-selftest.h
+++ b/cipher/cipher-selftest.h
@@ -42,26 +42,26 @@ typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
 
 /* Helper function for bulk CBC decryption selftest */
 const char *
-_gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
-			       gcry_cipher_encrypt_t encrypt_one,
-			       gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
-			       const int nblocks, const int blocksize,
-			       const int context_size);
+_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   gcry_cipher_bulk_cbc_dec_t bulk_cbc_dec,
+			   const int nblocks, const int blocksize,
+			   const int context_size);
 
 /* Helper function for bulk CFB decryption selftest */
 const char *
-_gcry_selftest_helper_cfb_128 (const char *cipher, gcry_cipher_setkey_t setkey,
-			       gcry_cipher_encrypt_t encrypt_one,
-			       gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
-			       const int nblocks, const int blocksize,
-			       const int context_size);
+_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+			   const int nblocks, const int blocksize,
+			   const int context_size);
 
 /* Helper function for bulk CTR encryption selftest */
 const char *
-_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
-			       gcry_cipher_encrypt_t encrypt_one,
-			       gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
-			       const int nblocks, const int blocksize,
-			       const int context_size);
+_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey,
+			   gcry_cipher_encrypt_t encrypt_one,
+			   gcry_cipher_bulk_ctr_enc_t bulk_ctr_enc,
+			   const int nblocks, const int blocksize,
+			   const int context_size);
 
 #endif /*G10_SELFTEST_HELP_H*/
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 9f075ff..ac2fcfb 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -2168,7 +2168,7 @@ selftest_ctr_128 (void)
   const int blocksize = BLOCKSIZE;
   const int context_size = sizeof(RIJNDAEL_context);
 
-  return _gcry_selftest_helper_ctr_128("AES", &rijndael_setkey,
+  return _gcry_selftest_helper_ctr("AES", &rijndael_setkey,
            &rijndael_encrypt, &_gcry_aes_ctr_enc, nblocks, blocksize,
 	   context_size);
 }
@@ -2183,7 +2183,7 @@ selftest_cbc_128 (void)
   const int blocksize = BLOCKSIZE;
   const int context_size = sizeof(RIJNDAEL_context);
 
-  return _gcry_selftest_helper_cbc_128("AES", &rijndael_setkey,
+  return _gcry_selftest_helper_cbc("AES", &rijndael_setkey,
            &rijndael_encrypt, &_gcry_aes_cbc_dec, nblocks, blocksize,
 	   context_size);
 }
@@ -2198,7 +2198,7 @@ selftest_cfb_128 (void)
   const int blocksize = BLOCKSIZE;
   const int context_size = sizeof(RIJNDAEL_context);
 
-  return _gcry_selftest_helper_cfb_128("AES", &rijndael_setkey,
+  return _gcry_selftest_helper_cfb("AES", &rijndael_setkey,
            &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
 	   context_size);
 }
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 95ac7c1..c72951e 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -997,7 +997,7 @@ selftest_ctr_128 (void)
   const int blocksize = sizeof(serpent_block_t);
   const int context_size = sizeof(serpent_context_t);
 
-  return _gcry_selftest_helper_ctr_128("SERPENT", &serpent_setkey,
+  return _gcry_selftest_helper_ctr("SERPENT", &serpent_setkey,
            &serpent_encrypt, &_gcry_serpent_ctr_enc, nblocks, blocksize,
 	   context_size);
 }
@@ -1012,7 +1012,7 @@ selftest_cbc_128 (void)
   const int blocksize = sizeof(serpent_block_t);
   const int context_size = sizeof(serpent_context_t);
 
-  return _gcry_selftest_helper_cbc_128("SERPENT", &serpent_setkey,
+  return _gcry_selftest_helper_cbc("SERPENT", &serpent_setkey,
            &serpent_encrypt, &_gcry_serpent_cbc_dec, nblocks, blocksize,
 	   context_size);
 }
@@ -1027,7 +1027,7 @@ selftest_cfb_128 (void)
   const int blocksize = sizeof(serpent_block_t);
   const int context_size = sizeof(serpent_context_t);
 
-  return _gcry_selftest_helper_cfb_128("SERPENT", &serpent_setkey,
+  return _gcry_selftest_helper_cfb("SERPENT", &serpent_setkey,
            &serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
 	   context_size);
 }


From jussi.kivilinna at iki.fi  Fri May 24 11:43:29 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 24 May 2013 12:43:29 +0300
Subject: [PATCH 2/2] cast5: add amd64 assembly implementation
In-Reply-To: <20130524094324.15701.68533.stgit@localhost6.localdomain6>
References: <20130524094324.15701.68533.stgit@localhost6.localdomain6>
Message-ID: <20130524094329.15701.71759.stgit@localhost6.localdomain6>

* cipher/Makefile.am: Add 'cast5-amd64.S'.
* cipher/cast5-amd64.S: New file.
* cipher/cast5.c (USE_AMD64_ASM): New macro.
(_gcry_cast5_s1tos4): Merge arrays s1, s2, s3, s4 to single array to
simplify access from assembly implementation.
(s1, s2, s3, s4): New macros pointing to subarrays in
_gcry_cast5_s1tos4.
[USE_AMD64_ASM] (_gcry_cast5_amd64_encrypt_block)
(_gcry_cast5_amd64_decrypt_block, _gcry_cast5_amd64_ctr_enc)
(_gcry_cast5_amd64_cbc_dec, _gcry_cast5_amd64_cfb_dec): New prototypes.
[USE_AMD64_ASM] (do_encrypt_block, do_decrypt_block, encrypt_block)
(decrypt_block): New functions.
(_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec, _gcry_cast5_cfb_dec)
(selftest_ctr, selftest_cbc, selftest_cfb): New functions.
(selftest): Call new bulk selftests.
* cipher/cipher.c (gcry_cipher_open) [USE_CAST5]: Register CAST5 bulk
functions for ctr-enc, cbc-dec and cfb-dec.
* configure.ac (cast5) [x86_64]: Add 'cast5-amd64.lo'.
* src/cipher.h (_gcry_cast5_ctr_enc, _gcry_cast5_cbc_dec)
(gcry_cast5_cfb_dec): New prototypes.
--

Provides non-parallel implementations for small speed-up and 4-way parallel
implementations that gets accelerated on `out-of-order' CPUs.

Speed old vs. new on AMD Phenom II X6 1055T:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
CAST5         1.23x   1.22x   1.21x   2.86x   1.21x   2.83x   1.22x   1.17x   2.73x   2.73x

Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
CAST5         1.00x   1.04x   1.06x   2.56x   1.06x   2.37x   1.03x   1.01x   2.43x   2.41x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am   |    2 
 cipher/cast5-amd64.S |  587 ++++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/cast5.c       |  278 +++++++++++++++++++++++-
 cipher/cipher.c      |    7 +
 configure.ac         |    7 +
 src/cipher.h         |   13 +
 6 files changed, 885 insertions(+), 9 deletions(-)
 create mode 100644 cipher/cast5-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 69f1e6d..1e2696f 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -54,7 +54,7 @@ rmd.h
 EXTRA_libcipher_la_SOURCES = \
 arcfour.c \
 blowfish.c \
-cast5.c \
+cast5.c cast5-amd64.S \
 crc.c \
 des.c \
 dsa.c \
diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
new file mode 100644
index 0000000..c3007d3
--- /dev/null
+++ b/cipher/cast5-amd64.S
@@ -0,0 +1,587 @@
+/* cast5-amd64.S  -  AMD64 assembly implementation of CAST5 cipher
+ *
+ * Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_CAST5)
+
+#ifdef __PIC__
+#  define RIP %rip
+#  define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
+#else
+#  define RIP
+#  define GET_EXTERN_POINTER(name, reg) leaq name, reg
+#endif
+
+.text
+
+.extern _gcry_cast5_s1to4;
+
+#define s1 0
+#define s2 (s1 + (4 * 256))
+#define s3 (s2 + (4 * 256))
+#define s4 (s3 + (4 * 256))
+
+/* structure of CAST5_context: */
+#define Km 0
+#define Kr (Km + (16 * 4))
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RTAB %r8
+
+#define RLR0 %r9
+#define RLR1 %r10
+#define RLR2 %r11
+#define RLR3 %r12
+
+#define RLR0d %r9d
+#define RLR1d %r10d
+#define RLR2d %r11d
+#define RLR3d %r12d
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %dh
+
+#define RKR %rcx
+#define RKRd %ecx
+#define RKRbl %cl
+
+#define RT0 %rbp
+#define RT1 %rsi
+
+#define RT0d %ebp
+#define RT1d %esi
+
+#define RKM0d %r13d
+#define RKM1d %r14d
+
+/***********************************************************************
+ * 1-way cast5
+ ***********************************************************************/
+#define dummy(x)
+
+#define shr_kr(none) \
+	shrq $8,			RKR;
+
+#define F(km, load_next_kr, op0, op1, op2, op3) \
+	op0 ## l RLR0d,			km ## d; \
+	roll RKRbl,			km ## d; \
+	rorq $32,			RLR0; \
+	movzbl km ## bh,		RT0d; \
+	movzbl km ## bl,		RT1d; \
+	roll $16,			km ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	load_next_kr(kr_next); \
+	movzbl km ## bh,		RT1d; \
+	movzbl km ## bl,		km ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,km,4),		RT0d; \
+	xorq RT0,			RLR0;
+
+#define F1(km, load_next_kr) \
+	F(##km, load_next_kr, add, xor, sub, add)
+#define F2(km, load_next_kr) \
+	F(##km, load_next_kr, xor, sub, add, xor)
+#define F3(km, load_next_kr) \
+	F(##km, load_next_kr, sub, add, xor, sub)
+
+#define get_round_km(n, km) \
+	movl Km+4*(n)(CTX), 		km;
+
+#define get_round_kr_enc(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n)(CTX),		RKR;
+
+#define get_round_kr_dec(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n - 7)(CTX),		RKR; \
+	bswapq				RKR;
+
+#define round_enc(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n + 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_enc_last(n, FXA, FXB) \
+	get_round_km(n + 1, RX2d); \
+	\
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_enc_1(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, shr_kr)
+
+#define round_enc_2(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, dummy)
+
+#define round_dec(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n - 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_dec_last(n, FXA, FXB) \
+	get_round_km(n - 1, RX2d); \
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_dec_1(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, shr_kr)
+
+#define round_dec_2(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, dummy)
+
+#define read_block() \
+	movq (RIO), 		RLR0; \
+	bswapq 			RLR0;
+
+#define write_block() \
+	bswapq 			RLR0; \
+	rorq $32,		RLR0; \
+	movq RLR0, 		(RIO);
+
+.align 8
+.global _gcry_cast5_amd64_encrypt_block
+.type   _gcry_cast5_amd64_encrypt_block, at function;
+
+_gcry_cast5_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(0, RX0d);
+	get_round_kr_enc(0);
+	round_enc_1(0, F1, F2);
+	round_enc_1(2, F3, F1);
+	round_enc_1(4, F2, F3);
+	round_enc_2(6, F1, F2);
+	get_round_kr_enc(8);
+	round_enc_1(8, F3, F1);
+	round_enc_1(10, F2, F3);
+	round_enc_1(12, F1, F2);
+	round_enc_last(14, F3, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;
+
+.align 8
+.global _gcry_cast5_amd64_decrypt_block
+.type   _gcry_cast5_amd64_decrypt_block, at function;
+
+_gcry_cast5_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(15, RX0d);
+	get_round_kr_dec(15);
+	round_dec_1(15, F1, F3);
+	round_dec_1(13, F2, F1);
+	round_dec_1(11, F3, F2);
+	round_dec_2(9, F1, F3);
+	get_round_kr_dec(7);
+	round_dec_1(7, F2, F1);
+	round_dec_1(5, F3, F2);
+	round_dec_1(3, F1, F3);
+	round_dec_last(1, F2, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;
+
+/**********************************************************************
+  4-way cast5, four blocks parallel
+ **********************************************************************/
+#define F_tail(rlr, rx, op1, op2, op3) \
+	movzbl rx ## bh,		RT0d; \
+	movzbl rx ## bl,		RT1d; \
+	roll $16,			rx ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	movzbl rx ## bh,		RT1d; \
+	movzbl rx ## bl,		rx ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,rx,4),		RT0d; \
+	xorq RT0,			rlr;
+
+#define F4(km, load_next_kr, op0, op1, op2, op3) \
+	movl km,			RX0d; \
+	op0 ## l RLR0d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	rorq $32,			RLR0; \
+	\
+	movl km,			RX1d; \
+	op0 ## l RLR1d,			RX1d; \
+	roll RKRbl,			RX1d; \
+	rorq $32,			RLR1; \
+	\
+	movl km,			RX2d; \
+	op0 ## l RLR2d,			RX2d; \
+	roll RKRbl,			RX2d; \
+	rorq $32,			RLR2; \
+	\
+	F_tail(RLR0, RX0, op1, op2, op3); \
+	F_tail(RLR1, RX1, op1, op2, op3); \
+	F_tail(RLR2, RX2, op1, op2, op3); \
+	\
+	movl km,			RX0d; \
+	op0 ## l RLR3d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	load_next_kr();			\
+	rorq $32,			RLR3; \
+	\
+	F_tail(RLR3, RX0, op1, op2, op3);
+
+#define F4_1(km, load_next_kr) \
+	F4(km, load_next_kr, add, xor, sub, add)
+#define F4_2(km, load_next_kr) \
+	F4(km, load_next_kr, xor, sub, add, xor)
+#define F4_3(km, load_next_kr) \
+	F4(km, load_next_kr, sub, add, xor, sub)
+
+#define round_enc4(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n + 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_enc_last4(n, FXA, FXB) \
+	get_round_km(n + 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_enc4_1(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_enc4_2(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, dummy);
+
+#define round_dec4(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n - 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_dec_last4(n, FXA, FXB) \
+	get_round_km(n - 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_dec4_1(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_dec4_2(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, dummy);
+
+#define inbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d;
+
+#define outbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d; \
+	rorq $32,		a; \
+	rorq $32,		b; \
+	rorq $32,		c; \
+	rorq $32,		d;
+
+.align 8
+.type   __cast5_enc_blk4, at function;
+
+__cast5_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
+	 */
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	get_round_km(0, RKM0d);
+	get_round_kr_enc(0);
+	round_enc4_1(0, F4_1, F4_2);
+	round_enc4_1(2, F4_3, F4_1);
+	round_enc4_1(4, F4_2, F4_3);
+	round_enc4_2(6, F4_1, F4_2);
+	get_round_kr_enc(8);
+	round_enc4_1(8, F4_3, F4_1);
+	round_enc4_1(10, F4_2, F4_3);
+	round_enc4_1(12, F4_1, F4_2);
+	round_enc_last4(14, F4_3, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+.size __cast5_enc_blk4,.-__cast5_enc_blk4;
+
+.align 8
+.type   __cast5_dec_blk4, at function;
+
+__cast5_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
+	 */
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	get_round_km(15, RKM0d);
+	get_round_kr_dec(15);
+	round_dec4_1(15, F4_1, F4_3);
+	round_dec4_1(13, F4_2, F4_1);
+	round_dec4_1(11, F4_3, F4_2);
+	round_dec4_2(9, F4_1, F4_3);
+	get_round_kr_dec(7);
+	round_dec4_1(7, F4_2, F4_1);
+	round_dec4_1(5, F4_3, F4_2);
+	round_dec4_1(3, F4_1, F4_3);
+	round_dec_last4(1, F4_2, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+.size __cast5_dec_blk4,.-__cast5_dec_blk4;
+
+.align 8
+.global _gcry_cast5_amd64_ctr_enc
+.type   _gcry_cast5_amd64_ctr_enc, at function;
+_gcry_cast5_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rsi;
+	pushq %rdx;
+
+	/* load IV and byteswap */
+	movq (%rcx), RX0;
+	bswapq RX0;
+	movq RX0, RLR0;
+
+	/* construct IVs */
+	leaq 1(RX0), RLR1;
+	leaq 2(RX0), RLR2;
+	leaq 3(RX0), RLR3;
+	leaq 4(RX0), RX0;
+	bswapq RX0;
+
+	/* store new IV */
+	movq RX0, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %r14; /*src*/
+	popq %r13; /*dst*/
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r14), RLR0;
+	xorq 1 * 8(%r14), RLR1;
+	xorq 2 * 8(%r14), RLR2;
+	xorq 3 * 8(%r14), RLR3;
+	movq RLR0, 0 * 8(%r13);
+	movq RLR1, 1 * 8(%r13);
+	movq RLR2, 2 * 8(%r13);
+	movq RLR3, 3 * 8(%r13);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret
+.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;
+
+.align 8
+.global _gcry_cast5_amd64_cbc_dec
+.type   _gcry_cast5_amd64_cbc_dec, at function;
+_gcry_cast5_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rcx;
+	pushq %rsi;
+	pushq %rdx;
+
+	/* load input */
+	movq 0 * 8(%rdx), RLR0;
+	movq 1 * 8(%rdx), RLR1;
+	movq 2 * 8(%rdx), RLR2;
+	movq 3 * 8(%rdx), RLR3;
+
+	call __cast5_dec_blk4;
+
+	popq RX0; /*src*/
+	popq RX1; /*dst*/
+	popq RX2; /*iv*/
+
+	movq 3 * 8(RX0), %r14;
+	xorq      (RX2), RLR0;
+	xorq 0 * 8(RX0), RLR1;
+	xorq 1 * 8(RX0), RLR2;
+	xorq 2 * 8(RX0), RLR3;
+	movq %r14, (RX2); /* store new IV */
+
+	movq RLR0, 0 * 8(RX1);
+	movq RLR1, 1 * 8(RX1);
+	movq RLR2, 2 * 8(RX1);
+	movq RLR3, 3 * 8(RX1);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+
+.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;
+
+.align 8
+.global _gcry_cast5_amd64_cfb_dec
+.type   _gcry_cast5_amd64_cfb_dec, at function;
+_gcry_cast5_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rsi;
+	pushq %rdx;
+
+	/* Load input */
+	movq (%rcx), RLR0;
+	movq 0 * 8(%rdx), RLR1;
+	movq 1 * 8(%rdx), RLR2;
+	movq 2 * 8(%rdx), RLR3;
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	/* Update IV */
+	movq 3 * 8(%rdx), %rdx;
+	movq %rdx, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %rdx; /*src*/
+	popq %rcx; /*dst*/
+
+	xorq 0 * 8(%rdx), RLR0;
+	xorq 1 * 8(%rdx), RLR1;
+	xorq 2 * 8(%rdx), RLR2;
+	xorq 3 * 8(%rdx), RLR3;
+	movq RLR0, 0 * 8(%rcx);
+	movq RLR1, 1 * 8(%rcx);
+	movq RLR2, 2 * 8(%rcx);
+	movq RLR3, 3 * 8(%rcx);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+
+.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;
+
+#endif /*defined(USE_CAST5)*/
+#endif /*__x86_64*/
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 9905f5c..e3d7bc5 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -42,6 +42,14 @@
 #include "g10lib.h"
 #include "types.h"
 #include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__)
+# define USE_AMD64_ASM 1
+#endif
 
 #define CAST5_BLOCKSIZE 8
 
@@ -56,8 +64,12 @@ static void decrypt_block (void *c, byte *outbuf, const byte *inbuf);
 
 
+#define s1 _gcry_cast5_s1to4[0]
+#define s2 _gcry_cast5_s1to4[1]
+#define s3 _gcry_cast5_s1to4[2]
+#define s4 _gcry_cast5_s1to4[3]
 
-static const u32 s1[256] = {
+const u32 _gcry_cast5_s1to4[4][256] = { {
 0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f, 0x9c004dd3, 0x6003e540, 0xcf9fc949,
 0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0, 0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
 0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3, 0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
@@ -90,8 +102,7 @@ static const u32 s1[256] = {
 0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db, 0xab838653, 0x6e2f1e23, 0x83719c9e,
 0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c, 0xe1e696ff, 0xb141ab08, 0x7cca89b9,
 0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c, 0x5ac9f049, 0xdd8f0f00, 0x5c8165bf
-};
-static const u32 s2[256] = {
+}, {
 0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a, 0xeec5207a, 0x55889c94, 0x72fc0651,
 0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef, 0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
 0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086, 0xef944459, 0xba83ccb3, 0xe0c3cdfb,
@@ -124,8 +135,7 @@ static const u32 s2[256] = {
 0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3, 0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
 0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589, 0xa345415e, 0x5c038323, 0x3e5d3bb9,
 0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539, 0x73bfbe70, 0x83877605, 0x4523ecf1
-};
-static const u32 s3[256] = {
+}, {
 0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff, 0x369fe44b, 0x8c1fc644, 0xaececa90,
 0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806, 0xf0ad0548, 0xe13c8d83, 0x927010d5,
 0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820, 0xfade82e0, 0xa067268b, 0x8272792e,
@@ -158,8 +168,7 @@ static const u32 s3[256] = {
 0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f, 0x68458425, 0x99833be5, 0x600d457d,
 0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31, 0x9c305a00, 0x52bce688, 0x1b03588a,
 0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636, 0xa133c501, 0xe9d3531c, 0xee353783
-};
-static const u32 s4[256] = {
+}, {
 0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb, 0x64ad8c57, 0x85510443, 0xfa020ed1,
 0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43, 0x6497b7b1, 0xf3641f63, 0x241e4adf,
 0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30, 0xc0a5374f, 0x1d2d00d9, 0x24147b15,
@@ -192,7 +201,7 @@ static const u32 s4[256] = {
 0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee, 0xf65324e6, 0x6afce36c, 0x0316cc04,
 0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979, 0x932bcdf6, 0xb657c34d, 0x4edfd282,
 0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0, 0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2
-};
+} };
 static const u32 s5[256] = {
 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f,
 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a,
@@ -331,6 +340,53 @@ static const u32 s8[256] = {
 };
 
 
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of CAST5. */
+extern void _gcry_cast5_amd64_encrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+extern void _gcry_cast5_amd64_decrypt_block(CAST5_context *c, byte *outbuf,
+					    const byte *inbuf);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *ctr);
+
+extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
+				      const byte *in, byte *iv);
+
+static void
+do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
+}
+
+static void encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+static void decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  CAST5_context *c = (CAST5_context *) context;
+  _gcry_cast5_amd64_decrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+#else /*USE_AMD64_ASM*/
+
 #if defined(__GNUC__) && defined(__i386__)
 static inline u32
 rol(int n, u32 x)
@@ -463,6 +519,201 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   _gcry_burn_stack (20+4*sizeof(void*));
 }
 
+#endif /*!USE_AMD64_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size CAST5_BLOCKSIZE. */
+void
+_gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, 
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[CAST5_BLOCKSIZE];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+
+  int i;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, CAST5_BLOCKSIZE);
+      outbuf += CAST5_BLOCKSIZE;
+      inbuf  += CAST5_BLOCKSIZE;
+      /* Increment the counter.  */
+      for (i = CAST5_BLOCKSIZE; i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[CAST5_BLOCKSIZE];
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, CAST5_BLOCKSIZE);
+
+      do_decrypt_block (ctx, outbuf, inbuf);
+
+      buf_xor(outbuf, outbuf, iv, CAST5_BLOCKSIZE);
+      memcpy(iv, savebuf, CAST5_BLOCKSIZE);
+      inbuf += CAST5_BLOCKSIZE;
+      outbuf += CAST5_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  CAST5_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = (20 + 4 * sizeof(void*)) + 2 * CAST5_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 8 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * CAST5_BLOCKSIZE;
+        inbuf  += 4 * CAST5_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, CAST5_BLOCKSIZE);
+      outbuf += CAST5_BLOCKSIZE;
+      inbuf  += CAST5_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Run the self-tests for CAST5-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 4+1;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_ctr("CAST5", &cast_setkey,
+           &encrypt_block, &_gcry_cast5_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for CAST5-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_cbc("CAST5", &cast_setkey,
+           &encrypt_block, &_gcry_cast5_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for CAST5-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = CAST5_BLOCKSIZE;
+  const int context_size = sizeof(CAST5_context);
+
+  return _gcry_selftest_helper_cfb("CAST5", &cast_setkey,
+           &encrypt_block, &_gcry_cast5_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
 
 static const char*
 selftest(void)
@@ -473,6 +724,7 @@ selftest(void)
     byte plain[8] = { 0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF };
     byte cipher[8]= { 0x23, 0x8B, 0x4F, 0xE5, 0x84, 0x7E, 0x44, 0xB2 };
     byte buffer[8];
+    const char *r;
 
     cast_setkey( &c, key, 16 );
     encrypt_block( &c, buffer, plain );
@@ -507,6 +759,16 @@ selftest(void)
 
     }
 #endif
+
+    if ( (r = selftest_cbc ()) )
+      return r;
+
+    if ( (r = selftest_cfb ()) )
+      return r;
+
+    if ( (r = selftest_ctr ()) )
+      return r;
+
     return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 652d795..79ca755 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -718,6 +718,13 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               break;
 #endif /*USE_AES*/
+#ifdef USE_CAST5
+	    case GCRY_CIPHER_CAST5:
+              h->bulk.cfb_dec = _gcry_cast5_cfb_dec;
+              h->bulk.cbc_dec = _gcry_cast5_cbc_dec;
+              h->bulk.ctr_enc = _gcry_cast5_ctr_enc;
+              break;
+#endif /*USE_CAMELLIA*/
 #ifdef USE_CAMELLIA
 	    case GCRY_CIPHER_CAMELLIA128:
 	    case GCRY_CIPHER_CAMELLIA192:
diff --git a/configure.ac b/configure.ac
index 3fec8bc..113c71f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1195,6 +1195,13 @@ LIST_MEMBER(cast5, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5.lo"
    AC_DEFINE(USE_CAST5, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(des, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 9d6cc01..1742003 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -95,6 +95,19 @@ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         unsigned int nblocks);
 
+/*-- cast5.c --*/
+void _gcry_cast5_cfb_dec (void *context, unsigned char *iv,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  unsigned int nblocks);
+
+void _gcry_cast5_cbc_dec (void *context, unsigned char *iv,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  unsigned int nblocks);
+
+void _gcry_cast5_ctr_enc (void *context, unsigned char *ctr,
+			  void *outbuf_arg, const void *inbuf_arg,
+			  unsigned int nblocks);
+
 /*-- camellia-glue.c --*/
 void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
                              void *outbuf_arg, const void *inbuf_arg,


From wk at gnupg.org  Fri May 24 14:10:53 2013
From: wk at gnupg.org (Werner Koch)
Date: Fri, 24 May 2013 14:10:53 +0200
Subject: [PATCH 2/2] cast5: add amd64 assembly implementation
In-Reply-To: <20130524094329.15701.71759.stgit@localhost6.localdomain6> (Jussi
 Kivilinna's message of "Fri, 24 May 2013 12:43:29 +0300")
References: <20130524094324.15701.68533.stgit@localhost6.localdomain6>
 <20130524094329.15701.71759.stgit@localhost6.localdomain6>
Message-ID: <87li744lyq.fsf@vigenere.g10code.de>


> Provides non-parallel implementations for small speed-up and 4-way parallel
> implementations that gets accelerated on `out-of-order' CPUs.

Both pushed.  [Do you think anyone is using CAST5 in CTR mode ?]


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Fri May 24 14:50:25 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Fri, 24 May 2013 15:50:25 +0300
Subject: [PATCH 2/2] cast5: add amd64 assembly implementation
In-Reply-To: <87li744lyq.fsf@vigenere.g10code.de>
References: <20130524094324.15701.68533.stgit@localhost6.localdomain6>
 <20130524094329.15701.71759.stgit@localhost6.localdomain6>
 <87li744lyq.fsf@vigenere.g10code.de>
Message-ID: <519F6211.7020401@iki.fi>

On 24.05.2013 15:10, Werner Koch wrote:
> 
>> Provides non-parallel implementations for small speed-up and 4-way parallel
>> implementations that gets accelerated on `out-of-order' CPUs.
> 
> Both pushed.  [Do you think anyone is using CAST5 in CTR mode ?]
> 

Thanks. At least RFC4344 describes cast128-ctr for ssh.

-Jussi

> 
> Shalom-Salam,
> 
>    Werner
> 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 730 bytes
Desc: OpenPGP digital signature
URL: </pipermail/attachments/20130524/433bba20/attachment.sig>

From jussi.kivilinna at iki.fi  Wed May 29 15:48:09 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 29 May 2013 16:48:09 +0300
Subject: [PATCH 1/2] blowfish: add amd64 assembly implementation
Message-ID: <20130529134809.17165.30961.stgit@localhost6.localdomain6>

* cipher/Makefile.am: Add 'blowfish-amd64.S'.
* cipher/blowfish-amd64.S: New file.
* cipher/blowfish.c (USE_AMD64_ASM): New macro.
[USE_AMD64_ASM] (_gcry_blowfish_amd64_do_encrypt)
(_gcry_blowfish_amd64_encrypt_block)
(_gcry_blowfish_amd64_decrypt_block, _gcry_blowfish_amd64_ctr_enc)
(_gcry_blowfish_amd64_cbc_dec, _gcry_blowfish_amd64_cfb_dec): New
prototypes.
[USE_AMD64_ASM] (do_encrypt, do_encrypt_block, do_decrypt_block)
(encrypt_block, decrypt_block): New functions.
(_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec)
(_gcry_blowfish_cfb_dec, selftest_ctr, selftest_cbc, selftest_cfb): New
functions.
(selftest): Call new bulk selftests.
* cipher/cipher.c (gcry_cipher_open) [USE_BLOWFISH]: Register Blowfish
bulk functions for ctr-enc, cbc-dec and cfb-dec.
* configure.ac (blowfish) [x86_64]: Add 'blowfish-amd64.lo'.
* src/cipher.h (_gcry_blowfish_ctr_enc, _gcry_blowfish_cbc_dec)
(gcry_blowfish_cfb_dec): New prototypes.
--

Add non-parallel functions for small speed-up and 4-way parallel functions for
modes of operation that support parallel processing.

Speed old vs. new on AMD Phenom II X6 1055T:
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
BLOWFISH      1.21x   1.12x   1.17x   3.52x   1.18x   3.34x   1.16x   1.15x   3.38x   3.47x

Speed old vs. new on Intel Core i5-2450M (Sandy-Bridge):
                ECB/Stream         CBC             CFB             OFB             CTR
             --------------- --------------- --------------- --------------- ---------------
BLOWFISH      1.16x   1.10x   1.17x   2.98x   1.18x   2.88x   1.16x   1.15x   3.00x   3.02x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am      |    2 
 cipher/blowfish-amd64.S |  533 +++++++++++++++++++++++++++++++++++++++++++++++
 cipher/blowfish.c       |  271 ++++++++++++++++++++++++
 cipher/cipher.c         |    7 +
 configure.ac            |    7 +
 src/cipher.h            |   13 +
 6 files changed, 832 insertions(+), 1 deletion(-)
 create mode 100644 cipher/blowfish-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 687c599..c0a7593 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -54,7 +54,7 @@ rmd.h
 
 EXTRA_libcipher_la_SOURCES = \
 arcfour.c \
-blowfish.c \
+blowfish.c blowfish-amd64.S \
 cast5.c cast5-amd64.S \
 crc.c \
 des.c \
diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
new file mode 100644
index 0000000..1008387
--- /dev/null
+++ b/cipher/blowfish-amd64.S
@@ -0,0 +1,533 @@
+/* blowfish-amd64.S  -  AMD64 assembly implementation of Blowfish cipher
+ *
+ * Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_BLOWFISH)
+
+.text
+
+/* structure of BLOWFISH_context: */
+#define s0	0
+#define s1	((s0) + 256 * 4)
+#define s2	((s1) + 256 * 4)
+#define s3	((s2) + 256 * 4)
+#define p	((s3) + 256 * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %ebp
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+	movzbl RX0bh,		RT1d; \
+	movzbl RX0bl,		RT3d; \
+	rorq $16,		RX0; \
+	movzbl RX0bh,		RT0d; \
+	movzbl RX0bl,		RT2d; \
+	rorq $16,		RX0; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		RX0;
+
+#define load_roundkey_enc(n) \
+	movq p+4*(n)(CTX), 	RX3;
+
+#define add_roundkey_enc() \
+	xorq RX3, 		RX0;
+
+#define round_enc(n) \
+	add_roundkey_enc(); \
+	load_roundkey_enc(n); \
+	\
+	F(); \
+	F();
+
+#define load_roundkey_dec(n) \
+	movq p+4*(n-1)(CTX),	RX3; \
+	rorq $32,		RX3;
+
+#define add_roundkey_dec() \
+	xorq RX3, 		RX0;
+
+#define round_dec(n) \
+	add_roundkey_dec(); \
+	load_roundkey_dec(n); \
+	\
+	F(); \
+	F();
+
+#define read_block() \
+	movq (RIO), 		RX0; \
+	rorq $32, 		RX0; \
+	bswapq 			RX0;
+
+#define write_block() \
+	bswapq 			RX0; \
+	movq RX0, 		(RIO);
+
+.align 8
+.type   __blowfish_enc_blk1, at function;
+
+__blowfish_enc_blk1:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0: input plaintext block
+	 * output:
+	 *	RX0: output plaintext block
+	 */
+	movq %rbp, %r11;
+
+	load_roundkey_enc(0);
+	round_enc(2);
+	round_enc(4);
+	round_enc(6);
+	round_enc(8);
+	round_enc(10);
+	round_enc(12);
+	round_enc(14);
+	round_enc(16);
+	add_roundkey_enc();
+
+	movq %r11, %rbp;
+
+	ret;
+.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+
+.align 8
+.globl  _gcry_blowfish_amd64_do_encrypt
+.type   _gcry_blowfish_amd64_do_encrypt, at function;
+
+_gcry_blowfish_amd64_do_encrypt:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: u32 *ret_xl
+	 *	%rdx: u32 *ret_xr
+	 */
+	movl (%rdx), RX0d;
+	shlq $32, RX0;
+	movl (%rsi), RT3d;
+	movq %rdx, %r10;
+	orq RT3, RX0;
+	movq %rsi, RX2;
+
+	call __blowfish_enc_blk1;
+
+	movl RX0d, (%r10);
+	shrq $32, RX0;
+	movl RX0d, (RX2);
+
+	ret;
+.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;
+
+.align 8
+.globl  _gcry_blowfish_amd64_encrypt_block
+.type   _gcry_blowfish_amd64_encrypt_block, at function;
+
+_gcry_blowfish_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r10;
+
+	movq %rdx, RIO;
+	read_block();
+
+	call __blowfish_enc_blk1;
+
+	movq %r10, RIO;
+	write_block();
+
+	ret;
+.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;
+
+.align 8
+.globl  _gcry_blowfish_amd64_decrypt_block
+.type   _gcry_blowfish_amd64_decrypt_block, at function;
+
+_gcry_blowfish_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	movq %rbp, %r11;
+
+	movq %rsi, %r10;
+	movq %rdx, RIO;
+
+	read_block();
+
+	load_roundkey_dec(17);
+	round_dec(15);
+	round_dec(13);
+	round_dec(11);
+	round_dec(9);
+	round_dec(7);
+	round_dec(5);
+	round_dec(3);
+	round_dec(1);
+	add_roundkey_dec();
+
+	movq %r10, RIO;
+	write_block();
+
+	movq %r11, %rbp;
+
+	ret;
+.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;
+
+/**********************************************************************
+  4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define F4(x) \
+	movzbl x ## bh,		RT1d; \
+	movzbl x ## bl,		RT3d; \
+	rorq $16,		x; \
+	movzbl x ## bh,		RT0d; \
+	movzbl x ## bl,		RT2d; \
+	rorq $16,		x; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		x;
+
+#define add_preloaded_roundkey4() \
+	xorq RKEY,		RX0; \
+	xorq RKEY,		RX1; \
+	xorq RKEY,		RX2; \
+	xorq RKEY,		RX3;
+
+#define preload_roundkey_enc(n) \
+	movq p+4*(n)(CTX),	RKEY;
+
+#define add_roundkey_enc4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+	add_roundkey_enc4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define preload_roundkey_dec(n) \
+	movq p+4*((n)-1)(CTX),	RKEY; \
+	rorq $32,		RKEY;
+
+#define add_roundkey_dec4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+	add_roundkey_dec4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define inbswap_block4() \
+	rorq $32,		RX0; \
+	bswapq 			RX0; \
+	rorq $32,		RX1; \
+	bswapq 			RX1; \
+	rorq $32,		RX2; \
+	bswapq 			RX2; \
+	rorq $32,		RX3; \
+	bswapq 			RX3;
+
+#define inctrswap_block4() \
+	rorq $32,		RX0; \
+	rorq $32,		RX1; \
+	rorq $32,		RX2; \
+	rorq $32,		RX3;
+
+#define outbswap_block4() \
+	bswapq 			RX0; \
+	bswapq 			RX1; \
+	bswapq 			RX2; \
+	bswapq 			RX3;
+
+.align 8
+.type   __blowfish_enc_blk4, at function;
+
+__blowfish_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
+	 */
+	preload_roundkey_enc(0);
+
+	round_enc4(0);
+	round_enc4(2);
+	round_enc4(4);
+	round_enc4(6);
+	round_enc4(8);
+	round_enc4(10);
+	round_enc4(12);
+	round_enc4(14);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;
+
+.align 8
+.type   __blowfish_dec_blk4, at function;
+
+__blowfish_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input ciphertext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
+	 */
+	preload_roundkey_dec(17);
+
+	inbswap_block4();
+
+	round_dec4(17);
+	round_dec4(15);
+	round_dec4(13);
+	round_dec4(11);
+	round_dec4(9);
+	round_dec4(7);
+	round_dec4(5);
+	round_dec4(3);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;
+
+.align 8
+.globl  _gcry_blowfish_amd64_ctr_enc
+.type   _gcry_blowfish_amd64_ctr_enc, at function;
+_gcry_blowfish_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* load IV and byteswap */
+	movq (%r13), RT0;
+	bswapq RT0;
+	movq RT0, RX0;
+
+	/* construct IVs */
+	leaq 1(RT0), RX1;
+	leaq 2(RT0), RX2;
+	leaq 3(RT0), RX3;
+	leaq 4(RT0), RT0;
+	bswapq RT0;
+
+	inctrswap_block4();
+
+	/* store new IV */
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;
+
+.align 8
+.globl  _gcry_blowfish_amd64_cbc_dec
+.type   _gcry_blowfish_amd64_cbc_dec, at function;
+_gcry_blowfish_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
+	movq %rsi, %r11; /*dst*/
+	movq %rdx, %r12; /*src*/
+	movq %rcx, %r13; /*iv*/
+
+	/* load input */
+	movq 0 * 8(%r12), RX0;
+	movq 1 * 8(%r12), RX1;
+	movq 2 * 8(%r12), RX2;
+	movq 3 * 8(%r12), RX3;
+
+	call __blowfish_dec_blk4;
+
+	movq 3 * 8(%r12), RT0;
+	xorq      (%r13), RX0;
+	xorq 0 * 8(%r12), RX1;
+	xorq 1 * 8(%r12), RX2;
+	xorq 2 * 8(%r12), RX3;
+	movq RT0, (%r13); /* store new IV */
+
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;
+
+.align 8
+.globl  _gcry_blowfish_amd64_cfb_dec
+.type   _gcry_blowfish_amd64_cfb_dec, at function;
+_gcry_blowfish_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* Load input */
+	movq (%r13), RX0;
+	movq 0 * 8(%r12), RX1;
+	movq 1 * 8(%r12), RX2;
+	movq 2 * 8(%r12), RX3;
+
+	inbswap_block4();
+
+	/* Update IV */
+	movq 3 * 8(%r12), RT0;
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;
+
+#endif /*defined(USE_BLOWFISH)*/
+#endif /*__x86_64*/
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index b4d2b9c..39d4051 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -36,10 +36,20 @@
 #include "types.h"
 #include "g10lib.h"
 #include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
 
 #define BLOWFISH_BLOCKSIZE 8
 #define BLOWFISH_ROUNDS 16
 
+
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && (BLOWFISH_ROUNDS == 16)
+# define USE_AMD64_ASM 1
+#endif
+
+
 typedef struct {
     u32 s0[256];
     u32 s1[256];
@@ -240,6 +250,61 @@ static const u32 ps[BLOWFISH_ROUNDS+2] = {
     0xC0AC29B7,0xC97C50DD,0x3F84D5B5,0xB5470917,0x9216D5D9,0x8979FB1B };
 
 
+#ifdef USE_AMD64_ASM
+
+/* Assembly implementations of Blowfish. */
+extern void _gcry_blowfish_amd64_do_encrypt(BLOWFISH_context *c, u32 *ret_xl,
+					    u32 *ret_xr);
+
+extern void _gcry_blowfish_amd64_encrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+extern void _gcry_blowfish_amd64_decrypt_block(BLOWFISH_context *c, byte *out,
+					       const byte *in);
+
+/* These assembly implementations process four blocks in parallel. */
+extern void _gcry_blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *ctr);
+
+extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
+					 const byte *in, byte *iv);
+
+static void
+do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
+{
+  _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
+}
+
+static void
+do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
+}
+
+static void
+do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
+{
+  _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
+}
+
+static void encrypt_block (void *context , byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_encrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+static void decrypt_block (void *context, byte *outbuf, const byte *inbuf)
+{
+  BLOWFISH_context *c = (BLOWFISH_context *) context;
+  do_decrypt_block (c, outbuf, inbuf);
+  _gcry_burn_stack (2*8);
+}
+
+#else /*USE_AMD64_ASM*/
 
 #if BLOWFISH_ROUNDS != 16
 static inline u32
@@ -461,6 +526,201 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
   _gcry_burn_stack (64);
 }
 
+#endif /*!USE_AMD64_ASM*/
+
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size BLOWFISH_BLOCKSIZE. */
+void
+_gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[BLOWFISH_BLOCKSIZE];
+  int burn_stack_depth = (64) + 2 * BLOWFISH_BLOCKSIZE;
+  int i;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_blowfish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+    /* TODO: use caching instead? */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      do_encrypt_block(ctx, tmpbuf, ctr);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+      /* Increment the counter.  */
+      for (i = BLOWFISH_BLOCKSIZE; i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[BLOWFISH_BLOCKSIZE];
+  int burn_stack_depth = (64) + 2 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_blowfish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, BLOWFISH_BLOCKSIZE);
+
+      do_decrypt_block (ctx, outbuf, inbuf);
+
+      buf_xor(outbuf, outbuf, iv, BLOWFISH_BLOCKSIZE);
+      memcpy(iv, savebuf, BLOWFISH_BLOCKSIZE);
+      inbuf += BLOWFISH_BLOCKSIZE;
+      outbuf += BLOWFISH_BLOCKSIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
+		    const void *inbuf_arg, unsigned int nblocks)
+{
+  BLOWFISH_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = (64) + 2 * BLOWFISH_BLOCKSIZE;
+
+#ifdef USE_AMD64_ASM
+  {
+    if (nblocks >= 4)
+      burn_stack_depth += 5 * sizeof(void*);
+
+    /* Process data in 4 block chunks. */
+    while (nblocks >= 4)
+      {
+        _gcry_blowfish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+
+        nblocks -= 4;
+        outbuf += 4 * BLOWFISH_BLOCKSIZE;
+        inbuf  += 4 * BLOWFISH_BLOCKSIZE;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for ( ;nblocks; nblocks-- )
+    {
+      do_encrypt_block(ctx, iv, iv);
+      buf_xor_n_copy(outbuf, iv, inbuf, BLOWFISH_BLOCKSIZE);
+      outbuf += BLOWFISH_BLOCKSIZE;
+      inbuf  += BLOWFISH_BLOCKSIZE;
+    }
+
+  _gcry_burn_stack(burn_stack_depth);
+}
+
+
+/* Run the self-tests for BLOWFISH-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char *
+selftest_ctr (void)
+{
+  const int nblocks = 4+1;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_ctr("BLOWFISH", &bf_setkey,
+           &encrypt_block, &_gcry_blowfish_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for BLOWFISH-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cbc (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_cbc("BLOWFISH", &bf_setkey,
+           &encrypt_block, &_gcry_blowfish_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+
+/* Run the self-tests for BLOWFISH-CFB, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char *
+selftest_cfb (void)
+{
+  const int nblocks = 4+2;
+  const int blocksize = BLOWFISH_BLOCKSIZE;
+  const int context_size = sizeof(BLOWFISH_context);
+
+  return _gcry_selftest_helper_cfb("BLOWFISH", &bf_setkey,
+           &encrypt_block, &_gcry_blowfish_cfb_dec, nblocks, blocksize,
+	   context_size);
+}
+
 
 static const char*
 selftest(void)
@@ -471,6 +731,7 @@ selftest(void)
   byte plain3[] = { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 };
   byte key3[] = { 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
   byte cipher3[] = { 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
+  const char *r;
 
   bf_setkey( (void *) &c,
              (const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26 );
@@ -488,6 +749,16 @@ selftest(void)
   decrypt_block( (void *) &c, buffer, buffer );
   if( memcmp( buffer, plain3, 8 ) )
     return "Blowfish selftest failed (4).";
+
+  if ( (r = selftest_cbc ()) )
+    return r;
+
+  if ( (r = selftest_cfb ()) )
+    return r;
+
+  if ( (r = selftest_ctr ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 79ca755..508f26f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -718,6 +718,13 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               break;
 #endif /*USE_AES*/
+#ifdef USE_BLOWFISH
+	    case GCRY_CIPHER_BLOWFISH:
+              h->bulk.cfb_dec = _gcry_blowfish_cfb_dec;
+              h->bulk.cbc_dec = _gcry_blowfish_cbc_dec;
+              h->bulk.ctr_enc = _gcry_blowfish_ctr_enc;
+              break;
+#endif /*USE_BLOWFISH*/
 #ifdef USE_CAST5
 	    case GCRY_CIPHER_CAST5:
               h->bulk.cfb_dec = _gcry_cast5_cfb_dec;
diff --git a/configure.ac b/configure.ac
index 113c71f..f13a91b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1189,6 +1189,13 @@ LIST_MEMBER(blowfish, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish.lo"
    AC_DEFINE(USE_BLOWFISH, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(cast5, $enabled_ciphers)
diff --git a/src/cipher.h b/src/cipher.h
index 1742003..ca595b0 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -95,6 +95,19 @@ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         unsigned int nblocks);
 
+/*-- blowfish.c --*/
+void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,
+			     void *outbuf_arg, const void *inbuf_arg,
+			     unsigned int nblocks);
+
+void _gcry_blowfish_cbc_dec (void *context, unsigned char *iv,
+			     void *outbuf_arg, const void *inbuf_arg,
+			     unsigned int nblocks);
+
+void _gcry_blowfish_ctr_enc (void *context, unsigned char *ctr,
+			     void *outbuf_arg, const void *inbuf_arg,
+			     unsigned int nblocks);
+
 /*-- cast5.c --*/
 void _gcry_cast5_cfb_dec (void *context, unsigned char *iv,
 			  void *outbuf_arg, const void *inbuf_arg,


From jussi.kivilinna at iki.fi  Wed May 29 15:48:14 2013
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Wed, 29 May 2013 16:48:14 +0300
Subject: [PATCH 2/2] rinjdael: add amd64 assembly implementation
In-Reply-To: <20130529134809.17165.30961.stgit@localhost6.localdomain6>
References: <20130529134809.17165.30961.stgit@localhost6.localdomain6>
Message-ID: <20130529134814.17165.93557.stgit@localhost6.localdomain6>

* cipher/Makefile.am: Add 'rijndael-amd64.S'.
* cipher/rijndael-amd64.S: New file.
* cipher/rijndael.c (USE_AMD64_ASM): New macro.
[USE_AMD64_ASM] (_gcry_aes_amd64_encrypt_block)
(_gcry_aes_amd64_decrypt_block): New prototypes.
(do_encrypt_aligned) [USE_AMD64_ASM]: Use amd64 assembly function.
(do_encrypt): Disable input/output alignment when USE_AMD64_ASM is set.
(do_decrypt_aligned) [USE_AMD64_ASM]: Use amd64 assembly function.
(do_decrypt): Disable input/output alignment when USE_AMD64_AES is set.
* configure.ac (aes) [x86-64]: Add 'rijndael-amd64.lo'.
--

Add optimized amd64 assembly implementation for AES.

Old vs new, on AMD Phenom II:
          ECB/Stream         CBC             CFB             OFB             CTR
       --------------- --------------- --------------- --------------- ---------------
AES     1.74x   1.72x   1.81x   1.85x   1.82x   1.76x   1.67x   1.64x   1.79x   1.81x
AES192  1.77x   1.77x   1.79x   1.88x   1.90x   1.80x   1.69x   1.69x   1.85x   1.81x
AES256  1.79x   1.81x   1.83x   1.89x   1.88x   1.82x   1.72x   1.70x   1.87x   1.89x

Old vs new, on Intel Core2:
          ECB/Stream         CBC             CFB             OFB             CTR
       --------------- --------------- --------------- --------------- ---------------
AES     1.77x   1.75x   1.78x   1.76x   1.76x   1.77x   1.75x   1.76x   1.76x   1.82x
AES192  1.80x   1.73x   1.81x   1.76x   1.79x   1.85x   1.77x   1.76x   1.80x   1.85x
AES256  1.81x   1.77x   1.81x   1.77x   1.80x   1.79x   1.78x   1.77x   1.81x   1.85x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am      |    2 
 cipher/rijndael-amd64.S | 1416 +++++++++++++++++++++++++++++++++++++++++++++++
 cipher/rijndael.c       |   32 +
 configure.ac            |    7 
 4 files changed, 1456 insertions(+), 1 deletion(-)
 create mode 100644 cipher/rijndael-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index c0a7593..7439cc9 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -64,7 +64,7 @@ ecc.c \
 idea.c \
 md4.c \
 md5.c \
-rijndael.c rijndael-tables.h \
+rijndael.c rijndael-tables.h rijndael-amd64.S \
 rmd160.c \
 rsa.c \
 scrypt.c \
diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S
new file mode 100644
index 0000000..6921f31
--- /dev/null
+++ b/cipher/rijndael-amd64.S
@@ -0,0 +1,1416 @@
+/* rinjdael-amd64.S  -  AMD64 assembly implementation of AES cipher
+ *
+ * Copyright ? 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_AES)
+
+#ifdef __PIC__
+#  define RIP %rip
+#else
+#  define RIP
+#endif
+
+.text
+
+/* table macros */
+#define E0	(.LtableE0-.LtableE0)
+#define E1	(.LtableE1-.LtableE0)
+#define E2	(.LtableE2-.LtableE0)
+#define E3	(.LtableE3-.LtableE0)
+#define Es0	(.LtableEs0-.LtableE0)
+#define Es1	(.LtableEs1-.LtableE0)
+#define Es2	(.LtableEs2-.LtableE0)
+#define Es3	(.LtableEs3-.LtableE0)
+#define Esize	8
+
+#define D0	(.LtableD0-.LtableD0)
+#define D1	(.LtableD1-.LtableD0)
+#define D2	(.LtableD2-.LtableD0)
+#define D3	(.LtableD3-.LtableD0)
+#define Ds0	(.LtableDs0-.LtableD0)
+#define Ds1	(.LtableDs1-.LtableD0)
+#define Ds2	(.LtableDs2-.LtableD0)
+#define Ds3	(.LtableDs3-.LtableD0)
+#define Dsize	8
+
+/* register macros */
+#define CTX	%rdi
+#define RTAB	%r12
+
+#define RA	%rax
+#define RB	%rbx
+#define RC	%rcx
+#define RD	%rdx
+
+#define RAd	%eax
+#define RBd	%ebx
+#define RCd	%ecx
+#define RDd	%edx
+
+#define RAbl	%al
+#define RBbl	%bl
+#define RCbl	%cl
+#define RDbl	%dl
+
+#define RAbh	%ah
+#define RBbh	%bh
+#define RCbh	%ch
+#define RDbh	%dh
+
+#define RNA	%r8
+#define RNB	%r9
+#define RNC	%r10
+#define RND	%r11
+
+#define RNAd	%r8d
+#define RNBd	%r9d
+#define RNCd	%r10d
+#define RNDd	%r11d
+
+#define RT0	%rbp
+#define RT1	%rsi
+
+#define RT0d	%ebp
+#define RT1d	%esi
+
+/* helper macros */
+#define do16bit(op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
+	movzbl source ## bh,			t1 ## d; \
+	movzbl source ## bl,			t0 ## d; \
+	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
+	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
+
+#define do16bit_shr(shf, op, source, tablemul, table1, dest1, table2, dest2, t0, t1) \
+	movzbl source ## bl,			t0 ## d; \
+	movzbl source ## bh,			t1 ## d; \
+	shrq $(shf),				source; \
+	op ## l table1(RTAB,t0,tablemul),	dest1 ## d; \
+	op ## l table2(RTAB,t1,tablemul),	dest2 ## d;
+
+/***********************************************************************
+ * AMD64 assembly implementation of the AES cipher
+ ***********************************************************************/
+#define addroundkey(round, ra, rb, rc, rd) \
+	xorl (((round) * 16) + 0 * 4)(CTX), ra ## d; \
+	xorl (((round) * 16) + 1 * 4)(CTX), rb ## d; \
+	xorl (((round) * 16) + 2 * 4)(CTX), rc ## d; \
+	xorl (((round) * 16) + 3 * 4)(CTX), rd ## d;
+
+#define do_encround(next_r) \
+	do16bit_shr(16, mov, RA, Esize, E0, RNA, E1, RND, RT0, RT1); \
+	do16bit(        mov, RA, Esize, E2, RNC, E3, RNB, RT0, RT1); \
+	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
+	xorl RNAd, RAd; \
+	\
+	do16bit_shr(16, xor, RD, Esize, E0, RND, E1, RNC, RT0, RT1); \
+	do16bit(        xor, RD, Esize, E2, RNB, E3, RA,  RT0, RT1); \
+	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
+	xorl RNDd, RDd; \
+	\
+	do16bit_shr(16, xor, RC, Esize, E0, RNC, E1, RNB, RT0, RT1); \
+	do16bit(        xor, RC, Esize, E2, RA,  E3, RD,  RT0, RT1); \
+	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
+	xorl RNCd, RCd; \
+	\
+	do16bit_shr(16, xor, RB, Esize, E0, RNB, E1, RA,  RT0, RT1); \
+	do16bit(        xor, RB, Esize, E2, RD,  E3, RC,  RT0, RT1); \
+	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
+	xorl RNBd, RBd;
+
+#define do_lastencround(next_r) \
+	do16bit_shr(16, mov, RA, Esize, Es0, RNA, Es1, RND, RT0, RT1); \
+	do16bit(        mov, RA, Esize, Es2, RNC, Es3, RNB, RT0, RT1); \
+	\
+	do16bit_shr(16,  or, RB, Esize, Es0, RNB, Es1, RNA, RT0, RT1); \
+	do16bit(         or, RB, Esize, Es2, RND, Es3, RNC, RT0, RT1); \
+	\
+	do16bit_shr(16,  or, RC, Esize, Es0, RNC, Es1, RNB, RT0, RT1); \
+	do16bit(         or, RC, Esize, Es2, RNA, Es3, RND, RT0, RT1); \
+	\
+	do16bit_shr(16,  or, RD, Esize, Es0, RND, Es1, RNC, RT0, RT1); \
+	do16bit(         or, RD, Esize, Es2, RNB, Es3, RNA, RT0, RT1);
+
+#define firstencround(round) \
+	addroundkey(round, RA, RB, RC, RD); \
+	do_encround((round) + 1);
+
+#define encround(round) \
+	do_encround((round) + 1);
+
+#define lastencround(round) \
+	do_lastencround(); \
+	addroundkey((round) + 1, RNA, RNB, RNC, RND);
+
+.align 8
+.global _gcry_aes_amd64_encrypt_block
+.type   _gcry_aes_amd64_encrypt_block, at function;
+
+_gcry_aes_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: keysched, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%ecx: number of rounds.. 10, 12 or 14
+	 */
+	subq $(5 * 8), %rsp;
+	movq %rsi, (0 * 8)(%rsp);
+	movl %ecx, (1 * 8)(%rsp);
+	movq %rbp, (2 * 8)(%rsp);
+	movq %rbx, (3 * 8)(%rsp);
+	movq %r12, (4 * 8)(%rsp);
+
+	leaq .LtableE0(RIP), RTAB;
+
+	/* read input block */
+	movl 0 * 4(%rdx), RAd;
+	movl 1 * 4(%rdx), RBd;
+	movl 2 * 4(%rdx), RCd;
+	movl 3 * 4(%rdx), RDd;
+
+	firstencround(0);
+	encround(1);
+	encround(2);
+	encround(3);
+	encround(4);
+	encround(5);
+	encround(6);
+	encround(7);
+	encround(8);
+	cmpl $12, (1 * 8)(%rsp);
+	jnb .Lenc_not_128;
+	lastencround(9);
+
+.align 4
+.Lenc_done:
+	/* write output block */
+	movq (0 * 8)(%rsp), %rsi;
+	movl RNAd, 0 * 4(%rsi);
+	movl RNBd, 1 * 4(%rsi);
+	movl RNCd, 2 * 4(%rsi);
+	movl RNDd, 3 * 4(%rsi);
+
+	movq (4 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %rbp;
+	addq $(5 * 8), %rsp;
+
+	ret;
+
+.align 4
+.Lenc_not_128:
+	je .Lenc_192
+
+	encround(9);
+	encround(10);
+	encround(11);
+	encround(12);
+	lastencround(13);
+
+	jmp .Lenc_done;
+
+.align 4
+.Lenc_192:
+	encround(9);
+	encround(10);
+	lastencround(11);
+
+	jmp .Lenc_done;
+.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;
+
+#define do_decround(next_r) \
+	do16bit_shr(16, mov, RA, Dsize, D0, RNA, D1, RNB, RT0, RT1); \
+	do16bit(        mov, RA, Dsize, D2, RNC, D3, RND, RT0, RT1); \
+	movl (((next_r) * 16) + 0 * 4)(CTX), RAd; \
+	xorl RNAd, RAd; \
+	\
+	do16bit_shr(16, xor, RB, Dsize, D0, RNB, D1, RNC, RT0, RT1); \
+	do16bit(        xor, RB, Dsize, D2, RND, D3, RA,  RT0, RT1); \
+	movl (((next_r) * 16) + 1 * 4)(CTX), RBd; \
+	xorl RNBd, RBd; \
+	\
+	do16bit_shr(16, xor, RC, Dsize, D0, RNC, D1, RND, RT0, RT1); \
+	do16bit(        xor, RC, Dsize, D2, RA,  D3, RB,  RT0, RT1); \
+	movl (((next_r) * 16) + 2 * 4)(CTX), RCd; \
+	xorl RNCd, RCd; \
+	\
+	do16bit_shr(16, xor, RD, Dsize, D0, RND, D1, RA,  RT0, RT1); \
+	do16bit(        xor, RD, Dsize, D2, RB,  D3, RC,  RT0, RT1); \
+	movl (((next_r) * 16) + 3 * 4)(CTX), RDd; \
+	xorl RNDd, RDd; \
+
+#define do_lastdecround() \
+	do16bit_shr(16, mov, RA, Dsize, Ds0, RNA, Ds1, RNB, RT0, RT1); \
+	do16bit(        mov, RA, Dsize, Ds2, RNC, Ds3, RND, RT0, RT1); \
+	\
+	do16bit_shr(16,  or, RB, Dsize, Ds0, RNB, Ds1, RNC, RT0, RT1); \
+	do16bit(         or, RB, Dsize, Ds2, RND, Ds3, RNA, RT0, RT1); \
+	\
+	do16bit_shr(16,  or, RC, Dsize, Ds0, RNC, Ds1, RND, RT0, RT1); \
+	do16bit(         or, RC, Dsize, Ds2, RNA, Ds3, RNB, RT0, RT1); \
+	\
+	do16bit_shr(16,  or, RD, Dsize, Ds0, RND, Ds1, RNA, RT0, RT1); \
+	do16bit(         or, RD, Dsize, Ds2, RNB, Ds3, RNC, RT0, RT1);
+
+#define firstdecround(round) \
+	addroundkey((round + 1), RA, RB, RC, RD); \
+	do_decround(round);
+
+#define decround(round) \
+	do_decround(round);
+
+#define lastdecround(round) \
+	do_lastdecround(); \
+	addroundkey(round, RNA, RNB, RNC, RND);
+
+.align 8
+.global _gcry_aes_amd64_decrypt_block
+.type   _gcry_aes_amd64_decrypt_block, at function;
+
+_gcry_aes_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: keysched, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%ecx: number of rounds.. 10, 12 or 14
+	 */
+	subq $(5 * 8), %rsp;
+	movq %rsi, (0 * 8)(%rsp);
+	movl %ecx, (1 * 8)(%rsp);
+	movq %rbp, (2 * 8)(%rsp);
+	movq %rbx, (3 * 8)(%rsp);
+	movq %r12, (4 * 8)(%rsp);
+
+	leaq .LtableD0(RIP), RTAB;
+
+	/* read input block */
+	movl 0 * 4(%rdx), RAd;
+	movl 1 * 4(%rdx), RBd;
+	movl 2 * 4(%rdx), RCd;
+	movl 3 * 4(%rdx), RDd;
+
+	cmpl $12, (1 * 8)(%rsp);
+	jnb .Ldec_256;
+
+	firstdecround(9);
+.align 4
+.Ldec_tail:
+	decround(8);
+	decround(7);
+	decround(6);
+	decround(5);
+	decround(4);
+	decround(3);
+	decround(2);
+	decround(1);
+	lastdecround(0);
+
+	/* write output block */
+	movq (0 * 8)(%rsp), %rsi;
+	movl RNAd, 0 * 4(%rsi);
+	movl RNBd, 1 * 4(%rsi);
+	movl RNCd, 2 * 4(%rsi);
+	movl RNDd, 3 * 4(%rsi);
+
+	movq (4 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %rbp;
+	addq $(5 * 8), %rsp;
+
+	ret;
+
+.align 4
+.Ldec_256:
+	je .Ldec_192;
+
+	firstdecround(13);
+	decround(12);
+	decround(11);
+	decround(10);
+	decround(9);
+
+	jmp .Ldec_tail;
+
+.align 4
+.Ldec_192:
+	firstdecround(11);
+	decround(10);
+	decround(9);
+
+	jmp .Ldec_tail;
+.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;
+
+.data
+.align 16
+
+/* Encryption tables */
+.LtableE0:
+.long 0xa56363c6
+.LtableEs0:
+.long             0x00000063, 0x847c7cf8, 0x0000007c
+.long 0x997777ee, 0x00000077, 0x8d7b7bf6, 0x0000007b
+.long 0x0df2f2ff, 0x000000f2, 0xbd6b6bd6, 0x0000006b
+.long 0xb16f6fde, 0x0000006f, 0x54c5c591, 0x000000c5
+.long 0x50303060, 0x00000030, 0x03010102, 0x00000001
+.long 0xa96767ce, 0x00000067, 0x7d2b2b56, 0x0000002b
+.long 0x19fefee7, 0x000000fe, 0x62d7d7b5, 0x000000d7
+.long 0xe6abab4d, 0x000000ab, 0x9a7676ec, 0x00000076
+.long 0x45caca8f, 0x000000ca, 0x9d82821f, 0x00000082
+.long 0x40c9c989, 0x000000c9, 0x877d7dfa, 0x0000007d
+.long 0x15fafaef, 0x000000fa, 0xeb5959b2, 0x00000059
+.long 0xc947478e, 0x00000047, 0x0bf0f0fb, 0x000000f0
+.long 0xecadad41, 0x000000ad, 0x67d4d4b3, 0x000000d4
+.long 0xfda2a25f, 0x000000a2, 0xeaafaf45, 0x000000af
+.long 0xbf9c9c23, 0x0000009c, 0xf7a4a453, 0x000000a4
+.long 0x967272e4, 0x00000072, 0x5bc0c09b, 0x000000c0
+.long 0xc2b7b775, 0x000000b7, 0x1cfdfde1, 0x000000fd
+.long 0xae93933d, 0x00000093, 0x6a26264c, 0x00000026
+.long 0x5a36366c, 0x00000036, 0x413f3f7e, 0x0000003f
+.long 0x02f7f7f5, 0x000000f7, 0x4fcccc83, 0x000000cc
+.long 0x5c343468, 0x00000034, 0xf4a5a551, 0x000000a5
+.long 0x34e5e5d1, 0x000000e5, 0x08f1f1f9, 0x000000f1
+.long 0x937171e2, 0x00000071, 0x73d8d8ab, 0x000000d8
+.long 0x53313162, 0x00000031, 0x3f15152a, 0x00000015
+.long 0x0c040408, 0x00000004, 0x52c7c795, 0x000000c7
+.long 0x65232346, 0x00000023, 0x5ec3c39d, 0x000000c3
+.long 0x28181830, 0x00000018, 0xa1969637, 0x00000096
+.long 0x0f05050a, 0x00000005, 0xb59a9a2f, 0x0000009a
+.long 0x0907070e, 0x00000007, 0x36121224, 0x00000012
+.long 0x9b80801b, 0x00000080, 0x3de2e2df, 0x000000e2
+.long 0x26ebebcd, 0x000000eb, 0x6927274e, 0x00000027
+.long 0xcdb2b27f, 0x000000b2, 0x9f7575ea, 0x00000075
+.long 0x1b090912, 0x00000009, 0x9e83831d, 0x00000083
+.long 0x742c2c58, 0x0000002c, 0x2e1a1a34, 0x0000001a
+.long 0x2d1b1b36, 0x0000001b, 0xb26e6edc, 0x0000006e
+.long 0xee5a5ab4, 0x0000005a, 0xfba0a05b, 0x000000a0
+.long 0xf65252a4, 0x00000052, 0x4d3b3b76, 0x0000003b
+.long 0x61d6d6b7, 0x000000d6, 0xceb3b37d, 0x000000b3
+.long 0x7b292952, 0x00000029, 0x3ee3e3dd, 0x000000e3
+.long 0x712f2f5e, 0x0000002f, 0x97848413, 0x00000084
+.long 0xf55353a6, 0x00000053, 0x68d1d1b9, 0x000000d1
+.long 0x00000000, 0x00000000, 0x2cededc1, 0x000000ed
+.long 0x60202040, 0x00000020, 0x1ffcfce3, 0x000000fc
+.long 0xc8b1b179, 0x000000b1, 0xed5b5bb6, 0x0000005b
+.long 0xbe6a6ad4, 0x0000006a, 0x46cbcb8d, 0x000000cb
+.long 0xd9bebe67, 0x000000be, 0x4b393972, 0x00000039
+.long 0xde4a4a94, 0x0000004a, 0xd44c4c98, 0x0000004c
+.long 0xe85858b0, 0x00000058, 0x4acfcf85, 0x000000cf
+.long 0x6bd0d0bb, 0x000000d0, 0x2aefefc5, 0x000000ef
+.long 0xe5aaaa4f, 0x000000aa, 0x16fbfbed, 0x000000fb
+.long 0xc5434386, 0x00000043, 0xd74d4d9a, 0x0000004d
+.long 0x55333366, 0x00000033, 0x94858511, 0x00000085
+.long 0xcf45458a, 0x00000045, 0x10f9f9e9, 0x000000f9
+.long 0x06020204, 0x00000002, 0x817f7ffe, 0x0000007f
+.long 0xf05050a0, 0x00000050, 0x443c3c78, 0x0000003c
+.long 0xba9f9f25, 0x0000009f, 0xe3a8a84b, 0x000000a8
+.long 0xf35151a2, 0x00000051, 0xfea3a35d, 0x000000a3
+.long 0xc0404080, 0x00000040, 0x8a8f8f05, 0x0000008f
+.long 0xad92923f, 0x00000092, 0xbc9d9d21, 0x0000009d
+.long 0x48383870, 0x00000038, 0x04f5f5f1, 0x000000f5
+.long 0xdfbcbc63, 0x000000bc, 0xc1b6b677, 0x000000b6
+.long 0x75dadaaf, 0x000000da, 0x63212142, 0x00000021
+.long 0x30101020, 0x00000010, 0x1affffe5, 0x000000ff
+.long 0x0ef3f3fd, 0x000000f3, 0x6dd2d2bf, 0x000000d2
+.long 0x4ccdcd81, 0x000000cd, 0x140c0c18, 0x0000000c
+.long 0x35131326, 0x00000013, 0x2fececc3, 0x000000ec
+.long 0xe15f5fbe, 0x0000005f, 0xa2979735, 0x00000097
+.long 0xcc444488, 0x00000044, 0x3917172e, 0x00000017
+.long 0x57c4c493, 0x000000c4, 0xf2a7a755, 0x000000a7
+.long 0x827e7efc, 0x0000007e, 0x473d3d7a, 0x0000003d
+.long 0xac6464c8, 0x00000064, 0xe75d5dba, 0x0000005d
+.long 0x2b191932, 0x00000019, 0x957373e6, 0x00000073
+.long 0xa06060c0, 0x00000060, 0x98818119, 0x00000081
+.long 0xd14f4f9e, 0x0000004f, 0x7fdcdca3, 0x000000dc
+.long 0x66222244, 0x00000022, 0x7e2a2a54, 0x0000002a
+.long 0xab90903b, 0x00000090, 0x8388880b, 0x00000088
+.long 0xca46468c, 0x00000046, 0x29eeeec7, 0x000000ee
+.long 0xd3b8b86b, 0x000000b8, 0x3c141428, 0x00000014
+.long 0x79dedea7, 0x000000de, 0xe25e5ebc, 0x0000005e
+.long 0x1d0b0b16, 0x0000000b, 0x76dbdbad, 0x000000db
+.long 0x3be0e0db, 0x000000e0, 0x56323264, 0x00000032
+.long 0x4e3a3a74, 0x0000003a, 0x1e0a0a14, 0x0000000a
+.long 0xdb494992, 0x00000049, 0x0a06060c, 0x00000006
+.long 0x6c242448, 0x00000024, 0xe45c5cb8, 0x0000005c
+.long 0x5dc2c29f, 0x000000c2, 0x6ed3d3bd, 0x000000d3
+.long 0xefacac43, 0x000000ac, 0xa66262c4, 0x00000062
+.long 0xa8919139, 0x00000091, 0xa4959531, 0x00000095
+.long 0x37e4e4d3, 0x000000e4, 0x8b7979f2, 0x00000079
+.long 0x32e7e7d5, 0x000000e7, 0x43c8c88b, 0x000000c8
+.long 0x5937376e, 0x00000037, 0xb76d6dda, 0x0000006d
+.long 0x8c8d8d01, 0x0000008d, 0x64d5d5b1, 0x000000d5
+.long 0xd24e4e9c, 0x0000004e, 0xe0a9a949, 0x000000a9
+.long 0xb46c6cd8, 0x0000006c, 0xfa5656ac, 0x00000056
+.long 0x07f4f4f3, 0x000000f4, 0x25eaeacf, 0x000000ea
+.long 0xaf6565ca, 0x00000065, 0x8e7a7af4, 0x0000007a
+.long 0xe9aeae47, 0x000000ae, 0x18080810, 0x00000008
+.long 0xd5baba6f, 0x000000ba, 0x887878f0, 0x00000078
+.long 0x6f25254a, 0x00000025, 0x722e2e5c, 0x0000002e
+.long 0x241c1c38, 0x0000001c, 0xf1a6a657, 0x000000a6
+.long 0xc7b4b473, 0x000000b4, 0x51c6c697, 0x000000c6
+.long 0x23e8e8cb, 0x000000e8, 0x7cdddda1, 0x000000dd
+.long 0x9c7474e8, 0x00000074, 0x211f1f3e, 0x0000001f
+.long 0xdd4b4b96, 0x0000004b, 0xdcbdbd61, 0x000000bd
+.long 0x868b8b0d, 0x0000008b, 0x858a8a0f, 0x0000008a
+.long 0x907070e0, 0x00000070, 0x423e3e7c, 0x0000003e
+.long 0xc4b5b571, 0x000000b5, 0xaa6666cc, 0x00000066
+.long 0xd8484890, 0x00000048, 0x05030306, 0x00000003
+.long 0x01f6f6f7, 0x000000f6, 0x120e0e1c, 0x0000000e
+.long 0xa36161c2, 0x00000061, 0x5f35356a, 0x00000035
+.long 0xf95757ae, 0x00000057, 0xd0b9b969, 0x000000b9
+.long 0x91868617, 0x00000086, 0x58c1c199, 0x000000c1
+.long 0x271d1d3a, 0x0000001d, 0xb99e9e27, 0x0000009e
+.long 0x38e1e1d9, 0x000000e1, 0x13f8f8eb, 0x000000f8
+.long 0xb398982b, 0x00000098, 0x33111122, 0x00000011
+.long 0xbb6969d2, 0x00000069, 0x70d9d9a9, 0x000000d9
+.long 0x898e8e07, 0x0000008e, 0xa7949433, 0x00000094
+.long 0xb69b9b2d, 0x0000009b, 0x221e1e3c, 0x0000001e
+.long 0x92878715, 0x00000087, 0x20e9e9c9, 0x000000e9
+.long 0x49cece87, 0x000000ce, 0xff5555aa, 0x00000055
+.long 0x78282850, 0x00000028, 0x7adfdfa5, 0x000000df
+.long 0x8f8c8c03, 0x0000008c, 0xf8a1a159, 0x000000a1
+.long 0x80898909, 0x00000089, 0x170d0d1a, 0x0000000d
+.long 0xdabfbf65, 0x000000bf, 0x31e6e6d7, 0x000000e6
+.long 0xc6424284, 0x00000042, 0xb86868d0, 0x00000068
+.long 0xc3414182, 0x00000041, 0xb0999929, 0x00000099
+.long 0x772d2d5a, 0x0000002d, 0x110f0f1e, 0x0000000f
+.long 0xcbb0b07b, 0x000000b0, 0xfc5454a8, 0x00000054
+.long 0xd6bbbb6d, 0x000000bb, 0x3a16162c, 0x00000016
+.LtableE1:
+.long 0x6363c6a5
+.LtableEs1:
+.long             0x00006300, 0x7c7cf884, 0x00007c00
+.long 0x7777ee99, 0x00007700, 0x7b7bf68d, 0x00007b00
+.long 0xf2f2ff0d, 0x0000f200, 0x6b6bd6bd, 0x00006b00
+.long 0x6f6fdeb1, 0x00006f00, 0xc5c59154, 0x0000c500
+.long 0x30306050, 0x00003000, 0x01010203, 0x00000100
+.long 0x6767cea9, 0x00006700, 0x2b2b567d, 0x00002b00
+.long 0xfefee719, 0x0000fe00, 0xd7d7b562, 0x0000d700
+.long 0xabab4de6, 0x0000ab00, 0x7676ec9a, 0x00007600
+.long 0xcaca8f45, 0x0000ca00, 0x82821f9d, 0x00008200
+.long 0xc9c98940, 0x0000c900, 0x7d7dfa87, 0x00007d00
+.long 0xfafaef15, 0x0000fa00, 0x5959b2eb, 0x00005900
+.long 0x47478ec9, 0x00004700, 0xf0f0fb0b, 0x0000f000
+.long 0xadad41ec, 0x0000ad00, 0xd4d4b367, 0x0000d400
+.long 0xa2a25ffd, 0x0000a200, 0xafaf45ea, 0x0000af00
+.long 0x9c9c23bf, 0x00009c00, 0xa4a453f7, 0x0000a400
+.long 0x7272e496, 0x00007200, 0xc0c09b5b, 0x0000c000
+.long 0xb7b775c2, 0x0000b700, 0xfdfde11c, 0x0000fd00
+.long 0x93933dae, 0x00009300, 0x26264c6a, 0x00002600
+.long 0x36366c5a, 0x00003600, 0x3f3f7e41, 0x00003f00
+.long 0xf7f7f502, 0x0000f700, 0xcccc834f, 0x0000cc00
+.long 0x3434685c, 0x00003400, 0xa5a551f4, 0x0000a500
+.long 0xe5e5d134, 0x0000e500, 0xf1f1f908, 0x0000f100
+.long 0x7171e293, 0x00007100, 0xd8d8ab73, 0x0000d800
+.long 0x31316253, 0x00003100, 0x15152a3f, 0x00001500
+.long 0x0404080c, 0x00000400, 0xc7c79552, 0x0000c700
+.long 0x23234665, 0x00002300, 0xc3c39d5e, 0x0000c300
+.long 0x18183028, 0x00001800, 0x969637a1, 0x00009600
+.long 0x05050a0f, 0x00000500, 0x9a9a2fb5, 0x00009a00
+.long 0x07070e09, 0x00000700, 0x12122436, 0x00001200
+.long 0x80801b9b, 0x00008000, 0xe2e2df3d, 0x0000e200
+.long 0xebebcd26, 0x0000eb00, 0x27274e69, 0x00002700
+.long 0xb2b27fcd, 0x0000b200, 0x7575ea9f, 0x00007500
+.long 0x0909121b, 0x00000900, 0x83831d9e, 0x00008300
+.long 0x2c2c5874, 0x00002c00, 0x1a1a342e, 0x00001a00
+.long 0x1b1b362d, 0x00001b00, 0x6e6edcb2, 0x00006e00
+.long 0x5a5ab4ee, 0x00005a00, 0xa0a05bfb, 0x0000a000
+.long 0x5252a4f6, 0x00005200, 0x3b3b764d, 0x00003b00
+.long 0xd6d6b761, 0x0000d600, 0xb3b37dce, 0x0000b300
+.long 0x2929527b, 0x00002900, 0xe3e3dd3e, 0x0000e300
+.long 0x2f2f5e71, 0x00002f00, 0x84841397, 0x00008400
+.long 0x5353a6f5, 0x00005300, 0xd1d1b968, 0x0000d100
+.long 0x00000000, 0x00000000, 0xededc12c, 0x0000ed00
+.long 0x20204060, 0x00002000, 0xfcfce31f, 0x0000fc00
+.long 0xb1b179c8, 0x0000b100, 0x5b5bb6ed, 0x00005b00
+.long 0x6a6ad4be, 0x00006a00, 0xcbcb8d46, 0x0000cb00
+.long 0xbebe67d9, 0x0000be00, 0x3939724b, 0x00003900
+.long 0x4a4a94de, 0x00004a00, 0x4c4c98d4, 0x00004c00
+.long 0x5858b0e8, 0x00005800, 0xcfcf854a, 0x0000cf00
+.long 0xd0d0bb6b, 0x0000d000, 0xefefc52a, 0x0000ef00
+.long 0xaaaa4fe5, 0x0000aa00, 0xfbfbed16, 0x0000fb00
+.long 0x434386c5, 0x00004300, 0x4d4d9ad7, 0x00004d00
+.long 0x33336655, 0x00003300, 0x85851194, 0x00008500
+.long 0x45458acf, 0x00004500, 0xf9f9e910, 0x0000f900
+.long 0x02020406, 0x00000200, 0x7f7ffe81, 0x00007f00
+.long 0x5050a0f0, 0x00005000, 0x3c3c7844, 0x00003c00
+.long 0x9f9f25ba, 0x00009f00, 0xa8a84be3, 0x0000a800
+.long 0x5151a2f3, 0x00005100, 0xa3a35dfe, 0x0000a300
+.long 0x404080c0, 0x00004000, 0x8f8f058a, 0x00008f00
+.long 0x92923fad, 0x00009200, 0x9d9d21bc, 0x00009d00
+.long 0x38387048, 0x00003800, 0xf5f5f104, 0x0000f500
+.long 0xbcbc63df, 0x0000bc00, 0xb6b677c1, 0x0000b600
+.long 0xdadaaf75, 0x0000da00, 0x21214263, 0x00002100
+.long 0x10102030, 0x00001000, 0xffffe51a, 0x0000ff00
+.long 0xf3f3fd0e, 0x0000f300, 0xd2d2bf6d, 0x0000d200
+.long 0xcdcd814c, 0x0000cd00, 0x0c0c1814, 0x00000c00
+.long 0x13132635, 0x00001300, 0xececc32f, 0x0000ec00
+.long 0x5f5fbee1, 0x00005f00, 0x979735a2, 0x00009700
+.long 0x444488cc, 0x00004400, 0x17172e39, 0x00001700
+.long 0xc4c49357, 0x0000c400, 0xa7a755f2, 0x0000a700
+.long 0x7e7efc82, 0x00007e00, 0x3d3d7a47, 0x00003d00
+.long 0x6464c8ac, 0x00006400, 0x5d5dbae7, 0x00005d00
+.long 0x1919322b, 0x00001900, 0x7373e695, 0x00007300
+.long 0x6060c0a0, 0x00006000, 0x81811998, 0x00008100
+.long 0x4f4f9ed1, 0x00004f00, 0xdcdca37f, 0x0000dc00
+.long 0x22224466, 0x00002200, 0x2a2a547e, 0x00002a00
+.long 0x90903bab, 0x00009000, 0x88880b83, 0x00008800
+.long 0x46468cca, 0x00004600, 0xeeeec729, 0x0000ee00
+.long 0xb8b86bd3, 0x0000b800, 0x1414283c, 0x00001400
+.long 0xdedea779, 0x0000de00, 0x5e5ebce2, 0x00005e00
+.long 0x0b0b161d, 0x00000b00, 0xdbdbad76, 0x0000db00
+.long 0xe0e0db3b, 0x0000e000, 0x32326456, 0x00003200
+.long 0x3a3a744e, 0x00003a00, 0x0a0a141e, 0x00000a00
+.long 0x494992db, 0x00004900, 0x06060c0a, 0x00000600
+.long 0x2424486c, 0x00002400, 0x5c5cb8e4, 0x00005c00
+.long 0xc2c29f5d, 0x0000c200, 0xd3d3bd6e, 0x0000d300
+.long 0xacac43ef, 0x0000ac00, 0x6262c4a6, 0x00006200
+.long 0x919139a8, 0x00009100, 0x959531a4, 0x00009500
+.long 0xe4e4d337, 0x0000e400, 0x7979f28b, 0x00007900
+.long 0xe7e7d532, 0x0000e700, 0xc8c88b43, 0x0000c800
+.long 0x37376e59, 0x00003700, 0x6d6ddab7, 0x00006d00
+.long 0x8d8d018c, 0x00008d00, 0xd5d5b164, 0x0000d500
+.long 0x4e4e9cd2, 0x00004e00, 0xa9a949e0, 0x0000a900
+.long 0x6c6cd8b4, 0x00006c00, 0x5656acfa, 0x00005600
+.long 0xf4f4f307, 0x0000f400, 0xeaeacf25, 0x0000ea00
+.long 0x6565caaf, 0x00006500, 0x7a7af48e, 0x00007a00
+.long 0xaeae47e9, 0x0000ae00, 0x08081018, 0x00000800
+.long 0xbaba6fd5, 0x0000ba00, 0x7878f088, 0x00007800
+.long 0x25254a6f, 0x00002500, 0x2e2e5c72, 0x00002e00
+.long 0x1c1c3824, 0x00001c00, 0xa6a657f1, 0x0000a600
+.long 0xb4b473c7, 0x0000b400, 0xc6c69751, 0x0000c600
+.long 0xe8e8cb23, 0x0000e800, 0xdddda17c, 0x0000dd00
+.long 0x7474e89c, 0x00007400, 0x1f1f3e21, 0x00001f00
+.long 0x4b4b96dd, 0x00004b00, 0xbdbd61dc, 0x0000bd00
+.long 0x8b8b0d86, 0x00008b00, 0x8a8a0f85, 0x00008a00
+.long 0x7070e090, 0x00007000, 0x3e3e7c42, 0x00003e00
+.long 0xb5b571c4, 0x0000b500, 0x6666ccaa, 0x00006600
+.long 0x484890d8, 0x00004800, 0x03030605, 0x00000300
+.long 0xf6f6f701, 0x0000f600, 0x0e0e1c12, 0x00000e00
+.long 0x6161c2a3, 0x00006100, 0x35356a5f, 0x00003500
+.long 0x5757aef9, 0x00005700, 0xb9b969d0, 0x0000b900
+.long 0x86861791, 0x00008600, 0xc1c19958, 0x0000c100
+.long 0x1d1d3a27, 0x00001d00, 0x9e9e27b9, 0x00009e00
+.long 0xe1e1d938, 0x0000e100, 0xf8f8eb13, 0x0000f800
+.long 0x98982bb3, 0x00009800, 0x11112233, 0x00001100
+.long 0x6969d2bb, 0x00006900, 0xd9d9a970, 0x0000d900
+.long 0x8e8e0789, 0x00008e00, 0x949433a7, 0x00009400
+.long 0x9b9b2db6, 0x00009b00, 0x1e1e3c22, 0x00001e00
+.long 0x87871592, 0x00008700, 0xe9e9c920, 0x0000e900
+.long 0xcece8749, 0x0000ce00, 0x5555aaff, 0x00005500
+.long 0x28285078, 0x00002800, 0xdfdfa57a, 0x0000df00
+.long 0x8c8c038f, 0x00008c00, 0xa1a159f8, 0x0000a100
+.long 0x89890980, 0x00008900, 0x0d0d1a17, 0x00000d00
+.long 0xbfbf65da, 0x0000bf00, 0xe6e6d731, 0x0000e600
+.long 0x424284c6, 0x00004200, 0x6868d0b8, 0x00006800
+.long 0x414182c3, 0x00004100, 0x999929b0, 0x00009900
+.long 0x2d2d5a77, 0x00002d00, 0x0f0f1e11, 0x00000f00
+.long 0xb0b07bcb, 0x0000b000, 0x5454a8fc, 0x00005400
+.long 0xbbbb6dd6, 0x0000bb00, 0x16162c3a, 0x00001600
+.LtableE2:
+.long 0x63c6a563
+.LtableEs2:
+.long             0x00630000, 0x7cf8847c, 0x007c0000
+.long 0x77ee9977, 0x00770000, 0x7bf68d7b, 0x007b0000
+.long 0xf2ff0df2, 0x00f20000, 0x6bd6bd6b, 0x006b0000
+.long 0x6fdeb16f, 0x006f0000, 0xc59154c5, 0x00c50000
+.long 0x30605030, 0x00300000, 0x01020301, 0x00010000
+.long 0x67cea967, 0x00670000, 0x2b567d2b, 0x002b0000
+.long 0xfee719fe, 0x00fe0000, 0xd7b562d7, 0x00d70000
+.long 0xab4de6ab, 0x00ab0000, 0x76ec9a76, 0x00760000
+.long 0xca8f45ca, 0x00ca0000, 0x821f9d82, 0x00820000
+.long 0xc98940c9, 0x00c90000, 0x7dfa877d, 0x007d0000
+.long 0xfaef15fa, 0x00fa0000, 0x59b2eb59, 0x00590000
+.long 0x478ec947, 0x00470000, 0xf0fb0bf0, 0x00f00000
+.long 0xad41ecad, 0x00ad0000, 0xd4b367d4, 0x00d40000
+.long 0xa25ffda2, 0x00a20000, 0xaf45eaaf, 0x00af0000
+.long 0x9c23bf9c, 0x009c0000, 0xa453f7a4, 0x00a40000
+.long 0x72e49672, 0x00720000, 0xc09b5bc0, 0x00c00000
+.long 0xb775c2b7, 0x00b70000, 0xfde11cfd, 0x00fd0000
+.long 0x933dae93, 0x00930000, 0x264c6a26, 0x00260000
+.long 0x366c5a36, 0x00360000, 0x3f7e413f, 0x003f0000
+.long 0xf7f502f7, 0x00f70000, 0xcc834fcc, 0x00cc0000
+.long 0x34685c34, 0x00340000, 0xa551f4a5, 0x00a50000
+.long 0xe5d134e5, 0x00e50000, 0xf1f908f1, 0x00f10000
+.long 0x71e29371, 0x00710000, 0xd8ab73d8, 0x00d80000
+.long 0x31625331, 0x00310000, 0x152a3f15, 0x00150000
+.long 0x04080c04, 0x00040000, 0xc79552c7, 0x00c70000
+.long 0x23466523, 0x00230000, 0xc39d5ec3, 0x00c30000
+.long 0x18302818, 0x00180000, 0x9637a196, 0x00960000
+.long 0x050a0f05, 0x00050000, 0x9a2fb59a, 0x009a0000
+.long 0x070e0907, 0x00070000, 0x12243612, 0x00120000
+.long 0x801b9b80, 0x00800000, 0xe2df3de2, 0x00e20000
+.long 0xebcd26eb, 0x00eb0000, 0x274e6927, 0x00270000
+.long 0xb27fcdb2, 0x00b20000, 0x75ea9f75, 0x00750000
+.long 0x09121b09, 0x00090000, 0x831d9e83, 0x00830000
+.long 0x2c58742c, 0x002c0000, 0x1a342e1a, 0x001a0000
+.long 0x1b362d1b, 0x001b0000, 0x6edcb26e, 0x006e0000
+.long 0x5ab4ee5a, 0x005a0000, 0xa05bfba0, 0x00a00000
+.long 0x52a4f652, 0x00520000, 0x3b764d3b, 0x003b0000
+.long 0xd6b761d6, 0x00d60000, 0xb37dceb3, 0x00b30000
+.long 0x29527b29, 0x00290000, 0xe3dd3ee3, 0x00e30000
+.long 0x2f5e712f, 0x002f0000, 0x84139784, 0x00840000
+.long 0x53a6f553, 0x00530000, 0xd1b968d1, 0x00d10000
+.long 0x00000000, 0x00000000, 0xedc12ced, 0x00ed0000
+.long 0x20406020, 0x00200000, 0xfce31ffc, 0x00fc0000
+.long 0xb179c8b1, 0x00b10000, 0x5bb6ed5b, 0x005b0000
+.long 0x6ad4be6a, 0x006a0000, 0xcb8d46cb, 0x00cb0000
+.long 0xbe67d9be, 0x00be0000, 0x39724b39, 0x00390000
+.long 0x4a94de4a, 0x004a0000, 0x4c98d44c, 0x004c0000
+.long 0x58b0e858, 0x00580000, 0xcf854acf, 0x00cf0000
+.long 0xd0bb6bd0, 0x00d00000, 0xefc52aef, 0x00ef0000
+.long 0xaa4fe5aa, 0x00aa0000, 0xfbed16fb, 0x00fb0000
+.long 0x4386c543, 0x00430000, 0x4d9ad74d, 0x004d0000
+.long 0x33665533, 0x00330000, 0x85119485, 0x00850000
+.long 0x458acf45, 0x00450000, 0xf9e910f9, 0x00f90000
+.long 0x02040602, 0x00020000, 0x7ffe817f, 0x007f0000
+.long 0x50a0f050, 0x00500000, 0x3c78443c, 0x003c0000
+.long 0x9f25ba9f, 0x009f0000, 0xa84be3a8, 0x00a80000
+.long 0x51a2f351, 0x00510000, 0xa35dfea3, 0x00a30000
+.long 0x4080c040, 0x00400000, 0x8f058a8f, 0x008f0000
+.long 0x923fad92, 0x00920000, 0x9d21bc9d, 0x009d0000
+.long 0x38704838, 0x00380000, 0xf5f104f5, 0x00f50000
+.long 0xbc63dfbc, 0x00bc0000, 0xb677c1b6, 0x00b60000
+.long 0xdaaf75da, 0x00da0000, 0x21426321, 0x00210000
+.long 0x10203010, 0x00100000, 0xffe51aff, 0x00ff0000
+.long 0xf3fd0ef3, 0x00f30000, 0xd2bf6dd2, 0x00d20000
+.long 0xcd814ccd, 0x00cd0000, 0x0c18140c, 0x000c0000
+.long 0x13263513, 0x00130000, 0xecc32fec, 0x00ec0000
+.long 0x5fbee15f, 0x005f0000, 0x9735a297, 0x00970000
+.long 0x4488cc44, 0x00440000, 0x172e3917, 0x00170000
+.long 0xc49357c4, 0x00c40000, 0xa755f2a7, 0x00a70000
+.long 0x7efc827e, 0x007e0000, 0x3d7a473d, 0x003d0000
+.long 0x64c8ac64, 0x00640000, 0x5dbae75d, 0x005d0000
+.long 0x19322b19, 0x00190000, 0x73e69573, 0x00730000
+.long 0x60c0a060, 0x00600000, 0x81199881, 0x00810000
+.long 0x4f9ed14f, 0x004f0000, 0xdca37fdc, 0x00dc0000
+.long 0x22446622, 0x00220000, 0x2a547e2a, 0x002a0000
+.long 0x903bab90, 0x00900000, 0x880b8388, 0x00880000
+.long 0x468cca46, 0x00460000, 0xeec729ee, 0x00ee0000
+.long 0xb86bd3b8, 0x00b80000, 0x14283c14, 0x00140000
+.long 0xdea779de, 0x00de0000, 0x5ebce25e, 0x005e0000
+.long 0x0b161d0b, 0x000b0000, 0xdbad76db, 0x00db0000
+.long 0xe0db3be0, 0x00e00000, 0x32645632, 0x00320000
+.long 0x3a744e3a, 0x003a0000, 0x0a141e0a, 0x000a0000
+.long 0x4992db49, 0x00490000, 0x060c0a06, 0x00060000
+.long 0x24486c24, 0x00240000, 0x5cb8e45c, 0x005c0000
+.long 0xc29f5dc2, 0x00c20000, 0xd3bd6ed3, 0x00d30000
+.long 0xac43efac, 0x00ac0000, 0x62c4a662, 0x00620000
+.long 0x9139a891, 0x00910000, 0x9531a495, 0x00950000
+.long 0xe4d337e4, 0x00e40000, 0x79f28b79, 0x00790000
+.long 0xe7d532e7, 0x00e70000, 0xc88b43c8, 0x00c80000
+.long 0x376e5937, 0x00370000, 0x6ddab76d, 0x006d0000
+.long 0x8d018c8d, 0x008d0000, 0xd5b164d5, 0x00d50000
+.long 0x4e9cd24e, 0x004e0000, 0xa949e0a9, 0x00a90000
+.long 0x6cd8b46c, 0x006c0000, 0x56acfa56, 0x00560000
+.long 0xf4f307f4, 0x00f40000, 0xeacf25ea, 0x00ea0000
+.long 0x65caaf65, 0x00650000, 0x7af48e7a, 0x007a0000
+.long 0xae47e9ae, 0x00ae0000, 0x08101808, 0x00080000
+.long 0xba6fd5ba, 0x00ba0000, 0x78f08878, 0x00780000
+.long 0x254a6f25, 0x00250000, 0x2e5c722e, 0x002e0000
+.long 0x1c38241c, 0x001c0000, 0xa657f1a6, 0x00a60000
+.long 0xb473c7b4, 0x00b40000, 0xc69751c6, 0x00c60000
+.long 0xe8cb23e8, 0x00e80000, 0xdda17cdd, 0x00dd0000
+.long 0x74e89c74, 0x00740000, 0x1f3e211f, 0x001f0000
+.long 0x4b96dd4b, 0x004b0000, 0xbd61dcbd, 0x00bd0000
+.long 0x8b0d868b, 0x008b0000, 0x8a0f858a, 0x008a0000
+.long 0x70e09070, 0x00700000, 0x3e7c423e, 0x003e0000
+.long 0xb571c4b5, 0x00b50000, 0x66ccaa66, 0x00660000
+.long 0x4890d848, 0x00480000, 0x03060503, 0x00030000
+.long 0xf6f701f6, 0x00f60000, 0x0e1c120e, 0x000e0000
+.long 0x61c2a361, 0x00610000, 0x356a5f35, 0x00350000
+.long 0x57aef957, 0x00570000, 0xb969d0b9, 0x00b90000
+.long 0x86179186, 0x00860000, 0xc19958c1, 0x00c10000
+.long 0x1d3a271d, 0x001d0000, 0x9e27b99e, 0x009e0000
+.long 0xe1d938e1, 0x00e10000, 0xf8eb13f8, 0x00f80000
+.long 0x982bb398, 0x00980000, 0x11223311, 0x00110000
+.long 0x69d2bb69, 0x00690000, 0xd9a970d9, 0x00d90000
+.long 0x8e07898e, 0x008e0000, 0x9433a794, 0x00940000
+.long 0x9b2db69b, 0x009b0000, 0x1e3c221e, 0x001e0000
+.long 0x87159287, 0x00870000, 0xe9c920e9, 0x00e90000
+.long 0xce8749ce, 0x00ce0000, 0x55aaff55, 0x00550000
+.long 0x28507828, 0x00280000, 0xdfa57adf, 0x00df0000
+.long 0x8c038f8c, 0x008c0000, 0xa159f8a1, 0x00a10000
+.long 0x89098089, 0x00890000, 0x0d1a170d, 0x000d0000
+.long 0xbf65dabf, 0x00bf0000, 0xe6d731e6, 0x00e60000
+.long 0x4284c642, 0x00420000, 0x68d0b868, 0x00680000
+.long 0x4182c341, 0x00410000, 0x9929b099, 0x00990000
+.long 0x2d5a772d, 0x002d0000, 0x0f1e110f, 0x000f0000
+.long 0xb07bcbb0, 0x00b00000, 0x54a8fc54, 0x00540000
+.long 0xbb6dd6bb, 0x00bb0000, 0x162c3a16, 0x00160000
+.LtableE3:
+.long 0xc6a56363
+.LtableEs3:
+.long             0x63000000, 0xf8847c7c, 0x7c000000
+.long 0xee997777, 0x77000000, 0xf68d7b7b, 0x7b000000
+.long 0xff0df2f2, 0xf2000000, 0xd6bd6b6b, 0x6b000000
+.long 0xdeb16f6f, 0x6f000000, 0x9154c5c5, 0xc5000000
+.long 0x60503030, 0x30000000, 0x02030101, 0x01000000
+.long 0xcea96767, 0x67000000, 0x567d2b2b, 0x2b000000
+.long 0xe719fefe, 0xfe000000, 0xb562d7d7, 0xd7000000
+.long 0x4de6abab, 0xab000000, 0xec9a7676, 0x76000000
+.long 0x8f45caca, 0xca000000, 0x1f9d8282, 0x82000000
+.long 0x8940c9c9, 0xc9000000, 0xfa877d7d, 0x7d000000
+.long 0xef15fafa, 0xfa000000, 0xb2eb5959, 0x59000000
+.long 0x8ec94747, 0x47000000, 0xfb0bf0f0, 0xf0000000
+.long 0x41ecadad, 0xad000000, 0xb367d4d4, 0xd4000000
+.long 0x5ffda2a2, 0xa2000000, 0x45eaafaf, 0xaf000000
+.long 0x23bf9c9c, 0x9c000000, 0x53f7a4a4, 0xa4000000
+.long 0xe4967272, 0x72000000, 0x9b5bc0c0, 0xc0000000
+.long 0x75c2b7b7, 0xb7000000, 0xe11cfdfd, 0xfd000000
+.long 0x3dae9393, 0x93000000, 0x4c6a2626, 0x26000000
+.long 0x6c5a3636, 0x36000000, 0x7e413f3f, 0x3f000000
+.long 0xf502f7f7, 0xf7000000, 0x834fcccc, 0xcc000000
+.long 0x685c3434, 0x34000000, 0x51f4a5a5, 0xa5000000
+.long 0xd134e5e5, 0xe5000000, 0xf908f1f1, 0xf1000000
+.long 0xe2937171, 0x71000000, 0xab73d8d8, 0xd8000000
+.long 0x62533131, 0x31000000, 0x2a3f1515, 0x15000000
+.long 0x080c0404, 0x04000000, 0x9552c7c7, 0xc7000000
+.long 0x46652323, 0x23000000, 0x9d5ec3c3, 0xc3000000
+.long 0x30281818, 0x18000000, 0x37a19696, 0x96000000
+.long 0x0a0f0505, 0x05000000, 0x2fb59a9a, 0x9a000000
+.long 0x0e090707, 0x07000000, 0x24361212, 0x12000000
+.long 0x1b9b8080, 0x80000000, 0xdf3de2e2, 0xe2000000
+.long 0xcd26ebeb, 0xeb000000, 0x4e692727, 0x27000000
+.long 0x7fcdb2b2, 0xb2000000, 0xea9f7575, 0x75000000
+.long 0x121b0909, 0x09000000, 0x1d9e8383, 0x83000000
+.long 0x58742c2c, 0x2c000000, 0x342e1a1a, 0x1a000000
+.long 0x362d1b1b, 0x1b000000, 0xdcb26e6e, 0x6e000000
+.long 0xb4ee5a5a, 0x5a000000, 0x5bfba0a0, 0xa0000000
+.long 0xa4f65252, 0x52000000, 0x764d3b3b, 0x3b000000
+.long 0xb761d6d6, 0xd6000000, 0x7dceb3b3, 0xb3000000
+.long 0x527b2929, 0x29000000, 0xdd3ee3e3, 0xe3000000
+.long 0x5e712f2f, 0x2f000000, 0x13978484, 0x84000000
+.long 0xa6f55353, 0x53000000, 0xb968d1d1, 0xd1000000
+.long 0x00000000, 0x00000000, 0xc12ceded, 0xed000000
+.long 0x40602020, 0x20000000, 0xe31ffcfc, 0xfc000000
+.long 0x79c8b1b1, 0xb1000000, 0xb6ed5b5b, 0x5b000000
+.long 0xd4be6a6a, 0x6a000000, 0x8d46cbcb, 0xcb000000
+.long 0x67d9bebe, 0xbe000000, 0x724b3939, 0x39000000
+.long 0x94de4a4a, 0x4a000000, 0x98d44c4c, 0x4c000000
+.long 0xb0e85858, 0x58000000, 0x854acfcf, 0xcf000000
+.long 0xbb6bd0d0, 0xd0000000, 0xc52aefef, 0xef000000
+.long 0x4fe5aaaa, 0xaa000000, 0xed16fbfb, 0xfb000000
+.long 0x86c54343, 0x43000000, 0x9ad74d4d, 0x4d000000
+.long 0x66553333, 0x33000000, 0x11948585, 0x85000000
+.long 0x8acf4545, 0x45000000, 0xe910f9f9, 0xf9000000
+.long 0x04060202, 0x02000000, 0xfe817f7f, 0x7f000000
+.long 0xa0f05050, 0x50000000, 0x78443c3c, 0x3c000000
+.long 0x25ba9f9f, 0x9f000000, 0x4be3a8a8, 0xa8000000
+.long 0xa2f35151, 0x51000000, 0x5dfea3a3, 0xa3000000
+.long 0x80c04040, 0x40000000, 0x058a8f8f, 0x8f000000
+.long 0x3fad9292, 0x92000000, 0x21bc9d9d, 0x9d000000
+.long 0x70483838, 0x38000000, 0xf104f5f5, 0xf5000000
+.long 0x63dfbcbc, 0xbc000000, 0x77c1b6b6, 0xb6000000
+.long 0xaf75dada, 0xda000000, 0x42632121, 0x21000000
+.long 0x20301010, 0x10000000, 0xe51affff, 0xff000000
+.long 0xfd0ef3f3, 0xf3000000, 0xbf6dd2d2, 0xd2000000
+.long 0x814ccdcd, 0xcd000000, 0x18140c0c, 0x0c000000
+.long 0x26351313, 0x13000000, 0xc32fecec, 0xec000000
+.long 0xbee15f5f, 0x5f000000, 0x35a29797, 0x97000000
+.long 0x88cc4444, 0x44000000, 0x2e391717, 0x17000000
+.long 0x9357c4c4, 0xc4000000, 0x55f2a7a7, 0xa7000000
+.long 0xfc827e7e, 0x7e000000, 0x7a473d3d, 0x3d000000
+.long 0xc8ac6464, 0x64000000, 0xbae75d5d, 0x5d000000
+.long 0x322b1919, 0x19000000, 0xe6957373, 0x73000000
+.long 0xc0a06060, 0x60000000, 0x19988181, 0x81000000
+.long 0x9ed14f4f, 0x4f000000, 0xa37fdcdc, 0xdc000000
+.long 0x44662222, 0x22000000, 0x547e2a2a, 0x2a000000
+.long 0x3bab9090, 0x90000000, 0x0b838888, 0x88000000
+.long 0x8cca4646, 0x46000000, 0xc729eeee, 0xee000000
+.long 0x6bd3b8b8, 0xb8000000, 0x283c1414, 0x14000000
+.long 0xa779dede, 0xde000000, 0xbce25e5e, 0x5e000000
+.long 0x161d0b0b, 0x0b000000, 0xad76dbdb, 0xdb000000
+.long 0xdb3be0e0, 0xe0000000, 0x64563232, 0x32000000
+.long 0x744e3a3a, 0x3a000000, 0x141e0a0a, 0x0a000000
+.long 0x92db4949, 0x49000000, 0x0c0a0606, 0x06000000
+.long 0x486c2424, 0x24000000, 0xb8e45c5c, 0x5c000000
+.long 0x9f5dc2c2, 0xc2000000, 0xbd6ed3d3, 0xd3000000
+.long 0x43efacac, 0xac000000, 0xc4a66262, 0x62000000
+.long 0x39a89191, 0x91000000, 0x31a49595, 0x95000000
+.long 0xd337e4e4, 0xe4000000, 0xf28b7979, 0x79000000
+.long 0xd532e7e7, 0xe7000000, 0x8b43c8c8, 0xc8000000
+.long 0x6e593737, 0x37000000, 0xdab76d6d, 0x6d000000
+.long 0x018c8d8d, 0x8d000000, 0xb164d5d5, 0xd5000000
+.long 0x9cd24e4e, 0x4e000000, 0x49e0a9a9, 0xa9000000
+.long 0xd8b46c6c, 0x6c000000, 0xacfa5656, 0x56000000
+.long 0xf307f4f4, 0xf4000000, 0xcf25eaea, 0xea000000
+.long 0xcaaf6565, 0x65000000, 0xf48e7a7a, 0x7a000000
+.long 0x47e9aeae, 0xae000000, 0x10180808, 0x08000000
+.long 0x6fd5baba, 0xba000000, 0xf0887878, 0x78000000
+.long 0x4a6f2525, 0x25000000, 0x5c722e2e, 0x2e000000
+.long 0x38241c1c, 0x1c000000, 0x57f1a6a6, 0xa6000000
+.long 0x73c7b4b4, 0xb4000000, 0x9751c6c6, 0xc6000000
+.long 0xcb23e8e8, 0xe8000000, 0xa17cdddd, 0xdd000000
+.long 0xe89c7474, 0x74000000, 0x3e211f1f, 0x1f000000
+.long 0x96dd4b4b, 0x4b000000, 0x61dcbdbd, 0xbd000000
+.long 0x0d868b8b, 0x8b000000, 0x0f858a8a, 0x8a000000
+.long 0xe0907070, 0x70000000, 0x7c423e3e, 0x3e000000
+.long 0x71c4b5b5, 0xb5000000, 0xccaa6666, 0x66000000
+.long 0x90d84848, 0x48000000, 0x06050303, 0x03000000
+.long 0xf701f6f6, 0xf6000000, 0x1c120e0e, 0x0e000000
+.long 0xc2a36161, 0x61000000, 0x6a5f3535, 0x35000000
+.long 0xaef95757, 0x57000000, 0x69d0b9b9, 0xb9000000
+.long 0x17918686, 0x86000000, 0x9958c1c1, 0xc1000000
+.long 0x3a271d1d, 0x1d000000, 0x27b99e9e, 0x9e000000
+.long 0xd938e1e1, 0xe1000000, 0xeb13f8f8, 0xf8000000
+.long 0x2bb39898, 0x98000000, 0x22331111, 0x11000000
+.long 0xd2bb6969, 0x69000000, 0xa970d9d9, 0xd9000000
+.long 0x07898e8e, 0x8e000000, 0x33a79494, 0x94000000
+.long 0x2db69b9b, 0x9b000000, 0x3c221e1e, 0x1e000000
+.long 0x15928787, 0x87000000, 0xc920e9e9, 0xe9000000
+.long 0x8749cece, 0xce000000, 0xaaff5555, 0x55000000
+.long 0x50782828, 0x28000000, 0xa57adfdf, 0xdf000000
+.long 0x038f8c8c, 0x8c000000, 0x59f8a1a1, 0xa1000000
+.long 0x09808989, 0x89000000, 0x1a170d0d, 0x0d000000
+.long 0x65dabfbf, 0xbf000000, 0xd731e6e6, 0xe6000000
+.long 0x84c64242, 0x42000000, 0xd0b86868, 0x68000000
+.long 0x82c34141, 0x41000000, 0x29b09999, 0x99000000
+.long 0x5a772d2d, 0x2d000000, 0x1e110f0f, 0x0f000000
+.long 0x7bcbb0b0, 0xb0000000, 0xa8fc5454, 0x54000000
+.long 0x6dd6bbbb, 0xbb000000, 0x2c3a1616, 0x16000000
+
+/* Decryption tables */
+.LtableD0:
+.long 0x50a7f451
+.LtableDs0:
+.long             0x00000052, 0x5365417e, 0x00000009
+.long 0xc3a4171a, 0x0000006a, 0x965e273a, 0x000000d5
+.long 0xcb6bab3b, 0x00000030, 0xf1459d1f, 0x00000036
+.long 0xab58faac, 0x000000a5, 0x9303e34b, 0x00000038
+.long 0x55fa3020, 0x000000bf, 0xf66d76ad, 0x00000040
+.long 0x9176cc88, 0x000000a3, 0x254c02f5, 0x0000009e
+.long 0xfcd7e54f, 0x00000081, 0xd7cb2ac5, 0x000000f3
+.long 0x80443526, 0x000000d7, 0x8fa362b5, 0x000000fb
+.long 0x495ab1de, 0x0000007c, 0x671bba25, 0x000000e3
+.long 0x980eea45, 0x00000039, 0xe1c0fe5d, 0x00000082
+.long 0x02752fc3, 0x0000009b, 0x12f04c81, 0x0000002f
+.long 0xa397468d, 0x000000ff, 0xc6f9d36b, 0x00000087
+.long 0xe75f8f03, 0x00000034, 0x959c9215, 0x0000008e
+.long 0xeb7a6dbf, 0x00000043, 0xda595295, 0x00000044
+.long 0x2d83bed4, 0x000000c4, 0xd3217458, 0x000000de
+.long 0x2969e049, 0x000000e9, 0x44c8c98e, 0x000000cb
+.long 0x6a89c275, 0x00000054, 0x78798ef4, 0x0000007b
+.long 0x6b3e5899, 0x00000094, 0xdd71b927, 0x00000032
+.long 0xb64fe1be, 0x000000a6, 0x17ad88f0, 0x000000c2
+.long 0x66ac20c9, 0x00000023, 0xb43ace7d, 0x0000003d
+.long 0x184adf63, 0x000000ee, 0x82311ae5, 0x0000004c
+.long 0x60335197, 0x00000095, 0x457f5362, 0x0000000b
+.long 0xe07764b1, 0x00000042, 0x84ae6bbb, 0x000000fa
+.long 0x1ca081fe, 0x000000c3, 0x942b08f9, 0x0000004e
+.long 0x58684870, 0x00000008, 0x19fd458f, 0x0000002e
+.long 0x876cde94, 0x000000a1, 0xb7f87b52, 0x00000066
+.long 0x23d373ab, 0x00000028, 0xe2024b72, 0x000000d9
+.long 0x578f1fe3, 0x00000024, 0x2aab5566, 0x000000b2
+.long 0x0728ebb2, 0x00000076, 0x03c2b52f, 0x0000005b
+.long 0x9a7bc586, 0x000000a2, 0xa50837d3, 0x00000049
+.long 0xf2872830, 0x0000006d, 0xb2a5bf23, 0x0000008b
+.long 0xba6a0302, 0x000000d1, 0x5c8216ed, 0x00000025
+.long 0x2b1ccf8a, 0x00000072, 0x92b479a7, 0x000000f8
+.long 0xf0f207f3, 0x000000f6, 0xa1e2694e, 0x00000064
+.long 0xcdf4da65, 0x00000086, 0xd5be0506, 0x00000068
+.long 0x1f6234d1, 0x00000098, 0x8afea6c4, 0x00000016
+.long 0x9d532e34, 0x000000d4, 0xa055f3a2, 0x000000a4
+.long 0x32e18a05, 0x0000005c, 0x75ebf6a4, 0x000000cc
+.long 0x39ec830b, 0x0000005d, 0xaaef6040, 0x00000065
+.long 0x069f715e, 0x000000b6, 0x51106ebd, 0x00000092
+.long 0xf98a213e, 0x0000006c, 0x3d06dd96, 0x00000070
+.long 0xae053edd, 0x00000048, 0x46bde64d, 0x00000050
+.long 0xb58d5491, 0x000000fd, 0x055dc471, 0x000000ed
+.long 0x6fd40604, 0x000000b9, 0xff155060, 0x000000da
+.long 0x24fb9819, 0x0000005e, 0x97e9bdd6, 0x00000015
+.long 0xcc434089, 0x00000046, 0x779ed967, 0x00000057
+.long 0xbd42e8b0, 0x000000a7, 0x888b8907, 0x0000008d
+.long 0x385b19e7, 0x0000009d, 0xdbeec879, 0x00000084
+.long 0x470a7ca1, 0x00000090, 0xe90f427c, 0x000000d8
+.long 0xc91e84f8, 0x000000ab, 0x00000000, 0x00000000
+.long 0x83868009, 0x0000008c, 0x48ed2b32, 0x000000bc
+.long 0xac70111e, 0x000000d3, 0x4e725a6c, 0x0000000a
+.long 0xfbff0efd, 0x000000f7, 0x5638850f, 0x000000e4
+.long 0x1ed5ae3d, 0x00000058, 0x27392d36, 0x00000005
+.long 0x64d90f0a, 0x000000b8, 0x21a65c68, 0x000000b3
+.long 0xd1545b9b, 0x00000045, 0x3a2e3624, 0x00000006
+.long 0xb1670a0c, 0x000000d0, 0x0fe75793, 0x0000002c
+.long 0xd296eeb4, 0x0000001e, 0x9e919b1b, 0x0000008f
+.long 0x4fc5c080, 0x000000ca, 0xa220dc61, 0x0000003f
+.long 0x694b775a, 0x0000000f, 0x161a121c, 0x00000002
+.long 0x0aba93e2, 0x000000c1, 0xe52aa0c0, 0x000000af
+.long 0x43e0223c, 0x000000bd, 0x1d171b12, 0x00000003
+.long 0x0b0d090e, 0x00000001, 0xadc78bf2, 0x00000013
+.long 0xb9a8b62d, 0x0000008a, 0xc8a91e14, 0x0000006b
+.long 0x8519f157, 0x0000003a, 0x4c0775af, 0x00000091
+.long 0xbbdd99ee, 0x00000011, 0xfd607fa3, 0x00000041
+.long 0x9f2601f7, 0x0000004f, 0xbcf5725c, 0x00000067
+.long 0xc53b6644, 0x000000dc, 0x347efb5b, 0x000000ea
+.long 0x7629438b, 0x00000097, 0xdcc623cb, 0x000000f2
+.long 0x68fcedb6, 0x000000cf, 0x63f1e4b8, 0x000000ce
+.long 0xcadc31d7, 0x000000f0, 0x10856342, 0x000000b4
+.long 0x40229713, 0x000000e6, 0x2011c684, 0x00000073
+.long 0x7d244a85, 0x00000096, 0xf83dbbd2, 0x000000ac
+.long 0x1132f9ae, 0x00000074, 0x6da129c7, 0x00000022
+.long 0x4b2f9e1d, 0x000000e7, 0xf330b2dc, 0x000000ad
+.long 0xec52860d, 0x00000035, 0xd0e3c177, 0x00000085
+.long 0x6c16b32b, 0x000000e2, 0x99b970a9, 0x000000f9
+.long 0xfa489411, 0x00000037, 0x2264e947, 0x000000e8
+.long 0xc48cfca8, 0x0000001c, 0x1a3ff0a0, 0x00000075
+.long 0xd82c7d56, 0x000000df, 0xef903322, 0x0000006e
+.long 0xc74e4987, 0x00000047, 0xc1d138d9, 0x000000f1
+.long 0xfea2ca8c, 0x0000001a, 0x360bd498, 0x00000071
+.long 0xcf81f5a6, 0x0000001d, 0x28de7aa5, 0x00000029
+.long 0x268eb7da, 0x000000c5, 0xa4bfad3f, 0x00000089
+.long 0xe49d3a2c, 0x0000006f, 0x0d927850, 0x000000b7
+.long 0x9bcc5f6a, 0x00000062, 0x62467e54, 0x0000000e
+.long 0xc2138df6, 0x000000aa, 0xe8b8d890, 0x00000018
+.long 0x5ef7392e, 0x000000be, 0xf5afc382, 0x0000001b
+.long 0xbe805d9f, 0x000000fc, 0x7c93d069, 0x00000056
+.long 0xa92dd56f, 0x0000003e, 0xb31225cf, 0x0000004b
+.long 0x3b99acc8, 0x000000c6, 0xa77d1810, 0x000000d2
+.long 0x6e639ce8, 0x00000079, 0x7bbb3bdb, 0x00000020
+.long 0x097826cd, 0x0000009a, 0xf418596e, 0x000000db
+.long 0x01b79aec, 0x000000c0, 0xa89a4f83, 0x000000fe
+.long 0x656e95e6, 0x00000078, 0x7ee6ffaa, 0x000000cd
+.long 0x08cfbc21, 0x0000005a, 0xe6e815ef, 0x000000f4
+.long 0xd99be7ba, 0x0000001f, 0xce366f4a, 0x000000dd
+.long 0xd4099fea, 0x000000a8, 0xd67cb029, 0x00000033
+.long 0xafb2a431, 0x00000088, 0x31233f2a, 0x00000007
+.long 0x3094a5c6, 0x000000c7, 0xc066a235, 0x00000031
+.long 0x37bc4e74, 0x000000b1, 0xa6ca82fc, 0x00000012
+.long 0xb0d090e0, 0x00000010, 0x15d8a733, 0x00000059
+.long 0x4a9804f1, 0x00000027, 0xf7daec41, 0x00000080
+.long 0x0e50cd7f, 0x000000ec, 0x2ff69117, 0x0000005f
+.long 0x8dd64d76, 0x00000060, 0x4db0ef43, 0x00000051
+.long 0x544daacc, 0x0000007f, 0xdf0496e4, 0x000000a9
+.long 0xe3b5d19e, 0x00000019, 0x1b886a4c, 0x000000b5
+.long 0xb81f2cc1, 0x0000004a, 0x7f516546, 0x0000000d
+.long 0x04ea5e9d, 0x0000002d, 0x5d358c01, 0x000000e5
+.long 0x737487fa, 0x0000007a, 0x2e410bfb, 0x0000009f
+.long 0x5a1d67b3, 0x00000093, 0x52d2db92, 0x000000c9
+.long 0x335610e9, 0x0000009c, 0x1347d66d, 0x000000ef
+.long 0x8c61d79a, 0x000000a0, 0x7a0ca137, 0x000000e0
+.long 0x8e14f859, 0x0000003b, 0x893c13eb, 0x0000004d
+.long 0xee27a9ce, 0x000000ae, 0x35c961b7, 0x0000002a
+.long 0xede51ce1, 0x000000f5, 0x3cb1477a, 0x000000b0
+.long 0x59dfd29c, 0x000000c8, 0x3f73f255, 0x000000eb
+.long 0x79ce1418, 0x000000bb, 0xbf37c773, 0x0000003c
+.long 0xeacdf753, 0x00000083, 0x5baafd5f, 0x00000053
+.long 0x146f3ddf, 0x00000099, 0x86db4478, 0x00000061
+.long 0x81f3afca, 0x00000017, 0x3ec468b9, 0x0000002b
+.long 0x2c342438, 0x00000004, 0x5f40a3c2, 0x0000007e
+.long 0x72c31d16, 0x000000ba, 0x0c25e2bc, 0x00000077
+.long 0x8b493c28, 0x000000d6, 0x41950dff, 0x00000026
+.long 0x7101a839, 0x000000e1, 0xdeb30c08, 0x00000069
+.long 0x9ce4b4d8, 0x00000014, 0x90c15664, 0x00000063
+.long 0x6184cb7b, 0x00000055, 0x70b632d5, 0x00000021
+.long 0x745c6c48, 0x0000000c, 0x4257b8d0, 0x0000007d
+.LtableD1:
+.long 0xa7f45150
+.LtableDs1:
+.long             0x00005200, 0x65417e53, 0x00000900
+.long 0xa4171ac3, 0x00006a00, 0x5e273a96, 0x0000d500
+.long 0x6bab3bcb, 0x00003000, 0x459d1ff1, 0x00003600
+.long 0x58faacab, 0x0000a500, 0x03e34b93, 0x00003800
+.long 0xfa302055, 0x0000bf00, 0x6d76adf6, 0x00004000
+.long 0x76cc8891, 0x0000a300, 0x4c02f525, 0x00009e00
+.long 0xd7e54ffc, 0x00008100, 0xcb2ac5d7, 0x0000f300
+.long 0x44352680, 0x0000d700, 0xa362b58f, 0x0000fb00
+.long 0x5ab1de49, 0x00007c00, 0x1bba2567, 0x0000e300
+.long 0x0eea4598, 0x00003900, 0xc0fe5de1, 0x00008200
+.long 0x752fc302, 0x00009b00, 0xf04c8112, 0x00002f00
+.long 0x97468da3, 0x0000ff00, 0xf9d36bc6, 0x00008700
+.long 0x5f8f03e7, 0x00003400, 0x9c921595, 0x00008e00
+.long 0x7a6dbfeb, 0x00004300, 0x595295da, 0x00004400
+.long 0x83bed42d, 0x0000c400, 0x217458d3, 0x0000de00
+.long 0x69e04929, 0x0000e900, 0xc8c98e44, 0x0000cb00
+.long 0x89c2756a, 0x00005400, 0x798ef478, 0x00007b00
+.long 0x3e58996b, 0x00009400, 0x71b927dd, 0x00003200
+.long 0x4fe1beb6, 0x0000a600, 0xad88f017, 0x0000c200
+.long 0xac20c966, 0x00002300, 0x3ace7db4, 0x00003d00
+.long 0x4adf6318, 0x0000ee00, 0x311ae582, 0x00004c00
+.long 0x33519760, 0x00009500, 0x7f536245, 0x00000b00
+.long 0x7764b1e0, 0x00004200, 0xae6bbb84, 0x0000fa00
+.long 0xa081fe1c, 0x0000c300, 0x2b08f994, 0x00004e00
+.long 0x68487058, 0x00000800, 0xfd458f19, 0x00002e00
+.long 0x6cde9487, 0x0000a100, 0xf87b52b7, 0x00006600
+.long 0xd373ab23, 0x00002800, 0x024b72e2, 0x0000d900
+.long 0x8f1fe357, 0x00002400, 0xab55662a, 0x0000b200
+.long 0x28ebb207, 0x00007600, 0xc2b52f03, 0x00005b00
+.long 0x7bc5869a, 0x0000a200, 0x0837d3a5, 0x00004900
+.long 0x872830f2, 0x00006d00, 0xa5bf23b2, 0x00008b00
+.long 0x6a0302ba, 0x0000d100, 0x8216ed5c, 0x00002500
+.long 0x1ccf8a2b, 0x00007200, 0xb479a792, 0x0000f800
+.long 0xf207f3f0, 0x0000f600, 0xe2694ea1, 0x00006400
+.long 0xf4da65cd, 0x00008600, 0xbe0506d5, 0x00006800
+.long 0x6234d11f, 0x00009800, 0xfea6c48a, 0x00001600
+.long 0x532e349d, 0x0000d400, 0x55f3a2a0, 0x0000a400
+.long 0xe18a0532, 0x00005c00, 0xebf6a475, 0x0000cc00
+.long 0xec830b39, 0x00005d00, 0xef6040aa, 0x00006500
+.long 0x9f715e06, 0x0000b600, 0x106ebd51, 0x00009200
+.long 0x8a213ef9, 0x00006c00, 0x06dd963d, 0x00007000
+.long 0x053eddae, 0x00004800, 0xbde64d46, 0x00005000
+.long 0x8d5491b5, 0x0000fd00, 0x5dc47105, 0x0000ed00
+.long 0xd406046f, 0x0000b900, 0x155060ff, 0x0000da00
+.long 0xfb981924, 0x00005e00, 0xe9bdd697, 0x00001500
+.long 0x434089cc, 0x00004600, 0x9ed96777, 0x00005700
+.long 0x42e8b0bd, 0x0000a700, 0x8b890788, 0x00008d00
+.long 0x5b19e738, 0x00009d00, 0xeec879db, 0x00008400
+.long 0x0a7ca147, 0x00009000, 0x0f427ce9, 0x0000d800
+.long 0x1e84f8c9, 0x0000ab00, 0x00000000, 0x00000000
+.long 0x86800983, 0x00008c00, 0xed2b3248, 0x0000bc00
+.long 0x70111eac, 0x0000d300, 0x725a6c4e, 0x00000a00
+.long 0xff0efdfb, 0x0000f700, 0x38850f56, 0x0000e400
+.long 0xd5ae3d1e, 0x00005800, 0x392d3627, 0x00000500
+.long 0xd90f0a64, 0x0000b800, 0xa65c6821, 0x0000b300
+.long 0x545b9bd1, 0x00004500, 0x2e36243a, 0x00000600
+.long 0x670a0cb1, 0x0000d000, 0xe757930f, 0x00002c00
+.long 0x96eeb4d2, 0x00001e00, 0x919b1b9e, 0x00008f00
+.long 0xc5c0804f, 0x0000ca00, 0x20dc61a2, 0x00003f00
+.long 0x4b775a69, 0x00000f00, 0x1a121c16, 0x00000200
+.long 0xba93e20a, 0x0000c100, 0x2aa0c0e5, 0x0000af00
+.long 0xe0223c43, 0x0000bd00, 0x171b121d, 0x00000300
+.long 0x0d090e0b, 0x00000100, 0xc78bf2ad, 0x00001300
+.long 0xa8b62db9, 0x00008a00, 0xa91e14c8, 0x00006b00
+.long 0x19f15785, 0x00003a00, 0x0775af4c, 0x00009100
+.long 0xdd99eebb, 0x00001100, 0x607fa3fd, 0x00004100
+.long 0x2601f79f, 0x00004f00, 0xf5725cbc, 0x00006700
+.long 0x3b6644c5, 0x0000dc00, 0x7efb5b34, 0x0000ea00
+.long 0x29438b76, 0x00009700, 0xc623cbdc, 0x0000f200
+.long 0xfcedb668, 0x0000cf00, 0xf1e4b863, 0x0000ce00
+.long 0xdc31d7ca, 0x0000f000, 0x85634210, 0x0000b400
+.long 0x22971340, 0x0000e600, 0x11c68420, 0x00007300
+.long 0x244a857d, 0x00009600, 0x3dbbd2f8, 0x0000ac00
+.long 0x32f9ae11, 0x00007400, 0xa129c76d, 0x00002200
+.long 0x2f9e1d4b, 0x0000e700, 0x30b2dcf3, 0x0000ad00
+.long 0x52860dec, 0x00003500, 0xe3c177d0, 0x00008500
+.long 0x16b32b6c, 0x0000e200, 0xb970a999, 0x0000f900
+.long 0x489411fa, 0x00003700, 0x64e94722, 0x0000e800
+.long 0x8cfca8c4, 0x00001c00, 0x3ff0a01a, 0x00007500
+.long 0x2c7d56d8, 0x0000df00, 0x903322ef, 0x00006e00
+.long 0x4e4987c7, 0x00004700, 0xd138d9c1, 0x0000f100
+.long 0xa2ca8cfe, 0x00001a00, 0x0bd49836, 0x00007100
+.long 0x81f5a6cf, 0x00001d00, 0xde7aa528, 0x00002900
+.long 0x8eb7da26, 0x0000c500, 0xbfad3fa4, 0x00008900
+.long 0x9d3a2ce4, 0x00006f00, 0x9278500d, 0x0000b700
+.long 0xcc5f6a9b, 0x00006200, 0x467e5462, 0x00000e00
+.long 0x138df6c2, 0x0000aa00, 0xb8d890e8, 0x00001800
+.long 0xf7392e5e, 0x0000be00, 0xafc382f5, 0x00001b00
+.long 0x805d9fbe, 0x0000fc00, 0x93d0697c, 0x00005600
+.long 0x2dd56fa9, 0x00003e00, 0x1225cfb3, 0x00004b00
+.long 0x99acc83b, 0x0000c600, 0x7d1810a7, 0x0000d200
+.long 0x639ce86e, 0x00007900, 0xbb3bdb7b, 0x00002000
+.long 0x7826cd09, 0x00009a00, 0x18596ef4, 0x0000db00
+.long 0xb79aec01, 0x0000c000, 0x9a4f83a8, 0x0000fe00
+.long 0x6e95e665, 0x00007800, 0xe6ffaa7e, 0x0000cd00
+.long 0xcfbc2108, 0x00005a00, 0xe815efe6, 0x0000f400
+.long 0x9be7bad9, 0x00001f00, 0x366f4ace, 0x0000dd00
+.long 0x099fead4, 0x0000a800, 0x7cb029d6, 0x00003300
+.long 0xb2a431af, 0x00008800, 0x233f2a31, 0x00000700
+.long 0x94a5c630, 0x0000c700, 0x66a235c0, 0x00003100
+.long 0xbc4e7437, 0x0000b100, 0xca82fca6, 0x00001200
+.long 0xd090e0b0, 0x00001000, 0xd8a73315, 0x00005900
+.long 0x9804f14a, 0x00002700, 0xdaec41f7, 0x00008000
+.long 0x50cd7f0e, 0x0000ec00, 0xf691172f, 0x00005f00
+.long 0xd64d768d, 0x00006000, 0xb0ef434d, 0x00005100
+.long 0x4daacc54, 0x00007f00, 0x0496e4df, 0x0000a900
+.long 0xb5d19ee3, 0x00001900, 0x886a4c1b, 0x0000b500
+.long 0x1f2cc1b8, 0x00004a00, 0x5165467f, 0x00000d00
+.long 0xea5e9d04, 0x00002d00, 0x358c015d, 0x0000e500
+.long 0x7487fa73, 0x00007a00, 0x410bfb2e, 0x00009f00
+.long 0x1d67b35a, 0x00009300, 0xd2db9252, 0x0000c900
+.long 0x5610e933, 0x00009c00, 0x47d66d13, 0x0000ef00
+.long 0x61d79a8c, 0x0000a000, 0x0ca1377a, 0x0000e000
+.long 0x14f8598e, 0x00003b00, 0x3c13eb89, 0x00004d00
+.long 0x27a9ceee, 0x0000ae00, 0xc961b735, 0x00002a00
+.long 0xe51ce1ed, 0x0000f500, 0xb1477a3c, 0x0000b000
+.long 0xdfd29c59, 0x0000c800, 0x73f2553f, 0x0000eb00
+.long 0xce141879, 0x0000bb00, 0x37c773bf, 0x00003c00
+.long 0xcdf753ea, 0x00008300, 0xaafd5f5b, 0x00005300
+.long 0x6f3ddf14, 0x00009900, 0xdb447886, 0x00006100
+.long 0xf3afca81, 0x00001700, 0xc468b93e, 0x00002b00
+.long 0x3424382c, 0x00000400, 0x40a3c25f, 0x00007e00
+.long 0xc31d1672, 0x0000ba00, 0x25e2bc0c, 0x00007700
+.long 0x493c288b, 0x0000d600, 0x950dff41, 0x00002600
+.long 0x01a83971, 0x0000e100, 0xb30c08de, 0x00006900
+.long 0xe4b4d89c, 0x00001400, 0xc1566490, 0x00006300
+.long 0x84cb7b61, 0x00005500, 0xb632d570, 0x00002100
+.long 0x5c6c4874, 0x00000c00, 0x57b8d042, 0x00007d00
+.LtableD2:
+.long 0xf45150a7
+.LtableDs2:
+.long             0x00520000, 0x417e5365, 0x00090000
+.long 0x171ac3a4, 0x006a0000, 0x273a965e, 0x00d50000
+.long 0xab3bcb6b, 0x00300000, 0x9d1ff145, 0x00360000
+.long 0xfaacab58, 0x00a50000, 0xe34b9303, 0x00380000
+.long 0x302055fa, 0x00bf0000, 0x76adf66d, 0x00400000
+.long 0xcc889176, 0x00a30000, 0x02f5254c, 0x009e0000
+.long 0xe54ffcd7, 0x00810000, 0x2ac5d7cb, 0x00f30000
+.long 0x35268044, 0x00d70000, 0x62b58fa3, 0x00fb0000
+.long 0xb1de495a, 0x007c0000, 0xba25671b, 0x00e30000
+.long 0xea45980e, 0x00390000, 0xfe5de1c0, 0x00820000
+.long 0x2fc30275, 0x009b0000, 0x4c8112f0, 0x002f0000
+.long 0x468da397, 0x00ff0000, 0xd36bc6f9, 0x00870000
+.long 0x8f03e75f, 0x00340000, 0x9215959c, 0x008e0000
+.long 0x6dbfeb7a, 0x00430000, 0x5295da59, 0x00440000
+.long 0xbed42d83, 0x00c40000, 0x7458d321, 0x00de0000
+.long 0xe0492969, 0x00e90000, 0xc98e44c8, 0x00cb0000
+.long 0xc2756a89, 0x00540000, 0x8ef47879, 0x007b0000
+.long 0x58996b3e, 0x00940000, 0xb927dd71, 0x00320000
+.long 0xe1beb64f, 0x00a60000, 0x88f017ad, 0x00c20000
+.long 0x20c966ac, 0x00230000, 0xce7db43a, 0x003d0000
+.long 0xdf63184a, 0x00ee0000, 0x1ae58231, 0x004c0000
+.long 0x51976033, 0x00950000, 0x5362457f, 0x000b0000
+.long 0x64b1e077, 0x00420000, 0x6bbb84ae, 0x00fa0000
+.long 0x81fe1ca0, 0x00c30000, 0x08f9942b, 0x004e0000
+.long 0x48705868, 0x00080000, 0x458f19fd, 0x002e0000
+.long 0xde94876c, 0x00a10000, 0x7b52b7f8, 0x00660000
+.long 0x73ab23d3, 0x00280000, 0x4b72e202, 0x00d90000
+.long 0x1fe3578f, 0x00240000, 0x55662aab, 0x00b20000
+.long 0xebb20728, 0x00760000, 0xb52f03c2, 0x005b0000
+.long 0xc5869a7b, 0x00a20000, 0x37d3a508, 0x00490000
+.long 0x2830f287, 0x006d0000, 0xbf23b2a5, 0x008b0000
+.long 0x0302ba6a, 0x00d10000, 0x16ed5c82, 0x00250000
+.long 0xcf8a2b1c, 0x00720000, 0x79a792b4, 0x00f80000
+.long 0x07f3f0f2, 0x00f60000, 0x694ea1e2, 0x00640000
+.long 0xda65cdf4, 0x00860000, 0x0506d5be, 0x00680000
+.long 0x34d11f62, 0x00980000, 0xa6c48afe, 0x00160000
+.long 0x2e349d53, 0x00d40000, 0xf3a2a055, 0x00a40000
+.long 0x8a0532e1, 0x005c0000, 0xf6a475eb, 0x00cc0000
+.long 0x830b39ec, 0x005d0000, 0x6040aaef, 0x00650000
+.long 0x715e069f, 0x00b60000, 0x6ebd5110, 0x00920000
+.long 0x213ef98a, 0x006c0000, 0xdd963d06, 0x00700000
+.long 0x3eddae05, 0x00480000, 0xe64d46bd, 0x00500000
+.long 0x5491b58d, 0x00fd0000, 0xc471055d, 0x00ed0000
+.long 0x06046fd4, 0x00b90000, 0x5060ff15, 0x00da0000
+.long 0x981924fb, 0x005e0000, 0xbdd697e9, 0x00150000
+.long 0x4089cc43, 0x00460000, 0xd967779e, 0x00570000
+.long 0xe8b0bd42, 0x00a70000, 0x8907888b, 0x008d0000
+.long 0x19e7385b, 0x009d0000, 0xc879dbee, 0x00840000
+.long 0x7ca1470a, 0x00900000, 0x427ce90f, 0x00d80000
+.long 0x84f8c91e, 0x00ab0000, 0x00000000, 0x00000000
+.long 0x80098386, 0x008c0000, 0x2b3248ed, 0x00bc0000
+.long 0x111eac70, 0x00d30000, 0x5a6c4e72, 0x000a0000
+.long 0x0efdfbff, 0x00f70000, 0x850f5638, 0x00e40000
+.long 0xae3d1ed5, 0x00580000, 0x2d362739, 0x00050000
+.long 0x0f0a64d9, 0x00b80000, 0x5c6821a6, 0x00b30000
+.long 0x5b9bd154, 0x00450000, 0x36243a2e, 0x00060000
+.long 0x0a0cb167, 0x00d00000, 0x57930fe7, 0x002c0000
+.long 0xeeb4d296, 0x001e0000, 0x9b1b9e91, 0x008f0000
+.long 0xc0804fc5, 0x00ca0000, 0xdc61a220, 0x003f0000
+.long 0x775a694b, 0x000f0000, 0x121c161a, 0x00020000
+.long 0x93e20aba, 0x00c10000, 0xa0c0e52a, 0x00af0000
+.long 0x223c43e0, 0x00bd0000, 0x1b121d17, 0x00030000
+.long 0x090e0b0d, 0x00010000, 0x8bf2adc7, 0x00130000
+.long 0xb62db9a8, 0x008a0000, 0x1e14c8a9, 0x006b0000
+.long 0xf1578519, 0x003a0000, 0x75af4c07, 0x00910000
+.long 0x99eebbdd, 0x00110000, 0x7fa3fd60, 0x00410000
+.long 0x01f79f26, 0x004f0000, 0x725cbcf5, 0x00670000
+.long 0x6644c53b, 0x00dc0000, 0xfb5b347e, 0x00ea0000
+.long 0x438b7629, 0x00970000, 0x23cbdcc6, 0x00f20000
+.long 0xedb668fc, 0x00cf0000, 0xe4b863f1, 0x00ce0000
+.long 0x31d7cadc, 0x00f00000, 0x63421085, 0x00b40000
+.long 0x97134022, 0x00e60000, 0xc6842011, 0x00730000
+.long 0x4a857d24, 0x00960000, 0xbbd2f83d, 0x00ac0000
+.long 0xf9ae1132, 0x00740000, 0x29c76da1, 0x00220000
+.long 0x9e1d4b2f, 0x00e70000, 0xb2dcf330, 0x00ad0000
+.long 0x860dec52, 0x00350000, 0xc177d0e3, 0x00850000
+.long 0xb32b6c16, 0x00e20000, 0x70a999b9, 0x00f90000
+.long 0x9411fa48, 0x00370000, 0xe9472264, 0x00e80000
+.long 0xfca8c48c, 0x001c0000, 0xf0a01a3f, 0x00750000
+.long 0x7d56d82c, 0x00df0000, 0x3322ef90, 0x006e0000
+.long 0x4987c74e, 0x00470000, 0x38d9c1d1, 0x00f10000
+.long 0xca8cfea2, 0x001a0000, 0xd498360b, 0x00710000
+.long 0xf5a6cf81, 0x001d0000, 0x7aa528de, 0x00290000
+.long 0xb7da268e, 0x00c50000, 0xad3fa4bf, 0x00890000
+.long 0x3a2ce49d, 0x006f0000, 0x78500d92, 0x00b70000
+.long 0x5f6a9bcc, 0x00620000, 0x7e546246, 0x000e0000
+.long 0x8df6c213, 0x00aa0000, 0xd890e8b8, 0x00180000
+.long 0x392e5ef7, 0x00be0000, 0xc382f5af, 0x001b0000
+.long 0x5d9fbe80, 0x00fc0000, 0xd0697c93, 0x00560000
+.long 0xd56fa92d, 0x003e0000, 0x25cfb312, 0x004b0000
+.long 0xacc83b99, 0x00c60000, 0x1810a77d, 0x00d20000
+.long 0x9ce86e63, 0x00790000, 0x3bdb7bbb, 0x00200000
+.long 0x26cd0978, 0x009a0000, 0x596ef418, 0x00db0000
+.long 0x9aec01b7, 0x00c00000, 0x4f83a89a, 0x00fe0000
+.long 0x95e6656e, 0x00780000, 0xffaa7ee6, 0x00cd0000
+.long 0xbc2108cf, 0x005a0000, 0x15efe6e8, 0x00f40000
+.long 0xe7bad99b, 0x001f0000, 0x6f4ace36, 0x00dd0000
+.long 0x9fead409, 0x00a80000, 0xb029d67c, 0x00330000
+.long 0xa431afb2, 0x00880000, 0x3f2a3123, 0x00070000
+.long 0xa5c63094, 0x00c70000, 0xa235c066, 0x00310000
+.long 0x4e7437bc, 0x00b10000, 0x82fca6ca, 0x00120000
+.long 0x90e0b0d0, 0x00100000, 0xa73315d8, 0x00590000
+.long 0x04f14a98, 0x00270000, 0xec41f7da, 0x00800000
+.long 0xcd7f0e50, 0x00ec0000, 0x91172ff6, 0x005f0000
+.long 0x4d768dd6, 0x00600000, 0xef434db0, 0x00510000
+.long 0xaacc544d, 0x007f0000, 0x96e4df04, 0x00a90000
+.long 0xd19ee3b5, 0x00190000, 0x6a4c1b88, 0x00b50000
+.long 0x2cc1b81f, 0x004a0000, 0x65467f51, 0x000d0000
+.long 0x5e9d04ea, 0x002d0000, 0x8c015d35, 0x00e50000
+.long 0x87fa7374, 0x007a0000, 0x0bfb2e41, 0x009f0000
+.long 0x67b35a1d, 0x00930000, 0xdb9252d2, 0x00c90000
+.long 0x10e93356, 0x009c0000, 0xd66d1347, 0x00ef0000
+.long 0xd79a8c61, 0x00a00000, 0xa1377a0c, 0x00e00000
+.long 0xf8598e14, 0x003b0000, 0x13eb893c, 0x004d0000
+.long 0xa9ceee27, 0x00ae0000, 0x61b735c9, 0x002a0000
+.long 0x1ce1ede5, 0x00f50000, 0x477a3cb1, 0x00b00000
+.long 0xd29c59df, 0x00c80000, 0xf2553f73, 0x00eb0000
+.long 0x141879ce, 0x00bb0000, 0xc773bf37, 0x003c0000
+.long 0xf753eacd, 0x00830000, 0xfd5f5baa, 0x00530000
+.long 0x3ddf146f, 0x00990000, 0x447886db, 0x00610000
+.long 0xafca81f3, 0x00170000, 0x68b93ec4, 0x002b0000
+.long 0x24382c34, 0x00040000, 0xa3c25f40, 0x007e0000
+.long 0x1d1672c3, 0x00ba0000, 0xe2bc0c25, 0x00770000
+.long 0x3c288b49, 0x00d60000, 0x0dff4195, 0x00260000
+.long 0xa8397101, 0x00e10000, 0x0c08deb3, 0x00690000
+.long 0xb4d89ce4, 0x00140000, 0x566490c1, 0x00630000
+.long 0xcb7b6184, 0x00550000, 0x32d570b6, 0x00210000
+.long 0x6c48745c, 0x000c0000, 0xb8d04257, 0x007d0000
+.LtableD3:
+.long 0x5150a7f4
+.LtableDs3:
+.long             0x52000000, 0x7e536541, 0x09000000
+.long 0x1ac3a417, 0x6a000000, 0x3a965e27, 0xd5000000
+.long 0x3bcb6bab, 0x30000000, 0x1ff1459d, 0x36000000
+.long 0xacab58fa, 0xa5000000, 0x4b9303e3, 0x38000000
+.long 0x2055fa30, 0xbf000000, 0xadf66d76, 0x40000000
+.long 0x889176cc, 0xa3000000, 0xf5254c02, 0x9e000000
+.long 0x4ffcd7e5, 0x81000000, 0xc5d7cb2a, 0xf3000000
+.long 0x26804435, 0xd7000000, 0xb58fa362, 0xfb000000
+.long 0xde495ab1, 0x7c000000, 0x25671bba, 0xe3000000
+.long 0x45980eea, 0x39000000, 0x5de1c0fe, 0x82000000
+.long 0xc302752f, 0x9b000000, 0x8112f04c, 0x2f000000
+.long 0x8da39746, 0xff000000, 0x6bc6f9d3, 0x87000000
+.long 0x03e75f8f, 0x34000000, 0x15959c92, 0x8e000000
+.long 0xbfeb7a6d, 0x43000000, 0x95da5952, 0x44000000
+.long 0xd42d83be, 0xc4000000, 0x58d32174, 0xde000000
+.long 0x492969e0, 0xe9000000, 0x8e44c8c9, 0xcb000000
+.long 0x756a89c2, 0x54000000, 0xf478798e, 0x7b000000
+.long 0x996b3e58, 0x94000000, 0x27dd71b9, 0x32000000
+.long 0xbeb64fe1, 0xa6000000, 0xf017ad88, 0xc2000000
+.long 0xc966ac20, 0x23000000, 0x7db43ace, 0x3d000000
+.long 0x63184adf, 0xee000000, 0xe582311a, 0x4c000000
+.long 0x97603351, 0x95000000, 0x62457f53, 0x0b000000
+.long 0xb1e07764, 0x42000000, 0xbb84ae6b, 0xfa000000
+.long 0xfe1ca081, 0xc3000000, 0xf9942b08, 0x4e000000
+.long 0x70586848, 0x08000000, 0x8f19fd45, 0x2e000000
+.long 0x94876cde, 0xa1000000, 0x52b7f87b, 0x66000000
+.long 0xab23d373, 0x28000000, 0x72e2024b, 0xd9000000
+.long 0xe3578f1f, 0x24000000, 0x662aab55, 0xb2000000
+.long 0xb20728eb, 0x76000000, 0x2f03c2b5, 0x5b000000
+.long 0x869a7bc5, 0xa2000000, 0xd3a50837, 0x49000000
+.long 0x30f28728, 0x6d000000, 0x23b2a5bf, 0x8b000000
+.long 0x02ba6a03, 0xd1000000, 0xed5c8216, 0x25000000
+.long 0x8a2b1ccf, 0x72000000, 0xa792b479, 0xf8000000
+.long 0xf3f0f207, 0xf6000000, 0x4ea1e269, 0x64000000
+.long 0x65cdf4da, 0x86000000, 0x06d5be05, 0x68000000
+.long 0xd11f6234, 0x98000000, 0xc48afea6, 0x16000000
+.long 0x349d532e, 0xd4000000, 0xa2a055f3, 0xa4000000
+.long 0x0532e18a, 0x5c000000, 0xa475ebf6, 0xcc000000
+.long 0x0b39ec83, 0x5d000000, 0x40aaef60, 0x65000000
+.long 0x5e069f71, 0xb6000000, 0xbd51106e, 0x92000000
+.long 0x3ef98a21, 0x6c000000, 0x963d06dd, 0x70000000
+.long 0xddae053e, 0x48000000, 0x4d46bde6, 0x50000000
+.long 0x91b58d54, 0xfd000000, 0x71055dc4, 0xed000000
+.long 0x046fd406, 0xb9000000, 0x60ff1550, 0xda000000
+.long 0x1924fb98, 0x5e000000, 0xd697e9bd, 0x15000000
+.long 0x89cc4340, 0x46000000, 0x67779ed9, 0x57000000
+.long 0xb0bd42e8, 0xa7000000, 0x07888b89, 0x8d000000
+.long 0xe7385b19, 0x9d000000, 0x79dbeec8, 0x84000000
+.long 0xa1470a7c, 0x90000000, 0x7ce90f42, 0xd8000000
+.long 0xf8c91e84, 0xab000000, 0x00000000, 0x00000000
+.long 0x09838680, 0x8c000000, 0x3248ed2b, 0xbc000000
+.long 0x1eac7011, 0xd3000000, 0x6c4e725a, 0x0a000000
+.long 0xfdfbff0e, 0xf7000000, 0x0f563885, 0xe4000000
+.long 0x3d1ed5ae, 0x58000000, 0x3627392d, 0x05000000
+.long 0x0a64d90f, 0xb8000000, 0x6821a65c, 0xb3000000
+.long 0x9bd1545b, 0x45000000, 0x243a2e36, 0x06000000
+.long 0x0cb1670a, 0xd0000000, 0x930fe757, 0x2c000000
+.long 0xb4d296ee, 0x1e000000, 0x1b9e919b, 0x8f000000
+.long 0x804fc5c0, 0xca000000, 0x61a220dc, 0x3f000000
+.long 0x5a694b77, 0x0f000000, 0x1c161a12, 0x02000000
+.long 0xe20aba93, 0xc1000000, 0xc0e52aa0, 0xaf000000
+.long 0x3c43e022, 0xbd000000, 0x121d171b, 0x03000000
+.long 0x0e0b0d09, 0x01000000, 0xf2adc78b, 0x13000000
+.long 0x2db9a8b6, 0x8a000000, 0x14c8a91e, 0x6b000000
+.long 0x578519f1, 0x3a000000, 0xaf4c0775, 0x91000000
+.long 0xeebbdd99, 0x11000000, 0xa3fd607f, 0x41000000
+.long 0xf79f2601, 0x4f000000, 0x5cbcf572, 0x67000000
+.long 0x44c53b66, 0xdc000000, 0x5b347efb, 0xea000000
+.long 0x8b762943, 0x97000000, 0xcbdcc623, 0xf2000000
+.long 0xb668fced, 0xcf000000, 0xb863f1e4, 0xce000000
+.long 0xd7cadc31, 0xf0000000, 0x42108563, 0xb4000000
+.long 0x13402297, 0xe6000000, 0x842011c6, 0x73000000
+.long 0x857d244a, 0x96000000, 0xd2f83dbb, 0xac000000
+.long 0xae1132f9, 0x74000000, 0xc76da129, 0x22000000
+.long 0x1d4b2f9e, 0xe7000000, 0xdcf330b2, 0xad000000
+.long 0x0dec5286, 0x35000000, 0x77d0e3c1, 0x85000000
+.long 0x2b6c16b3, 0xe2000000, 0xa999b970, 0xf9000000
+.long 0x11fa4894, 0x37000000, 0x472264e9, 0xe8000000
+.long 0xa8c48cfc, 0x1c000000, 0xa01a3ff0, 0x75000000
+.long 0x56d82c7d, 0xdf000000, 0x22ef9033, 0x6e000000
+.long 0x87c74e49, 0x47000000, 0xd9c1d138, 0xf1000000
+.long 0x8cfea2ca, 0x1a000000, 0x98360bd4, 0x71000000
+.long 0xa6cf81f5, 0x1d000000, 0xa528de7a, 0x29000000
+.long 0xda268eb7, 0xc5000000, 0x3fa4bfad, 0x89000000
+.long 0x2ce49d3a, 0x6f000000, 0x500d9278, 0xb7000000
+.long 0x6a9bcc5f, 0x62000000, 0x5462467e, 0x0e000000
+.long 0xf6c2138d, 0xaa000000, 0x90e8b8d8, 0x18000000
+.long 0x2e5ef739, 0xbe000000, 0x82f5afc3, 0x1b000000
+.long 0x9fbe805d, 0xfc000000, 0x697c93d0, 0x56000000
+.long 0x6fa92dd5, 0x3e000000, 0xcfb31225, 0x4b000000
+.long 0xc83b99ac, 0xc6000000, 0x10a77d18, 0xd2000000
+.long 0xe86e639c, 0x79000000, 0xdb7bbb3b, 0x20000000
+.long 0xcd097826, 0x9a000000, 0x6ef41859, 0xdb000000
+.long 0xec01b79a, 0xc0000000, 0x83a89a4f, 0xfe000000
+.long 0xe6656e95, 0x78000000, 0xaa7ee6ff, 0xcd000000
+.long 0x2108cfbc, 0x5a000000, 0xefe6e815, 0xf4000000
+.long 0xbad99be7, 0x1f000000, 0x4ace366f, 0xdd000000
+.long 0xead4099f, 0xa8000000, 0x29d67cb0, 0x33000000
+.long 0x31afb2a4, 0x88000000, 0x2a31233f, 0x07000000
+.long 0xc63094a5, 0xc7000000, 0x35c066a2, 0x31000000
+.long 0x7437bc4e, 0xb1000000, 0xfca6ca82, 0x12000000
+.long 0xe0b0d090, 0x10000000, 0x3315d8a7, 0x59000000
+.long 0xf14a9804, 0x27000000, 0x41f7daec, 0x80000000
+.long 0x7f0e50cd, 0xec000000, 0x172ff691, 0x5f000000
+.long 0x768dd64d, 0x60000000, 0x434db0ef, 0x51000000
+.long 0xcc544daa, 0x7f000000, 0xe4df0496, 0xa9000000
+.long 0x9ee3b5d1, 0x19000000, 0x4c1b886a, 0xb5000000
+.long 0xc1b81f2c, 0x4a000000, 0x467f5165, 0x0d000000
+.long 0x9d04ea5e, 0x2d000000, 0x015d358c, 0xe5000000
+.long 0xfa737487, 0x7a000000, 0xfb2e410b, 0x9f000000
+.long 0xb35a1d67, 0x93000000, 0x9252d2db, 0xc9000000
+.long 0xe9335610, 0x9c000000, 0x6d1347d6, 0xef000000
+.long 0x9a8c61d7, 0xa0000000, 0x377a0ca1, 0xe0000000
+.long 0x598e14f8, 0x3b000000, 0xeb893c13, 0x4d000000
+.long 0xceee27a9, 0xae000000, 0xb735c961, 0x2a000000
+.long 0xe1ede51c, 0xf5000000, 0x7a3cb147, 0xb0000000
+.long 0x9c59dfd2, 0xc8000000, 0x553f73f2, 0xeb000000
+.long 0x1879ce14, 0xbb000000, 0x73bf37c7, 0x3c000000
+.long 0x53eacdf7, 0x83000000, 0x5f5baafd, 0x53000000
+.long 0xdf146f3d, 0x99000000, 0x7886db44, 0x61000000
+.long 0xca81f3af, 0x17000000, 0xb93ec468, 0x2b000000
+.long 0x382c3424, 0x04000000, 0xc25f40a3, 0x7e000000
+.long 0x1672c31d, 0xba000000, 0xbc0c25e2, 0x77000000
+.long 0x288b493c, 0xd6000000, 0xff41950d, 0x26000000
+.long 0x397101a8, 0xe1000000, 0x08deb30c, 0x69000000
+.long 0xd89ce4b4, 0x14000000, 0x6490c156, 0x63000000
+.long 0x7b6184cb, 0x55000000, 0xd570b632, 0x21000000
+.long 0x48745c6c, 0x0c000000, 0xd04257b8, 0x7d000000
+
+#endif /*USE_AES*/
+#endif /*__x86_64*/
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index ac2fcfb..4a89e32 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -61,6 +61,12 @@
 #endif
 
 
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__)
+# define USE_AMD64_ASM 1
+#endif
+
 /* USE_PADLOCK indicates whether to compile the padlock specific
    code.  */
 #undef USE_PADLOCK
@@ -96,6 +102,20 @@ typedef u32           u32_a_t;
 #endif
 
 
+#ifdef USE_AMD64_ASM
+/* AMD64 assembly implementations of AES */
+extern void _gcry_aes_amd64_encrypt_block(const void *keysched_enc,
+					  unsigned char *out,
+					  const unsigned char *in,
+					  int rounds);
+
+extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec,
+					  unsigned char *out,
+					  const unsigned char *in,
+					  int rounds);
+#endif /*USE_AMD64_ASM*/
+
+
 

 /* Our context object.  */
 typedef struct
@@ -524,6 +544,9 @@ static void
 do_encrypt_aligned (const RIJNDAEL_context *ctx,
                     unsigned char *b, const unsigned char *a)
 {
+#ifdef USE_AMD64_ASM
+  _gcry_aes_amd64_encrypt_block(ctx->keyschenc, b, a, ctx->rounds);
+#else /*!USE_AMD64_ASM*/
 #define rk (ctx->keyschenc)
   int rounds = ctx->rounds;
   int r;
@@ -605,6 +628,7 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx,
   *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]);
   *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]);
 #undef rk
+#endif /*!USE_AMD64_ASM*/
 }
 
 
@@ -612,6 +636,7 @@ static void
 do_encrypt (const RIJNDAEL_context *ctx,
             unsigned char *bx, const unsigned char *ax)
 {
+#ifndef USE_AMD64_ASM
   /* BX and AX are not necessary correctly aligned.  Thus we might
      need to copy them here.  We try to align to a 16 bytes.  */
   if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
@@ -632,6 +657,7 @@ do_encrypt (const RIJNDAEL_context *ctx,
       memcpy (bx, b.b, 16);
     }
   else
+#endif /*!USE_AMD64_ASM*/
     {
       do_encrypt_aligned (ctx, bx, ax);
     }
@@ -1639,6 +1665,9 @@ static void
 do_decrypt_aligned (RIJNDAEL_context *ctx,
                     unsigned char *b, const unsigned char *a)
 {
+#ifdef USE_AMD64_ASM
+  _gcry_aes_amd64_decrypt_block(ctx->keyschdec, b, a, ctx->rounds);
+#else /*!USE_AMD64_ASM*/
 #define rk  (ctx->keyschdec)
   int rounds = ctx->rounds;
   int r;
@@ -1721,6 +1750,7 @@ do_decrypt_aligned (RIJNDAEL_context *ctx,
   *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]);
   *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]);
 #undef rk
+#endif /*!USE_AMD64_ASM*/
 }
 
 
@@ -1735,6 +1765,7 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
       ctx->decryption_prepared = 1;
     }
 
+#ifndef USE_AMD64_ASM
   /* BX and AX are not necessary correctly aligned.  Thus we might
      need to copy them here.  We try to align to a 16 bytes. */
   if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f))
@@ -1755,6 +1786,7 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
       memcpy (bx, b.b, 16);
     }
   else
+#endif /*!USE_AMD64_ASM*/
     {
       do_decrypt_aligned (ctx, bx, ax);
     }
diff --git a/configure.ac b/configure.ac
index f13a91b..c33c950 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1221,6 +1221,13 @@ LIST_MEMBER(aes, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael.lo"
    AC_DEFINE(USE_AES, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(twofish, $enabled_ciphers)