[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-84-g63ac3ba

Tue Feb 19 11:33:38 CET 2013

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  63ac3ba07dba82fde040d31b90b4eff627bd92b9 (commit)
       via  4de62d80644228fc5db2a9f9c94a7eb633d8de2e (commit)
       via  537f12ce072d568f9fa344c447d32b2e0efffbe8 (commit)
      from  09ac5d87d11aa0b1fa0e0a4184ab03b3671a73e2 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 63ac3ba07dba82fde040d31b90b4eff627bd92b9
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Wed Jan 23 11:55:13 2013 +0200

    Add AES-NI/AVX accelerated Camellia implementation
    
    * configure.ac: Add option --disable-avx-support.
    (HAVE_GCC_INLINE_ASM_AVX): New.
    (ENABLE_AVX_SUPPORT): New.
    (camellia) [ENABLE_AVX_SUPPORT, ENABLE_AESNI_SUPPORT]: Add
    camellia_aesni_avx_x86-64.lo.
    * cipher/Makefile.am (AM_CCASFLAGS): Add.
    (EXTRA_libcipher_la_SOURCES): Add camellia_aesni_avx_x86-64.S
    * cipher/camellia-glue.c [ENABLE_AESNI_SUPPORT, ENABLE_AVX_SUPPORT]
    [__x86_64__] (USE_AESNI_AVX): Add macro.
    (struct Camellia_context) [USE_AESNI_AVX]: Add use_aesni_avx.
    [USE_AESNI_AVX] (_gcry_camellia_aesni_avx_ctr_enc)
    (_gcry_camellia_aesni_avx_cbc_dec): New prototypes to assembly
    functions.
    (camellia_setkey) [USE_AESNI_AVX]: Enable AES-NI/AVX if hardware
    support both.
    (_gcry_camellia_ctr_enc) [USE_AESNI_AVX]: Add AES-NI/AVX code.
    (_gcry_camellia_cbc_dec) [USE_AESNI_AVX]: Add AES-NI/AVX code.
    * cipher/camellia_aesni_avx_x86-64.S: New.
    * src/g10lib.h (HWF_INTEL_AVX): New.
    * src/global.c (hwflist): Add HWF_INTEL_AVX.
    * src/hwf-x86.c (detect_x86_gnuc) [ENABLE_AVX_SUPPORT]: Add detection
    for AVX.
    --
    
    Before:
     Running each test 250 times.
                     ECB/Stream         CBC             CFB             OFB             CTR
                  --------------- --------------- --------------- --------------- ---------------
     CAMELLIA128   2210ms  2200ms  2300ms  2050ms  2240ms  2250ms  2290ms  2270ms  2070ms  2070ms
     CAMELLIA256   2810ms  2800ms  2920ms  2670ms  2840ms  2850ms  2910ms  2890ms  2660ms  2640ms
    
    After:
     Running each test 250 times.
                     ECB/Stream         CBC             CFB             OFB             CTR
                  --------------- --------------- --------------- --------------- ---------------
     CAMELLIA128   2200ms  2220ms  2290ms   470ms  2240ms  2270ms  2270ms  2290ms   480ms   480ms
     CAMELLIA256   2820ms  2820ms  2900ms   600ms  2860ms  2860ms  2900ms  2920ms   620ms   620ms
    
    AES-NI/AVX implementation works by processing 16 parallel blocks (256 bytes).
    It's bytesliced implementation that uses AES-NI (Subbyte) for Camellia sboxes,
    with help of prefiltering/postfiltering. For smaller data sets generic C
    implementation is used.
    
    Speed-up for CBC-decryption and CTR-mode (large data): 4.3x
    
    Tests were run on: Intel Core i5-2450M
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
    
    (license boiler plate update by wk)

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index e8050e3..fcb9be5 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -26,6 +26,8 @@ EXTRA_DIST = Manifest
 AM_CPPFLAGS = -I../src -I$(top_srcdir)/src
 AM_CFLAGS = $(GPG_ERROR_CFLAGS)
 
+AM_CCASFLAGS = $(NOEXECSTACK_FLAGS)
+
 
 noinst_LTLIBRARIES = libcipher.la
 
@@ -69,7 +71,7 @@ tiger.c \
 whirlpool.c \
 twofish.c \
 rfc2268.c \
-camellia.c camellia.h camellia-glue.c
+camellia.c camellia.h camellia-glue.c camellia_aesni_avx_x86-64.S
 
 if ENABLE_O_FLAG_MUNGING
 o_flag_munging = sed -e 's/-O[2-9s]*/-O1/g'
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index ba8aa28..dd9206f 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -71,12 +71,38 @@
 # define ATTR_ALIGNED_16
 #endif
 
+/* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
+#undef USE_AESNI_AVX
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+# if defined(__x86_64__)
+#  define USE_AESNI_AVX 1
+# endif
+#endif
+
 typedef struct
 {
   int keybitlength;
   KEY_TABLE_TYPE keytable;
+#ifdef USE_AESNI_AVX
+  int use_aesni_avx;		/* AES-NI/AVX implementation shall be used.  */
+#endif /*USE_AESNI_AVX*/
 } CAMELLIA_context;
 
+#ifdef USE_AESNI_AVX
+/* Assembler implementations of Camellia using AES-NI and AVX.  Process data
+   in 16 block same time.
+ */
+extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *ctr);
+
+extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv);
+#endif
+
 static const char *selftest(void);
 
 static gcry_err_code_t
@@ -109,6 +135,15 @@ camellia_setkey(void *c, const byte *key, unsigned keylen)
      +3*2*sizeof(void*)                     /* Function calls.  */
      );
 
+#ifdef USE_AESNI_AVX
+  ctx->use_aesni_avx = 0;
+  if ((_gcry_get_hw_features () & HWF_INTEL_AESNI) &&
+      (_gcry_get_hw_features () & HWF_INTEL_AVX))
+    {
+      ctx->use_aesni_avx = 1;
+    }
+#endif
+
   return 0;
 }
 
@@ -158,8 +193,39 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
+  int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
   int i;
 
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          /* clear AVX registers */
+          asm volatile ("vzeroall;\n":::);
+
+          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+        }
+
+      /* Use generic code to handle smaller chunks... */
+      /* TODO: use caching instead? */
+    }
+#endif
+
   for ( ;nblocks; nblocks-- )
     {
       /* Encrypt the counter. */
@@ -178,7 +244,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
     }
 
   wipememory(tmpbuf, sizeof(tmpbuf));
-  _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size);
+  _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CBC mode.  This function is only
@@ -192,6 +258,36 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
+  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_camellia_aesni_avx_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+          inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+          did_use_aesni_avx = 1;
+        }
+
+      if (did_use_aesni_avx)
+        {
+          /* clear AVX registers */
+          asm volatile ("vzeroall;\n":::);
+
+          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+        }
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
 
   for ( ;nblocks; nblocks-- )
     {
@@ -208,7 +304,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
     }
 
   wipememory(savebuf, sizeof(savebuf));
-  _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size);
+  _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
diff --git a/cipher/camellia.c b/cipher/camellia.c
index 42a9b73..cd46885 100644
--- a/cipher/camellia.c
+++ b/cipher/camellia.c
@@ -14,8 +14,7 @@
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 /*
@@ -985,7 +984,7 @@ void camellia_decrypt128(const u32 *subkey, u32 *blocks)
     io[1] = blocks[1];
     io[2] = blocks[2];
     io[3] = blocks[3];
- 
+
     /* pre whitening but absorb kw2*/
     io[0] ^= CamelliaSubkeyL(24);
     io[1] ^= CamelliaSubkeyR(24);
diff --git a/cipher/camellia_aesni_avx_x86-64.S b/cipher/camellia_aesni_avx_x86-64.S
new file mode 100644
index 0000000..e25ad8f
--- /dev/null
+++ b/cipher/camellia_aesni_avx_x86-64.S
@@ -0,0 +1,1120 @@
+/* camellia_avx_aesni_x86-64.S  -  AES-NI/AVX implementation of Camellia cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_bitlength 0
+#define key_table 4
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/**********************************************************************
+  16-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vmovdqa .Linv_shift_row RIP, t4; \
+	vbroadcastss .L0f0f0f0f RIP, t7; \
+	vmovdqa .Lpre_tf_lo_s1 RIP, t0; \
+	vmovdqa .Lpre_tf_hi_s1 RIP, t1; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	vmovdqa .Lpre_tf_lo_s4 RIP, t2; \
+	vmovdqa .Lpre_tf_hi_s4 RIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	vpxor t4, t4, t4; \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
+	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
+	vaesenclast t4, x0, x0; \
+	vaesenclast t4, x7, x7; \
+	vaesenclast t4, x1, x1; \
+	vaesenclast t4, x4, x4; \
+	vaesenclast t4, x2, x2; \
+	vaesenclast t4, x5, x5; \
+	vaesenclast t4, x3, x3; \
+	vaesenclast t4, x6, x6; \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vmovdqa .Lpost_tf_lo_s3 RIP, t2; \
+	vmovdqa .Lpost_tf_hi_s3 RIP, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vmovdqa .Lpost_tf_lo_s2 RIP, t4; \
+	vmovdqa .Lpost_tf_hi_s2 RIP, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpxor t6, t6, t6; \
+	vmovq key, t0; \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	vpsrldq $5, t0, t5; \
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpsrldq $3, t0, t3; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t6, t0, t0; \
+	vpshufb t6, t1, t1; \
+	vpshufb t6, t2, t2; \
+	vpshufb t6, t3, t3; \
+	vpshufb t6, t4, t4; \
+	vpsrldq $2, t5, t7; \
+	vpshufb t6, t7, t7; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 16(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 16(mem_cd), x5, x5; \
+	\
+	vpsrldq $1, t5, t3; \
+	vpshufb t6, t5, t5; \
+	vpshufb t6, t3, t6; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 16(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 16(mem_cd), x7, x7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 16(mem_cd), x0, x0; \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 16(mem_cd), x1, x1; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 16(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+	\
+	vmovdqu x4, 0 * 16(mem_cd); \
+	vmovdqu x5, 1 * 16(mem_cd); \
+	vmovdqu x6, 2 * 16(mem_cd); \
+	vmovdqu x7, 3 * 16(mem_cd); \
+	vmovdqu x0, 4 * 16(mem_cd); \
+	vmovdqu x1, 5 * 16(mem_cd); \
+	vmovdqu x2, 6 * 16(mem_cd); \
+	vmovdqu x3, 7 * 16(mem_cd); \
+	\
+	roundsm16(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpxor tt0, tt0, tt0; \
+	vmovd kll, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vmovdqu l4, 4 * 16(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 16(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 16(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 16(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vmovd krr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor 4 * 16(r), t0, t0; \
+	vpor 5 * 16(r), t1, t1; \
+	vpor 6 * 16(r), t2, t2; \
+	vpor 7 * 16(r), t3, t3; \
+	\
+	vpxor 0 * 16(r), t0, t0; \
+	vpxor 1 * 16(r), t1, t1; \
+	vpxor 2 * 16(r), t2, t2; \
+	vpxor 3 * 16(r), t3, t3; \
+	vmovdqu t0, 0 * 16(r); \
+	vmovdqu t1, 1 * 16(r); \
+	vmovdqu t2, 2 * 16(r); \
+	vmovdqu t3, 3 * 16(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vmovd krl, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand 0 * 16(r), t0, t0; \
+	vpand 1 * 16(r), t1, t1; \
+	vpand 2 * 16(r), t2, t2; \
+	vpand 3 * 16(r), t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 16(r), t0, t0; \
+	vpxor 5 * 16(r), t1, t1; \
+	vpxor 6 * 16(r), t2, t2; \
+	vpxor 7 * 16(r), t3, t3; \
+	vmovdqu t0, 4 * 16(r); \
+	vmovdqu t1, 5 * 16(r); \
+	vmovdqu t2, 6 * 16(r); \
+	vmovdqu t3, 7 * 16(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vmovd klr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 16(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 16(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 16(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vmovdqu .Lshufb_16x16b RIP, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+#define transpose_8x8b(a, b, c, d, e, f, g, h, t0, t1, t2, t3, t4) \
+	vpunpcklbw a, b, t0; \
+	vpunpckhbw a, b, b; \
+	\
+	vpunpcklbw c, d, t1; \
+	vpunpckhbw c, d, d; \
+	\
+	vpunpcklbw e, f, t2; \
+	vpunpckhbw e, f, f; \
+	\
+	vpunpcklbw g, h, t3; \
+	vpunpckhbw g, h, h; \
+	\
+	vpunpcklwd t0, t1, g; \
+	vpunpckhwd t0, t1, t0; \
+	\
+	vpunpcklwd b, d, t1; \
+	vpunpckhwd b, d, e; \
+	\
+	vpunpcklwd t2, t3, c; \
+	vpunpckhwd t2, t3, t2; \
+	\
+	vpunpcklwd f, h, t3; \
+	vpunpckhwd f, h, b; \
+	\
+	vpunpcklwd e, b, t4; \
+	vpunpckhwd e, b, b; \
+	\
+	vpunpcklwd t1, t3, e; \
+	vpunpckhwd t1, t3, f; \
+	\
+	vmovdqa .Ltranspose_8x8_shuf RIP, t3; \
+	\
+	vpunpcklwd g, c, d; \
+	vpunpckhwd g, c, c; \
+	\
+	vpunpcklwd t0, t2, t1; \
+	vpunpckhwd t0, t2, h; \
+	\
+	vpunpckhqdq b, h, a; \
+	vpshufb t3, a, a; \
+	vpunpcklqdq b, h, b; \
+	vpshufb t3, b, b; \
+	\
+	vpunpckhqdq e, d, g; \
+	vpshufb t3, g, g; \
+	vpunpcklqdq e, d, h; \
+	vpshufb t3, h, h; \
+	\
+	vpunpckhqdq f, c, e; \
+	vpshufb t3, e, e; \
+	vpunpcklqdq f, c, f; \
+	vpshufb t3, f, f; \
+	\
+	vpunpckhqdq t4, t1, c; \
+	vpshufb t3, c, c; \
+	vpunpcklqdq t4, t1, d; \
+	vpshufb t3, d, d;
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap RIP, x0, x0; \
+	\
+	vpxor 0 * 16(rio), x0, y7; \
+	vpxor 1 * 16(rio), x0, y6; \
+	vpxor 2 * 16(rio), x0, y5; \
+	vpxor 3 * 16(rio), x0, y4; \
+	vpxor 4 * 16(rio), x0, y3; \
+	vpxor 5 * 16(rio), x0, y2; \
+	vpxor 6 * 16(rio), x0, y1; \
+	vpxor 7 * 16(rio), x0, y0; \
+	vpxor 8 * 16(rio), x0, x7; \
+	vpxor 9 * 16(rio), x0, x6; \
+	vpxor 10 * 16(rio), x0, x5; \
+	vpxor 11 * 16(rio), x0, x4; \
+	vpxor 12 * 16(rio), x0, x3; \
+	vpxor 13 * 16(rio), x0, x2; \
+	vpxor 14 * 16(rio), x0, x1; \
+	vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab); \
+	vmovdqu y0, 0 * 16(mem_cd); \
+	vmovdqu y1, 1 * 16(mem_cd); \
+	vmovdqu y2, 2 * 16(mem_cd); \
+	vmovdqu y3, 3 * 16(mem_cd); \
+	vmovdqu y4, 4 * 16(mem_cd); \
+	vmovdqu y5, 5 * 16(mem_cd); \
+	vmovdqu y6, 6 * 16(mem_cd); \
+	vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap RIP, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 16(rio); \
+	vmovdqu x1, 1 * 16(rio); \
+	vmovdqu x2, 2 * 16(rio); \
+	vmovdqu x3, 3 * 16(rio); \
+	vmovdqu x4, 4 * 16(rio); \
+	vmovdqu x5, 5 * 16(rio); \
+	vmovdqu x6, 6 * 16(rio); \
+	vmovdqu x7, 7 * 16(rio); \
+	vmovdqu y0, 8 * 16(rio); \
+	vmovdqu y1, 9 * 16(rio); \
+	vmovdqu y2, 10 * 16(rio); \
+	vmovdqu y3, 11 * 16(rio); \
+	vmovdqu y4, 12 * 16(rio); \
+	vmovdqu y5, 13 * 16(rio); \
+	vmovdqu y6, 14 * 16(rio); \
+	vmovdqu y7, 15 * 16(rio);
+
+.data
+.align 16
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x80808080
+	.long 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* shuffle mask for 8x8 byte transpose */
+.Ltranspose_8x8_shuf:
+	.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 8
+.type   __camellia_enc_blk16, at function;
+
+__camellia_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%xmm0..%xmm15: 16 plaintext blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX),
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 8);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX),
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 16);
+
+	movl $24, %r8d;
+	cmpl $128, key_bitlength(CTX);
+	jne .Lenc_max32;
+
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+
+	ret;
+
+.align 8
+.Lenc_max32:
+	movl $32, %r8d;
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX),
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 24);
+
+	jmp .Lenc_done;
+.size __camellia_enc_blk16,.-__camellia_enc_blk16;
+
+.align 8
+.type   __camellia_dec_blk16, at function;
+
+__camellia_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%xmm0..%xmm15: 16 encrypted blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+	cmpl $32, %r8d;
+	je .Ldec_max32;
+
+.Ldec_max24:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 16);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX),
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX));
+
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 8);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX),
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX));
+
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+	ret;
+
+.align 8
+.Ldec_max32:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 24);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX),
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX));
+
+	jmp .Ldec_max24;
+.size __camellia_dec_blk16,.-__camellia_dec_blk16;
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.global _gcry_camellia_aesni_avx_ctr_enc
+.type   _gcry_camellia_aesni_avx_ctr_enc, at function;
+
+_gcry_camellia_aesni_avx_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	vmovdqa .Lbswap128_mask RIP, %xmm14;
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm15;
+	vmovdqu %xmm15, 15 * 16(%rax);
+	vpshufb %xmm14, %xmm15, %xmm0; /* be => le */
+
+	vpcmpeqd %xmm15, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+	/* construct IVs */
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 14 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 13 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm12;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm11;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm10;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm9;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm8;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm7;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm6;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm5;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm4;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm3;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm2;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vmovdqa %xmm0, %xmm13;
+	vpshufb %xmm14, %xmm0, %xmm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */
+	vmovdqu %xmm13, (%rcx);
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor 13 * 16(%rax), %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
+
+.align 8
+.global _gcry_camellia_aesni_avx_cbc_dec
+.type   _gcry_camellia_aesni_avx_cbc_dec, at function;
+
+_gcry_camellia_aesni_avx_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+
+	movq %rcx, %r9;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	call __camellia_dec_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	/* XOR output with IV */
+	vpxor (%r9), %xmm7, %xmm7;
+	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+	movq (15 * 16 + 0)(%rdx), %r10;
+	movq (15 * 16 + 8)(%rdx), %r11;
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	/* store new IV */
+	movq %r10, (0)(%r9);
+	movq %r11, (8)(%r9);
+
+	ret;
+.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+
+#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/configure.ac b/configure.ac
index 5e57868..4a4a2aa 100644
--- a/configure.ac
+++ b/configure.ac
@@ -557,6 +557,14 @@ if test x"$drngsupport" = xyes ; then
             [Enable support for Intel DRNG (RDRAND instruction).])
 fi
 
+# Implementation of the --disable-avx-support switch.
+AC_MSG_CHECKING([whether AVX support is requested])
+AC_ARG_ENABLE(avx-support,
+              AC_HELP_STRING([--disable-avx-support],
+                 [Disable support for the Intel AVX instructions]),
+	      avxsupport=$enableval,avxsupport=yes)
+AC_MSG_RESULT($avxsupport)
+
 # Implementation of the --disable-O-flag-munging switch.
 AC_MSG_CHECKING([whether a -O flag munging is requested])
 AC_ARG_ENABLE([O-flag-munging],
@@ -842,6 +850,32 @@ if test "$gcry_cv_have_asm" = "no" ; then
 fi
 
 
+#
+# Check whether GCC inline assembler supports AVX instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions],
+       [gcry_cv_gcc_inline_asm_avx],
+       [gcry_cv_gcc_inline_asm_avx=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[void a(void) {
+              __asm__("vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):);
+            }]])],
+          [gcry_cv_gcc_inline_asm_avx=yes])])
+if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_AVX,1,
+     [Defined if inline assembler supports AVX instructions])
+
+   if test x"$avxsupport" = xyes ; then
+      AC_DEFINE(ENABLE_AVX_SUPPORT,1,
+        [Enable support for Intel AVX instructions.])
+   fi
+else
+   if test x"$avxsupport" = xyes ; then
+      avxsupport="no (unsupported by compiler)"
+   fi
+fi
+
+
 #######################################
 #### Checks for library functions. ####
 #######################################
@@ -1139,6 +1173,13 @@ LIST_MEMBER(camellia, $enabled_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo"
    AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included])
+
+   if test x"$avxsupport" = xyes ; then
+      if test x"$aesnisupport" = xyes ; then
+        # Build with the AES-NI/AVX implementation
+        GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia_aesni_avx_x86-64.lo"
+      fi
+   fi
 fi
 
 LIST_MEMBER(idea, $enabled_ciphers)
@@ -1358,6 +1399,7 @@ echo "
         Try using Padlock crypto:  $padlocksupport
         Try using AES-NI crypto:   $aesnisupport
         Try using DRNG (RDRAND):   $drngsupport
+        Try using Intel AVX:       $avxsupport
 "
 
 if test "$print_egd_notice" = "yes"; then
diff --git a/src/g10lib.h b/src/g10lib.h
index 5e99c46..da76c7b 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -152,6 +152,7 @@ int _gcry_log_verbosity( int level );
 
 #define HWF_INTEL_AESNI  256
 #define HWF_INTEL_RDRAND 512
+#define HWF_INTEL_AVX    1024
 
 
 unsigned int _gcry_get_hw_features (void);
diff --git a/src/global.c b/src/global.c
index b701dfc..a1a83e9 100644
--- a/src/global.c
+++ b/src/global.c
@@ -67,6 +67,7 @@ static struct
     { HWF_PADLOCK_MMUL,"padlock-mmul"},
     { HWF_INTEL_AESNI, "intel-aesni" },
     { HWF_INTEL_RDRAND,"intel-rdrand" },
+    { HWF_INTEL_AVX,   "intel-avx" },
     { 0, NULL}
   };
 
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index b50b4b0..1e6ec94 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -204,6 +204,11 @@ detect_x86_gnuc (void)
   if (features & 0x02000000)
      result |= HWF_INTEL_AESNI;
 #endif /*ENABLE_AESNI_SUPPORT*/
+#ifdef ENABLE_AVX_SUPPORT
+  /* Test bit 28 for AVX.  */
+  if (features & 0x10000000)
+     result |= HWF_INTEL_AVX;
+#endif /*ENABLE_AVX_SUPPORT*/
 #ifdef ENABLE_DRNG_SUPPORT
   /* Test bit 30 for RDRAND.  */
   if (features & 0x40000000)

commit 4de62d80644228fc5db2a9f9c94a7eb633d8de2e
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Wed Jan 23 11:55:08 2013 +0200

    camellia.c: Prepare for AES-NI/AVX implementation
    
    * cipher/camellia-glue.c (CAMELLIA_encrypt_stack_burn_size)
    (CAMELLIA_decrypt_stack_burn_size): Increase stack burn size.
    * cipher/camellia.c (CAMELLIA_ROUNDSM): Move key-material mixing in
    the front.
    (camellia_setup128, camellia_setup256): Remove now unneeded
    key-material mangling.
    (camellia_encrypt128, camellia_decrypt128, amellia_encrypt256)
    (camellia_decrypt256): Copy block to stack, so that compiler can
    optimize it for register usage.
    --
    
    Camellia implementation needs to be modified slightly for compatibility with
    AES-NI/AVX version.
    
    Before:
    Running each test 100 times.
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    CAMELLIA128    800ms   790ms   840ms   730ms   810ms   800ms   820ms   820ms   730ms   740ms
    CAMELLIA192   1040ms  1040ms  1030ms   930ms  1000ms  1000ms  1020ms  1020ms   940ms   930ms
    CAMELLIA256   1000ms   980ms  1040ms   930ms  1010ms   990ms  1040ms  1040ms   940ms   930ms
    
    After:
    Running each test 100 times.
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    CAMELLIA128    780ms   750ms   810ms   690ms   780ms   770ms   810ms   810ms   700ms   690ms
    CAMELLIA192   1020ms   990ms  1000ms   890ms   970ms   970ms   990ms  1000ms   890ms   900ms
    CAMELLIA256    960ms   960ms  1000ms   900ms   970ms   970ms   990ms  1010ms   900ms   890ms
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 686262a..ba8aa28 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -121,7 +121,7 @@ camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
 
 #define CAMELLIA_encrypt_stack_burn_size \
   (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
-     +4*sizeof(u32) \
+     +4*sizeof(u32)+4*sizeof(u32) \
      +2*sizeof(u32*)+4*sizeof(u32) \
      +2*2*sizeof(void*) /* Function calls.  */ \
     )
@@ -138,7 +138,7 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
 
 #define CAMELLIA_decrypt_stack_burn_size \
     (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
-     +4*sizeof(u32) \
+     +4*sizeof(u32)+4*sizeof(u32) \
      +2*sizeof(u32*)+4*sizeof(u32) \
      +2*2*sizeof(void*) /* Function calls.  */ \
     )
diff --git a/cipher/camellia.c b/cipher/camellia.c
index 2e28bce..42a9b73 100644
--- a/cipher/camellia.c
+++ b/cipher/camellia.c
@@ -151,6 +151,8 @@ typedef unsigned char u8;
 
 #define CAMELLIA_ROUNDSM(xl, xr, kl, kr, yl, yr, il, ir, t0, t1)	\
     do {								\
+	yl ^= kl;							\
+	yr ^= kr;							\
 	ir = CAMELLIA_SP1110(xr & 0xff)					\
 	    ^ CAMELLIA_SP0222((xr >> 24) & 0xff)			\
 	    ^ CAMELLIA_SP3033((xr >> 16) & 0xff)			\
@@ -159,8 +161,6 @@ typedef unsigned char u8;
 	    ^ CAMELLIA_SP0222((xl >> 16) & 0xff)			\
 	    ^ CAMELLIA_SP3033((xl >> 8) & 0xff)				\
 	    ^ CAMELLIA_SP4404(xl & 0xff);				\
-	il ^= kl;							\
-	ir ^= kr;							\
 	ir ^= il;							\
 	il = CAMELLIA_RR8(il);						\
 	il ^= ir;							\
@@ -614,44 +614,6 @@ void camellia_setup128(const unsigned char *key, u32 *subkey)
     CamelliaSubkeyL(24) = subl(24) ^ subl(23);
     CamelliaSubkeyR(24) = subr(24) ^ subr(23);
 
-    /* apply the inverse of the last half of P-function */
-    dw = CamelliaSubkeyL(2) ^ CamelliaSubkeyR(2), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(2) = CamelliaSubkeyL(2) ^ dw, CamelliaSubkeyL(2) = dw;
-    dw = CamelliaSubkeyL(3) ^ CamelliaSubkeyR(3), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(3) = CamelliaSubkeyL(3) ^ dw, CamelliaSubkeyL(3) = dw;
-    dw = CamelliaSubkeyL(4) ^ CamelliaSubkeyR(4), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(4) = CamelliaSubkeyL(4) ^ dw, CamelliaSubkeyL(4) = dw;
-    dw = CamelliaSubkeyL(5) ^ CamelliaSubkeyR(5), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(5) = CamelliaSubkeyL(5) ^ dw, CamelliaSubkeyL(5) = dw;
-    dw = CamelliaSubkeyL(6) ^ CamelliaSubkeyR(6), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(6) = CamelliaSubkeyL(6) ^ dw, CamelliaSubkeyL(6) = dw;
-    dw = CamelliaSubkeyL(7) ^ CamelliaSubkeyR(7), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(7) = CamelliaSubkeyL(7) ^ dw, CamelliaSubkeyL(7) = dw;
-    dw = CamelliaSubkeyL(10) ^ CamelliaSubkeyR(10), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(10) = CamelliaSubkeyL(10) ^ dw, CamelliaSubkeyL(10) = dw;
-    dw = CamelliaSubkeyL(11) ^ CamelliaSubkeyR(11), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(11) = CamelliaSubkeyL(11) ^ dw, CamelliaSubkeyL(11) = dw;
-    dw = CamelliaSubkeyL(12) ^ CamelliaSubkeyR(12), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(12) = CamelliaSubkeyL(12) ^ dw, CamelliaSubkeyL(12) = dw;
-    dw = CamelliaSubkeyL(13) ^ CamelliaSubkeyR(13), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(13) = CamelliaSubkeyL(13) ^ dw, CamelliaSubkeyL(13) = dw;
-    dw = CamelliaSubkeyL(14) ^ CamelliaSubkeyR(14), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(14) = CamelliaSubkeyL(14) ^ dw, CamelliaSubkeyL(14) = dw;
-    dw = CamelliaSubkeyL(15) ^ CamelliaSubkeyR(15), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(15) = CamelliaSubkeyL(15) ^ dw, CamelliaSubkeyL(15) = dw;
-    dw = CamelliaSubkeyL(18) ^ CamelliaSubkeyR(18), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(18) = CamelliaSubkeyL(18) ^ dw, CamelliaSubkeyL(18) = dw;
-    dw = CamelliaSubkeyL(19) ^ CamelliaSubkeyR(19), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(19) = CamelliaSubkeyL(19) ^ dw, CamelliaSubkeyL(19) = dw;
-    dw = CamelliaSubkeyL(20) ^ CamelliaSubkeyR(20), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(20) = CamelliaSubkeyL(20) ^ dw, CamelliaSubkeyL(20) = dw;
-    dw = CamelliaSubkeyL(21) ^ CamelliaSubkeyR(21), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(21) = CamelliaSubkeyL(21) ^ dw, CamelliaSubkeyL(21) = dw;
-    dw = CamelliaSubkeyL(22) ^ CamelliaSubkeyR(22), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(22) = CamelliaSubkeyL(22) ^ dw, CamelliaSubkeyL(22) = dw;
-    dw = CamelliaSubkeyL(23) ^ CamelliaSubkeyR(23), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(23) = CamelliaSubkeyL(23) ^ dw, CamelliaSubkeyL(23) = dw;
-
     return;
 }
 
@@ -888,56 +850,6 @@ void camellia_setup256(const unsigned char *key, u32 *subkey)
     CamelliaSubkeyL(32) = subl(32) ^ subl(31);
     CamelliaSubkeyR(32) = subr(32) ^ subr(31);
 
-    /* apply the inverse of the last half of P-function */
-    dw = CamelliaSubkeyL(2) ^ CamelliaSubkeyR(2), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(2) = CamelliaSubkeyL(2) ^ dw, CamelliaSubkeyL(2) = dw;
-    dw = CamelliaSubkeyL(3) ^ CamelliaSubkeyR(3), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(3) = CamelliaSubkeyL(3) ^ dw, CamelliaSubkeyL(3) = dw;
-    dw = CamelliaSubkeyL(4) ^ CamelliaSubkeyR(4), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(4) = CamelliaSubkeyL(4) ^ dw, CamelliaSubkeyL(4) = dw;
-    dw = CamelliaSubkeyL(5) ^ CamelliaSubkeyR(5), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(5) = CamelliaSubkeyL(5) ^ dw, CamelliaSubkeyL(5) = dw;
-    dw = CamelliaSubkeyL(6) ^ CamelliaSubkeyR(6), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(6) = CamelliaSubkeyL(6) ^ dw, CamelliaSubkeyL(6) = dw;
-    dw = CamelliaSubkeyL(7) ^ CamelliaSubkeyR(7), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(7) = CamelliaSubkeyL(7) ^ dw, CamelliaSubkeyL(7) = dw;
-    dw = CamelliaSubkeyL(10) ^ CamelliaSubkeyR(10), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(10) = CamelliaSubkeyL(10) ^ dw, CamelliaSubkeyL(10) = dw;
-    dw = CamelliaSubkeyL(11) ^ CamelliaSubkeyR(11), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(11) = CamelliaSubkeyL(11) ^ dw, CamelliaSubkeyL(11) = dw;
-    dw = CamelliaSubkeyL(12) ^ CamelliaSubkeyR(12), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(12) = CamelliaSubkeyL(12) ^ dw, CamelliaSubkeyL(12) = dw;
-    dw = CamelliaSubkeyL(13) ^ CamelliaSubkeyR(13), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(13) = CamelliaSubkeyL(13) ^ dw, CamelliaSubkeyL(13) = dw;
-    dw = CamelliaSubkeyL(14) ^ CamelliaSubkeyR(14), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(14) = CamelliaSubkeyL(14) ^ dw, CamelliaSubkeyL(14) = dw;
-    dw = CamelliaSubkeyL(15) ^ CamelliaSubkeyR(15), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(15) = CamelliaSubkeyL(15) ^ dw, CamelliaSubkeyL(15) = dw;
-    dw = CamelliaSubkeyL(18) ^ CamelliaSubkeyR(18), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(18) = CamelliaSubkeyL(18) ^ dw, CamelliaSubkeyL(18) = dw;
-    dw = CamelliaSubkeyL(19) ^ CamelliaSubkeyR(19), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(19) = CamelliaSubkeyL(19) ^ dw, CamelliaSubkeyL(19) = dw;
-    dw = CamelliaSubkeyL(20) ^ CamelliaSubkeyR(20), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(20) = CamelliaSubkeyL(20) ^ dw, CamelliaSubkeyL(20) = dw;
-    dw = CamelliaSubkeyL(21) ^ CamelliaSubkeyR(21), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(21) = CamelliaSubkeyL(21) ^ dw, CamelliaSubkeyL(21) = dw;
-    dw = CamelliaSubkeyL(22) ^ CamelliaSubkeyR(22), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(22) = CamelliaSubkeyL(22) ^ dw, CamelliaSubkeyL(22) = dw;
-    dw = CamelliaSubkeyL(23) ^ CamelliaSubkeyR(23), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(23) = CamelliaSubkeyL(23) ^ dw, CamelliaSubkeyL(23) = dw;
-    dw = CamelliaSubkeyL(26) ^ CamelliaSubkeyR(26), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(26) = CamelliaSubkeyL(26) ^ dw, CamelliaSubkeyL(26) = dw;
-    dw = CamelliaSubkeyL(27) ^ CamelliaSubkeyR(27), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(27) = CamelliaSubkeyL(27) ^ dw, CamelliaSubkeyL(27) = dw;
-    dw = CamelliaSubkeyL(28) ^ CamelliaSubkeyR(28), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(28) = CamelliaSubkeyL(28) ^ dw, CamelliaSubkeyL(28) = dw;
-    dw = CamelliaSubkeyL(29) ^ CamelliaSubkeyR(29), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(29) = CamelliaSubkeyL(29) ^ dw, CamelliaSubkeyL(29) = dw;
-    dw = CamelliaSubkeyL(30) ^ CamelliaSubkeyR(30), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(30) = CamelliaSubkeyL(30) ^ dw, CamelliaSubkeyL(30) = dw;
-    dw = CamelliaSubkeyL(31) ^ CamelliaSubkeyR(31), dw = CAMELLIA_RL8(dw);
-    CamelliaSubkeyR(31) = CamelliaSubkeyL(31) ^ dw,CamelliaSubkeyL(31) = dw;
-
     return;
 }
 
@@ -963,9 +875,15 @@ void camellia_setup192(const unsigned char *key, u32 *subkey)
  *
  * "io" must be 4byte aligned and big-endian data.
  */
-void camellia_encrypt128(const u32 *subkey, u32 *io)
+void camellia_encrypt128(const u32 *subkey, u32 *blocks)
 {
     u32 il, ir, t0, t1;
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
 
     /* pre whitening but absorb kw2*/
     io[0] ^= CamelliaSubkeyL(0);
@@ -1050,13 +968,24 @@ void camellia_encrypt128(const u32 *subkey, u32 *io)
     io[2] = t0;
     io[3] = t1;
 
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
     return;
 }
 
-void camellia_decrypt128(const u32 *subkey, u32 *io)
+void camellia_decrypt128(const u32 *subkey, u32 *blocks)
 {
     u32 il,ir,t0,t1;               /* temporary valiables */
+    u32 io[4];
 
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
+ 
     /* pre whitening but absorb kw2*/
     io[0] ^= CamelliaSubkeyL(24);
     io[1] ^= CamelliaSubkeyR(24);
@@ -1140,15 +1069,26 @@ void camellia_decrypt128(const u32 *subkey, u32 *io)
     io[2] = t0;
     io[3] = t1;
 
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
     return;
 }
 
 /**
  * stuff for 192 and 256bit encryption/decryption
  */
-void camellia_encrypt256(const u32 *subkey, u32 *io)
+void camellia_encrypt256(const u32 *subkey, u32 *blocks)
 {
     u32 il,ir,t0,t1;           /* temporary valiables */
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
 
     /* pre whitening but absorb kw2*/
     io[0] ^= CamelliaSubkeyL(0);
@@ -1257,12 +1197,23 @@ void camellia_encrypt256(const u32 *subkey, u32 *io)
     io[2] = t0;
     io[3] = t1;
 
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
     return;
 }
 
-void camellia_decrypt256(const u32 *subkey, u32 *io)
+void camellia_decrypt256(const u32 *subkey, u32 *blocks)
 {
     u32 il,ir,t0,t1;           /* temporary valiables */
+    u32 io[4];
+
+    io[0] = blocks[0];
+    io[1] = blocks[1];
+    io[2] = blocks[2];
+    io[3] = blocks[3];
 
     /* pre whitening but absorb kw2*/
     io[0] ^= CamelliaSubkeyL(32);
@@ -1371,6 +1322,11 @@ void camellia_decrypt256(const u32 *subkey, u32 *io)
     io[2] = t0;
     io[3] = t1;
 
+    blocks[0] = io[0];
+    blocks[1] = io[1];
+    blocks[2] = io[2];
+    blocks[3] = io[3];
+
     return;
 }
 

commit 537f12ce072d568f9fa344c447d32b2e0efffbe8
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Wed Jan 23 11:55:03 2013 +0200

    Camellia, prepare glue code for AES-NI/AVX implementation
    
    * cipher/camellia-glue.c (ATTR_ALIGNED_16): Add macro.
    (CAMELLIA_encrypt_stack_burn_size): Add macro.
    (camellia_encrypt): Use macro above for stack burn size.
    (CAMELLIA_decrypt_stack_burn_size): Add macro.
    (camellia_decrypt): Use macro above for stack burn size.
    (_gcry_camellia_ctr_enc): New function.
    (_gcry_camellia_cbc_dec): New function.
    (selftest_ctr_128): New function.
    (selftest): Call function above.
    * cipher/cipher.c (gcry_cipher_open) [USE_CAMELLIA]: Register bulk
    functions for CBC-decryption and CTR-mode.
    * src/cipher.h (_gcry_camellia_ctr_enc): New prototype.
    (_gcry_camellia_cbc_dec): New prototype.
    --
    
    AES-NI/AVX implementation needs multi-block input, so prepare glue code for
    that by adding bulk-functions for CBC-decryption and CTR-mode.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index c5019d0..686262a 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -62,6 +62,14 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "camellia.h"
+#include "bufhelp.h"
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
 
 typedef struct
 {
@@ -110,12 +118,15 @@ camellia_encrypt(void *c, byte *outbuf, const byte *inbuf)
   CAMELLIA_context *ctx=c;
 
   Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
-  _gcry_burn_stack
-    (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/)
-     +4*sizeof(u32)
-     +2*sizeof(u32*)+4*sizeof(u32)
-     +2*2*sizeof(void*) /* Function calls.  */
-    );
+
+#define CAMELLIA_encrypt_stack_burn_size \
+  (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
+     +4*sizeof(u32) \
+     +2*sizeof(u32*)+4*sizeof(u32) \
+     +2*2*sizeof(void*) /* Function calls.  */ \
+    )
+
+  _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size);
 }
 
 static void
@@ -124,12 +135,175 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
   CAMELLIA_context *ctx=c;
 
   Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf);
-  _gcry_burn_stack
-    (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/)
-     +4*sizeof(u32)
-     +2*sizeof(u32*)+4*sizeof(u32)
-     +2*2*sizeof(void*) /* Function calls.  */
-    );
+
+#define CAMELLIA_decrypt_stack_burn_size \
+    (sizeof(int)+2*sizeof(unsigned char *)+sizeof(void*/*KEY_TABLE_TYPE*/) \
+     +4*sizeof(u32) \
+     +2*sizeof(u32*)+4*sizeof(u32) \
+     +2*2*sizeof(void*) /* Function calls.  */ \
+    )
+
+  _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size);
+}
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size CAMELLIA_BLOCK_SIZE. */
+void
+_gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
+  int i;
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* Encrypt the counter. */
+      Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
+      /* XOR the input with the encrypted counter and store in output.  */
+      buf_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf  += CAMELLIA_BLOCK_SIZE;
+      /* Increment the counter.  */
+      for (i = CAMELLIA_BLOCK_SIZE; i > 0; i--)
+        {
+          ctr[i-1]++;
+          if (ctr[i-1])
+            break;
+        }
+    }
+
+  wipememory(tmpbuf, sizeof(tmpbuf));
+  _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_camellia_cbc_dec(void *context, unsigned char *iv,
+                       void *outbuf_arg, const void *inbuf_arg,
+                       unsigned int nblocks)
+{
+  CAMELLIA_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
+
+  for ( ;nblocks; nblocks-- )
+    {
+      /* We need to save INBUF away because it may be identical to
+         OUTBUF.  */
+      memcpy(savebuf, inbuf, CAMELLIA_BLOCK_SIZE);
+
+      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, outbuf);
+
+      buf_xor(outbuf, outbuf, iv, CAMELLIA_BLOCK_SIZE);
+      memcpy(iv, savebuf, CAMELLIA_BLOCK_SIZE);
+      inbuf += CAMELLIA_BLOCK_SIZE;
+      outbuf += CAMELLIA_BLOCK_SIZE;
+    }
+
+  wipememory(savebuf, sizeof(savebuf));
+  _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size);
+}
+
+/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 16+1;
+  CAMELLIA_context ctx ATTR_ALIGNED_16;
+  unsigned char plaintext[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char ciphertext[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char plaintext2[nblocks*16] ATTR_ALIGNED_16;
+  unsigned char iv[16] ATTR_ALIGNED_16;
+  unsigned char iv2[16] ATTR_ALIGNED_16;
+  int i, j, diff;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
+  static char error_str[128];
+
+  camellia_setkey (&ctx, key, sizeof (key));
+
+  /* Test single block code path */
+  memset(iv, 0xff, sizeof(iv));
+  for (i = 0; i < 16; i++)
+    plaintext[i] = i;
+
+  /* CTR manually.  */
+  camellia_encrypt (&ctx, ciphertext, iv);
+  for (i = 0; i < 16; i++)
+    ciphertext[i] ^= plaintext[i];
+  for (i = 16; i > 0; i--)
+    {
+      iv[i-1]++;
+      if (iv[i-1])
+        break;
+    }
+
+  memset(iv2, 0xff, sizeof(iv2));
+  _gcry_camellia_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp(plaintext2, plaintext, 16))
+    return "CAMELLIA-128-CTR test failed (plaintext mismatch)";
+
+  if (memcmp(iv2, iv, 16))
+    return "CAMELLIA-128-CTR test failed (IV mismatch)";
+
+  /* Test parallelized code paths */
+  for (diff = 0; diff < nblocks; diff++) {
+    memset(iv, 0xff, sizeof(iv));
+    iv[15] -= diff;
+
+    for (i = 0; i < sizeof(plaintext); i++)
+      plaintext[i] = i;
+
+    /* Create CTR ciphertext manually.  */
+    for (i = 0; i < sizeof(plaintext); i+=16)
+      {
+        camellia_encrypt (&ctx, &ciphertext[i], iv);
+        for (j = 0; j < 16; j++)
+          ciphertext[i+j] ^= plaintext[i+j];
+        for (j = 16; j > 0; j--)
+          {
+            iv[j-1]++;
+            if (iv[j-1])
+              break;
+          }
+      }
+
+    /* Decrypt using bulk CTR and compare result.  */
+    memset(iv2, 0xff, sizeof(iv2));
+    iv2[15] -= diff;
+
+    _gcry_camellia_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
+                            sizeof(ciphertext) / CAMELLIA_BLOCK_SIZE);
+
+    if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
+      {
+        snprintf(error_str, sizeof(error_str),
+                 "CAMELLIA-128-CTR test failed (plaintext mismatch, diff: %d)",
+                 diff);
+        return error_str;
+      }
+    if (memcmp(iv2, iv, sizeof(iv)))
+      {
+        snprintf(error_str, sizeof(error_str),
+                 "CAMELLIA-128-CTR test failed (IV mismatch, diff: %d)",
+                 diff);
+        return error_str;
+      }
+  }
+
+  return NULL;
 }
 
 static const char *
@@ -137,6 +311,7 @@ selftest(void)
 {
   CAMELLIA_context ctx;
   byte scratch[16];
+  const char *r;
 
   /* These test vectors are from RFC-3713 */
   const byte plaintext[]=
@@ -200,6 +375,9 @@ selftest(void)
   if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
     return "CAMELLIA-256 test decryption failed.";
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 389bf7a..f1224af 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -718,6 +718,14 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
               h->bulk.ctr_enc = _gcry_aes_ctr_enc;
               break;
 #endif /*USE_AES*/
+#ifdef USE_CAMELLIA
+	    case GCRY_CIPHER_CAMELLIA128:
+	    case GCRY_CIPHER_CAMELLIA192:
+	    case GCRY_CIPHER_CAMELLIA256:
+              h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
+              h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
+              break;
+#endif /*USE_CAMELLIA*/
 
             default:
               break;
diff --git a/src/cipher.h b/src/cipher.h
index 48eeeda..17fec6c 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -95,6 +95,14 @@ void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         unsigned int nblocks);
 
+/*-- camellia-glue.c --*/
+void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
+                             void *outbuf_arg, const void *inbuf_arg,
+                             unsigned int nblocks);
+void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
+                             void *outbuf_arg, const void *inbuf_arg,
+                             unsigned int nblocks);
+
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am                 |    4 +-
 cipher/camellia-glue.c             |  298 +++++++++-
 cipher/camellia.c                  |  147 ++---
 cipher/camellia_aesni_avx_x86-64.S | 1120 ++++++++++++++++++++++++++++++++++++
 cipher/cipher.c                    |    8 +
 configure.ac                       |   42 ++
 src/cipher.h                       |    8 +
 src/g10lib.h                       |    1 +
 src/global.c                       |    1 +
 src/hwf-x86.c                      |    5 +
 10 files changed, 1525 insertions(+), 109 deletions(-)
 create mode 100644 cipher/camellia_aesni_avx_x86-64.S


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org