[PATCH] Add Intel SSSE3 based vector permutation AES implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Dec 28 15:03:10 CET 2014
* cipher/Makefile.am: Add 'rijndael-ssse3-amd64.c'.
* cipher/rijndael-internal.h (USE_SSSE3): New.
(RIJNDAEL_context_s) [USE_SSSE3]: Add 'use_ssse3'.
* cipher/rijndael-ssse3-amd64.c: New.
* cipher/rijndael.c [USE_SSSE3] (_gcry_aes_ssse3_do_setkey)
(_gcry_aes_ssse3_prepare_decryption, _gcry_aes_ssse3_encrypt)
(_gcry_aes_ssse3_decrypt, _gcry_aes_ssse3_cfb_enc)
(_gcry_aes_ssse3_cbc_enc, _gcry_aes_ssse3_ctr_enc)
(_gcry_aes_ssse3_cfb_dec, _gcry_aes_ssse3_cbc_dec): New.
(do_setkey): Add HWF check for SSSE3 and setup for SSSE3
implementation.
(prepare_decryption, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
(_gcry_aes_ctr_enc, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec): Add
selection for SSSE3 implementation.
* configure.ac [host=x86_64]: Add 'rijndael-ssse3-amd64.lo'.
--
This patch adds "AES with vector permutations" implementation by
Mike Hamburg. Public-domain source-code is available at:
http://crypto.stanford.edu/vpaes/
Benchmark on Intel Core2 T8100 (2.1Ghz, no turbo):
Old:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 8.80 ns/B 108.4 MiB/s 18.48 c/B
ECB dec | 9.07 ns/B 105.2 MiB/s 19.04 c/B
CBC enc | 7.78 ns/B 122.7 MiB/s 16.33 c/B
CBC dec | 7.71 ns/B 123.8 MiB/s 16.18 c/B
CFB enc | 7.89 ns/B 120.9 MiB/s 16.56 c/B
CFB dec | 7.56 ns/B 126.1 MiB/s 15.88 c/B
OFB enc | 9.04 ns/B 105.5 MiB/s 18.99 c/B
OFB dec | 9.01 ns/B 105.9 MiB/s 18.91 c/B
CTR enc | 7.79 ns/B 122.3 MiB/s 16.37 c/B
CTR dec | 7.94 ns/B 120.2 MiB/s 16.67 c/B
New:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 5.92 ns/B 161.1 MiB/s 12.43 c/B
ECB dec | 7.19 ns/B 132.6 MiB/s 15.11 c/B
CBC enc | 5.30 ns/B 180.0 MiB/s 11.12 c/B
CBC dec | 6.54 ns/B 145.9 MiB/s 13.73 c/B
CFB enc | 5.22 ns/B 182.9 MiB/s 10.95 c/B
CFB dec | 5.29 ns/B 180.2 MiB/s 11.11 c/B
OFB enc | 6.20 ns/B 153.8 MiB/s 13.02 c/B
OFB dec | 6.19 ns/B 154.2 MiB/s 12.99 c/B
CTR enc | 5.30 ns/B 179.8 MiB/s 11.14 c/B
CTR dec | 5.31 ns/B 179.7 MiB/s 11.14 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 2
cipher/rijndael-internal.h | 9
cipher/rijndael-ssse3-amd64.c | 1219 +++++++++++++++++++++++++++++++++++++++++
cipher/rijndael.c | 96 +++
configure.ac | 3
5 files changed, 1326 insertions(+), 3 deletions(-)
create mode 100644 cipher/rijndael-ssse3-amd64.c
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 98142ed..7dd626c 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -75,7 +75,7 @@ md4.c \
md5.c \
poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
- rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \
+ rijndael-padlock.c rijndael-amd64.S rijndael-arm.S rijndael-ssse3-amd64.c \
rmd160.c \
rsa.c \
salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 7ff8660..854980b 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -43,6 +43,12 @@
# define USE_AMD64_ASM 1
#endif
+/* USE_SSSE3 indicates whether to use SSSE3 code. */
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_SSSE3)
+# define USE_SSSE3 1
+#endif
+
/* USE_ARM_ASM indicates whether to use ARM assembly code. */
#undef USE_ARM_ASM
#if defined(__ARMEL__)
@@ -116,6 +122,9 @@ typedef struct RIJNDAEL_context_s
#ifdef USE_AESNI
unsigned int use_aesni:1; /* AES-NI shall be used. */
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ unsigned int use_ssse3:1; /* SSSE3 shall be used. */
+#endif /*USE_SSSE3*/
rijndael_cryptfn_t encrypt_fn;
rijndael_cryptfn_t decrypt_fn;
rijndael_prefetchfn_t prefetch_enc_fn;
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
new file mode 100644
index 0000000..080e54e
--- /dev/null
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -0,0 +1,1219 @@
+/* SSSE3 vector permutation AES for Libgcrypt
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* for memcmp() */
+
+#include "types.h" /* for byte and u32 typedefs */
+#include "g10lib.h"
+#include "cipher.h"
+#include "bufhelp.h"
+#include "cipher-selftest.h"
+#include "rijndael-internal.h"
+
+
+#ifdef USE_SSSE3
+
+
+/* Two macros to be called prior and after the use of SSSE3
+ instructions. There should be no external function calls between
+ the use of these macros. There purpose is to make sure that the
+ SSE regsiters are cleared and won't reveal any information about
+ the key or the data. */
+#define vpaes_ssse3_prepare(const_ptr) do { \
+ asm volatile ("call _aes_preheat\n\t" \
+ "movq %%r10, %q0\n\t" \
+ : "=c" (const_ptr) \
+ : \
+ : "r10", "memory", "cc" ); \
+ } while (0)
+#define vpaes_ssse3_cleanup() do { \
+ asm volatile ("call _aes_cleanup\n\t" ::: "memory", "cc" ); \
+ } while (0)
+
+
+void
+_gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+ unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+
+ asm volatile ("leaq %q[key], %%rdi" "\n\t"
+ "movl %[bits], %%esi" "\n\t"
+ "leaq %[buf], %%rdx" "\n\t"
+ "movl %[dir], %%ecx" "\n\t"
+ "movl %[rotoffs], %%r8d" "\n\t"
+ "call _aes_schedule_core" "\n\t"
+ :
+ : [key] "m" (*key),
+ [bits] "g" (keybits),
+ [buf] "m" (ctx->keyschenc32[0][0]),
+ [dir] "g" (0),
+ [rotoffs] "g" (48)
+ : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
+ "cc", "memory");
+
+ /* Save key for setting up decryption. */
+ memcpy(&ctx->keyschdec32[0][0], key, keybits / 8);
+}
+
+
+/* Make a decryption key from an encryption key. */
+void
+_gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
+{
+ unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+
+ asm volatile ("leaq %q[key], %%rdi" "\n\t"
+ "movl %[bits], %%esi" "\n\t"
+ "leaq %[buf], %%rdx" "\n\t"
+ "movl %[dir], %%ecx" "\n\t"
+ "movl %[rotoffs], %%r8d" "\n\t"
+ "call _aes_schedule_core" "\n\t"
+ :
+ : [key] "m" (ctx->keyschdec32[0][0]),
+ [bits] "g" (keybits),
+ [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
+ [dir] "g" (1),
+ [rotoffs] "g" ((keybits == 192) ? 0 : 32)
+ : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
+ "cc", "memory");
+}
+
+
+/* Encrypt one block using the Intel SSSE3 instructions. Block is input
+ * and output through SSE register xmm0. */
+static inline void
+do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds,
+ const void *aes_const_ptr)
+{
+ unsigned int middle_rounds = nrounds - 1;
+
+ asm volatile ("call _aes_encrypt_core" "\n\t"
+ : "+a" (middle_rounds)
+ : "d" (ctx->keyschenc32), "c" (aes_const_ptr)
+ : "r9", "r11", "cc");
+}
+
+
+/* Decrypt one block using the Intel SSSE3 instructions. Block is input
+ * and output through SSE register xmm0. */
+static inline void
+do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds,
+ const void *aes_const_ptr)
+{
+ unsigned int middle_rounds = nrounds - 1;
+
+ asm volatile ("call _aes_decrypt_core" "\n\t"
+ : "+a" (middle_rounds)
+ : "d" (ctx->keyschdec32), "c" (aes_const_ptr)
+ : "r9", "r11", "cc");
+}
+
+
+
+unsigned int
+_gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+ asm volatile ("movdqu %[src], %%xmm0\n\t"
+ :
+ : [src] "m" (*src)
+ : "memory" );
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+ : [dst] "=m" (*dst)
+ :
+ : "memory" );
+ vpaes_ssse3_cleanup ();
+ return 0;
+}
+
+
+void
+_gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+
+ asm volatile ("movdqu %[iv], %%xmm0\n\t"
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory" );
+
+ for ( ;nblocks; nblocks-- )
+ {
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
+ "pxor %%xmm1, %%xmm0\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+ : [iv] "=m" (*iv)
+ :
+ : "memory" );
+
+ vpaes_ssse3_cleanup ();
+}
+
+
+void
+_gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks, int cbc_mac)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+
+ asm volatile ("movdqu %[iv], %%xmm7\n\t"
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory" );
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm7, %%xmm0\n\t"
+ : /* No output */
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("movdqa %%xmm0, %%xmm7\n\t"
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory" );
+
+ inbuf += BLOCKSIZE;
+ if (!cbc_mac)
+ outbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+ : [iv] "=m" (*iv)
+ :
+ : "memory" );
+
+ vpaes_ssse3_cleanup ();
+}
+
+
+void
+_gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *ctr,
+ size_t nblocks)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+ static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+
+ asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
+ "movdqa %[ctr], %%xmm7\n\t" /* Preload CTR */
+ : /* No output */
+ : [mask] "m" (*be_mask),
+ [ctr] "m" (*ctr)
+ : "memory");
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile ("movdqa %%xmm7, %%xmm0\n\t" /* xmm0 := CTR (xmm7) */
+ "pcmpeqd %%xmm1, %%xmm1\n\t"
+ "psrldq $8, %%xmm1\n\t" /* xmm1 = -1 */
+
+ "pshufb %%xmm6, %%xmm7\n\t"
+ "psubq %%xmm1, %%xmm7\n\t" /* xmm7++ (big endian) */
+
+ /* detect if 64-bit carry handling is needed */
+ "cmpl $0xffffffff, 8(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+ "cmpl $0xffffffff, 12(%[ctr])\n\t"
+ "jne .Lno_carry%=\n\t"
+
+ "pslldq $8, %%xmm1\n\t" /* move lower 64-bit to high */
+ "psubq %%xmm1, %%xmm7\n\t" /* add carry to upper 64bits */
+
+ ".Lno_carry%=:\n\t"
+
+ "pshufb %%xmm6, %%xmm7\n\t"
+ "movdqa %%xmm7, (%[ctr])\n\t" /* Update CTR (mem). */
+ :
+ : [ctr] "r" (ctr)
+ : "cc", "memory");
+
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("movdqu %[src], %%xmm1\n\t" /* xmm1 := input */
+ "pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
+ "movdqu %%xmm0, %[dst]" /* Store EncCTR. */
+ : [dst] "=m" (*outbuf)
+ : [src] "m" (*inbuf)
+ : "memory");
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ vpaes_ssse3_cleanup ();
+}
+
+
+unsigned int
+_gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
+ const unsigned char *src)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+ asm volatile ("movdqu %[src], %%xmm0\n\t"
+ :
+ : [src] "m" (*src)
+ : "memory" );
+ do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+ asm volatile ("movdqu %%xmm0, %[dst]\n\t"
+ : [dst] "=m" (*dst)
+ :
+ : "memory" );
+ vpaes_ssse3_cleanup ();
+ return 0;
+}
+
+
+void
+_gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+
+ asm volatile ("movdqu %[iv], %%xmm0\n\t"
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory" );
+
+ for ( ;nblocks; nblocks-- )
+ {
+ do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+ asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
+ "movdqu %[inbuf], %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm6\n\t"
+ "movdqu %%xmm6, %[outbuf]\n\t"
+ : [outbuf] "=m" (*outbuf)
+ : [inbuf] "m" (*inbuf)
+ : "memory" );
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile ("movdqu %%xmm0, %[iv]\n\t"
+ : [iv] "=m" (*iv)
+ :
+ : "memory" );
+
+ vpaes_ssse3_cleanup ();
+}
+
+
+void
+_gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
+ const unsigned char *inbuf, unsigned char *iv,
+ size_t nblocks)
+{
+ unsigned int nrounds = ctx->rounds;
+ const void *aes_const_ptr;
+
+ vpaes_ssse3_prepare (aes_const_ptr);
+
+ asm volatile
+ ("movdqu %[iv], %%xmm7\n\t" /* use xmm7 as fast IV storage */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile
+ ("movdqu %[inbuf], %%xmm0\n\t"
+ "movdqa %%xmm0, %%xmm6\n\t" /* use xmm6 as savebuf */
+ : /* No output */
+ : [inbuf] "m" (*inbuf)
+ : "memory");
+
+ do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+
+ asm volatile
+ ("pxor %%xmm7, %%xmm0\n\t" /* xor IV with output */
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ "movdqu %%xmm6, %%xmm7\n\t" /* store savebuf as new IV */
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory");
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile
+ ("movdqu %%xmm7, %[iv]\n\t" /* store IV */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ vpaes_ssse3_cleanup ();
+}
+
+
+
+asm (
+ "\n\t" "##"
+ "\n\t" "## Constant-time SSSE3 AES core implementation."
+ "\n\t" "## version 0.1"
+ "\n\t" "##"
+ "\n\t" "## By Mike Hamburg (Stanford University), 2009"
+ "\n\t" "## Public domain."
+ "\n\t" "##"
+
+ "\n\t" ".text"
+
+ "\n\t" "##"
+ "\n\t" "## _aes_preheat"
+ "\n\t" "##"
+ "\n\t" "## Fills register %r10 -> .Laes_consts (so you can -fPIC)"
+ "\n\t" "## and %xmm8-%xmm15 as specified below."
+ "\n\t" "##"
+ "\n\t" ".align 16"
+ "\n\t" ".type _aes_preheat, at function"
+ "\n\t" "_aes_preheat:"
+ "\n\t" " lea .Laes_consts(%rip), %r10"
+ "\n\t" " movdqa (%r10), %xmm9 # 0F"
+ "\n\t" " movdqa .Lk_inv (%r10), %xmm10 # inv"
+ "\n\t" " movdqa .Lk_inv+16(%r10), %xmm11 # inva"
+ "\n\t" " movdqa .Lk_sb1 (%r10), %xmm13 # sb1u"
+ "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm12 # sb1t"
+ "\n\t" " movdqa .Lk_sb2 (%r10), %xmm15 # sb2u"
+ "\n\t" " movdqa .Lk_sb2+16(%r10), %xmm14 # sb2t"
+ "\n\t" " ret"
+ "\n\t" ".size _aes_preheat,.-_aes_preheat"
+
+ "\n\t" "##"
+ "\n\t" "## _aes_encrypt_core"
+ "\n\t" "##"
+ "\n\t" "## AES-encrypt %xmm0."
+ "\n\t" "##"
+ "\n\t" "## Inputs:"
+ "\n\t" "## %xmm0 = input"
+ "\n\t" "## %xmm9-%xmm15 as in .Laes_preheat"
+ "\n\t" "## %rcx = .Laes_consts"
+ "\n\t" "## (%rdx) = scheduled keys"
+ "\n\t" "## %rax = nrounds - 1"
+ "\n\t" "##"
+ "\n\t" "## Output in %xmm0"
+ "\n\t" "## Clobbers %xmm1-%xmm4, %r9, %r11, %rax"
+ "\n\t" "## Preserves %xmm5 - %xmm8 so you get some local vectors"
+ "\n\t" "##"
+ "\n\t" "##"
+ "\n\t" ".align 16"
+ "\n\t" ".type _aes_encrypt_core, at function"
+ "\n\t" "_aes_encrypt_core:"
+ "\n\t" " lea -16(%rdx), %r9"
+ "\n\t" " mov $16, %r11"
+ "\n\t" " movdqa .Lk_ipt (%rcx), %xmm2 # iptlo"
+ "\n\t" " movdqa %xmm9, %xmm1"
+ "\n\t" " pandn %xmm0, %xmm1"
+ "\n\t" " psrld $4, %xmm1"
+ "\n\t" " pand %xmm9, %xmm0"
+ "\n\t" " pshufb %xmm0, %xmm2"
+ "\n\t" " movdqa .Lk_ipt+16(%rcx), %xmm0 # ipthi"
+ "\n\t" " pshufb %xmm1, %xmm0"
+ "\n\t" " pxor 16(%r9),%xmm2"
+ "\n\t" " pxor %xmm2, %xmm0"
+ "\n\t" " add $32, %r9"
+ "\n\t" " jmp .Laes_entry"
+
+ "\n\t" ".align 16"
+ "\n\t" ".Laes_loop:"
+ "\n\t" " # middle of middle round"
+ "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb1u"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb1u"
+ "\n\t" " pxor (%r9), %xmm4 # 4 = sb1u + k"
+ "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb1t"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
+ "\n\t" " movdqa %xmm15, %xmm4 # 4 : sb2u"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb2u"
+ "\n\t" " movdqa .Lk_mc_forward(%r11,%rcx), %xmm1"
+ "\n\t" " movdqa %xmm14, %xmm2 # 2 : sb2t"
+ "\n\t" " pshufb %xmm3, %xmm2 # 2 = sb2t"
+ "\n\t" " pxor %xmm4, %xmm2 # 2 = 2A"
+ "\n\t" " movdqa %xmm0, %xmm3 # 3 = A"
+ "\n\t" " pshufb %xmm1, %xmm0 # 0 = B"
+ "\n\t" " pxor %xmm2, %xmm0 # 0 = 2A+B"
+ "\n\t" " pshufb .Lk_mc_backward(%r11,%rcx), %xmm3 # 3 = D"
+ "\n\t" " pxor %xmm0, %xmm3 # 3 = 2A+B+D"
+ "\n\t" " add $16, %r9 # next key"
+ "\n\t" " pshufb %xmm1, %xmm0 # 0 = 2B+C"
+ "\n\t" " pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D"
+ "\n\t" " add $16, %r11 # next mc"
+ "\n\t" " and $48, %r11 # ... mod 4"
+ "\n\t" " dec %rax # nr--"
+
+ "\n\t" ".Laes_entry:"
+ "\n\t" " # top of round"
+ "\n\t" " movdqa %xmm9, %xmm1 # 1 : i"
+ "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4"
+ "\n\t" " psrld $4, %xmm1 # 1 = i"
+ "\n\t" " pand %xmm9, %xmm0 # 0 = k"
+ "\n\t" " movaps %xmm11, %xmm2 # 2 : a/k"
+ "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
+ "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
+ "\n\t" " movaps %xmm10, %xmm3 # 3 : 1/i"
+ "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
+ "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
+ "\n\t" " movaps %xmm10, %xmm4 # 4 : 1/j"
+ "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
+ "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
+ "\n\t" " movaps %xmm10, %xmm2 # 2 : 1/iak"
+ "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
+ "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
+ "\n\t" " movaps %xmm10, %xmm3 # 3 : 1/jak"
+ "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
+ "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
+ "\n\t" " jnz .Laes_loop"
+
+ "\n\t" " # middle of last round"
+ "\n\t" " movdqa .Lk_sbo(%rcx), %xmm4 # 3 : sbou"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
+ "\n\t" " pxor (%r9), %xmm4 # 4 = sb1u + k"
+ "\n\t" " movdqa .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
+ "\n\t" " pshufb .Lk_sr(%r11,%rcx), %xmm0"
+ "\n\t" " ret"
+ "\n\t" ".size _aes_encrypt_core,.-_aes_encrypt_core"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_cleanup"
+ "\n\t" "##"
+ "\n\t" "## Erases sensitive registers %xmm0-%xmm8"
+ "\n\t" "##"
+ "\n\t" ".align 16"
+ "\n\t" ".type _aes_cleanup, at function"
+ "\n\t" "_aes_cleanup:"
+ "\n\t" " pxor %xmm0, %xmm0"
+ "\n\t" " pxor %xmm1, %xmm1"
+ "\n\t" " pxor %xmm2, %xmm2"
+ "\n\t" " pxor %xmm3, %xmm3"
+ "\n\t" " pxor %xmm4, %xmm4"
+ "\n\t" " pxor %xmm5, %xmm5"
+ "\n\t" " pxor %xmm6, %xmm6"
+ "\n\t" " pxor %xmm7, %xmm7"
+ "\n\t" " pxor %xmm8, %xmm8"
+ "\n\t" " ret"
+ "\n\t" ".size _aes_cleanup,.-_aes_cleanup"
+
+ "\n\t" "##"
+ "\n\t" "## Decryption core"
+ "\n\t" "##"
+ "\n\t" "## Same API as encryption core, except that it clobbers"
+ "\n\t" "## %xmm5. It's actually not clear that this is worthwhile."
+ "\n\t" "##"
+ "\n\t" ".align 16"
+ "\n\t" ".type _aes_decrypt_core, at function"
+ "\n\t" "_aes_decrypt_core:"
+ "\n\t" " lea -16(%rdx), %r9 # load key"
+ "\n\t" " mov %rax, %r11"
+ "\n\t" " shl $4, %r11"
+ "\n\t" " xor $48, %r11"
+ "\n\t" " and $48, %r11"
+ "\n\t" " movdqa .Lk_dipt (%rcx), %xmm2 # iptlo"
+ "\n\t" " movdqa %xmm9, %xmm1"
+ "\n\t" " pandn %xmm0, %xmm1"
+ "\n\t" " psrld $4, %xmm1"
+ "\n\t" " pand %xmm9, %xmm0"
+ "\n\t" " pshufb %xmm0, %xmm2"
+ "\n\t" " movdqa .Lk_dipt+16(%rcx), %xmm0 # ipthi"
+ "\n\t" " pshufb %xmm1, %xmm0"
+ "\n\t" " pxor 16(%r9),%xmm2"
+ "\n\t" " pxor %xmm2, %xmm0"
+ "\n\t" " add $32, %r9"
+ "\n\t" " movdqa .Lk_mc_forward+48(%rcx), %xmm5"
+ "\n\t" " jmp .Laes_dec_entry"
+
+ "\n\t" ".align 16"
+ "\n\t" ".Laes_dec_loop:"
+ "\n\t" "##"
+ "\n\t" "## Inverse mix columns"
+ "\n\t" "##"
+ "\n\t" " movdqa (%r9), %xmm0"
+ "\n\t" " movdqa .Lk_dsb9(%rcx),%xmm4 # 4 : sb9u"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb9u"
+ "\n\t" " pxor %xmm0, %xmm4"
+ "\n\t" " movdqa .Lk_dsb9+16(%rcx),%xmm0 # 0 : sb9t"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb9t"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
+ "\n\t" " add $16, %r9 # next round key"
+
+ "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
+ "\n\t" " movdqa .Lk_dsbd(%rcx),%xmm4 # 4 : sbdu"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbdu"
+ "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
+ "\n\t" " movdqa .Lk_dsbd+16(%rcx),%xmm0 # 0 : sbdt"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbdt"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
+ "\n\t" " dec %rax # nr--"
+
+ "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
+ "\n\t" " movdqa .Lk_dsbb(%rcx),%xmm4 # 4 : sbbu"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbbu"
+ "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
+ "\n\t" " movdqa .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbbt"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
+
+ "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
+ "\n\t" " movdqa .Lk_dsbe(%rcx),%xmm4 # 4 : sbeu"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbeu"
+ "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
+ "\n\t" " movdqa .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbet"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
+
+ "\n\t" " palignr $12, %xmm5, %xmm5"
+
+ "\n\t" ".Laes_dec_entry:"
+ "\n\t" " # top of round"
+ "\n\t" " movdqa %xmm9, %xmm1 # 1 : i"
+ "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4"
+ "\n\t" " psrld $4, %xmm1 # 1 = i"
+ "\n\t" " pand %xmm9, %xmm0 # 0 = k"
+ "\n\t" " movaps %xmm11, %xmm2 # 2 : a/k"
+ "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
+ "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
+ "\n\t" " movaps %xmm10, %xmm3 # 3 : 1/i"
+ "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
+ "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
+ "\n\t" " movaps %xmm10, %xmm4 # 4 : 1/j"
+ "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
+ "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
+ "\n\t" " movaps %xmm10, %xmm2 # 2 : 1/iak"
+ "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
+ "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
+ "\n\t" " movaps %xmm10, %xmm3 # 3 : 1/jak"
+ "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
+ "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
+ "\n\t" " jnz .Laes_dec_loop"
+
+ "\n\t" " # middle of last round"
+ "\n\t" " movdqa .Lk_dsbo(%rcx), %xmm4 # 3 : sbou"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
+ "\n\t" " pxor (%r9), %xmm4 # 4 = sb1u + k"
+ "\n\t" " movdqa .Lk_dsbo+16(%rcx), %xmm0 # 0 : sbot"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
+ "\n\t" " pshufb .Lk_sr(%r11,%rcx), %xmm0"
+ "\n\t" " ret"
+ "\n\t" ".size _aes_decrypt_core,.-_aes_decrypt_core"
+
+ "\n\t" "########################################################"
+ "\n\t" "## ##"
+ "\n\t" "## AES key schedule ##"
+ "\n\t" "## ##"
+ "\n\t" "########################################################"
+
+ "\n\t" ".align 16"
+ "\n\t" ".type _aes_schedule_core, at function"
+ "\n\t" "_aes_schedule_core:"
+ "\n\t" " # rdi = key"
+ "\n\t" " # rsi = size in bits"
+ "\n\t" " # rdx = buffer"
+ "\n\t" " # rcx = direction. 0=encrypt, 1=decrypt"
+
+ "\n\t" " call _aes_preheat # load the tables"
+ "\n\t" " movdqa .Lk_rcon(%r10), %xmm8 # load rcon"
+ "\n\t" " movdqu (%rdi), %xmm0 # load key (unaligned)"
+
+ "\n\t" " # input transform"
+ "\n\t" " movdqu %xmm0, %xmm3"
+ "\n\t" " lea .Lk_ipt(%r10), %r11"
+ "\n\t" " call .Laes_schedule_transform"
+ "\n\t" " movdqu %xmm0, %xmm7"
+
+ "\n\t" " test %rcx, %rcx"
+ "\n\t" " jnz .Laes_schedule_am_decrypting"
+
+ "\n\t" " # encrypting, output zeroth round key after transform"
+ "\n\t" " movdqa %xmm0, (%rdx)"
+ "\n\t" " jmp .Laes_schedule_go"
+
+ "\n\t" ".Laes_schedule_am_decrypting:"
+ "\n\t" " # decrypting, output zeroth round key after shiftrows"
+ "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3"
+ "\n\t" " movdqa %xmm3, (%rdx)"
+ "\n\t" " xor $48, %r8"
+
+ "\n\t" ".Laes_schedule_go:"
+ "\n\t" " cmp $192, %rsi"
+ "\n\t" " je .Laes_schedule_192"
+ "\n\t" " cmp $256, %rsi"
+ "\n\t" " je .Laes_schedule_256"
+ "\n\t" " # 128: fall though"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_128"
+ "\n\t" "##"
+ "\n\t" "## 128-bit specific part of key schedule."
+ "\n\t" "##"
+ "\n\t" "## This schedule is really simple, because all its parts"
+ "\n\t" "## are accomplished by the subroutines."
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_128:"
+ "\n\t" " mov $10, %rsi"
+
+ "\n\t" ".Laes_schedule_128_L:"
+ "\n\t" " call .Laes_schedule_round"
+ "\n\t" " dec %rsi"
+ "\n\t" " jz .Laes_schedule_mangle_last"
+ "\n\t" " call .Laes_schedule_mangle # write output"
+ "\n\t" " jmp .Laes_schedule_128_L"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_192"
+ "\n\t" "##"
+ "\n\t" "## 192-bit specific part of key schedule."
+ "\n\t" "##"
+ "\n\t" "## The main body of this schedule is the same as the 128-bit"
+ "\n\t" "## schedule, but with more smearing. The long, high side is"
+ "\n\t" "## stored in %xmm7 as before, and the short, low side is in"
+ "\n\t" "## the high bits of %xmm6."
+ "\n\t" "##"
+ "\n\t" "## This schedule is somewhat nastier, however, because each"
+ "\n\t" "## round produces 192 bits of key material, or 1.5 round keys."
+ "\n\t" "## Therefore, on each cycle we do 2 rounds and produce 3 round"
+ "\n\t" "## keys."
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_192:"
+ "\n\t" " movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)"
+ "\n\t" " call .Laes_schedule_transform # input transform"
+ "\n\t" " movdqa %xmm0, %xmm6 # save short part"
+ "\n\t" " pxor %xmm4, %xmm4 # clear 4"
+ "\n\t" " movhlps %xmm4, %xmm6 # clobber low side with zeros"
+ "\n\t" " mov $4, %rsi"
+
+ "\n\t" ".Laes_schedule_192_L:"
+ "\n\t" " call .Laes_schedule_round"
+ "\n\t" " palignr $8,%xmm6,%xmm0 "
+ "\n\t" " call .Laes_schedule_mangle # save key n"
+ "\n\t" " call .Laes_schedule_192_smear"
+ "\n\t" " call .Laes_schedule_mangle # save key n+1"
+ "\n\t" " call .Laes_schedule_round"
+ "\n\t" " dec %rsi"
+ "\n\t" " jz .Laes_schedule_mangle_last"
+ "\n\t" " call .Laes_schedule_mangle # save key n+2"
+ "\n\t" " call .Laes_schedule_192_smear"
+ "\n\t" " jmp .Laes_schedule_192_L"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_192_smear"
+ "\n\t" "##"
+ "\n\t" "## Smear the short, low side in the 192-bit key schedule."
+ "\n\t" "##"
+ "\n\t" "## Inputs:"
+ "\n\t" "## %xmm7: high side, b a x y"
+ "\n\t" "## %xmm6: low side, d c 0 0"
+ "\n\t" "## %xmm13: 0"
+ "\n\t" "##"
+ "\n\t" "## Outputs:"
+ "\n\t" "## %xmm6: b+c+d b+c 0 0"
+ "\n\t" "## %xmm0: b+c+d b+c b a"
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_192_smear:"
+ "\n\t" " pshufd $0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0"
+ "\n\t" " pxor %xmm0, %xmm6 # -> c+d c 0 0"
+ "\n\t" " pshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a"
+ "\n\t" " pxor %xmm0, %xmm6 # -> b+c+d b+c b a"
+ "\n\t" " movdqa %xmm6, %xmm0"
+ "\n\t" " pxor %xmm1, %xmm1"
+ "\n\t" " movhlps %xmm1, %xmm6 # clobber low side with zeros"
+ "\n\t" " ret"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_256"
+ "\n\t" "##"
+ "\n\t" "## 256-bit specific part of key schedule."
+ "\n\t" "##"
+ "\n\t" "## The structure here is very similar to the 128-bit"
+ "\n\t" "## schedule, but with an additional 'low side' in"
+ "\n\t" "## %xmm6. The low side's rounds are the same as the"
+ "\n\t" "## high side's, except no rcon and no rotation."
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_256:"
+ "\n\t" " movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)"
+ "\n\t" " call .Laes_schedule_transform # input transform"
+ "\n\t" " mov $7, %rsi"
+
+ "\n\t" ".Laes_schedule_256_L:"
+ "\n\t" " call .Laes_schedule_mangle # output low result"
+ "\n\t" " movdqa %xmm0, %xmm6 # save cur_lo in xmm6"
+
+ "\n\t" " # high round"
+ "\n\t" " call .Laes_schedule_round"
+ "\n\t" " dec %rsi"
+ "\n\t" " jz .Laes_schedule_mangle_last"
+ "\n\t" " call .Laes_schedule_mangle "
+
+ "\n\t" " # low round. swap xmm7 and xmm6"
+ "\n\t" " pshufd $0xFF, %xmm0, %xmm0"
+ "\n\t" " movdqa %xmm7, %xmm5"
+ "\n\t" " movdqa %xmm6, %xmm7"
+ "\n\t" " call .Laes_schedule_low_round"
+ "\n\t" " movdqa %xmm5, %xmm7"
+
+ "\n\t" " jmp .Laes_schedule_256_L"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_round"
+ "\n\t" "##"
+ "\n\t" "## Runs one main round of the key schedule on %xmm0, %xmm7"
+ "\n\t" "##"
+ "\n\t" "## Specifically, runs subbytes on the high dword of %xmm0"
+ "\n\t" "## then rotates it by one byte and xors into the low dword of"
+ "\n\t" "## %xmm7."
+ "\n\t" "##"
+ "\n\t" "## Adds rcon from low byte of %xmm8, then rotates %xmm8 for"
+ "\n\t" "## next rcon."
+ "\n\t" "##"
+ "\n\t" "## Smears the dwords of %xmm7 by xoring the low into the"
+ "\n\t" "## second low, result into third, result into highest."
+ "\n\t" "##"
+ "\n\t" "## Returns results in %xmm7 = %xmm0."
+ "\n\t" "## Clobbers %xmm1-%xmm4, %r11."
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_round:"
+ "\n\t" " # extract rcon from xmm8"
+ "\n\t" " pxor %xmm1, %xmm1"
+ "\n\t" " palignr $15, %xmm8, %xmm1"
+ "\n\t" " palignr $15, %xmm8, %xmm8"
+ "\n\t" " pxor %xmm1, %xmm7"
+
+ "\n\t" " # rotate"
+ "\n\t" " pshufd $0xFF, %xmm0, %xmm0"
+ "\n\t" " palignr $1, %xmm0, %xmm0"
+
+ "\n\t" " # fall through..."
+
+ "\n\t" " # low round: same as high round, but no rotation and no rcon."
+ "\n\t" ".Laes_schedule_low_round:"
+ "\n\t" " # smear xmm7"
+ "\n\t" " movdqa %xmm7, %xmm1"
+ "\n\t" " pslldq $4, %xmm7"
+ "\n\t" " pxor %xmm1, %xmm7"
+ "\n\t" " movdqa %xmm7, %xmm1"
+ "\n\t" " pslldq $8, %xmm7"
+ "\n\t" " pxor %xmm1, %xmm7"
+ "\n\t" " pxor .Lk_s63(%r10), %xmm7"
+
+ "\n\t" " # subbytes"
+ "\n\t" " movdqa %xmm9, %xmm1"
+ "\n\t" " pandn %xmm0, %xmm1"
+ "\n\t" " psrld $4, %xmm1 # 1 = i"
+ "\n\t" " pand %xmm9, %xmm0 # 0 = k"
+ "\n\t" " movaps %xmm11, %xmm2 # 2 : a/k"
+ "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
+ "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
+ "\n\t" " movaps %xmm10, %xmm3 # 3 : 1/i"
+ "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
+ "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
+ "\n\t" " movaps %xmm10, %xmm4 # 4 : 1/j"
+ "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
+ "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
+ "\n\t" " movaps %xmm10, %xmm2 # 2 : 1/iak"
+ "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
+ "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
+ "\n\t" " movaps %xmm10, %xmm3 # 3 : 1/jak"
+ "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
+ "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
+ "\n\t" " movdqa .Lk_sb1(%r10), %xmm4 # 4 : sbou"
+ "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
+ "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm0 # 0 : sbot"
+ "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
+ "\n\t" " pxor %xmm4, %xmm0 # 0 = sbox output"
+
+ "\n\t" " # add in smeared stuff"
+ "\n\t" " pxor %xmm7, %xmm0 "
+ "\n\t" " movdqa %xmm0, %xmm7"
+ "\n\t" " ret"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_transform"
+ "\n\t" "##"
+ "\n\t" "## Linear-transform %xmm0 according to tables at (%r11)"
+ "\n\t" "##"
+ "\n\t" "## Requires that %xmm9 = 0x0F0F... as in preheat"
+ "\n\t" "## Output in %xmm0"
+ "\n\t" "## Clobbers %xmm1, %xmm2"
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_transform:"
+ "\n\t" " movdqa %xmm9, %xmm1"
+ "\n\t" " pandn %xmm0, %xmm1"
+ "\n\t" " psrld $4, %xmm1"
+ "\n\t" " pand %xmm9, %xmm0"
+ "\n\t" " movaps (%r11), %xmm2 # lo"
+ "\n\t" " pshufb %xmm0, %xmm2"
+ "\n\t" " movaps 16(%r11), %xmm0 # hi"
+ "\n\t" " pshufb %xmm1, %xmm0"
+ "\n\t" " pxor %xmm2, %xmm0"
+ "\n\t" " ret"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_mangle"
+ "\n\t" "##"
+ "\n\t" "## Mangle xmm0 from (basis-transformed) standard version"
+ "\n\t" "## to our version."
+ "\n\t" "##"
+ "\n\t" "## On encrypt,"
+ "\n\t" "## xor with 0x63"
+ "\n\t" "## multiply by circulant 0,1,1,1"
+ "\n\t" "## apply shiftrows transform"
+ "\n\t" "##"
+ "\n\t" "## On decrypt,"
+ "\n\t" "## xor with 0x63"
+ "\n\t" "## multiply by 'inverse mixcolumns' circulant E,B,D,9"
+ "\n\t" "## deskew"
+ "\n\t" "## apply shiftrows transform"
+ "\n\t" "##"
+ "\n\t" "##"
+ "\n\t" "## Writes out to (%rdx), and increments or decrements it"
+ "\n\t" "## Keeps track of round number mod 4 in %r8"
+ "\n\t" "## Preserves xmm0"
+ "\n\t" "## Clobbers xmm1-xmm5"
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_mangle:"
+ "\n\t" " movdqa %xmm0, %xmm4 # save xmm0 for later"
+ "\n\t" " movdqa .Lk_mc_forward(%r10),%xmm5"
+ "\n\t" " test %rcx, %rcx"
+ "\n\t" " jnz .Laes_schedule_mangle_dec"
+
+ "\n\t" " # encrypting"
+ "\n\t" " add $16, %rdx"
+ "\n\t" " pxor .Lk_s63(%r10),%xmm4"
+ "\n\t" " pshufb %xmm5, %xmm4"
+ "\n\t" " movdqa %xmm4, %xmm3"
+ "\n\t" " pshufb %xmm5, %xmm4"
+ "\n\t" " pxor %xmm4, %xmm3"
+ "\n\t" " pshufb %xmm5, %xmm4"
+ "\n\t" " pxor %xmm4, %xmm3"
+
+ "\n\t" " jmp .Laes_schedule_mangle_both"
+
+ "\n\t" ".Laes_schedule_mangle_dec:"
+ "\n\t" " lea .Lk_dks_1(%r10), %r11 # first table: *9"
+ "\n\t" " call .Laes_schedule_transform"
+ "\n\t" " movdqa %xmm0, %xmm3"
+ "\n\t" " pshufb %xmm5, %xmm3"
+
+ "\n\t" " add $32, %r11 # next table: *B"
+ "\n\t" " call .Laes_schedule_transform"
+ "\n\t" " pxor %xmm0, %xmm3"
+ "\n\t" " pshufb %xmm5, %xmm3"
+
+ "\n\t" " add $32, %r11 # next table: *D"
+ "\n\t" " call .Laes_schedule_transform"
+ "\n\t" " pxor %xmm0, %xmm3"
+ "\n\t" " pshufb %xmm5, %xmm3"
+
+ "\n\t" " add $32, %r11 # next table: *E"
+ "\n\t" " call .Laes_schedule_transform"
+ "\n\t" " pxor %xmm0, %xmm3"
+ "\n\t" " pshufb %xmm5, %xmm3"
+
+ "\n\t" " movdqa %xmm4, %xmm0 # restore %xmm0"
+ "\n\t" " add $-16, %rdx"
+
+ "\n\t" ".Laes_schedule_mangle_both:"
+ "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3"
+ "\n\t" " add $-16, %r8"
+ "\n\t" " and $48, %r8"
+ "\n\t" " movdqa %xmm3, (%rdx)"
+ "\n\t" " ret"
+
+ "\n\t" "##"
+ "\n\t" "## .Laes_schedule_mangle_last"
+ "\n\t" "##"
+ "\n\t" "## Mangler for last round of key schedule"
+ "\n\t" "## Mangles %xmm0"
+ "\n\t" "## when encrypting, outputs out(%xmm0) ^ 63"
+ "\n\t" "## when decrypting, outputs unskew(%xmm0)"
+ "\n\t" "##"
+ "\n\t" "## Always called right before return... jumps to cleanup and exits"
+ "\n\t" "##"
+ "\n\t" ".Laes_schedule_mangle_last:"
+ "\n\t" " # schedule last round key from xmm0"
+ "\n\t" " lea .Lk_deskew(%r10),%r11 # prepare to deskew"
+ "\n\t" " test %rcx, %rcx"
+ "\n\t" " jnz .Laes_schedule_mangle_last_dec"
+
+ "\n\t" " # encrypting"
+ "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm0 # output permute"
+ "\n\t" " lea .Lk_opt(%r10), %r11 # prepare to output transform"
+ "\n\t" " add $32, %rdx"
+
+ "\n\t" ".Laes_schedule_mangle_last_dec:"
+ "\n\t" " add $-16, %rdx"
+ "\n\t" " pxor .Lk_s63(%r10), %xmm0"
+ "\n\t" " call .Laes_schedule_transform # output transform"
+ "\n\t" " movdqa %xmm0, (%rdx) # save last key"
+ "\n\t" " jmp _aes_cleanup"
+ "\n\t" ".size _aes_schedule_core,.-_aes_schedule_core"
+
+ "\n\t" "########################################################"
+ "\n\t" "## ##"
+ "\n\t" "## Constants ##"
+ "\n\t" "## ##"
+ "\n\t" "########################################################"
+
+ "\n\t" ".align 16"
+ "\n\t" ".type _aes_consts, at object"
+ "\n\t" ".Laes_consts:"
+ "\n\t" "_aes_consts:"
+ "\n\t" " # s0F"
+ "\n\t" " .Lk_s0F = .-.Laes_consts"
+ "\n\t" " .quad 0x0F0F0F0F0F0F0F0F"
+ "\n\t" " .quad 0x0F0F0F0F0F0F0F0F"
+
+ "\n\t" " # input transform (lo, hi)"
+ "\n\t" " .Lk_ipt = .-.Laes_consts"
+ "\n\t" " .quad 0xC2B2E8985A2A7000"
+ "\n\t" " .quad 0xCABAE09052227808"
+ "\n\t" " .quad 0x4C01307D317C4D00"
+ "\n\t" " .quad 0xCD80B1FCB0FDCC81"
+
+ "\n\t" " # inv, inva"
+ "\n\t" " .Lk_inv = .-.Laes_consts"
+ "\n\t" " .quad 0x0E05060F0D080180"
+ "\n\t" " .quad 0x040703090A0B0C02"
+ "\n\t" " .quad 0x01040A060F0B0780"
+ "\n\t" " .quad 0x030D0E0C02050809"
+
+ "\n\t" " # sb1u, sb1t"
+ "\n\t" " .Lk_sb1 = .-.Laes_consts"
+ "\n\t" " .quad 0xB19BE18FCB503E00"
+ "\n\t" " .quad 0xA5DF7A6E142AF544"
+ "\n\t" " .quad 0x3618D415FAE22300"
+ "\n\t" " .quad 0x3BF7CCC10D2ED9EF"
+
+
+ "\n\t" " # sb2u, sb2t"
+ "\n\t" " .Lk_sb2 = .-.Laes_consts"
+ "\n\t" " .quad 0xE27A93C60B712400"
+ "\n\t" " .quad 0x5EB7E955BC982FCD"
+ "\n\t" " .quad 0x69EB88400AE12900"
+ "\n\t" " .quad 0xC2A163C8AB82234A"
+
+ "\n\t" " # sbou, sbot"
+ "\n\t" " .Lk_sbo = .-.Laes_consts"
+ "\n\t" " .quad 0xD0D26D176FBDC700"
+ "\n\t" " .quad 0x15AABF7AC502A878"
+ "\n\t" " .quad 0xCFE474A55FBB6A00"
+ "\n\t" " .quad 0x8E1E90D1412B35FA"
+
+ "\n\t" " # mc_forward"
+ "\n\t" " .Lk_mc_forward = .-.Laes_consts"
+ "\n\t" " .quad 0x0407060500030201"
+ "\n\t" " .quad 0x0C0F0E0D080B0A09"
+ "\n\t" " .quad 0x080B0A0904070605"
+ "\n\t" " .quad 0x000302010C0F0E0D"
+ "\n\t" " .quad 0x0C0F0E0D080B0A09"
+ "\n\t" " .quad 0x0407060500030201"
+ "\n\t" " .quad 0x000302010C0F0E0D"
+ "\n\t" " .quad 0x080B0A0904070605"
+
+ "\n\t" " # mc_backward"
+ "\n\t" " .Lk_mc_backward = .-.Laes_consts"
+ "\n\t" " .quad 0x0605040702010003"
+ "\n\t" " .quad 0x0E0D0C0F0A09080B"
+ "\n\t" " .quad 0x020100030E0D0C0F"
+ "\n\t" " .quad 0x0A09080B06050407"
+ "\n\t" " .quad 0x0E0D0C0F0A09080B"
+ "\n\t" " .quad 0x0605040702010003"
+ "\n\t" " .quad 0x0A09080B06050407"
+ "\n\t" " .quad 0x020100030E0D0C0F"
+
+ "\n\t" " # sr"
+ "\n\t" " .Lk_sr = .-.Laes_consts"
+ "\n\t" " .quad 0x0706050403020100"
+ "\n\t" " .quad 0x0F0E0D0C0B0A0908"
+ "\n\t" " .quad 0x030E09040F0A0500"
+ "\n\t" " .quad 0x0B06010C07020D08"
+ "\n\t" " .quad 0x0F060D040B020900"
+ "\n\t" " .quad 0x070E050C030A0108"
+ "\n\t" " .quad 0x0B0E0104070A0D00"
+ "\n\t" " .quad 0x0306090C0F020508"
+
+ "\n\t" " # rcon"
+ "\n\t" " .Lk_rcon = .-.Laes_consts"
+ "\n\t" " .quad 0x1F8391B9AF9DEEB6"
+ "\n\t" " .quad 0x702A98084D7C7D81"
+
+ "\n\t" " # s63: all equal to 0x63 transformed"
+ "\n\t" " .Lk_s63 = .-.Laes_consts"
+ "\n\t" " .quad 0x5B5B5B5B5B5B5B5B"
+ "\n\t" " .quad 0x5B5B5B5B5B5B5B5B"
+
+ "\n\t" " # output transform"
+ "\n\t" " .Lk_opt = .-.Laes_consts"
+ "\n\t" " .quad 0xFF9F4929D6B66000"
+ "\n\t" " .quad 0xF7974121DEBE6808"
+ "\n\t" " .quad 0x01EDBD5150BCEC00"
+ "\n\t" " .quad 0xE10D5DB1B05C0CE0"
+
+ "\n\t" " # deskew tables: inverts the sbox's 'skew'"
+ "\n\t" " .Lk_deskew = .-.Laes_consts"
+ "\n\t" " .quad 0x07E4A34047A4E300"
+ "\n\t" " .quad 0x1DFEB95A5DBEF91A"
+ "\n\t" " .quad 0x5F36B5DC83EA6900"
+ "\n\t" " .quad 0x2841C2ABF49D1E77"
+
+ "\n\t" "##"
+ "\n\t" "## Decryption stuff"
+ "\n\t" "## Key schedule constants"
+ "\n\t" "##"
+ "\n\t" " # decryption key schedule: x -> invskew x*9"
+ "\n\t" " .Lk_dks_1 = .-.Laes_consts"
+ "\n\t" " .quad 0xB6116FC87ED9A700"
+ "\n\t" " .quad 0x4AED933482255BFC"
+ "\n\t" " .quad 0x4576516227143300"
+ "\n\t" " .quad 0x8BB89FACE9DAFDCE"
+
+ "\n\t" " # decryption key schedule: invskew x*9 -> invskew x*D"
+ "\n\t" " .Lk_dks_2 = .-.Laes_consts"
+ "\n\t" " .quad 0x27438FEBCCA86400"
+ "\n\t" " .quad 0x4622EE8AADC90561"
+ "\n\t" " .quad 0x815C13CE4F92DD00"
+ "\n\t" " .quad 0x73AEE13CBD602FF2"
+
+ "\n\t" " # decryption key schedule: invskew x*D -> invskew x*B"
+ "\n\t" " .Lk_dks_3 = .-.Laes_consts"
+ "\n\t" " .quad 0x03C4C50201C6C700"
+ "\n\t" " .quad 0xF83F3EF9FA3D3CFB"
+ "\n\t" " .quad 0xEE1921D638CFF700"
+ "\n\t" " .quad 0xA5526A9D7384BC4B"
+
+ "\n\t" " # decryption key schedule: invskew x*B -> invskew x*E + 0x63"
+ "\n\t" " .Lk_dks_4 = .-.Laes_consts"
+ "\n\t" " .quad 0xE3C390B053732000"
+ "\n\t" " .quad 0xA080D3F310306343"
+ "\n\t" " .quad 0xA0CA214B036982E8"
+ "\n\t" " .quad 0x2F45AEC48CE60D67"
+
+ "\n\t" "##"
+ "\n\t" "## Decryption stuff"
+ "\n\t" "## Round function constants"
+ "\n\t" "##"
+ "\n\t" " # decryption input transform"
+ "\n\t" " .Lk_dipt = .-.Laes_consts"
+ "\n\t" " .quad 0x0F505B040B545F00"
+ "\n\t" " .quad 0x154A411E114E451A"
+ "\n\t" " .quad 0x86E383E660056500"
+ "\n\t" " .quad 0x12771772F491F194"
+
+ "\n\t" " # decryption sbox output *9*u, *9*t"
+ "\n\t" " .Lk_dsb9 = .-.Laes_consts"
+ "\n\t" " .quad 0x851C03539A86D600"
+ "\n\t" " .quad 0xCAD51F504F994CC9"
+ "\n\t" " .quad 0xC03B1789ECD74900"
+ "\n\t" " .quad 0x725E2C9EB2FBA565"
+
+ "\n\t" " # decryption sbox output *D*u, *D*t"
+ "\n\t" " .Lk_dsbd = .-.Laes_consts"
+ "\n\t" " .quad 0x7D57CCDFE6B1A200"
+ "\n\t" " .quad 0xF56E9B13882A4439"
+ "\n\t" " .quad 0x3CE2FAF724C6CB00"
+ "\n\t" " .quad 0x2931180D15DEEFD3"
+
+ "\n\t" " # decryption sbox output *B*u, *B*t"
+ "\n\t" " .Lk_dsbb = .-.Laes_consts"
+ "\n\t" " .quad 0xD022649296B44200"
+ "\n\t" " .quad 0x602646F6B0F2D404"
+ "\n\t" " .quad 0xC19498A6CD596700"
+ "\n\t" " .quad 0xF3FF0C3E3255AA6B"
+
+ "\n\t" " # decryption sbox output *E*u, *E*t"
+ "\n\t" " .Lk_dsbe = .-.Laes_consts"
+ "\n\t" " .quad 0x46F2929626D4D000"
+ "\n\t" " .quad 0x2242600464B4F6B0"
+ "\n\t" " .quad 0x0C55A6CDFFAAC100"
+ "\n\t" " .quad 0x9467F36B98593E32"
+
+ "\n\t" " # decryption sbox final output"
+ "\n\t" " .Lk_dsbo = .-.Laes_consts"
+ "\n\t" " .quad 0x1387EA537EF94000"
+ "\n\t" " .quad 0xC7AA6DB9D4943E2D"
+ "\n\t" " .quad 0x12D7560F93441D00"
+ "\n\t" " .quad 0xCA4B8159D8C58E9C"
+
+ "\n\t" " .Lk_ctr_mask = .-.Laes_consts"
+ "\n\t" " .quad 0x0000000000000000"
+ "\n\t" " .quad 0xFFFFFFFFFFFFFFFF"
+
+ "\n\t" " .Lk_ctr_one = .-.Laes_consts"
+ "\n\t" " .quad 0x0000000000000001"
+ "\n\t" " .quad 0x0000000000000000"
+
+ "\n\t" " # ocb mode"
+ "\n\t" " .Lk_ocb_mask = .-.Laes_consts"
+ "\n\t" " .quad 0x0100000001000000"
+ "\n\t" " .quad 0x8680000001000000"
+
+ "\n\t" " # padding"
+ "\n\t" " .Lk_reverse_id = .-.Laes_consts"
+ "\n\t" " .quad 0x08090A0B0C0D0E0F"
+ "\n\t" " .quad 0x0001020304050607"
+ "\n\t" ".size _aes_consts,.-_aes_consts"
+);
+
+#endif /* USE_SSSE3 */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 7a83718..51c36c7 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -99,6 +99,40 @@ extern void _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx,
unsigned char *iv, size_t nblocks);
#endif
+#ifdef USE_SSSE3
+/* SSSE3 (AMD64) vector permutation implementation of AES */
+extern void _gcry_aes_ssse3_do_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_ssse3_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx,
+ unsigned char *dst,
+ const unsigned char *src);
+extern unsigned int _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx,
+ unsigned char *dst,
+ const unsigned char *src);
+extern void _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks,
+ int cbc_mac);
+extern void _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *ctr, size_t nblocks);
+extern void _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx,
+ unsigned char *outbuf,
+ const unsigned char *inbuf,
+ unsigned char *iv, size_t nblocks);
+#endif
+
#ifdef USE_PADLOCK
extern unsigned int _gcry_aes_padlock_encrypt (const RIJNDAEL_context *ctx,
unsigned char *bx,
@@ -182,7 +216,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
int rounds;
int i,j, r, t, rconpointer = 0;
int KC;
-#if defined(USE_AESNI) || defined(USE_PADLOCK)
+#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3)
unsigned int hwfeatures;
#endif
@@ -223,7 +257,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
ctx->rounds = rounds;
-#if defined(USE_AESNI) || defined(USE_PADLOCK)
+#if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3)
hwfeatures = _gcry_get_hw_features ();
#endif
@@ -234,6 +268,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
#ifdef USE_AESNI
ctx->use_aesni = 0;
#endif
+#ifdef USE_SSSE3
+ ctx->use_ssse3 = 0;
+#endif
if (0)
{
@@ -260,6 +297,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
memcpy (ctx->padlockkey, key, keylen);
}
#endif
+#ifdef USE_SSSE3
+ else if (hwfeatures & HWF_INTEL_SSSE3)
+ {
+ ctx->encrypt_fn = _gcry_aes_ssse3_encrypt;
+ ctx->decrypt_fn = _gcry_aes_ssse3_decrypt;
+ ctx->prefetch_enc_fn = NULL;
+ ctx->prefetch_dec_fn = NULL;
+ ctx->use_ssse3 = 1;
+ }
+#endif
else
{
ctx->encrypt_fn = do_encrypt;
@@ -278,6 +325,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen)
else if (ctx->use_aesni)
_gcry_aes_aesni_do_setkey (ctx, key);
#endif
+#ifdef USE_AESNI
+ else if (ctx->use_ssse3)
+ _gcry_aes_ssse3_do_setkey (ctx, key);
+#endif
else
{
const byte *sbox = ((const byte *)encT) + 1;
@@ -403,6 +454,12 @@ prepare_decryption( RIJNDAEL_context *ctx )
_gcry_aes_aesni_prepare_decryption (ctx);
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_prepare_decryption (ctx);
+ }
+#endif /*USE_SSSE3*/
#ifdef USE_PADLOCK
else if (ctx->use_padlock)
{
@@ -650,6 +707,13 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_cfb_enc (ctx, outbuf, inbuf, iv, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else
{
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -697,6 +761,13 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_cbc_enc (ctx, outbuf, inbuf, iv, nblocks, cbc_mac);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else
{
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -752,6 +823,13 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_ctr_enc (ctx, outbuf, inbuf, ctr, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp;
@@ -986,6 +1064,13 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_cfb_dec (ctx, outbuf, inbuf, iv, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else
{
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -1032,6 +1117,13 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
burn_depth = 0;
}
#endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3)
+ {
+ _gcry_aes_ssse3_cbc_dec (ctx, outbuf, inbuf, iv, nblocks);
+ burn_depth = 0;
+ }
+#endif /*USE_SSSE3*/
else
{
unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16;
diff --git a/configure.ac b/configure.ac
index a4ea990..71c50c0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1692,6 +1692,9 @@ if test "$found" = "1" ; then
x86_64-*-*)
# Build with the assembly implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo"
+
+ # Build with the SSSE3 implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
;;
arm*-*-*)
# Build with the assembly implementation
More information about the Gcrypt-devel
mailing list