[PATCH] rijndael-ssse3: move assembly functions to separate source-file

Wed Jan 4 23:35:42 CET 2017

* cipher/Makefile.am: Add 'rinjdael-ssse3-amd64-asm.S'.
* cipher/rinjdael-ssse3-amd64-asm.S: Moved assembly functions
here ...
* cipher/rinjdael-ssse3-amd64.c: ... from this file.
(_gcry_aes_ssse3_enc_preload, _gcry_aes_ssse3_dec_preload)
(_gcry_aes_ssse3_shedule_core, _gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core): New.
(vpaes_ssse3_prepare_enc, vpaes_ssse3_prepare_dec)
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(do_vpaes_ssse3_enc, do_vpaes_ssse3_dec): Update to use external
assembly functions; remove 'aes_const_ptr' variable usage.
(_gcry_aes_ssse3_encrypt, _gcry_aes_ssse3_decrypt)
(_gcry_aes_ssse3_cfb_enc, _gcry_aes_ssse3_cbc_enc)
(_gcry_aes_ssse3_ctr_enc, _gcry_aes_ssse3_cfb_dec)
(_gcry_aes_ssse3_cbc_dec, ssse3_ocb_enc, ssse3_ocb_dec)
(_gcry_aes_ssse3_ocb_auth): Remove 'aes_const_ptr' variable usage.
* configure.ac: Add 'rinjdael-ssse3-amd64-asm.lo'.
--

After this change, libgcrypt can be compiled with -flto optimization
enabled on x86-64.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 8c9fc0e..fb0b7d2 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -80,7 +80,8 @@ md4.c \
 md5.c \
 poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
 rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
-  rijndael-padlock.c rijndael-amd64.S rijndael-arm.S rijndael-ssse3-amd64.c \
+  rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \
+  rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
   rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S rijndael-armv8-aarch64-ce.S \
   rijndael-aarch64.S \
 rmd160.c \
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
new file mode 100644
index 0000000..3ae55e8
--- /dev/null
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -0,0 +1,853 @@
+/* SSSE3 vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ *     libvpaes: constant-time SSSE3 AES encryption and decryption.
+ *     version 0.5
+ *
+ *     By Mike Hamburg, Stanford University, 2009.  Public domain.
+ *     I wrote essentially all of this code.  I did not write the test
+ *     vectors; they are the NIST known answer tests.  I hereby release all
+ *     the code and documentation here that I wrote into the public domain.
+ *
+ *     This is an implementation of AES following my paper,
+ *       "Accelerating AES with Vector Permute Instructions
+ *       CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#if defined(__x86_64__)
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ELF(...)
+#else
+# define ELF(...) __VA_ARGS__
+#endif
+
+.text
+
+##
+##  _gcry_aes_ssse3_enc_preload
+##
+ELF(.type _gcry_aes_ssse3_enc_preload, at function)
+.globl _gcry_aes_ssse3_enc_preload
+_gcry_aes_ssse3_enc_preload:
+	lea	.Laes_consts(%rip), %rax
+	movdqa	          (%rax), %xmm9  # 0F
+	movdqa	.Lk_inv   (%rax), %xmm10 # inv
+	movdqa	.Lk_inv+16(%rax), %xmm11 # inva
+	movdqa	.Lk_sb1   (%rax), %xmm13 # sb1u
+	movdqa	.Lk_sb1+16(%rax), %xmm12 # sb1t
+	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
+	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
+	ret
+ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
+
+##
+##  _gcry_aes_ssse3_dec_preload
+##
+ELF(.type _gcry_aes_ssse3_dec_preload, at function)
+.globl _gcry_aes_ssse3_dec_preload
+_gcry_aes_ssse3_dec_preload:
+	lea	.Laes_consts(%rip), %rax
+	movdqa	          (%rax), %xmm9   # 0F
+	movdqa	.Lk_inv   (%rax), %xmm10  # inv
+	movdqa	.Lk_inv+16(%rax), %xmm11  # inva
+	movdqa	.Lk_dsb9   (%rax), %xmm13 # sb9u
+	movdqa	.Lk_dsb9+16(%rax), %xmm12 # sb9t
+	movdqa	.Lk_dsbd   (%rax), %xmm15 # sbdu
+	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
+	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
+	ret
+ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
+
+##
+## Constant-time SSSE3 AES core implementation.
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in .Laes_preheat
+##    (%rdx) = scheduled keys
+##     %rax  = nrounds - 1
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx
+##  Preserves %xmm6 - %xmm7 so you get some local vectors
+##
+##
+.align 16
+ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
+.globl _gcry_aes_ssse3_encrypt_core
+_gcry_aes_ssse3_encrypt_core:
+_aes_encrypt_core:
+	lea	.Laes_consts(%rip), %rcx
+	leaq	.Lk_mc_backward(%rcx), %rdi
+	mov	$16,	%rsi
+	movdqa	.Lk_ipt   (%rcx), %xmm2 # iptlo
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rcx), %xmm0 # ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	(%rdx),%xmm2
+	pxor	%xmm2,	%xmm0
+	add	$16,	%rdx
+	jmp	.Laes_entry
+
+.align 8
+.Laes_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	pshufb  %xmm2,	%xmm4   # 4 = sb1u
+	pxor	(%rdx),	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa  %xmm15,	%xmm4	# 4 : sb2u
+	pshufb	%xmm2,	%xmm4	# 4 = sb2u
+	movdqa	.Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	pxor	%xmm4,  %xmm2	# 2 = 2A
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	(%rsi,%rdi), %xmm3  # 3 = D
+	lea	16(%esi),%esi	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	lea	16(%rdx),%rdx	# next key
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+	and	$48, %rsi	# ... mod 4
+	dec	%rax		# nr--
+
+.Laes_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	$4,    	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa  %xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa  %xmm10, %xmm3   # 3 : 1/jak
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Laes_loop
+
+	# middle of last round
+	movdqa	.Lk_sbo(%rcx), %xmm4	# 3 : sbou
+	pshufb  %xmm2,  %xmm4   # 4 = sbou
+	pxor	(%rdx), %xmm4   # 4 = sb1u + k
+	movdqa	.Lk_sbo+16(%rcx), %xmm0	# 0 : sbot
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	ret
+ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.align 16
+.globl _gcry_aes_ssse3_decrypt_core
+ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
+_gcry_aes_ssse3_decrypt_core:
+_aes_decrypt_core:
+	lea	.Laes_consts(%rip), %rcx
+	movl	%eax,	%esi
+	shll	$4,	%esi
+	xorl	$48,	%esi
+	andl	$48,	%esi
+	movdqa	.Lk_dipt   (%rcx), %xmm2 # iptlo
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_dipt+16(%rcx), %xmm0 # ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	(%rdx),	%xmm2
+	pxor	%xmm2,	%xmm0
+	movdqa	.Lk_mc_forward+48(%rcx), %xmm5
+	lea	16(%rdx), %rdx
+	neg	%rax
+	jmp	.Laes_dec_entry
+
+.align 16
+.Laes_dec_loop:
+##
+##  Inverse mix columns
+##
+	movdqa  %xmm13,	%xmm4		# 4 : sb9u
+	pshufb	%xmm2,	%xmm4		# 4 = sb9u
+	pxor	(%rdx),	%xmm4
+	movdqa  %xmm12,	%xmm0		# 0 : sb9t
+	pshufb	%xmm3,	%xmm0		# 0 = sb9t
+	movdqa  .Lk_dsbd+16(%rcx),%xmm1	# 1 : sbdt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	lea	16(%rdx), %rdx		# next round key
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  %xmm15,	%xmm4		# 4 : sbdu
+	pshufb	%xmm2,	%xmm4		# 4 = sbdu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	pshufb	%xmm3,	%xmm1		# 1 = sbdt
+	pxor	%xmm4,	%xmm1		# 1 = ch
+
+	pshufb	%xmm5,	%xmm1		# MC ch
+	movdqa  %xmm14,	%xmm4		# 4 : sbbu
+	pshufb	%xmm2,	%xmm4		# 4 = sbbu
+	inc     %rax                    # nr--
+	pxor	%xmm1,	%xmm4		# 4 = ch
+	movdqa  .Lk_dsbb+16(%rcx),%xmm0	# 0 : sbbt
+	pshufb	%xmm3,	%xmm0		# 0 = sbbt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  %xmm8,	%xmm4		# 4 : sbeu
+	pshufb	%xmm2,	%xmm4		# 4 = sbeu
+	pshufd	$0x93,	%xmm5,	%xmm5
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  .Lk_dsbe+16(%rcx),%xmm0	# 0 : sbet
+	pshufb	%xmm3,	%xmm0		# 0 = sbet
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+.Laes_dec_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	$4,    	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa  %xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa  %xmm10, %xmm3   # 3 : 1/jak
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Laes_dec_loop
+
+	# middle of last round
+	movdqa	.Lk_dsbo(%rcx), %xmm4		# 3 : sbou
+	pshufb  %xmm2,  %xmm4   # 4 = sbou
+	pxor	(%rdx), %xmm4   # 4 = sb1u + k
+	movdqa	.Lk_dsbo+16(%rcx), %xmm0	# 0 : sbot
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	ret
+ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+
+.align 16
+.globl _gcry_aes_ssse3_schedule_core
+ELF(.type _gcry_aes_ssse3_schedule_core, at function)
+_gcry_aes_ssse3_schedule_core:
+_aes_schedule_core:
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+
+	# load the tables
+	lea	.Laes_consts(%rip), %r10
+	movdqa	          (%r10), %xmm9  # 0F
+	movdqa	.Lk_inv   (%r10), %xmm10 # inv
+	movdqa	.Lk_inv+16(%r10), %xmm11 # inva
+	movdqa	.Lk_sb1   (%r10), %xmm13 # sb1u
+	movdqa	.Lk_sb1+16(%r10), %xmm12 # sb1t
+	movdqa	.Lk_sb2   (%r10), %xmm15 # sb2u
+	movdqa	.Lk_sb2+16(%r10), %xmm14 # sb2t
+
+	movdqa	.Lk_rcon(%r10), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqu	%xmm0,	%xmm3
+	lea	.Lk_ipt(%r10), %r11
+	call	.Laes_schedule_transform
+	movdqu	%xmm0,	%xmm7
+
+	test	%rcx,	%rcx
+	jnz	.Laes_schedule_am_decrypting
+
+	# encrypting, output zeroth round key after transform
+	movdqa	%xmm0,	(%rdx)
+	jmp	.Laes_schedule_go
+
+.Laes_schedule_am_decrypting:
+	# decrypting, output zeroth round key after shiftrows
+	pshufb  .Lk_sr(%r8,%r10),%xmm3
+	movdqa	%xmm3,	(%rdx)
+	xor	$48, 	%r8
+
+.Laes_schedule_go:
+	cmp	$192,	%rsi
+	je	.Laes_schedule_192
+	cmp	$256,	%rsi
+	je	.Laes_schedule_256
+	# 128: fall though
+
+##
+##  .Laes_schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Laes_schedule_128:
+	mov	$10, %rsi
+
+.Laes_schedule_128_L:
+	call 	.Laes_schedule_round
+	dec	%rsi
+	jz 	.Laes_schedule_mangle_last
+	call	.Laes_schedule_mangle	# write output
+	jmp 	.Laes_schedule_128_L
+
+##
+##  .Laes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.Laes_schedule_192:
+	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	call	.Laes_schedule_transform	# input transform
+	pshufd	$0x0E,	%xmm0,	%xmm6
+	pslldq	$8,	%xmm6		# clobber low side with zeros
+	mov	$4,	%rsi
+
+.Laes_schedule_192_L:
+	call	.Laes_schedule_round
+	palignr	$8,%xmm6,%xmm0
+	call	.Laes_schedule_mangle	# save key n
+	call	.Laes_schedule_192_smear
+	call	.Laes_schedule_mangle	# save key n+1
+	call	.Laes_schedule_round
+	dec	%rsi
+	jz 	.Laes_schedule_mangle_last
+	call	.Laes_schedule_mangle	# save key n+2
+	call	.Laes_schedule_192_smear
+	jmp	.Laes_schedule_192_L
+
+##
+##  .Laes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.Laes_schedule_192_smear:
+	pshufd	$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
+	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
+	pshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	pxor	%xmm6,	%xmm0		# -> b+c+d b+c b a
+	pshufd	$0x0E,	%xmm0,	%xmm6
+	pslldq	$8,	%xmm6		# clobber low side with zeros
+	ret
+
+##
+##  .Laes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional 'low side' in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.Laes_schedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	.Laes_schedule_transform	# input transform
+	mov	$7, %rsi
+
+.Laes_schedule_256_L:
+	call	.Laes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	.Laes_schedule_round
+	dec	%rsi
+	jz 	.Laes_schedule_mangle_last
+	call	.Laes_schedule_mangle
+
+	# low round. swap xmm7 and xmm6
+	pshufd	$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	.Laes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+
+	jmp	.Laes_schedule_256_L
+
+##
+##  .Laes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.Laes_schedule_round:
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	$15,	%xmm8,	%xmm1
+	palignr	$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	$0xFF,	%xmm0,	%xmm0
+	palignr	$1,	%xmm0,	%xmm0
+
+	# fall through...
+
+	# low round: same as high round, but no rotation and no rcon.
+.Laes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%r10), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	$4,    	%xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa  %xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa  %xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa  %xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	.Lk_sb1(%r10), %xmm4	# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	.Lk_sb1+16(%r10), %xmm0	# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0
+	movdqa	%xmm0,	%xmm7
+	ret
+
+##
+##  .Laes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.Laes_schedule_transform:
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+
+##
+##  .Laes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by 'inverse mixcolumns' circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.Laes_schedule_mangle:
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%r10),%xmm5
+	test	%rcx, 	%rcx
+	jnz	.Laes_schedule_mangle_dec
+
+	# encrypting
+	add	$16,	%rdx
+	pxor	.Lk_s63(%r10),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+	jmp	.Laes_schedule_mangle_both
+
+.Laes_schedule_mangle_dec:
+	lea	.Lk_dks_1(%r10), %r11	# first table: *9
+	call 	.Laes_schedule_transform
+	movdqa	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	add	$32, 	%r11		# next table:  *B
+	call 	.Laes_schedule_transform
+	pxor	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	add	$32, 	%r11		# next table:  *D
+	call 	.Laes_schedule_transform
+	pxor	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	add	$32, 	%r11		# next table:  *E
+	call 	.Laes_schedule_transform
+	pxor	%xmm0,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	%xmm4,	%xmm0		# restore %xmm0
+	add	$-16,	%rdx
+
+.Laes_schedule_mangle_both:
+	pshufb	.Lk_sr(%r8,%r10),%xmm3
+	add	$-16,	%r8
+	and	$48,	%r8
+	movdqa	%xmm3,	(%rdx)
+	ret
+
+##
+##  .Laes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.Laes_schedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%r10),%r11	# prepare to deskew
+	test	%rcx, 	%rcx
+	jnz	.Laes_schedule_mangle_last_dec
+
+	# encrypting
+	pshufb	.Lk_sr(%r8,%r10),%xmm0	# output permute
+	lea	.Lk_opt(%r10),	%r11	# prepare to output transform
+	add	$32,	%rdx
+
+.Laes_schedule_mangle_last_dec:
+	add	$-16,	%rdx
+	pxor	.Lk_s63(%r10),	%xmm0
+	call	.Laes_schedule_transform # output transform
+	movdqa	%xmm0,	(%rdx)		# save last key
+
+	#_aes_cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	pxor	%xmm8,  %xmm8
+	ret
+ELF(.size _aes_schedule_core,.-_aes_schedule_core)
+
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+
+.align 16
+ELF(.type _aes_consts, at object)
+.Laes_consts:
+_aes_consts:
+	# s0F
+	.Lk_s0F = .-.Laes_consts
+	.quad	0x0F0F0F0F0F0F0F0F
+	.quad	0x0F0F0F0F0F0F0F0F
+
+	# input transform (lo, hi)
+	.Lk_ipt = .-.Laes_consts
+	.quad	0xC2B2E8985A2A7000
+	.quad	0xCABAE09052227808
+	.quad	0x4C01307D317C4D00
+	.quad	0xCD80B1FCB0FDCC81
+
+	# inv, inva
+	.Lk_inv = .-.Laes_consts
+	.quad	0x0E05060F0D080180
+	.quad	0x040703090A0B0C02
+	.quad	0x01040A060F0B0780
+	.quad	0x030D0E0C02050809
+
+	# sb1u, sb1t
+	.Lk_sb1 = .-.Laes_consts
+	.quad	0xB19BE18FCB503E00
+	.quad	0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300
+	.quad	0x3BF7CCC10D2ED9EF
+
+
+	# sb2u, sb2t
+	.Lk_sb2 = .-.Laes_consts
+	.quad	0xE27A93C60B712400
+	.quad	0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900
+	.quad	0xC2A163C8AB82234A
+
+	# sbou, sbot
+	.Lk_sbo = .-.Laes_consts
+	.quad	0xD0D26D176FBDC700
+	.quad	0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00
+	.quad	0x8E1E90D1412B35FA
+
+	# mc_forward
+	.Lk_mc_forward = .-.Laes_consts
+	.quad	0x0407060500030201
+	.quad	0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605
+	.quad	0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09
+	.quad	0x0407060500030201
+	.quad	0x000302010C0F0E0D
+	.quad	0x080B0A0904070605
+
+	# mc_backward
+	.Lk_mc_backward = .-.Laes_consts
+	.quad	0x0605040702010003
+	.quad	0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F
+	.quad	0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B
+	.quad	0x0605040702010003
+	.quad	0x0A09080B06050407
+	.quad	0x020100030E0D0C0F
+
+	# sr
+	.Lk_sr = .-.Laes_consts
+	.quad	0x0706050403020100
+	.quad	0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500
+	.quad	0x0B06010C07020D08
+	.quad	0x0F060D040B020900
+	.quad	0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00
+	.quad	0x0306090C0F020508
+
+	# rcon
+	.Lk_rcon = .-.Laes_consts
+	.quad	0x1F8391B9AF9DEEB6
+	.quad	0x702A98084D7C7D81
+
+	# s63: all equal to 0x63 transformed
+	.Lk_s63 = .-.Laes_consts
+	.quad	0x5B5B5B5B5B5B5B5B
+	.quad	0x5B5B5B5B5B5B5B5B
+
+	# output transform
+	.Lk_opt = .-.Laes_consts
+	.quad	0xFF9F4929D6B66000
+	.quad	0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00
+	.quad	0xE10D5DB1B05C0CE0
+
+	# deskew tables: inverts the sbox's 'skew'
+	.Lk_deskew = .-.Laes_consts
+	.quad	0x07E4A34047A4E300
+	.quad	0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900
+	.quad	0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+	# decryption key schedule: x -> invskew x*9
+	.Lk_dks_1 = .-.Laes_consts
+	.quad	0xB6116FC87ED9A700
+	.quad	0x4AED933482255BFC
+	.quad	0x4576516227143300
+	.quad	0x8BB89FACE9DAFDCE
+
+	# decryption key schedule: invskew x*9 -> invskew x*D
+	.Lk_dks_2 = .-.Laes_consts
+	.quad	0x27438FEBCCA86400
+	.quad	0x4622EE8AADC90561
+	.quad	0x815C13CE4F92DD00
+	.quad	0x73AEE13CBD602FF2
+
+	# decryption key schedule: invskew x*D -> invskew x*B
+	.Lk_dks_3 = .-.Laes_consts
+	.quad	0x03C4C50201C6C700
+	.quad	0xF83F3EF9FA3D3CFB
+	.quad	0xEE1921D638CFF700
+	.quad	0xA5526A9D7384BC4B
+
+	# decryption key schedule: invskew x*B -> invskew x*E + 0x63
+	.Lk_dks_4 = .-.Laes_consts
+	.quad	0xE3C390B053732000
+	.quad	0xA080D3F310306343
+	.quad	0xA0CA214B036982E8
+	.quad	0x2F45AEC48CE60D67
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+	# decryption input transform
+	.Lk_dipt = .-.Laes_consts
+	.quad	0x0F505B040B545F00
+	.quad	0x154A411E114E451A
+	.quad	0x86E383E660056500
+	.quad	0x12771772F491F194
+
+	# decryption sbox output *9*u, *9*t
+	.Lk_dsb9 = .-.Laes_consts
+	.quad	0x851C03539A86D600
+	.quad	0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900
+	.quad	0x725E2C9EB2FBA565
+
+	# decryption sbox output *D*u, *D*t
+	.Lk_dsbd = .-.Laes_consts
+	.quad	0x7D57CCDFE6B1A200
+	.quad	0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00
+	.quad	0x2931180D15DEEFD3
+
+	# decryption sbox output *B*u, *B*t
+	.Lk_dsbb = .-.Laes_consts
+	.quad	0xD022649296B44200
+	.quad	0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700
+	.quad	0xF3FF0C3E3255AA6B
+
+	# decryption sbox output *E*u, *E*t
+	.Lk_dsbe = .-.Laes_consts
+	.quad	0x46F2929626D4D000
+	.quad	0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100
+	.quad	0x9467F36B98593E32
+
+	# decryption sbox final output
+	.Lk_dsbo = .-.Laes_consts
+	.quad	0x1387EA537EF94000
+	.quad	0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00
+	.quad	0xCA4B8159D8C58E9C
+ELF(.size _aes_consts,.-_aes_consts)
+
+#endif
+#endif
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index 2adb73f..25d1849 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -1,5 +1,5 @@
 /* SSSE3 vector permutation AES for Libgcrypt
- * Copyright (C) 2014-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -57,11 +57,22 @@
 #endif
 
 
+/* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
+   have custom calling convention and need to be called from assembly
+   blocks, not directly. */
+extern void _gcry_aes_ssse3_enc_preload(void);
+extern void _gcry_aes_ssse3_dec_preload(void);
+extern void _gcry_aes_ssse3_schedule_core(void);
+extern void _gcry_aes_ssse3_encrypt_core(void);
+extern void _gcry_aes_ssse3_decrypt_core(void);
+
+
+
 /* Two macros to be called prior and after the use of SSSE3
-  instructions.  There should be no external function calls between
-  the use of these macros.  There purpose is to make sure that the
-  SSE registers are cleared and won't reveal any information about
-  the key or the data.  */
+   instructions.  There should be no external function calls between
+   the use of these macros.  There purpose is to make sure that the
+   SSE registers are cleared and won't reveal any information about
+   the key or the data.  */
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 # define SSSE3_STATE_SIZE (16 * 10)
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
@@ -115,34 +126,19 @@
                   ::: "memory" )
 #endif
 
-#define vpaes_ssse3_prepare_enc(const_ptr) \
+#define vpaes_ssse3_prepare_enc() \
     vpaes_ssse3_prepare(); \
-    asm volatile ("lea	.Laes_consts(%%rip), %q0 \n\t" \
-                  "movdqa	          (%q0), %%xmm9  # 0F \n\t" \
-                  "movdqa	.Lk_inv   (%q0), %%xmm10 # inv \n\t" \
-                  "movdqa	.Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
-                  "movdqa	.Lk_sb1   (%q0), %%xmm13 # sb1u \n\t" \
-                  "movdqa	.Lk_sb1+16(%q0), %%xmm12 # sb1t \n\t" \
-                  "movdqa	.Lk_sb2   (%q0), %%xmm15 # sb2u \n\t" \
-                  "movdqa	.Lk_sb2+16(%q0), %%xmm14 # sb2t \n\t" \
-                  : "=c" (const_ptr) \
+    asm volatile ("call *%[core] \n\t" \
                   : \
-                  : "memory" )
+                  : [core] "r" (_gcry_aes_ssse3_enc_preload) \
+                  : "rax", "cc", "memory" )
 
-#define vpaes_ssse3_prepare_dec(const_ptr) \
+#define vpaes_ssse3_prepare_dec() \
     vpaes_ssse3_prepare(); \
-    asm volatile ("lea	.Laes_consts(%%rip), %q0 \n\t" \
-                  "movdqa	          (%q0), %%xmm9  # 0F \n\t" \
-                  "movdqa	.Lk_inv   (%q0), %%xmm10 # inv \n\t" \
-                  "movdqa	.Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
-                  "movdqa	.Lk_dsb9   (%q0), %%xmm13 # sb9u \n\t" \
-                  "movdqa	.Lk_dsb9+16(%q0), %%xmm12 # sb9t \n\t" \
-                  "movdqa	.Lk_dsbd   (%q0), %%xmm15 # sbdu \n\t" \
-                  "movdqa	.Lk_dsbb   (%q0), %%xmm14 # sbbu \n\t" \
-                  "movdqa	.Lk_dsbe   (%q0), %%xmm8 # sbeu \n\t" \
-                  : "=c" (const_ptr) \
+    asm volatile ("call *%[core] \n\t" \
                   : \
-                  : "memory" )
+                  : [core] "r" (_gcry_aes_ssse3_dec_preload) \
+                  : "rax", "cc", "memory" )
 
 
 
@@ -159,9 +155,10 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
                 "leaq %[buf], %%rdx"			"\n\t"
                 "movl %[dir], %%ecx"			"\n\t"
                 "movl %[rotoffs], %%r8d"		"\n\t"
-                "call _aes_schedule_core"		"\n\t"
+                "call *%[core]"				"\n\t"
                 :
-                : [key] "m" (*key),
+                : [core] "r" (&_gcry_aes_ssse3_schedule_core),
+                  [key] "m" (*key),
                   [bits] "g" (keybits),
                   [buf] "m" (ctx->keyschenc32[0][0]),
                   [dir] "g" (0),
@@ -169,10 +166,31 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
                 : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
                   "cc", "memory");
 
-  vpaes_ssse3_cleanup();
-
   /* Save key for setting up decryption. */
-  memcpy(&ctx->keyschdec32[0][0], key, keybits / 8);
+  if (keybits > 192)
+    asm volatile ("movdqu   (%[src]), %%xmm0\n\t"
+		  "movdqu 16(%[src]), %%xmm1\n\t"
+		  "movdqu %%xmm0,   (%[dst])\n\t"
+		  "movdqu %%xmm1, 16(%[dst])\n\t"
+		  : /* No output */
+		  : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+		  : "memory" );
+  else if (keybits == 192)
+    asm volatile ("movdqu   (%[src]), %%xmm0\n\t"
+		  "movq   16(%[src]), %%xmm1\n\t"
+		  "movdqu %%xmm0,   (%[dst])\n\t"
+		  "movq   %%xmm1, 16(%[dst])\n\t"
+		  : /* No output */
+		  : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+		  : "memory" );
+  else
+    asm volatile ("movdqu (%[src]), %%xmm0\n\t"
+		  "movdqu %%xmm0, (%[dst])\n\t"
+		  : /* No output */
+		  : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+		  : "memory" );
+
+  vpaes_ssse3_cleanup();
 }
 
 
@@ -190,9 +208,10 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
                 "leaq %[buf], %%rdx"			"\n\t"
                 "movl %[dir], %%ecx"			"\n\t"
                 "movl %[rotoffs], %%r8d"		"\n\t"
-                "call _aes_schedule_core"		"\n\t"
+                "call *%[core]"				"\n\t"
                 :
-                : [key] "m" (ctx->keyschdec32[0][0]),
+                : [core] "r" (_gcry_aes_ssse3_schedule_core),
+                  [key] "m" (ctx->keyschdec32[0][0]),
                   [bits] "g" (keybits),
                   [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
                   [dir] "g" (1),
@@ -207,32 +226,30 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 /* Encrypt one block using the Intel SSSE3 instructions.  Block is input
 * and output through SSE register xmm0. */
 static inline void
-do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds,
-                    const void *aes_const_ptr)
+do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
   unsigned int middle_rounds = nrounds - 1;
   const void *keysched = ctx->keyschenc32;
 
-  asm volatile ("call _aes_encrypt_core"		"\n\t"
-                : "+a" (middle_rounds), "+d" (keysched)
-                : "c" (aes_const_ptr)
-                : "rdi", "rsi", "cc", "memory");
+  asm volatile ("call *%[core]"				"\n\t"
+		: "+a" (middle_rounds), "+d" (keysched)
+		: [core] "r" (_gcry_aes_ssse3_encrypt_core)
+		: "rcx", "rsi", "rdi", "cc", "memory");
 }
 
 
 /* Decrypt one block using the Intel SSSE3 instructions.  Block is input
 * and output through SSE register xmm0. */
 static inline void
-do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds,
-                    const void *aes_const_ptr)
+do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
   unsigned int middle_rounds = nrounds - 1;
   const void *keysched = ctx->keyschdec32;
 
-  asm volatile ("call _aes_decrypt_core"		"\n\t"
+  asm volatile ("call *%[core]"				"\n\t"
                 : "+a" (middle_rounds), "+d" (keysched)
-                : "c" (aes_const_ptr)
-                : "rsi", "cc", "memory");
+		: [core] "r" (_gcry_aes_ssse3_decrypt_core)
+                : "rcx", "rsi", "cc", "memory");
 }
 
 
@@ -241,15 +258,14 @@ _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                         const unsigned char *src)
 {
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
   asm volatile ("movdqu %[src], %%xmm0\n\t"
                 :
                 : [src] "m" (*src)
                 : "memory" );
-  do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+  do_vpaes_ssse3_enc (ctx, nrounds);
   asm volatile ("movdqu %%xmm0, %[dst]\n\t"
                 : [dst] "=m" (*dst)
                 :
@@ -265,10 +281,9 @@ _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                         size_t nblocks)
 {
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
 
   asm volatile ("movdqu %[iv], %%xmm0\n\t"
                 : /* No output */
@@ -277,7 +292,7 @@ _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
 
   for ( ;nblocks; nblocks-- )
     {
-      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_enc (ctx, nrounds);
 
       asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
                     "pxor %%xmm1, %%xmm0\n\t"
@@ -305,10 +320,9 @@ _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                         size_t nblocks, int cbc_mac)
 {
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
 
   asm volatile ("movdqu %[iv], %%xmm7\n\t"
                 : /* No output */
@@ -323,7 +337,7 @@ _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
                     : [inbuf] "m" (*inbuf)
                     : "memory" );
 
-      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_enc (ctx, nrounds);
 
       asm volatile ("movdqa %%xmm0, %%xmm7\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
@@ -353,11 +367,10 @@ _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
   static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
   u64 ctrlow;
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
 
   asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
                 "movdqa (%[ctr]), %%xmm7\n\t"  /* Preload CTR */
@@ -388,10 +401,10 @@ _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
 
                     "pshufb %%xmm6, %%xmm7\n\t"
                     : [ctrlow] "+r" (ctrlow)
-                    : [ctr] "r" (ctr)
+                    :
                     : "cc", "memory");
 
-      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_enc (ctx, nrounds);
 
       asm volatile ("movdqu %[src], %%xmm1\n\t"      /* xmm1 := input   */
                     "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
@@ -418,15 +431,14 @@ _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
                         const unsigned char *src)
 {
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_dec (aes_const_ptr);
+  vpaes_ssse3_prepare_dec ();
   asm volatile ("movdqu %[src], %%xmm0\n\t"
                 :
                 : [src] "m" (*src)
                 : "memory" );
-  do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+  do_vpaes_ssse3_dec (ctx, nrounds);
   asm volatile ("movdqu %%xmm0, %[dst]\n\t"
                 : [dst] "=m" (*dst)
                 :
@@ -442,10 +454,9 @@ _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
                         size_t nblocks)
 {
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
 
   asm volatile ("movdqu %[iv], %%xmm0\n\t"
                 : /* No output */
@@ -454,7 +465,7 @@ _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
 
   for ( ;nblocks; nblocks-- )
     {
-      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_enc (ctx, nrounds);
 
       asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
                     "movdqu %[inbuf], %%xmm0\n\t"
@@ -483,45 +494,40 @@ _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
                         size_t nblocks)
 {
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_dec (aes_const_ptr);
+  vpaes_ssse3_prepare_dec ();
 
-  asm volatile
-    ("movdqu %[iv], %%xmm7\n\t"	/* use xmm7 as fast IV storage */
-    : /* No output */
-    : [iv] "m" (*iv)
-    : "memory");
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"	/* use xmm7 as fast IV storage */
+		: /* No output */
+		: [iv] "m" (*iv)
+		: "memory");
 
   for ( ;nblocks; nblocks-- )
     {
-      asm volatile
-        ("movdqu %[inbuf], %%xmm0\n\t"
-        "movdqa %%xmm0, %%xmm6\n\t"    /* use xmm6 as savebuf */
-        : /* No output */
-        : [inbuf] "m" (*inbuf)
-        : "memory");
-
-      do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
-
-      asm volatile
-        ("pxor %%xmm7, %%xmm0\n\t"	/* xor IV with output */
-        "movdqu %%xmm0, %[outbuf]\n\t"
-        "movdqu %%xmm6, %%xmm7\n\t"	/* store savebuf as new IV */
-        : [outbuf] "=m" (*outbuf)
-        :
-        : "memory");
+      asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+		    "movdqa %%xmm0, %%xmm6\n\t"    /* use xmm6 as savebuf */
+		    : /* No output */
+		    : [inbuf] "m" (*inbuf)
+		    : "memory");
+
+      do_vpaes_ssse3_dec (ctx, nrounds);
+
+      asm volatile ("pxor %%xmm7, %%xmm0\n\t"	/* xor IV with output */
+		    "movdqu %%xmm0, %[outbuf]\n\t"
+		    "movdqu %%xmm6, %%xmm7\n\t"	/* store savebuf as new IV */
+		    : [outbuf] "=m" (*outbuf)
+		    :
+		    : "memory");
 
       outbuf += BLOCKSIZE;
       inbuf  += BLOCKSIZE;
     }
 
-  asm volatile
-    ("movdqu %%xmm7, %[iv]\n\t"	/* store IV */
-    : /* No output */
-    : [iv] "m" (*iv)
-    : "memory");
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"	/* store IV */
+		: /* No output */
+		: [iv] "m" (*iv)
+		: "memory");
 
   vpaes_ssse3_cleanup ();
 }
@@ -536,10 +542,9 @@ ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
 
   /* Preload Offset and Checksum */
   asm volatile ("movdqu %[iv], %%xmm7\n\t"
@@ -568,7 +573,7 @@ ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
-      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_enc (ctx, nrounds);
 
       asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
                     "movdqu %%xmm0, %[outbuf]\n\t"
@@ -600,10 +605,9 @@ ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   u64 n = c->u_mode.ocb.data_nblocks;
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_dec (aes_const_ptr);
+  vpaes_ssse3_prepare_dec ();
 
   /* Preload Offset and Checksum */
   asm volatile ("movdqu %[iv], %%xmm7\n\t"
@@ -631,7 +635,7 @@ ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
                       [inbuf] "m" (*inbuf)
                     : "memory" );
 
-      do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_dec (ctx, nrounds);
 
       asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
                     "pxor   %%xmm0, %%xmm6\n\t"
@@ -675,10 +679,9 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   const unsigned char *abuf = abuf_arg;
   u64 n = c->u_mode.ocb.aad_nblocks;
   unsigned int nrounds = ctx->rounds;
-  const void *aes_const_ptr;
   byte ssse3_state[SSSE3_STATE_SIZE];
 
-  vpaes_ssse3_prepare_enc (aes_const_ptr);
+  vpaes_ssse3_prepare_enc ();
 
   /* Preload Offset and Sum */
   asm volatile ("movdqu %[iv], %%xmm7\n\t"
@@ -705,7 +708,7 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
                       [abuf] "m" (*abuf)
                     : "memory" );
 
-      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+      do_vpaes_ssse3_enc (ctx, nrounds);
 
       asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
                     :
@@ -726,774 +729,4 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   vpaes_ssse3_cleanup ();
 }
 
-
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define X(...)
-#else
-# define X(...) __VA_ARGS__
-#endif
-
-asm (
-  "\n\t" "##"
-  "\n\t" "## Constant-time SSSE3 AES core implementation."
-  "\n\t" "##"
-  "\n\t" "## By Mike Hamburg (Stanford University), 2009"
-  "\n\t" "## Public domain."
-  "\n\t" "##"
-
-  "\n\t" ".text"
-
-  "\n\t" "##"
-  "\n\t" "##  _aes_encrypt_core"
-  "\n\t" "##"
-  "\n\t" "##  AES-encrypt %xmm0."
-  "\n\t" "##"
-  "\n\t" "##  Inputs:"
-  "\n\t" "##     %xmm0 = input"
-  "\n\t" "##     %xmm9-%xmm15 as in .Laes_preheat"
-  "\n\t" "##     %rcx  = .Laes_consts"
-  "\n\t" "##    (%rdx) = scheduled keys"
-  "\n\t" "##     %rax  = nrounds - 1"
-  "\n\t" "##"
-  "\n\t" "##  Output in %xmm0"
-  "\n\t" "##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax"
-  "\n\t" "##  Preserves %xmm6 - %xmm7 so you get some local vectors"
-  "\n\t" "##"
-  "\n\t" "##"
-  "\n\t" ".align 16"
-X("\n\t" ".type _aes_encrypt_core, at function")
-  "\n\t" "_aes_encrypt_core:"
-  "\n\t" "	leaq	.Lk_mc_backward(%rcx), %rdi"
-  "\n\t" "	mov	$16,	%rsi"
-  "\n\t" "	movdqa	.Lk_ipt   (%rcx), %xmm2 # iptlo"
-  "\n\t" "	movdqa	%xmm9,	%xmm1"
-  "\n\t" "	pandn	%xmm0,	%xmm1"
-  "\n\t" "	psrld	$4,	%xmm1"
-  "\n\t" "	pand	%xmm9,	%xmm0"
-  "\n\t" "	pshufb	%xmm0,	%xmm2"
-  "\n\t" "	movdqa	.Lk_ipt+16(%rcx), %xmm0 # ipthi"
-  "\n\t" "	pshufb	%xmm1,	%xmm0"
-  "\n\t" "	pxor	(%rdx),%xmm2"
-  "\n\t" "	pxor	%xmm2,	%xmm0"
-  "\n\t" "	add	$16,	%rdx"
-  "\n\t" "	jmp	.Laes_entry"
-
-  "\n\t" ".align 8"
-  "\n\t" ".Laes_loop:"
-  "\n\t" "	# middle of middle round"
-  "\n\t" "	movdqa  %xmm13,	%xmm4	# 4 : sb1u"
-  "\n\t" "	pshufb  %xmm2,	%xmm4   # 4 = sb1u"
-  "\n\t" "	pxor	(%rdx),	%xmm4	# 4 = sb1u + k"
-  "\n\t" "	movdqa  %xmm12,	%xmm0	# 0 : sb1t"
-  "\n\t" "	pshufb  %xmm3,	%xmm0	# 0 = sb1t"
-  "\n\t" "	pxor	%xmm4,	%xmm0	# 0 = A"
-  "\n\t" "	movdqa  %xmm15,	%xmm4	# 4 : sb2u"
-  "\n\t" "	pshufb	%xmm2,	%xmm4	# 4 = sb2u"
-  "\n\t" "	movdqa	.Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1"
-  "\n\t" "	movdqa	%xmm14, %xmm2	# 2 : sb2t"
-  "\n\t" "	pshufb	%xmm3,  %xmm2	# 2 = sb2t"
-  "\n\t" "	pxor	%xmm4,  %xmm2	# 2 = 2A"
-  "\n\t" "	movdqa	%xmm0,  %xmm3	# 3 = A"
-  "\n\t" "	pshufb  %xmm1,  %xmm0	# 0 = B"
-  "\n\t" "	pxor	%xmm2,  %xmm0	# 0 = 2A+B"
-  "\n\t" "	pshufb	(%rsi,%rdi), %xmm3  # 3 = D"
-  "\n\t" "	lea	16(%esi),%esi	# next mc"
-  "\n\t" "	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D"
-  "\n\t" "	lea	16(%rdx),%rdx	# next key"
-  "\n\t" "	pshufb  %xmm1,	%xmm0	# 0 = 2B+C"
-  "\n\t" "	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D"
-  "\n\t" "	and	$48, %rsi	# ... mod 4"
-  "\n\t" "	dec	%rax		# nr--"
-
-  "\n\t" ".Laes_entry:"
-  "\n\t" "	# top of round"
-  "\n\t" "	movdqa  %xmm9, 	%xmm1	# 1 : i"
-  "\n\t" "	pandn	%xmm0, 	%xmm1	# 1 = i<<4"
-  "\n\t" "	psrld	$4,    	%xmm1   # 1 = i"
-  "\n\t" "	pand	%xmm9, 	%xmm0   # 0 = k"
-  "\n\t" "	movdqa	%xmm11, %xmm2	# 2 : a/k"
-  "\n\t" "	pshufb  %xmm0,  %xmm2	# 2 = a/k"
-  "\n\t" "	pxor	%xmm1,	%xmm0	# 0 = j"
-  "\n\t" "	movdqa  %xmm10,	%xmm3  	# 3 : 1/i"
-  "\n\t" "	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i"
-  "\n\t" "	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k"
-  "\n\t" "	movdqa	%xmm10,	%xmm4  	# 4 : 1/j"
-  "\n\t" "	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j"
-  "\n\t" "	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k"
-  "\n\t" "	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak"
-  "\n\t" "	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak"
-  "\n\t" "	pxor	%xmm0, 	%xmm2  	# 2 = io"
-  "\n\t" "	movdqa  %xmm10, %xmm3   # 3 : 1/jak"
-  "\n\t" "	pshufb  %xmm4,  %xmm3   # 3 = 1/jak"
-  "\n\t" "	pxor	%xmm1,  %xmm3   # 3 = jo"
-  "\n\t" "	jnz	.Laes_loop"
-
-  "\n\t" "	# middle of last round"
-  "\n\t" "	movdqa	.Lk_sbo(%rcx), %xmm4	# 3 : sbou"
-  "\n\t" "	pshufb  %xmm2,  %xmm4   # 4 = sbou"
-  "\n\t" "	pxor	(%rdx), %xmm4   # 4 = sb1u + k"
-  "\n\t" "	movdqa	.Lk_sbo+16(%rcx), %xmm0	# 0 : sbot"
-  "\n\t" "	pshufb  %xmm3,	%xmm0	# 0 = sb1t"
-  "\n\t" "	pxor	%xmm4,	%xmm0	# 0 = A"
-  "\n\t" "	pshufb	.Lk_sr(%rsi,%rcx), %xmm0"
-  "\n\t" "	ret"
-X("\n\t" ".size _aes_encrypt_core,.-_aes_encrypt_core")
-
-  "\n\t" "##"
-  "\n\t" "##  Decryption core"
-  "\n\t" "##"
-  "\n\t" "##  Same API as encryption core."
-  "\n\t" "##"
-  "\n\t" ".align 16"
-X("\n\t" ".type _aes_decrypt_core, at function")
-  "\n\t" "_aes_decrypt_core:"
-  "\n\t" "	movl	%eax,	%esi"
-  "\n\t" "	shll	$4,	%esi"
-  "\n\t" "	xorl	$48,	%esi"
-  "\n\t" "	andl	$48,	%esi"
-  "\n\t" "	movdqa	.Lk_dipt   (%rcx), %xmm2 # iptlo"
-  "\n\t" "	movdqa	%xmm9,	%xmm1"
-  "\n\t" "	pandn	%xmm0,	%xmm1"
-  "\n\t" "	psrld	$4,	%xmm1"
-  "\n\t" "	pand	%xmm9,	%xmm0"
-  "\n\t" "	pshufb	%xmm0,	%xmm2"
-  "\n\t" "	movdqa	.Lk_dipt+16(%rcx), %xmm0 # ipthi"
-  "\n\t" "	pshufb	%xmm1,	%xmm0"
-  "\n\t" "	pxor	(%rdx),	%xmm2"
-  "\n\t" "	pxor	%xmm2,	%xmm0"
-  "\n\t" "	movdqa	.Lk_mc_forward+48(%rcx), %xmm5"
-  "\n\t" "	lea	16(%rdx), %rdx"
-  "\n\t" "	neg	%rax"
-  "\n\t" "	jmp	.Laes_dec_entry"
-
-  "\n\t" ".align 16"
-  "\n\t" ".Laes_dec_loop:"
-  "\n\t" "##"
-  "\n\t" "##  Inverse mix columns"
-  "\n\t" "##"
-  "\n\t" "	movdqa  %xmm13,	%xmm4		# 4 : sb9u"
-  "\n\t" "	pshufb	%xmm2,	%xmm4		# 4 = sb9u"
-  "\n\t" "	pxor	(%rdx),	%xmm4"
-  "\n\t" "	movdqa  %xmm12,	%xmm0		# 0 : sb9t"
-  "\n\t" "	pshufb	%xmm3,	%xmm0		# 0 = sb9t"
-  "\n\t" "	movdqa  .Lk_dsbd+16(%rcx),%xmm1	# 1 : sbdt"
-  "\n\t" "	pxor	%xmm4,	%xmm0		# 0 = ch"
-  "\n\t" "	lea	16(%rdx), %rdx		# next round key"
-
-  "\n\t" "	pshufb	%xmm5,	%xmm0		# MC ch"
-  "\n\t" "	movdqa  %xmm15,	%xmm4		# 4 : sbdu"
-  "\n\t" "	pshufb	%xmm2,	%xmm4		# 4 = sbdu"
-  "\n\t" "	pxor	%xmm0,	%xmm4		# 4 = ch"
-  "\n\t" "	pshufb	%xmm3,	%xmm1		# 1 = sbdt"
-  "\n\t" "	pxor	%xmm4,	%xmm1		# 1 = ch"
-
-  "\n\t" "	pshufb	%xmm5,	%xmm1		# MC ch"
-  "\n\t" "	movdqa  %xmm14,	%xmm4		# 4 : sbbu"
-  "\n\t" "	pshufb	%xmm2,	%xmm4		# 4 = sbbu"
-  "\n\t" "      inc     %rax                    # nr--"
-  "\n\t" "	pxor	%xmm1,	%xmm4		# 4 = ch"
-  "\n\t" "	movdqa  .Lk_dsbb+16(%rcx),%xmm0	# 0 : sbbt"
-  "\n\t" "	pshufb	%xmm3,	%xmm0		# 0 = sbbt"
-  "\n\t" "	pxor	%xmm4,	%xmm0		# 0 = ch"
-
-  "\n\t" "	pshufb	%xmm5,	%xmm0		# MC ch"
-  "\n\t" "	movdqa  %xmm8,	%xmm4		# 4 : sbeu"
-  "\n\t" "	pshufb	%xmm2,	%xmm4		# 4 = sbeu"
-  "\n\t" "	pshufd	$0x93,	%xmm5,	%xmm5"
-  "\n\t" "	pxor	%xmm0,	%xmm4		# 4 = ch"
-  "\n\t" "	movdqa  .Lk_dsbe+16(%rcx),%xmm0	# 0 : sbet"
-  "\n\t" "	pshufb	%xmm3,	%xmm0		# 0 = sbet"
-  "\n\t" "	pxor	%xmm4,	%xmm0		# 0 = ch"
-
-  "\n\t" ".Laes_dec_entry:"
-  "\n\t" "	# top of round"
-  "\n\t" "	movdqa  %xmm9, 	%xmm1	# 1 : i"
-  "\n\t" "	pandn	%xmm0, 	%xmm1	# 1 = i<<4"
-  "\n\t" "	psrld	$4,    	%xmm1   # 1 = i"
-  "\n\t" "	pand	%xmm9, 	%xmm0   # 0 = k"
-  "\n\t" "	movdqa	%xmm11, %xmm2	# 2 : a/k"
-  "\n\t" "	pshufb  %xmm0,  %xmm2	# 2 = a/k"
-  "\n\t" "	pxor	%xmm1,	%xmm0	# 0 = j"
-  "\n\t" "	movdqa  %xmm10,	%xmm3  	# 3 : 1/i"
-  "\n\t" "	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i"
-  "\n\t" "	pxor	%xmm2, 	%xmm3  	# 3 = iak = 1/i + a/k"
-  "\n\t" "	movdqa	%xmm10,	%xmm4  	# 4 : 1/j"
-  "\n\t" "	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j"
-  "\n\t" "	pxor	%xmm2, 	%xmm4  	# 4 = jak = 1/j + a/k"
-  "\n\t" "	movdqa  %xmm10,	%xmm2  	# 2 : 1/iak"
-  "\n\t" "	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak"
-  "\n\t" "	pxor	%xmm0, 	%xmm2  	# 2 = io"
-  "\n\t" "	movdqa  %xmm10, %xmm3   # 3 : 1/jak"
-  "\n\t" "	pshufb  %xmm4,  %xmm3   # 3 = 1/jak"
-  "\n\t" "	pxor	%xmm1,  %xmm3   # 3 = jo"
-  "\n\t" "	jnz	.Laes_dec_loop"
-
-  "\n\t" "	# middle of last round"
-  "\n\t" "	movdqa	.Lk_dsbo(%rcx), %xmm4		# 3 : sbou"
-  "\n\t" "	pshufb  %xmm2,  %xmm4   # 4 = sbou"
-  "\n\t" "	pxor	(%rdx), %xmm4   # 4 = sb1u + k"
-  "\n\t" "	movdqa	.Lk_dsbo+16(%rcx), %xmm0	# 0 : sbot"
-  "\n\t" "	pshufb  %xmm3,	%xmm0	# 0 = sb1t"
-  "\n\t" "	pxor	%xmm4,	%xmm0	# 0 = A"
-  "\n\t" "	pshufb	.Lk_sr(%rsi,%rcx), %xmm0"
-  "\n\t" "	ret"
-X("\n\t" ".size _aes_decrypt_core,.-_aes_decrypt_core")
-
-  "\n\t" "########################################################"
-  "\n\t" "##                                                    ##"
-  "\n\t" "##                  AES key schedule                  ##"
-  "\n\t" "##                                                    ##"
-  "\n\t" "########################################################"
-
-  "\n\t" ".align 16"
-X("\n\t" ".type _aes_schedule_core, at function")
-  "\n\t" "_aes_schedule_core:"
-  "\n\t" "	# rdi = key"
-  "\n\t" "	# rsi = size in bits"
-  "\n\t" "	# rdx = buffer"
-  "\n\t" "	# rcx = direction.  0=encrypt, 1=decrypt"
-
-  "\n\t" "	# load the tables"
-  "\n\t" "	lea	.Laes_consts(%rip), %r10"
-  "\n\t" "	movdqa	          (%r10), %xmm9  # 0F"
-  "\n\t" "	movdqa	.Lk_inv   (%r10), %xmm10 # inv"
-  "\n\t" "	movdqa	.Lk_inv+16(%r10), %xmm11 # inva"
-  "\n\t" "	movdqa	.Lk_sb1   (%r10), %xmm13 # sb1u"
-  "\n\t" "	movdqa	.Lk_sb1+16(%r10), %xmm12 # sb1t"
-  "\n\t" "	movdqa	.Lk_sb2   (%r10), %xmm15 # sb2u"
-  "\n\t" "	movdqa	.Lk_sb2+16(%r10), %xmm14 # sb2t"
-
-  "\n\t" "	movdqa	.Lk_rcon(%r10), %xmm8	# load rcon"
-  "\n\t" "	movdqu	(%rdi),	%xmm0		# load key (unaligned)"
-
-  "\n\t" "	# input transform"
-  "\n\t" "	movdqu	%xmm0,	%xmm3"
-  "\n\t" "	lea	.Lk_ipt(%r10), %r11"
-  "\n\t" "	call	.Laes_schedule_transform"
-  "\n\t" "	movdqu	%xmm0,	%xmm7"
-
-  "\n\t" "	test	%rcx,	%rcx"
-  "\n\t" "	jnz	.Laes_schedule_am_decrypting"
-
-  "\n\t" "	# encrypting, output zeroth round key after transform"
-  "\n\t" "	movdqa	%xmm0,	(%rdx)"
-  "\n\t" "	jmp	.Laes_schedule_go"
-
-  "\n\t" ".Laes_schedule_am_decrypting:"
-  "\n\t" "	# decrypting, output zeroth round key after shiftrows"
-  "\n\t" "	pshufb  .Lk_sr(%r8,%r10),%xmm3"
-  "\n\t" "	movdqa	%xmm3,	(%rdx)"
-  "\n\t" "	xor	$48, 	%r8"
-
-  "\n\t" ".Laes_schedule_go:"
-  "\n\t" "	cmp	$192,	%rsi"
-  "\n\t" "	je	.Laes_schedule_192"
-  "\n\t" "	cmp	$256,	%rsi"
-  "\n\t" "	je	.Laes_schedule_256"
-  "\n\t" "	# 128: fall though"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_128"
-  "\n\t" "##"
-  "\n\t" "##  128-bit specific part of key schedule."
-  "\n\t" "##"
-  "\n\t" "##  This schedule is really simple, because all its parts"
-  "\n\t" "##  are accomplished by the subroutines."
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_128:"
-  "\n\t" "	mov	$10, %rsi"
-
-  "\n\t" ".Laes_schedule_128_L:"
-  "\n\t" "	call 	.Laes_schedule_round"
-  "\n\t" "	dec	%rsi"
-  "\n\t" "	jz 	.Laes_schedule_mangle_last"
-  "\n\t" "	call	.Laes_schedule_mangle	# write output"
-  "\n\t" "	jmp 	.Laes_schedule_128_L"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_192"
-  "\n\t" "##"
-  "\n\t" "##  192-bit specific part of key schedule."
-  "\n\t" "##"
-  "\n\t" "##  The main body of this schedule is the same as the 128-bit"
-  "\n\t" "##  schedule, but with more smearing.  The long, high side is"
-  "\n\t" "##  stored in %xmm7 as before, and the short, low side is in"
-  "\n\t" "##  the high bits of %xmm6."
-  "\n\t" "##"
-  "\n\t" "##  This schedule is somewhat nastier, however, because each"
-  "\n\t" "##  round produces 192 bits of key material, or 1.5 round keys."
-  "\n\t" "##  Therefore, on each cycle we do 2 rounds and produce 3 round"
-  "\n\t" "##  keys."
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_192:"
-  "\n\t" "	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)"
-  "\n\t" "	call	.Laes_schedule_transform	# input transform"
-  "\n\t" "	pshufd	$0x0E,	%xmm0,	%xmm6"
-  "\n\t" "	pslldq	$8,	%xmm6		# clobber low side with zeros"
-  "\n\t" "	mov	$4,	%rsi"
-
-  "\n\t" ".Laes_schedule_192_L:"
-  "\n\t" "	call	.Laes_schedule_round"
-  "\n\t" "	palignr	$8,%xmm6,%xmm0	"
-  "\n\t" "	call	.Laes_schedule_mangle	# save key n"
-  "\n\t" "	call	.Laes_schedule_192_smear"
-  "\n\t" "	call	.Laes_schedule_mangle	# save key n+1"
-  "\n\t" "	call	.Laes_schedule_round"
-  "\n\t" "	dec	%rsi"
-  "\n\t" "	jz 	.Laes_schedule_mangle_last"
-  "\n\t" "	call	.Laes_schedule_mangle	# save key n+2"
-  "\n\t" "	call	.Laes_schedule_192_smear"
-  "\n\t" "	jmp	.Laes_schedule_192_L"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_192_smear"
-  "\n\t" "##"
-  "\n\t" "##  Smear the short, low side in the 192-bit key schedule."
-  "\n\t" "##"
-  "\n\t" "##  Inputs:"
-  "\n\t" "##    %xmm7: high side, b  a  x  y"
-  "\n\t" "##    %xmm6:  low side, d  c  0  0"
-  "\n\t" "##    %xmm13: 0"
-  "\n\t" "##"
-  "\n\t" "##  Outputs:"
-  "\n\t" "##    %xmm6: b+c+d  b+c  0  0"
-  "\n\t" "##    %xmm0: b+c+d  b+c  b  a"
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_192_smear:"
-  "\n\t" "	pshufd	$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0"
-  "\n\t" "	pxor	%xmm0,	%xmm6		# -> c+d c 0 0"
-  "\n\t" "	pshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a"
-  "\n\t" "	pxor	%xmm6,	%xmm0		# -> b+c+d b+c b a"
-  "\n\t" "	pshufd	$0x0E,	%xmm0,	%xmm6"
-  "\n\t" "	pslldq	$8,	%xmm6		# clobber low side with zeros"
-  "\n\t" "	ret"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_256"
-  "\n\t" "##"
-  "\n\t" "##  256-bit specific part of key schedule."
-  "\n\t" "##"
-  "\n\t" "##  The structure here is very similar to the 128-bit"
-  "\n\t" "##  schedule, but with an additional 'low side' in"
-  "\n\t" "##  %xmm6.  The low side's rounds are the same as the"
-  "\n\t" "##  high side's, except no rcon and no rotation."
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_256:"
-  "\n\t" "	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)"
-  "\n\t" "	call	.Laes_schedule_transform	# input transform"
-  "\n\t" "	mov	$7, %rsi"
-
-  "\n\t" ".Laes_schedule_256_L:"
-  "\n\t" "	call	.Laes_schedule_mangle	# output low result"
-  "\n\t" "	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6"
-
-  "\n\t" "	# high round"
-  "\n\t" "	call	.Laes_schedule_round"
-  "\n\t" "	dec	%rsi"
-  "\n\t" "	jz 	.Laes_schedule_mangle_last"
-  "\n\t" "	call	.Laes_schedule_mangle	"
-
-  "\n\t" "	# low round. swap xmm7 and xmm6"
-  "\n\t" "	pshufd	$0xFF,	%xmm0,	%xmm0"
-  "\n\t" "	movdqa	%xmm7,	%xmm5"
-  "\n\t" "	movdqa	%xmm6,	%xmm7"
-  "\n\t" "	call	.Laes_schedule_low_round"
-  "\n\t" "	movdqa	%xmm5,	%xmm7"
-
-  "\n\t" "	jmp	.Laes_schedule_256_L"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_round"
-  "\n\t" "##"
-  "\n\t" "##  Runs one main round of the key schedule on %xmm0, %xmm7"
-  "\n\t" "##"
-  "\n\t" "##  Specifically, runs subbytes on the high dword of %xmm0"
-  "\n\t" "##  then rotates it by one byte and xors into the low dword of"
-  "\n\t" "##  %xmm7."
-  "\n\t" "##"
-  "\n\t" "##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for"
-  "\n\t" "##  next rcon."
-  "\n\t" "##"
-  "\n\t" "##  Smears the dwords of %xmm7 by xoring the low into the"
-  "\n\t" "##  second low, result into third, result into highest."
-  "\n\t" "##"
-  "\n\t" "##  Returns results in %xmm7 = %xmm0."
-  "\n\t" "##  Clobbers %xmm1-%xmm4, %r11."
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_round:"
-  "\n\t" "	# extract rcon from xmm8"
-  "\n\t" "	pxor	%xmm1,	%xmm1"
-  "\n\t" "	palignr	$15,	%xmm8,	%xmm1"
-  "\n\t" "	palignr	$15,	%xmm8,	%xmm8"
-  "\n\t" "	pxor	%xmm1,	%xmm7"
-
-  "\n\t" "	# rotate"
-  "\n\t" "	pshufd	$0xFF,	%xmm0,	%xmm0"
-  "\n\t" "	palignr	$1,	%xmm0,	%xmm0"
-
-  "\n\t" "	# fall through..."
-
-  "\n\t" "	# low round: same as high round, but no rotation and no rcon."
-  "\n\t" ".Laes_schedule_low_round:"
-  "\n\t" "	# smear xmm7"
-  "\n\t" "	movdqa	%xmm7,	%xmm1"
-  "\n\t" "	pslldq	$4,	%xmm7"
-  "\n\t" "	pxor	%xmm1,	%xmm7"
-  "\n\t" "	movdqa	%xmm7,	%xmm1"
-  "\n\t" "	pslldq	$8,	%xmm7"
-  "\n\t" "	pxor	%xmm1,	%xmm7"
-  "\n\t" "	pxor	.Lk_s63(%r10), %xmm7"
-
-  "\n\t" "	# subbytes"
-  "\n\t" "	movdqa  %xmm9, 	%xmm1"
-  "\n\t" "	pandn	%xmm0, 	%xmm1"
-  "\n\t" "	psrld	$4,    	%xmm1		# 1 = i"
-  "\n\t" "	pand	%xmm9, 	%xmm0		# 0 = k"
-  "\n\t" "	movdqa	%xmm11, %xmm2		# 2 : a/k"
-  "\n\t" "	pshufb  %xmm0,  %xmm2		# 2 = a/k"
-  "\n\t" "	pxor	%xmm1,	%xmm0		# 0 = j"
-  "\n\t" "	movdqa  %xmm10,	%xmm3		# 3 : 1/i"
-  "\n\t" "	pshufb  %xmm1, 	%xmm3		# 3 = 1/i"
-  "\n\t" "	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k"
-  "\n\t" "	movdqa	%xmm10,	%xmm4		# 4 : 1/j"
-  "\n\t" "	pshufb	%xmm0, 	%xmm4		# 4 = 1/j"
-  "\n\t" "	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k"
-  "\n\t" "	movdqa  %xmm10,	%xmm2		# 2 : 1/iak"
-  "\n\t" "	pshufb  %xmm3,	%xmm2		# 2 = 1/iak"
-  "\n\t" "	pxor	%xmm0, 	%xmm2		# 2 = io"
-  "\n\t" "	movdqa  %xmm10, %xmm3		# 3 : 1/jak"
-  "\n\t" "	pshufb  %xmm4,  %xmm3		# 3 = 1/jak"
-  "\n\t" "	pxor	%xmm1,  %xmm3		# 3 = jo"
-  "\n\t" "	movdqa	.Lk_sb1(%r10), %xmm4	# 4 : sbou"
-  "\n\t" "	pshufb  %xmm2,  %xmm4		# 4 = sbou"
-  "\n\t" "	movdqa	.Lk_sb1+16(%r10), %xmm0	# 0 : sbot"
-  "\n\t" "	pshufb  %xmm3,	%xmm0		# 0 = sb1t"
-  "\n\t" "	pxor	%xmm4, 	%xmm0		# 0 = sbox output"
-
-  "\n\t" "	# add in smeared stuff"
-  "\n\t" "	pxor	%xmm7,	%xmm0	"
-  "\n\t" "	movdqa	%xmm0,	%xmm7"
-  "\n\t" "	ret"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_transform"
-  "\n\t" "##"
-  "\n\t" "##  Linear-transform %xmm0 according to tables at (%r11)"
-  "\n\t" "##"
-  "\n\t" "##  Requires that %xmm9 = 0x0F0F... as in preheat"
-  "\n\t" "##  Output in %xmm0"
-  "\n\t" "##  Clobbers %xmm1, %xmm2"
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_transform:"
-  "\n\t" "	movdqa	%xmm9,	%xmm1"
-  "\n\t" "	pandn	%xmm0,	%xmm1"
-  "\n\t" "	psrld	$4,	%xmm1"
-  "\n\t" "	pand	%xmm9,	%xmm0"
-  "\n\t" "	movdqa	(%r11), %xmm2 	# lo"
-  "\n\t" "	pshufb	%xmm0,	%xmm2"
-  "\n\t" "	movdqa	16(%r11), %xmm0 # hi"
-  "\n\t" "	pshufb	%xmm1,	%xmm0"
-  "\n\t" "	pxor	%xmm2,	%xmm0"
-  "\n\t" "	ret"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_mangle"
-  "\n\t" "##"
-  "\n\t" "##  Mangle xmm0 from (basis-transformed) standard version"
-  "\n\t" "##  to our version."
-  "\n\t" "##"
-  "\n\t" "##  On encrypt,"
-  "\n\t" "##    xor with 0x63"
-  "\n\t" "##    multiply by circulant 0,1,1,1"
-  "\n\t" "##    apply shiftrows transform"
-  "\n\t" "##"
-  "\n\t" "##  On decrypt,"
-  "\n\t" "##    xor with 0x63"
-  "\n\t" "##    multiply by 'inverse mixcolumns' circulant E,B,D,9"
-  "\n\t" "##    deskew"
-  "\n\t" "##    apply shiftrows transform"
-  "\n\t" "##"
-  "\n\t" "##"
-  "\n\t" "##  Writes out to (%rdx), and increments or decrements it"
-  "\n\t" "##  Keeps track of round number mod 4 in %r8"
-  "\n\t" "##  Preserves xmm0"
-  "\n\t" "##  Clobbers xmm1-xmm5"
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_mangle:"
-  "\n\t" "	movdqa	%xmm0,	%xmm4	# save xmm0 for later"
-  "\n\t" "	movdqa	.Lk_mc_forward(%r10),%xmm5"
-  "\n\t" "	test	%rcx, 	%rcx"
-  "\n\t" "	jnz	.Laes_schedule_mangle_dec"
-
-  "\n\t" "	# encrypting"
-  "\n\t" "	add	$16,	%rdx"
-  "\n\t" "	pxor	.Lk_s63(%r10),%xmm4"
-  "\n\t" "	pshufb	%xmm5,	%xmm4"
-  "\n\t" "	movdqa	%xmm4,	%xmm3"
-  "\n\t" "	pshufb	%xmm5,	%xmm4"
-  "\n\t" "	pxor	%xmm4,	%xmm3"
-  "\n\t" "	pshufb	%xmm5,	%xmm4"
-  "\n\t" "	pxor	%xmm4,	%xmm3"
-
-  "\n\t" "	jmp	.Laes_schedule_mangle_both"
-
-  "\n\t" ".Laes_schedule_mangle_dec:"
-  "\n\t" "	lea	.Lk_dks_1(%r10), %r11	# first table: *9"
-  "\n\t" "	call 	.Laes_schedule_transform"
-  "\n\t" "	movdqa	%xmm0,	%xmm3"
-  "\n\t" "	pshufb	%xmm5,	%xmm3"
-
-  "\n\t" "	add	$32, 	%r11		# next table:  *B"
-  "\n\t" "	call 	.Laes_schedule_transform"
-  "\n\t" "	pxor	%xmm0,	%xmm3"
-  "\n\t" "	pshufb	%xmm5,	%xmm3"
-
-  "\n\t" "	add	$32, 	%r11		# next table:  *D"
-  "\n\t" "	call 	.Laes_schedule_transform"
-  "\n\t" "	pxor	%xmm0,	%xmm3"
-  "\n\t" "	pshufb	%xmm5,	%xmm3"
-
-  "\n\t" "	add	$32, 	%r11		# next table:  *E"
-  "\n\t" "	call 	.Laes_schedule_transform"
-  "\n\t" "	pxor	%xmm0,	%xmm3"
-  "\n\t" "	pshufb	%xmm5,	%xmm3"
-
-  "\n\t" "	movdqa	%xmm4,	%xmm0		# restore %xmm0"
-  "\n\t" "	add	$-16,	%rdx"
-
-  "\n\t" ".Laes_schedule_mangle_both:"
-  "\n\t" "	pshufb	.Lk_sr(%r8,%r10),%xmm3"
-  "\n\t" "	add	$-16,	%r8"
-  "\n\t" "	and	$48,	%r8"
-  "\n\t" "	movdqa	%xmm3,	(%rdx)"
-  "\n\t" "	ret"
-
-  "\n\t" "##"
-  "\n\t" "##  .Laes_schedule_mangle_last"
-  "\n\t" "##"
-  "\n\t" "##  Mangler for last round of key schedule"
-  "\n\t" "##  Mangles %xmm0"
-  "\n\t" "##    when encrypting, outputs out(%xmm0) ^ 63"
-  "\n\t" "##    when decrypting, outputs unskew(%xmm0)"
-  "\n\t" "##"
-  "\n\t" "##  Always called right before return... jumps to cleanup and exits"
-  "\n\t" "##"
-  "\n\t" ".Laes_schedule_mangle_last:"
-  "\n\t" "	# schedule last round key from xmm0"
-  "\n\t" "	lea	.Lk_deskew(%r10),%r11	# prepare to deskew"
-  "\n\t" "	test	%rcx, 	%rcx"
-  "\n\t" "	jnz	.Laes_schedule_mangle_last_dec"
-
-  "\n\t" "	# encrypting"
-  "\n\t" "	pshufb	.Lk_sr(%r8,%r10),%xmm0	# output permute"
-  "\n\t" "	lea	.Lk_opt(%r10),	%r11	# prepare to output transform"
-  "\n\t" "	add	$32,	%rdx"
-
-  "\n\t" ".Laes_schedule_mangle_last_dec:"
-  "\n\t" "	add	$-16,	%rdx"
-  "\n\t" "	pxor	.Lk_s63(%r10),	%xmm0"
-  "\n\t" "	call	.Laes_schedule_transform # output transform"
-  "\n\t" "	movdqa	%xmm0,	(%rdx)		# save last key"
-
-  "\n\t" "	#_aes_cleanup"
-  "\n\t" "	pxor	%xmm0,  %xmm0"
-  "\n\t" "	pxor	%xmm1,  %xmm1"
-  "\n\t" "	pxor	%xmm2,  %xmm2"
-  "\n\t" "	pxor	%xmm3,  %xmm3"
-  "\n\t" "	pxor	%xmm4,  %xmm4"
-  "\n\t" "	pxor	%xmm5,  %xmm5"
-  "\n\t" "	pxor	%xmm6,  %xmm6"
-  "\n\t" "	pxor	%xmm7,  %xmm7"
-  "\n\t" "	pxor	%xmm8,  %xmm8"
-  "\n\t" "	ret"
-X("\n\t" ".size _aes_schedule_core,.-_aes_schedule_core")
-
-  "\n\t" "########################################################"
-  "\n\t" "##                                                    ##"
-  "\n\t" "##                     Constants                      ##"
-  "\n\t" "##                                                    ##"
-  "\n\t" "########################################################"
-
-  "\n\t" ".align 16"
-X("\n\t" ".type _aes_consts, at object")
-  "\n\t" ".Laes_consts:"
-  "\n\t" "_aes_consts:"
-  "\n\t" "	# s0F"
-  "\n\t" "	.Lk_s0F = .-.Laes_consts"
-  "\n\t" "	.quad	0x0F0F0F0F0F0F0F0F"
-  "\n\t" "	.quad	0x0F0F0F0F0F0F0F0F"
-
-  "\n\t" "	# input transform (lo, hi)"
-  "\n\t" "	.Lk_ipt = .-.Laes_consts"
-  "\n\t" "	.quad	0xC2B2E8985A2A7000"
-  "\n\t" "	.quad	0xCABAE09052227808"
-  "\n\t" "	.quad	0x4C01307D317C4D00"
-  "\n\t" "	.quad	0xCD80B1FCB0FDCC81"
-
-  "\n\t" "	# inv, inva"
-  "\n\t" "	.Lk_inv = .-.Laes_consts"
-  "\n\t" "	.quad	0x0E05060F0D080180"
-  "\n\t" "	.quad	0x040703090A0B0C02"
-  "\n\t" "	.quad	0x01040A060F0B0780"
-  "\n\t" "	.quad	0x030D0E0C02050809"
-
-  "\n\t" "	# sb1u, sb1t"
-  "\n\t" "	.Lk_sb1 = .-.Laes_consts"
-  "\n\t" "	.quad	0xB19BE18FCB503E00"
-  "\n\t" "	.quad	0xA5DF7A6E142AF544"
-  "\n\t" "	.quad	0x3618D415FAE22300"
-  "\n\t" "	.quad	0x3BF7CCC10D2ED9EF"
-
-
-  "\n\t" "	# sb2u, sb2t"
-  "\n\t" "	.Lk_sb2 = .-.Laes_consts"
-  "\n\t" "	.quad	0xE27A93C60B712400"
-  "\n\t" "	.quad	0x5EB7E955BC982FCD"
-  "\n\t" "	.quad	0x69EB88400AE12900"
-  "\n\t" "	.quad	0xC2A163C8AB82234A"
-
-  "\n\t" "	# sbou, sbot"
-  "\n\t" "	.Lk_sbo = .-.Laes_consts"
-  "\n\t" "	.quad	0xD0D26D176FBDC700"
-  "\n\t" "	.quad	0x15AABF7AC502A878"
-  "\n\t" "	.quad	0xCFE474A55FBB6A00"
-  "\n\t" "	.quad	0x8E1E90D1412B35FA"
-
-  "\n\t" "	# mc_forward"
-  "\n\t" "	.Lk_mc_forward = .-.Laes_consts"
-  "\n\t" "	.quad	0x0407060500030201"
-  "\n\t" "	.quad	0x0C0F0E0D080B0A09"
-  "\n\t" "	.quad	0x080B0A0904070605"
-  "\n\t" "	.quad	0x000302010C0F0E0D"
-  "\n\t" "	.quad	0x0C0F0E0D080B0A09"
-  "\n\t" "	.quad	0x0407060500030201"
-  "\n\t" "	.quad	0x000302010C0F0E0D"
-  "\n\t" "	.quad	0x080B0A0904070605"
-
-  "\n\t" "	# mc_backward"
-  "\n\t" "	.Lk_mc_backward = .-.Laes_consts"
-  "\n\t" "	.quad	0x0605040702010003"
-  "\n\t" "	.quad	0x0E0D0C0F0A09080B"
-  "\n\t" "	.quad	0x020100030E0D0C0F"
-  "\n\t" "	.quad	0x0A09080B06050407"
-  "\n\t" "	.quad	0x0E0D0C0F0A09080B"
-  "\n\t" "	.quad	0x0605040702010003"
-  "\n\t" "	.quad	0x0A09080B06050407"
-  "\n\t" "	.quad	0x020100030E0D0C0F"
-
-  "\n\t" "	# sr"
-  "\n\t" "	.Lk_sr = .-.Laes_consts"
-  "\n\t" "	.quad	0x0706050403020100"
-  "\n\t" "	.quad	0x0F0E0D0C0B0A0908"
-  "\n\t" "	.quad	0x030E09040F0A0500"
-  "\n\t" "	.quad	0x0B06010C07020D08"
-  "\n\t" "	.quad	0x0F060D040B020900"
-  "\n\t" "	.quad	0x070E050C030A0108"
-  "\n\t" "	.quad	0x0B0E0104070A0D00"
-  "\n\t" "	.quad	0x0306090C0F020508"
-
-  "\n\t" "	# rcon"
-  "\n\t" "	.Lk_rcon = .-.Laes_consts"
-  "\n\t" "	.quad	0x1F8391B9AF9DEEB6"
-  "\n\t" "	.quad	0x702A98084D7C7D81"
-
-  "\n\t" "	# s63: all equal to 0x63 transformed"
-  "\n\t" "	.Lk_s63 = .-.Laes_consts"
-  "\n\t" "	.quad	0x5B5B5B5B5B5B5B5B"
-  "\n\t" "	.quad	0x5B5B5B5B5B5B5B5B"
-
-  "\n\t" "	# output transform"
-  "\n\t" "	.Lk_opt = .-.Laes_consts"
-  "\n\t" "	.quad	0xFF9F4929D6B66000"
-  "\n\t" "	.quad	0xF7974121DEBE6808"
-  "\n\t" "	.quad	0x01EDBD5150BCEC00"
-  "\n\t" "	.quad	0xE10D5DB1B05C0CE0"
-
-  "\n\t" "	# deskew tables: inverts the sbox's 'skew'"
-  "\n\t" "	.Lk_deskew = .-.Laes_consts"
-  "\n\t" "	.quad	0x07E4A34047A4E300"
-  "\n\t" "	.quad	0x1DFEB95A5DBEF91A"
-  "\n\t" "	.quad	0x5F36B5DC83EA6900"
-  "\n\t" "	.quad	0x2841C2ABF49D1E77"
-
-  "\n\t" "##"
-  "\n\t" "##  Decryption stuff"
-  "\n\t" "##  Key schedule constants"
-  "\n\t" "##"
-  "\n\t" "	# decryption key schedule: x -> invskew x*9"
-  "\n\t" "	.Lk_dks_1 = .-.Laes_consts"
-  "\n\t" "	.quad	0xB6116FC87ED9A700"
-  "\n\t" "	.quad	0x4AED933482255BFC"
-  "\n\t" "	.quad	0x4576516227143300"
-  "\n\t" "	.quad	0x8BB89FACE9DAFDCE"
-
-  "\n\t" "	# decryption key schedule: invskew x*9 -> invskew x*D"
-  "\n\t" "	.Lk_dks_2 = .-.Laes_consts"
-  "\n\t" "	.quad	0x27438FEBCCA86400"
-  "\n\t" "	.quad	0x4622EE8AADC90561"
-  "\n\t" "	.quad	0x815C13CE4F92DD00"
-  "\n\t" "	.quad	0x73AEE13CBD602FF2"
-
-  "\n\t" "	# decryption key schedule: invskew x*D -> invskew x*B"
-  "\n\t" "	.Lk_dks_3 = .-.Laes_consts"
-  "\n\t" "	.quad	0x03C4C50201C6C700"
-  "\n\t" "	.quad	0xF83F3EF9FA3D3CFB"
-  "\n\t" "	.quad	0xEE1921D638CFF700"
-  "\n\t" "	.quad	0xA5526A9D7384BC4B"
-
-  "\n\t" "	# decryption key schedule: invskew x*B -> invskew x*E + 0x63"
-  "\n\t" "	.Lk_dks_4 = .-.Laes_consts"
-  "\n\t" "	.quad	0xE3C390B053732000"
-  "\n\t" "	.quad	0xA080D3F310306343"
-  "\n\t" "	.quad	0xA0CA214B036982E8"
-  "\n\t" "	.quad	0x2F45AEC48CE60D67"
-
-  "\n\t" "##"
-  "\n\t" "##  Decryption stuff"
-  "\n\t" "##  Round function constants"
-  "\n\t" "##"
-  "\n\t" "	# decryption input transform"
-  "\n\t" "	.Lk_dipt = .-.Laes_consts"
-  "\n\t" "	.quad	0x0F505B040B545F00"
-  "\n\t" "	.quad	0x154A411E114E451A"
-  "\n\t" "	.quad	0x86E383E660056500"
-  "\n\t" "	.quad	0x12771772F491F194"
-
-  "\n\t" "	# decryption sbox output *9*u, *9*t"
-  "\n\t" "	.Lk_dsb9 = .-.Laes_consts"
-  "\n\t" "	.quad	0x851C03539A86D600"
-  "\n\t" "	.quad	0xCAD51F504F994CC9"
-  "\n\t" "	.quad	0xC03B1789ECD74900"
-  "\n\t" "	.quad	0x725E2C9EB2FBA565"
-
-  "\n\t" "	# decryption sbox output *D*u, *D*t"
-  "\n\t" "	.Lk_dsbd = .-.Laes_consts"
-  "\n\t" "	.quad	0x7D57CCDFE6B1A200"
-  "\n\t" "	.quad	0xF56E9B13882A4439"
-  "\n\t" "	.quad	0x3CE2FAF724C6CB00"
-  "\n\t" "	.quad	0x2931180D15DEEFD3"
-
-  "\n\t" "	# decryption sbox output *B*u, *B*t"
-  "\n\t" "	.Lk_dsbb = .-.Laes_consts"
-  "\n\t" "	.quad	0xD022649296B44200"
-  "\n\t" "	.quad	0x602646F6B0F2D404"
-  "\n\t" "	.quad	0xC19498A6CD596700"
-  "\n\t" "	.quad	0xF3FF0C3E3255AA6B"
-
-  "\n\t" "	# decryption sbox output *E*u, *E*t"
-  "\n\t" "	.Lk_dsbe = .-.Laes_consts"
-  "\n\t" "	.quad	0x46F2929626D4D000"
-  "\n\t" "	.quad	0x2242600464B4F6B0"
-  "\n\t" "	.quad	0x0C55A6CDFFAAC100"
-  "\n\t" "	.quad	0x9467F36B98593E32"
-
-  "\n\t" "	# decryption sbox final output"
-  "\n\t" "	.Lk_dsbo = .-.Laes_consts"
-  "\n\t" "	.quad	0x1387EA537EF94000"
-  "\n\t" "	.quad	0xC7AA6DB9D4943E2D"
-  "\n\t" "	.quad	0x12D7560F93441D00"
-  "\n\t" "	.quad	0xCA4B8159D8C58E9C"
-X("\n\t" ".size _aes_consts,.-_aes_consts")
-);
-
 #endif /* USE_SSSE3 */
diff --git a/configure.ac b/configure.ac
index 4932786..31c0d55 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2031,6 +2031,7 @@ if test "$found" = "1" ; then
 
          # Build with the SSSE3 implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo"
       ;;
       arm*-*-*)
          # Build with the assembly implementation