[PATCH] rijndael-ssse3: move assembly functions to separate source-file
Jussi Kivilinna
jussi.kivilinna at iki.fi
Wed Jan 4 23:35:42 CET 2017
* cipher/Makefile.am: Add 'rinjdael-ssse3-amd64-asm.S'.
* cipher/rinjdael-ssse3-amd64-asm.S: Moved assembly functions
here ...
* cipher/rinjdael-ssse3-amd64.c: ... from this file.
(_gcry_aes_ssse3_enc_preload, _gcry_aes_ssse3_dec_preload)
(_gcry_aes_ssse3_shedule_core, _gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core): New.
(vpaes_ssse3_prepare_enc, vpaes_ssse3_prepare_dec)
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(do_vpaes_ssse3_enc, do_vpaes_ssse3_dec): Update to use external
assembly functions; remove 'aes_const_ptr' variable usage.
(_gcry_aes_ssse3_encrypt, _gcry_aes_ssse3_decrypt)
(_gcry_aes_ssse3_cfb_enc, _gcry_aes_ssse3_cbc_enc)
(_gcry_aes_ssse3_ctr_enc, _gcry_aes_ssse3_cfb_dec)
(_gcry_aes_ssse3_cbc_dec, ssse3_ocb_enc, ssse3_ocb_dec)
(_gcry_aes_ssse3_ocb_auth): Remove 'aes_const_ptr' variable usage.
* configure.ac: Add 'rinjdael-ssse3-amd64-asm.lo'.
--
After this change, libgcrypt can be compiled with -flto optimization
enabled on x86-64.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 8c9fc0e..fb0b7d2 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -80,7 +80,8 @@ md4.c \
md5.c \
poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
- rijndael-padlock.c rijndael-amd64.S rijndael-arm.S rijndael-ssse3-amd64.c \
+ rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \
+ rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S rijndael-armv8-aarch64-ce.S \
rijndael-aarch64.S \
rmd160.c \
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
new file mode 100644
index 0000000..3ae55e8
--- /dev/null
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -0,0 +1,853 @@
+/* SSSE3 vector permutation AES for Libgcrypt
+ * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The code is based on the public domain library libvpaes version 0.5
+ * available at http://crypto.stanford.edu/vpaes/ and which carries
+ * this notice:
+ *
+ * libvpaes: constant-time SSSE3 AES encryption and decryption.
+ * version 0.5
+ *
+ * By Mike Hamburg, Stanford University, 2009. Public domain.
+ * I wrote essentially all of this code. I did not write the test
+ * vectors; they are the NIST known answer tests. I hereby release all
+ * the code and documentation here that I wrote into the public domain.
+ *
+ * This is an implementation of AES following my paper,
+ * "Accelerating AES with Vector Permute Instructions
+ * CHES 2009; http://shiftleft.org/papers/vector_aes/
+ */
+
+#if defined(__x86_64__)
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ELF(...)
+#else
+# define ELF(...) __VA_ARGS__
+#endif
+
+.text
+
+##
+## _gcry_aes_ssse3_enc_preload
+##
+ELF(.type _gcry_aes_ssse3_enc_preload, at function)
+.globl _gcry_aes_ssse3_enc_preload
+_gcry_aes_ssse3_enc_preload:
+ lea .Laes_consts(%rip), %rax
+ movdqa (%rax), %xmm9 # 0F
+ movdqa .Lk_inv (%rax), %xmm10 # inv
+ movdqa .Lk_inv+16(%rax), %xmm11 # inva
+ movdqa .Lk_sb1 (%rax), %xmm13 # sb1u
+ movdqa .Lk_sb1+16(%rax), %xmm12 # sb1t
+ movdqa .Lk_sb2 (%rax), %xmm15 # sb2u
+ movdqa .Lk_sb2+16(%rax), %xmm14 # sb2t
+ ret
+ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
+
+##
+## _gcry_aes_ssse3_dec_preload
+##
+ELF(.type _gcry_aes_ssse3_dec_preload, at function)
+.globl _gcry_aes_ssse3_dec_preload
+_gcry_aes_ssse3_dec_preload:
+ lea .Laes_consts(%rip), %rax
+ movdqa (%rax), %xmm9 # 0F
+ movdqa .Lk_inv (%rax), %xmm10 # inv
+ movdqa .Lk_inv+16(%rax), %xmm11 # inva
+ movdqa .Lk_dsb9 (%rax), %xmm13 # sb9u
+ movdqa .Lk_dsb9+16(%rax), %xmm12 # sb9t
+ movdqa .Lk_dsbd (%rax), %xmm15 # sbdu
+ movdqa .Lk_dsbb (%rax), %xmm14 # sbbu
+ movdqa .Lk_dsbe (%rax), %xmm8 # sbeu
+ ret
+ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
+
+##
+## Constant-time SSSE3 AES core implementation.
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in .Laes_preheat
+## (%rdx) = scheduled keys
+## %rax = nrounds - 1
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm4, %r9, %r11, %rax, %rcx
+## Preserves %xmm6 - %xmm7 so you get some local vectors
+##
+##
+.align 16
+ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
+.globl _gcry_aes_ssse3_encrypt_core
+_gcry_aes_ssse3_encrypt_core:
+_aes_encrypt_core:
+ lea .Laes_consts(%rip), %rcx
+ leaq .Lk_mc_backward(%rcx), %rdi
+ mov $16, %rsi
+ movdqa .Lk_ipt (%rcx), %xmm2 # iptlo
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld $4, %xmm1
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_ipt+16(%rcx), %xmm0 # ipthi
+ pshufb %xmm1, %xmm0
+ pxor (%rdx),%xmm2
+ pxor %xmm2, %xmm0
+ add $16, %rdx
+ jmp .Laes_entry
+
+.align 8
+.Laes_loop:
+ # middle of middle round
+ movdqa %xmm13, %xmm4 # 4 : sb1u
+ pshufb %xmm2, %xmm4 # 4 = sb1u
+ pxor (%rdx), %xmm4 # 4 = sb1u + k
+ movdqa %xmm12, %xmm0 # 0 : sb1t
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ movdqa %xmm15, %xmm4 # 4 : sb2u
+ pshufb %xmm2, %xmm4 # 4 = sb2u
+ movdqa .Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1
+ movdqa %xmm14, %xmm2 # 2 : sb2t
+ pshufb %xmm3, %xmm2 # 2 = sb2t
+ pxor %xmm4, %xmm2 # 2 = 2A
+ movdqa %xmm0, %xmm3 # 3 = A
+ pshufb %xmm1, %xmm0 # 0 = B
+ pxor %xmm2, %xmm0 # 0 = 2A+B
+ pshufb (%rsi,%rdi), %xmm3 # 3 = D
+ lea 16(%esi),%esi # next mc
+ pxor %xmm0, %xmm3 # 3 = 2A+B+D
+ lea 16(%rdx),%rdx # next key
+ pshufb %xmm1, %xmm0 # 0 = 2B+C
+ pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
+ and $48, %rsi # ... mod 4
+ dec %rax # nr--
+
+.Laes_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld $4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Laes_loop
+
+ # middle of last round
+ movdqa .Lk_sbo(%rcx), %xmm4 # 3 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor (%rdx), %xmm4 # 4 = sb1u + k
+ movdqa .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb .Lk_sr(%rsi,%rcx), %xmm0
+ ret
+ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.align 16
+.globl _gcry_aes_ssse3_decrypt_core
+ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
+_gcry_aes_ssse3_decrypt_core:
+_aes_decrypt_core:
+ lea .Laes_consts(%rip), %rcx
+ movl %eax, %esi
+ shll $4, %esi
+ xorl $48, %esi
+ andl $48, %esi
+ movdqa .Lk_dipt (%rcx), %xmm2 # iptlo
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld $4, %xmm1
+ pand %xmm9, %xmm0
+ pshufb %xmm0, %xmm2
+ movdqa .Lk_dipt+16(%rcx), %xmm0 # ipthi
+ pshufb %xmm1, %xmm0
+ pxor (%rdx), %xmm2
+ pxor %xmm2, %xmm0
+ movdqa .Lk_mc_forward+48(%rcx), %xmm5
+ lea 16(%rdx), %rdx
+ neg %rax
+ jmp .Laes_dec_entry
+
+.align 16
+.Laes_dec_loop:
+##
+## Inverse mix columns
+##
+ movdqa %xmm13, %xmm4 # 4 : sb9u
+ pshufb %xmm2, %xmm4 # 4 = sb9u
+ pxor (%rdx), %xmm4
+ movdqa %xmm12, %xmm0 # 0 : sb9t
+ pshufb %xmm3, %xmm0 # 0 = sb9t
+ movdqa .Lk_dsbd+16(%rcx),%xmm1 # 1 : sbdt
+ pxor %xmm4, %xmm0 # 0 = ch
+ lea 16(%rdx), %rdx # next round key
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa %xmm15, %xmm4 # 4 : sbdu
+ pshufb %xmm2, %xmm4 # 4 = sbdu
+ pxor %xmm0, %xmm4 # 4 = ch
+ pshufb %xmm3, %xmm1 # 1 = sbdt
+ pxor %xmm4, %xmm1 # 1 = ch
+
+ pshufb %xmm5, %xmm1 # MC ch
+ movdqa %xmm14, %xmm4 # 4 : sbbu
+ pshufb %xmm2, %xmm4 # 4 = sbbu
+ inc %rax # nr--
+ pxor %xmm1, %xmm4 # 4 = ch
+ movdqa .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt
+ pshufb %xmm3, %xmm0 # 0 = sbbt
+ pxor %xmm4, %xmm0 # 0 = ch
+
+ pshufb %xmm5, %xmm0 # MC ch
+ movdqa %xmm8, %xmm4 # 4 : sbeu
+ pshufb %xmm2, %xmm4 # 4 = sbeu
+ pshufd $0x93, %xmm5, %xmm5
+ pxor %xmm0, %xmm4 # 4 = ch
+ movdqa .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet
+ pshufb %xmm3, %xmm0 # 0 = sbet
+ pxor %xmm4, %xmm0 # 0 = ch
+
+.Laes_dec_entry:
+ # top of round
+ movdqa %xmm9, %xmm1 # 1 : i
+ pandn %xmm0, %xmm1 # 1 = i<<4
+ psrld $4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ jnz .Laes_dec_loop
+
+ # middle of last round
+ movdqa .Lk_dsbo(%rcx), %xmm4 # 3 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ pxor (%rdx), %xmm4 # 4 = sb1u + k
+ movdqa .Lk_dsbo+16(%rcx), %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = A
+ pshufb .Lk_sr(%rsi,%rcx), %xmm0
+ ret
+ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+
+.align 16
+.globl _gcry_aes_ssse3_schedule_core
+ELF(.type _gcry_aes_ssse3_schedule_core, at function)
+_gcry_aes_ssse3_schedule_core:
+_aes_schedule_core:
+ # rdi = key
+ # rsi = size in bits
+ # rdx = buffer
+ # rcx = direction. 0=encrypt, 1=decrypt
+
+ # load the tables
+ lea .Laes_consts(%rip), %r10
+ movdqa (%r10), %xmm9 # 0F
+ movdqa .Lk_inv (%r10), %xmm10 # inv
+ movdqa .Lk_inv+16(%r10), %xmm11 # inva
+ movdqa .Lk_sb1 (%r10), %xmm13 # sb1u
+ movdqa .Lk_sb1+16(%r10), %xmm12 # sb1t
+ movdqa .Lk_sb2 (%r10), %xmm15 # sb2u
+ movdqa .Lk_sb2+16(%r10), %xmm14 # sb2t
+
+ movdqa .Lk_rcon(%r10), %xmm8 # load rcon
+ movdqu (%rdi), %xmm0 # load key (unaligned)
+
+ # input transform
+ movdqu %xmm0, %xmm3
+ lea .Lk_ipt(%r10), %r11
+ call .Laes_schedule_transform
+ movdqu %xmm0, %xmm7
+
+ test %rcx, %rcx
+ jnz .Laes_schedule_am_decrypting
+
+ # encrypting, output zeroth round key after transform
+ movdqa %xmm0, (%rdx)
+ jmp .Laes_schedule_go
+
+.Laes_schedule_am_decrypting:
+ # decrypting, output zeroth round key after shiftrows
+ pshufb .Lk_sr(%r8,%r10),%xmm3
+ movdqa %xmm3, (%rdx)
+ xor $48, %r8
+
+.Laes_schedule_go:
+ cmp $192, %rsi
+ je .Laes_schedule_192
+ cmp $256, %rsi
+ je .Laes_schedule_256
+ # 128: fall though
+
+##
+## .Laes_schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Laes_schedule_128:
+ mov $10, %rsi
+
+.Laes_schedule_128_L:
+ call .Laes_schedule_round
+ dec %rsi
+ jz .Laes_schedule_mangle_last
+ call .Laes_schedule_mangle # write output
+ jmp .Laes_schedule_128_L
+
+##
+## .Laes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.Laes_schedule_192:
+ movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ call .Laes_schedule_transform # input transform
+ pshufd $0x0E, %xmm0, %xmm6
+ pslldq $8, %xmm6 # clobber low side with zeros
+ mov $4, %rsi
+
+.Laes_schedule_192_L:
+ call .Laes_schedule_round
+ palignr $8,%xmm6,%xmm0
+ call .Laes_schedule_mangle # save key n
+ call .Laes_schedule_192_smear
+ call .Laes_schedule_mangle # save key n+1
+ call .Laes_schedule_round
+ dec %rsi
+ jz .Laes_schedule_mangle_last
+ call .Laes_schedule_mangle # save key n+2
+ call .Laes_schedule_192_smear
+ jmp .Laes_schedule_192_L
+
+##
+## .Laes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.Laes_schedule_192_smear:
+ pshufd $0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
+ pxor %xmm0, %xmm6 # -> c+d c 0 0
+ pshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ pxor %xmm6, %xmm0 # -> b+c+d b+c b a
+ pshufd $0x0E, %xmm0, %xmm6
+ pslldq $8, %xmm6 # clobber low side with zeros
+ ret
+
+##
+## .Laes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional 'low side' in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.Laes_schedule_256:
+ movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ call .Laes_schedule_transform # input transform
+ mov $7, %rsi
+
+.Laes_schedule_256_L:
+ call .Laes_schedule_mangle # output low result
+ movdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ # high round
+ call .Laes_schedule_round
+ dec %rsi
+ jz .Laes_schedule_mangle_last
+ call .Laes_schedule_mangle
+
+ # low round. swap xmm7 and xmm6
+ pshufd $0xFF, %xmm0, %xmm0
+ movdqa %xmm7, %xmm5
+ movdqa %xmm6, %xmm7
+ call .Laes_schedule_low_round
+ movdqa %xmm5, %xmm7
+
+ jmp .Laes_schedule_256_L
+
+##
+## .Laes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.Laes_schedule_round:
+ # extract rcon from xmm8
+ pxor %xmm1, %xmm1
+ palignr $15, %xmm8, %xmm1
+ palignr $15, %xmm8, %xmm8
+ pxor %xmm1, %xmm7
+
+ # rotate
+ pshufd $0xFF, %xmm0, %xmm0
+ palignr $1, %xmm0, %xmm0
+
+ # fall through...
+
+ # low round: same as high round, but no rotation and no rcon.
+.Laes_schedule_low_round:
+ # smear xmm7
+ movdqa %xmm7, %xmm1
+ pslldq $4, %xmm7
+ pxor %xmm1, %xmm7
+ movdqa %xmm7, %xmm1
+ pslldq $8, %xmm7
+ pxor %xmm1, %xmm7
+ pxor .Lk_s63(%r10), %xmm7
+
+ # subbytes
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld $4, %xmm1 # 1 = i
+ pand %xmm9, %xmm0 # 0 = k
+ movdqa %xmm11, %xmm2 # 2 : a/k
+ pshufb %xmm0, %xmm2 # 2 = a/k
+ pxor %xmm1, %xmm0 # 0 = j
+ movdqa %xmm10, %xmm3 # 3 : 1/i
+ pshufb %xmm1, %xmm3 # 3 = 1/i
+ pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
+ movdqa %xmm10, %xmm4 # 4 : 1/j
+ pshufb %xmm0, %xmm4 # 4 = 1/j
+ pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
+ movdqa %xmm10, %xmm2 # 2 : 1/iak
+ pshufb %xmm3, %xmm2 # 2 = 1/iak
+ pxor %xmm0, %xmm2 # 2 = io
+ movdqa %xmm10, %xmm3 # 3 : 1/jak
+ pshufb %xmm4, %xmm3 # 3 = 1/jak
+ pxor %xmm1, %xmm3 # 3 = jo
+ movdqa .Lk_sb1(%r10), %xmm4 # 4 : sbou
+ pshufb %xmm2, %xmm4 # 4 = sbou
+ movdqa .Lk_sb1+16(%r10), %xmm0 # 0 : sbot
+ pshufb %xmm3, %xmm0 # 0 = sb1t
+ pxor %xmm4, %xmm0 # 0 = sbox output
+
+ # add in smeared stuff
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, %xmm7
+ ret
+
+##
+## .Laes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.Laes_schedule_transform:
+ movdqa %xmm9, %xmm1
+ pandn %xmm0, %xmm1
+ psrld $4, %xmm1
+ pand %xmm9, %xmm0
+ movdqa (%r11), %xmm2 # lo
+ pshufb %xmm0, %xmm2
+ movdqa 16(%r11), %xmm0 # hi
+ pshufb %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ ret
+
+##
+## .Laes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by 'inverse mixcolumns' circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.Laes_schedule_mangle:
+ movdqa %xmm0, %xmm4 # save xmm0 for later
+ movdqa .Lk_mc_forward(%r10),%xmm5
+ test %rcx, %rcx
+ jnz .Laes_schedule_mangle_dec
+
+ # encrypting
+ add $16, %rdx
+ pxor .Lk_s63(%r10),%xmm4
+ pshufb %xmm5, %xmm4
+ movdqa %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+ pshufb %xmm5, %xmm4
+ pxor %xmm4, %xmm3
+
+ jmp .Laes_schedule_mangle_both
+
+.Laes_schedule_mangle_dec:
+ lea .Lk_dks_1(%r10), %r11 # first table: *9
+ call .Laes_schedule_transform
+ movdqa %xmm0, %xmm3
+ pshufb %xmm5, %xmm3
+
+ add $32, %r11 # next table: *B
+ call .Laes_schedule_transform
+ pxor %xmm0, %xmm3
+ pshufb %xmm5, %xmm3
+
+ add $32, %r11 # next table: *D
+ call .Laes_schedule_transform
+ pxor %xmm0, %xmm3
+ pshufb %xmm5, %xmm3
+
+ add $32, %r11 # next table: *E
+ call .Laes_schedule_transform
+ pxor %xmm0, %xmm3
+ pshufb %xmm5, %xmm3
+
+ movdqa %xmm4, %xmm0 # restore %xmm0
+ add $-16, %rdx
+
+.Laes_schedule_mangle_both:
+ pshufb .Lk_sr(%r8,%r10),%xmm3
+ add $-16, %r8
+ and $48, %r8
+ movdqa %xmm3, (%rdx)
+ ret
+
+##
+## .Laes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.Laes_schedule_mangle_last:
+ # schedule last round key from xmm0
+ lea .Lk_deskew(%r10),%r11 # prepare to deskew
+ test %rcx, %rcx
+ jnz .Laes_schedule_mangle_last_dec
+
+ # encrypting
+ pshufb .Lk_sr(%r8,%r10),%xmm0 # output permute
+ lea .Lk_opt(%r10), %r11 # prepare to output transform
+ add $32, %rdx
+
+.Laes_schedule_mangle_last_dec:
+ add $-16, %rdx
+ pxor .Lk_s63(%r10), %xmm0
+ call .Laes_schedule_transform # output transform
+ movdqa %xmm0, (%rdx) # save last key
+
+ #_aes_cleanup
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ pxor %xmm8, %xmm8
+ ret
+ELF(.size _aes_schedule_core,.-_aes_schedule_core)
+
+########################################################
+## ##
+## Constants ##
+## ##
+########################################################
+
+.align 16
+ELF(.type _aes_consts, at object)
+.Laes_consts:
+_aes_consts:
+ # s0F
+ .Lk_s0F = .-.Laes_consts
+ .quad 0x0F0F0F0F0F0F0F0F
+ .quad 0x0F0F0F0F0F0F0F0F
+
+ # input transform (lo, hi)
+ .Lk_ipt = .-.Laes_consts
+ .quad 0xC2B2E8985A2A7000
+ .quad 0xCABAE09052227808
+ .quad 0x4C01307D317C4D00
+ .quad 0xCD80B1FCB0FDCC81
+
+ # inv, inva
+ .Lk_inv = .-.Laes_consts
+ .quad 0x0E05060F0D080180
+ .quad 0x040703090A0B0C02
+ .quad 0x01040A060F0B0780
+ .quad 0x030D0E0C02050809
+
+ # sb1u, sb1t
+ .Lk_sb1 = .-.Laes_consts
+ .quad 0xB19BE18FCB503E00
+ .quad 0xA5DF7A6E142AF544
+ .quad 0x3618D415FAE22300
+ .quad 0x3BF7CCC10D2ED9EF
+
+
+ # sb2u, sb2t
+ .Lk_sb2 = .-.Laes_consts
+ .quad 0xE27A93C60B712400
+ .quad 0x5EB7E955BC982FCD
+ .quad 0x69EB88400AE12900
+ .quad 0xC2A163C8AB82234A
+
+ # sbou, sbot
+ .Lk_sbo = .-.Laes_consts
+ .quad 0xD0D26D176FBDC700
+ .quad 0x15AABF7AC502A878
+ .quad 0xCFE474A55FBB6A00
+ .quad 0x8E1E90D1412B35FA
+
+ # mc_forward
+ .Lk_mc_forward = .-.Laes_consts
+ .quad 0x0407060500030201
+ .quad 0x0C0F0E0D080B0A09
+ .quad 0x080B0A0904070605
+ .quad 0x000302010C0F0E0D
+ .quad 0x0C0F0E0D080B0A09
+ .quad 0x0407060500030201
+ .quad 0x000302010C0F0E0D
+ .quad 0x080B0A0904070605
+
+ # mc_backward
+ .Lk_mc_backward = .-.Laes_consts
+ .quad 0x0605040702010003
+ .quad 0x0E0D0C0F0A09080B
+ .quad 0x020100030E0D0C0F
+ .quad 0x0A09080B06050407
+ .quad 0x0E0D0C0F0A09080B
+ .quad 0x0605040702010003
+ .quad 0x0A09080B06050407
+ .quad 0x020100030E0D0C0F
+
+ # sr
+ .Lk_sr = .-.Laes_consts
+ .quad 0x0706050403020100
+ .quad 0x0F0E0D0C0B0A0908
+ .quad 0x030E09040F0A0500
+ .quad 0x0B06010C07020D08
+ .quad 0x0F060D040B020900
+ .quad 0x070E050C030A0108
+ .quad 0x0B0E0104070A0D00
+ .quad 0x0306090C0F020508
+
+ # rcon
+ .Lk_rcon = .-.Laes_consts
+ .quad 0x1F8391B9AF9DEEB6
+ .quad 0x702A98084D7C7D81
+
+ # s63: all equal to 0x63 transformed
+ .Lk_s63 = .-.Laes_consts
+ .quad 0x5B5B5B5B5B5B5B5B
+ .quad 0x5B5B5B5B5B5B5B5B
+
+ # output transform
+ .Lk_opt = .-.Laes_consts
+ .quad 0xFF9F4929D6B66000
+ .quad 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00
+ .quad 0xE10D5DB1B05C0CE0
+
+ # deskew tables: inverts the sbox's 'skew'
+ .Lk_deskew = .-.Laes_consts
+ .quad 0x07E4A34047A4E300
+ .quad 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900
+ .quad 0x2841C2ABF49D1E77
+
+##
+## Decryption stuff
+## Key schedule constants
+##
+ # decryption key schedule: x -> invskew x*9
+ .Lk_dks_1 = .-.Laes_consts
+ .quad 0xB6116FC87ED9A700
+ .quad 0x4AED933482255BFC
+ .quad 0x4576516227143300
+ .quad 0x8BB89FACE9DAFDCE
+
+ # decryption key schedule: invskew x*9 -> invskew x*D
+ .Lk_dks_2 = .-.Laes_consts
+ .quad 0x27438FEBCCA86400
+ .quad 0x4622EE8AADC90561
+ .quad 0x815C13CE4F92DD00
+ .quad 0x73AEE13CBD602FF2
+
+ # decryption key schedule: invskew x*D -> invskew x*B
+ .Lk_dks_3 = .-.Laes_consts
+ .quad 0x03C4C50201C6C700
+ .quad 0xF83F3EF9FA3D3CFB
+ .quad 0xEE1921D638CFF700
+ .quad 0xA5526A9D7384BC4B
+
+ # decryption key schedule: invskew x*B -> invskew x*E + 0x63
+ .Lk_dks_4 = .-.Laes_consts
+ .quad 0xE3C390B053732000
+ .quad 0xA080D3F310306343
+ .quad 0xA0CA214B036982E8
+ .quad 0x2F45AEC48CE60D67
+
+##
+## Decryption stuff
+## Round function constants
+##
+ # decryption input transform
+ .Lk_dipt = .-.Laes_consts
+ .quad 0x0F505B040B545F00
+ .quad 0x154A411E114E451A
+ .quad 0x86E383E660056500
+ .quad 0x12771772F491F194
+
+ # decryption sbox output *9*u, *9*t
+ .Lk_dsb9 = .-.Laes_consts
+ .quad 0x851C03539A86D600
+ .quad 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900
+ .quad 0x725E2C9EB2FBA565
+
+ # decryption sbox output *D*u, *D*t
+ .Lk_dsbd = .-.Laes_consts
+ .quad 0x7D57CCDFE6B1A200
+ .quad 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00
+ .quad 0x2931180D15DEEFD3
+
+ # decryption sbox output *B*u, *B*t
+ .Lk_dsbb = .-.Laes_consts
+ .quad 0xD022649296B44200
+ .quad 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700
+ .quad 0xF3FF0C3E3255AA6B
+
+ # decryption sbox output *E*u, *E*t
+ .Lk_dsbe = .-.Laes_consts
+ .quad 0x46F2929626D4D000
+ .quad 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100
+ .quad 0x9467F36B98593E32
+
+ # decryption sbox final output
+ .Lk_dsbo = .-.Laes_consts
+ .quad 0x1387EA537EF94000
+ .quad 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00
+ .quad 0xCA4B8159D8C58E9C
+ELF(.size _aes_consts,.-_aes_consts)
+
+#endif
+#endif
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index 2adb73f..25d1849 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -1,5 +1,5 @@
/* SSSE3 vector permutation AES for Libgcrypt
- * Copyright (C) 2014-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -57,11 +57,22 @@
#endif
+/* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
+ have custom calling convention and need to be called from assembly
+ blocks, not directly. */
+extern void _gcry_aes_ssse3_enc_preload(void);
+extern void _gcry_aes_ssse3_dec_preload(void);
+extern void _gcry_aes_ssse3_schedule_core(void);
+extern void _gcry_aes_ssse3_encrypt_core(void);
+extern void _gcry_aes_ssse3_decrypt_core(void);
+
+
+
/* Two macros to be called prior and after the use of SSSE3
- instructions. There should be no external function calls between
- the use of these macros. There purpose is to make sure that the
- SSE registers are cleared and won't reveal any information about
- the key or the data. */
+ instructions. There should be no external function calls between
+ the use of these macros. There purpose is to make sure that the
+ SSE registers are cleared and won't reveal any information about
+ the key or the data. */
#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
# define SSSE3_STATE_SIZE (16 * 10)
/* XMM6-XMM15 are callee-saved registers on WIN64. */
@@ -115,34 +126,19 @@
::: "memory" )
#endif
-#define vpaes_ssse3_prepare_enc(const_ptr) \
+#define vpaes_ssse3_prepare_enc() \
vpaes_ssse3_prepare(); \
- asm volatile ("lea .Laes_consts(%%rip), %q0 \n\t" \
- "movdqa (%q0), %%xmm9 # 0F \n\t" \
- "movdqa .Lk_inv (%q0), %%xmm10 # inv \n\t" \
- "movdqa .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
- "movdqa .Lk_sb1 (%q0), %%xmm13 # sb1u \n\t" \
- "movdqa .Lk_sb1+16(%q0), %%xmm12 # sb1t \n\t" \
- "movdqa .Lk_sb2 (%q0), %%xmm15 # sb2u \n\t" \
- "movdqa .Lk_sb2+16(%q0), %%xmm14 # sb2t \n\t" \
- : "=c" (const_ptr) \
+ asm volatile ("call *%[core] \n\t" \
: \
- : "memory" )
+ : [core] "r" (_gcry_aes_ssse3_enc_preload) \
+ : "rax", "cc", "memory" )
-#define vpaes_ssse3_prepare_dec(const_ptr) \
+#define vpaes_ssse3_prepare_dec() \
vpaes_ssse3_prepare(); \
- asm volatile ("lea .Laes_consts(%%rip), %q0 \n\t" \
- "movdqa (%q0), %%xmm9 # 0F \n\t" \
- "movdqa .Lk_inv (%q0), %%xmm10 # inv \n\t" \
- "movdqa .Lk_inv+16(%q0), %%xmm11 # inva \n\t" \
- "movdqa .Lk_dsb9 (%q0), %%xmm13 # sb9u \n\t" \
- "movdqa .Lk_dsb9+16(%q0), %%xmm12 # sb9t \n\t" \
- "movdqa .Lk_dsbd (%q0), %%xmm15 # sbdu \n\t" \
- "movdqa .Lk_dsbb (%q0), %%xmm14 # sbbu \n\t" \
- "movdqa .Lk_dsbe (%q0), %%xmm8 # sbeu \n\t" \
- : "=c" (const_ptr) \
+ asm volatile ("call *%[core] \n\t" \
: \
- : "memory" )
+ : [core] "r" (_gcry_aes_ssse3_dec_preload) \
+ : "rax", "cc", "memory" )
@@ -159,9 +155,10 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
"leaq %[buf], %%rdx" "\n\t"
"movl %[dir], %%ecx" "\n\t"
"movl %[rotoffs], %%r8d" "\n\t"
- "call _aes_schedule_core" "\n\t"
+ "call *%[core]" "\n\t"
:
- : [key] "m" (*key),
+ : [core] "r" (&_gcry_aes_ssse3_schedule_core),
+ [key] "m" (*key),
[bits] "g" (keybits),
[buf] "m" (ctx->keyschenc32[0][0]),
[dir] "g" (0),
@@ -169,10 +166,31 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
: "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
"cc", "memory");
- vpaes_ssse3_cleanup();
-
/* Save key for setting up decryption. */
- memcpy(&ctx->keyschdec32[0][0], key, keybits / 8);
+ if (keybits > 192)
+ asm volatile ("movdqu (%[src]), %%xmm0\n\t"
+ "movdqu 16(%[src]), %%xmm1\n\t"
+ "movdqu %%xmm0, (%[dst])\n\t"
+ "movdqu %%xmm1, 16(%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+ : "memory" );
+ else if (keybits == 192)
+ asm volatile ("movdqu (%[src]), %%xmm0\n\t"
+ "movq 16(%[src]), %%xmm1\n\t"
+ "movdqu %%xmm0, (%[dst])\n\t"
+ "movq %%xmm1, 16(%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+ : "memory" );
+ else
+ asm volatile ("movdqu (%[src]), %%xmm0\n\t"
+ "movdqu %%xmm0, (%[dst])\n\t"
+ : /* No output */
+ : [dst] "r" (&ctx->keyschdec32[0][0]), [src] "r" (key)
+ : "memory" );
+
+ vpaes_ssse3_cleanup();
}
@@ -190,9 +208,10 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
"leaq %[buf], %%rdx" "\n\t"
"movl %[dir], %%ecx" "\n\t"
"movl %[rotoffs], %%r8d" "\n\t"
- "call _aes_schedule_core" "\n\t"
+ "call *%[core]" "\n\t"
:
- : [key] "m" (ctx->keyschdec32[0][0]),
+ : [core] "r" (_gcry_aes_ssse3_schedule_core),
+ [key] "m" (ctx->keyschdec32[0][0]),
[bits] "g" (keybits),
[buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
[dir] "g" (1),
@@ -207,32 +226,30 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
/* Encrypt one block using the Intel SSSE3 instructions. Block is input
* and output through SSE register xmm0. */
static inline void
-do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds,
- const void *aes_const_ptr)
+do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
{
unsigned int middle_rounds = nrounds - 1;
const void *keysched = ctx->keyschenc32;
- asm volatile ("call _aes_encrypt_core" "\n\t"
- : "+a" (middle_rounds), "+d" (keysched)
- : "c" (aes_const_ptr)
- : "rdi", "rsi", "cc", "memory");
+ asm volatile ("call *%[core]" "\n\t"
+ : "+a" (middle_rounds), "+d" (keysched)
+ : [core] "r" (_gcry_aes_ssse3_encrypt_core)
+ : "rcx", "rsi", "rdi", "cc", "memory");
}
/* Decrypt one block using the Intel SSSE3 instructions. Block is input
* and output through SSE register xmm0. */
static inline void
-do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds,
- const void *aes_const_ptr)
+do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
{
unsigned int middle_rounds = nrounds - 1;
const void *keysched = ctx->keyschdec32;
- asm volatile ("call _aes_decrypt_core" "\n\t"
+ asm volatile ("call *%[core]" "\n\t"
: "+a" (middle_rounds), "+d" (keysched)
- : "c" (aes_const_ptr)
- : "rsi", "cc", "memory");
+ : [core] "r" (_gcry_aes_ssse3_decrypt_core)
+ : "rcx", "rsi", "cc", "memory");
}
@@ -241,15 +258,14 @@ _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
const unsigned char *src)
{
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
asm volatile ("movdqu %[src], %%xmm0\n\t"
:
: [src] "m" (*src)
: "memory" );
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("movdqu %%xmm0, %[dst]\n\t"
: [dst] "=m" (*dst)
:
@@ -265,10 +281,9 @@ _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
size_t nblocks)
{
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
asm volatile ("movdqu %[iv], %%xmm0\n\t"
: /* No output */
@@ -277,7 +292,7 @@ _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
for ( ;nblocks; nblocks-- )
{
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("movdqu %[inbuf], %%xmm1\n\t"
"pxor %%xmm1, %%xmm0\n\t"
@@ -305,10 +320,9 @@ _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
size_t nblocks, int cbc_mac)
{
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
asm volatile ("movdqu %[iv], %%xmm7\n\t"
: /* No output */
@@ -323,7 +337,7 @@ _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
: [inbuf] "m" (*inbuf)
: "memory" );
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("movdqa %%xmm0, %%xmm7\n\t"
"movdqu %%xmm0, %[outbuf]\n\t"
@@ -353,11 +367,10 @@ _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
static const unsigned char be_mask[16] __attribute__ ((aligned (16))) =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
u64 ctrlow;
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
asm volatile ("movdqa %[mask], %%xmm6\n\t" /* Preload mask */
"movdqa (%[ctr]), %%xmm7\n\t" /* Preload CTR */
@@ -388,10 +401,10 @@ _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
"pshufb %%xmm6, %%xmm7\n\t"
: [ctrlow] "+r" (ctrlow)
- : [ctr] "r" (ctr)
+ :
: "cc", "memory");
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("movdqu %[src], %%xmm1\n\t" /* xmm1 := input */
"pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
@@ -418,15 +431,14 @@ _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
const unsigned char *src)
{
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_dec (aes_const_ptr);
+ vpaes_ssse3_prepare_dec ();
asm volatile ("movdqu %[src], %%xmm0\n\t"
:
: [src] "m" (*src)
: "memory" );
- do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_dec (ctx, nrounds);
asm volatile ("movdqu %%xmm0, %[dst]\n\t"
: [dst] "=m" (*dst)
:
@@ -442,10 +454,9 @@ _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
size_t nblocks)
{
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
asm volatile ("movdqu %[iv], %%xmm0\n\t"
: /* No output */
@@ -454,7 +465,7 @@ _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
for ( ;nblocks; nblocks-- )
{
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("movdqa %%xmm0, %%xmm6\n\t"
"movdqu %[inbuf], %%xmm0\n\t"
@@ -483,45 +494,40 @@ _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
size_t nblocks)
{
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_dec (aes_const_ptr);
+ vpaes_ssse3_prepare_dec ();
- asm volatile
- ("movdqu %[iv], %%xmm7\n\t" /* use xmm7 as fast IV storage */
- : /* No output */
- : [iv] "m" (*iv)
- : "memory");
+ asm volatile ("movdqu %[iv], %%xmm7\n\t" /* use xmm7 as fast IV storage */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
for ( ;nblocks; nblocks-- )
{
- asm volatile
- ("movdqu %[inbuf], %%xmm0\n\t"
- "movdqa %%xmm0, %%xmm6\n\t" /* use xmm6 as savebuf */
- : /* No output */
- : [inbuf] "m" (*inbuf)
- : "memory");
-
- do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
-
- asm volatile
- ("pxor %%xmm7, %%xmm0\n\t" /* xor IV with output */
- "movdqu %%xmm0, %[outbuf]\n\t"
- "movdqu %%xmm6, %%xmm7\n\t" /* store savebuf as new IV */
- : [outbuf] "=m" (*outbuf)
- :
- : "memory");
+ asm volatile ("movdqu %[inbuf], %%xmm0\n\t"
+ "movdqa %%xmm0, %%xmm6\n\t" /* use xmm6 as savebuf */
+ : /* No output */
+ : [inbuf] "m" (*inbuf)
+ : "memory");
+
+ do_vpaes_ssse3_dec (ctx, nrounds);
+
+ asm volatile ("pxor %%xmm7, %%xmm0\n\t" /* xor IV with output */
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ "movdqu %%xmm6, %%xmm7\n\t" /* store savebuf as new IV */
+ : [outbuf] "=m" (*outbuf)
+ :
+ : "memory");
outbuf += BLOCKSIZE;
inbuf += BLOCKSIZE;
}
- asm volatile
- ("movdqu %%xmm7, %[iv]\n\t" /* store IV */
- : /* No output */
- : [iv] "m" (*iv)
- : "memory");
+ asm volatile ("movdqu %%xmm7, %[iv]\n\t" /* store IV */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
vpaes_ssse3_cleanup ();
}
@@ -536,10 +542,9 @@ ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
/* Preload Offset and Checksum */
asm volatile ("movdqu %[iv], %%xmm7\n\t"
@@ -568,7 +573,7 @@ ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
[inbuf] "m" (*inbuf)
: "memory" );
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("pxor %%xmm7, %%xmm0\n\t"
"movdqu %%xmm0, %[outbuf]\n\t"
@@ -600,10 +605,9 @@ ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
const unsigned char *inbuf = inbuf_arg;
u64 n = c->u_mode.ocb.data_nblocks;
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_dec (aes_const_ptr);
+ vpaes_ssse3_prepare_dec ();
/* Preload Offset and Checksum */
asm volatile ("movdqu %[iv], %%xmm7\n\t"
@@ -631,7 +635,7 @@ ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
[inbuf] "m" (*inbuf)
: "memory" );
- do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_dec (ctx, nrounds);
asm volatile ("pxor %%xmm7, %%xmm0\n\t"
"pxor %%xmm0, %%xmm6\n\t"
@@ -675,10 +679,9 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
const unsigned char *abuf = abuf_arg;
u64 n = c->u_mode.ocb.aad_nblocks;
unsigned int nrounds = ctx->rounds;
- const void *aes_const_ptr;
byte ssse3_state[SSSE3_STATE_SIZE];
- vpaes_ssse3_prepare_enc (aes_const_ptr);
+ vpaes_ssse3_prepare_enc ();
/* Preload Offset and Sum */
asm volatile ("movdqu %[iv], %%xmm7\n\t"
@@ -705,7 +708,7 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
[abuf] "m" (*abuf)
: "memory" );
- do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+ do_vpaes_ssse3_enc (ctx, nrounds);
asm volatile ("pxor %%xmm0, %%xmm6\n\t"
:
@@ -726,774 +729,4 @@ _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
vpaes_ssse3_cleanup ();
}
-
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define X(...)
-#else
-# define X(...) __VA_ARGS__
-#endif
-
-asm (
- "\n\t" "##"
- "\n\t" "## Constant-time SSSE3 AES core implementation."
- "\n\t" "##"
- "\n\t" "## By Mike Hamburg (Stanford University), 2009"
- "\n\t" "## Public domain."
- "\n\t" "##"
-
- "\n\t" ".text"
-
- "\n\t" "##"
- "\n\t" "## _aes_encrypt_core"
- "\n\t" "##"
- "\n\t" "## AES-encrypt %xmm0."
- "\n\t" "##"
- "\n\t" "## Inputs:"
- "\n\t" "## %xmm0 = input"
- "\n\t" "## %xmm9-%xmm15 as in .Laes_preheat"
- "\n\t" "## %rcx = .Laes_consts"
- "\n\t" "## (%rdx) = scheduled keys"
- "\n\t" "## %rax = nrounds - 1"
- "\n\t" "##"
- "\n\t" "## Output in %xmm0"
- "\n\t" "## Clobbers %xmm1-%xmm4, %r9, %r11, %rax"
- "\n\t" "## Preserves %xmm6 - %xmm7 so you get some local vectors"
- "\n\t" "##"
- "\n\t" "##"
- "\n\t" ".align 16"
-X("\n\t" ".type _aes_encrypt_core, at function")
- "\n\t" "_aes_encrypt_core:"
- "\n\t" " leaq .Lk_mc_backward(%rcx), %rdi"
- "\n\t" " mov $16, %rsi"
- "\n\t" " movdqa .Lk_ipt (%rcx), %xmm2 # iptlo"
- "\n\t" " movdqa %xmm9, %xmm1"
- "\n\t" " pandn %xmm0, %xmm1"
- "\n\t" " psrld $4, %xmm1"
- "\n\t" " pand %xmm9, %xmm0"
- "\n\t" " pshufb %xmm0, %xmm2"
- "\n\t" " movdqa .Lk_ipt+16(%rcx), %xmm0 # ipthi"
- "\n\t" " pshufb %xmm1, %xmm0"
- "\n\t" " pxor (%rdx),%xmm2"
- "\n\t" " pxor %xmm2, %xmm0"
- "\n\t" " add $16, %rdx"
- "\n\t" " jmp .Laes_entry"
-
- "\n\t" ".align 8"
- "\n\t" ".Laes_loop:"
- "\n\t" " # middle of middle round"
- "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb1u"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb1u"
- "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k"
- "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb1t"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
- "\n\t" " movdqa %xmm15, %xmm4 # 4 : sb2u"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb2u"
- "\n\t" " movdqa .Lk_mc_forward-.Lk_mc_backward(%rsi,%rdi), %xmm1"
- "\n\t" " movdqa %xmm14, %xmm2 # 2 : sb2t"
- "\n\t" " pshufb %xmm3, %xmm2 # 2 = sb2t"
- "\n\t" " pxor %xmm4, %xmm2 # 2 = 2A"
- "\n\t" " movdqa %xmm0, %xmm3 # 3 = A"
- "\n\t" " pshufb %xmm1, %xmm0 # 0 = B"
- "\n\t" " pxor %xmm2, %xmm0 # 0 = 2A+B"
- "\n\t" " pshufb (%rsi,%rdi), %xmm3 # 3 = D"
- "\n\t" " lea 16(%esi),%esi # next mc"
- "\n\t" " pxor %xmm0, %xmm3 # 3 = 2A+B+D"
- "\n\t" " lea 16(%rdx),%rdx # next key"
- "\n\t" " pshufb %xmm1, %xmm0 # 0 = 2B+C"
- "\n\t" " pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D"
- "\n\t" " and $48, %rsi # ... mod 4"
- "\n\t" " dec %rax # nr--"
-
- "\n\t" ".Laes_entry:"
- "\n\t" " # top of round"
- "\n\t" " movdqa %xmm9, %xmm1 # 1 : i"
- "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4"
- "\n\t" " psrld $4, %xmm1 # 1 = i"
- "\n\t" " pand %xmm9, %xmm0 # 0 = k"
- "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k"
- "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
- "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
- "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i"
- "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
- "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
- "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j"
- "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
- "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
- "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak"
- "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
- "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
- "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak"
- "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
- "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
- "\n\t" " jnz .Laes_loop"
-
- "\n\t" " # middle of last round"
- "\n\t" " movdqa .Lk_sbo(%rcx), %xmm4 # 3 : sbou"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
- "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k"
- "\n\t" " movdqa .Lk_sbo+16(%rcx), %xmm0 # 0 : sbot"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
- "\n\t" " pshufb .Lk_sr(%rsi,%rcx), %xmm0"
- "\n\t" " ret"
-X("\n\t" ".size _aes_encrypt_core,.-_aes_encrypt_core")
-
- "\n\t" "##"
- "\n\t" "## Decryption core"
- "\n\t" "##"
- "\n\t" "## Same API as encryption core."
- "\n\t" "##"
- "\n\t" ".align 16"
-X("\n\t" ".type _aes_decrypt_core, at function")
- "\n\t" "_aes_decrypt_core:"
- "\n\t" " movl %eax, %esi"
- "\n\t" " shll $4, %esi"
- "\n\t" " xorl $48, %esi"
- "\n\t" " andl $48, %esi"
- "\n\t" " movdqa .Lk_dipt (%rcx), %xmm2 # iptlo"
- "\n\t" " movdqa %xmm9, %xmm1"
- "\n\t" " pandn %xmm0, %xmm1"
- "\n\t" " psrld $4, %xmm1"
- "\n\t" " pand %xmm9, %xmm0"
- "\n\t" " pshufb %xmm0, %xmm2"
- "\n\t" " movdqa .Lk_dipt+16(%rcx), %xmm0 # ipthi"
- "\n\t" " pshufb %xmm1, %xmm0"
- "\n\t" " pxor (%rdx), %xmm2"
- "\n\t" " pxor %xmm2, %xmm0"
- "\n\t" " movdqa .Lk_mc_forward+48(%rcx), %xmm5"
- "\n\t" " lea 16(%rdx), %rdx"
- "\n\t" " neg %rax"
- "\n\t" " jmp .Laes_dec_entry"
-
- "\n\t" ".align 16"
- "\n\t" ".Laes_dec_loop:"
- "\n\t" "##"
- "\n\t" "## Inverse mix columns"
- "\n\t" "##"
- "\n\t" " movdqa %xmm13, %xmm4 # 4 : sb9u"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sb9u"
- "\n\t" " pxor (%rdx), %xmm4"
- "\n\t" " movdqa %xmm12, %xmm0 # 0 : sb9t"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb9t"
- "\n\t" " movdqa .Lk_dsbd+16(%rcx),%xmm1 # 1 : sbdt"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
- "\n\t" " lea 16(%rdx), %rdx # next round key"
-
- "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
- "\n\t" " movdqa %xmm15, %xmm4 # 4 : sbdu"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbdu"
- "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
- "\n\t" " pshufb %xmm3, %xmm1 # 1 = sbdt"
- "\n\t" " pxor %xmm4, %xmm1 # 1 = ch"
-
- "\n\t" " pshufb %xmm5, %xmm1 # MC ch"
- "\n\t" " movdqa %xmm14, %xmm4 # 4 : sbbu"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbbu"
- "\n\t" " inc %rax # nr--"
- "\n\t" " pxor %xmm1, %xmm4 # 4 = ch"
- "\n\t" " movdqa .Lk_dsbb+16(%rcx),%xmm0 # 0 : sbbt"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbbt"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
-
- "\n\t" " pshufb %xmm5, %xmm0 # MC ch"
- "\n\t" " movdqa %xmm8, %xmm4 # 4 : sbeu"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbeu"
- "\n\t" " pshufd $0x93, %xmm5, %xmm5"
- "\n\t" " pxor %xmm0, %xmm4 # 4 = ch"
- "\n\t" " movdqa .Lk_dsbe+16(%rcx),%xmm0 # 0 : sbet"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sbet"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = ch"
-
- "\n\t" ".Laes_dec_entry:"
- "\n\t" " # top of round"
- "\n\t" " movdqa %xmm9, %xmm1 # 1 : i"
- "\n\t" " pandn %xmm0, %xmm1 # 1 = i<<4"
- "\n\t" " psrld $4, %xmm1 # 1 = i"
- "\n\t" " pand %xmm9, %xmm0 # 0 = k"
- "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k"
- "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
- "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
- "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i"
- "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
- "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
- "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j"
- "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
- "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
- "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak"
- "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
- "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
- "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak"
- "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
- "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
- "\n\t" " jnz .Laes_dec_loop"
-
- "\n\t" " # middle of last round"
- "\n\t" " movdqa .Lk_dsbo(%rcx), %xmm4 # 3 : sbou"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
- "\n\t" " pxor (%rdx), %xmm4 # 4 = sb1u + k"
- "\n\t" " movdqa .Lk_dsbo+16(%rcx), %xmm0 # 0 : sbot"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = A"
- "\n\t" " pshufb .Lk_sr(%rsi,%rcx), %xmm0"
- "\n\t" " ret"
-X("\n\t" ".size _aes_decrypt_core,.-_aes_decrypt_core")
-
- "\n\t" "########################################################"
- "\n\t" "## ##"
- "\n\t" "## AES key schedule ##"
- "\n\t" "## ##"
- "\n\t" "########################################################"
-
- "\n\t" ".align 16"
-X("\n\t" ".type _aes_schedule_core, at function")
- "\n\t" "_aes_schedule_core:"
- "\n\t" " # rdi = key"
- "\n\t" " # rsi = size in bits"
- "\n\t" " # rdx = buffer"
- "\n\t" " # rcx = direction. 0=encrypt, 1=decrypt"
-
- "\n\t" " # load the tables"
- "\n\t" " lea .Laes_consts(%rip), %r10"
- "\n\t" " movdqa (%r10), %xmm9 # 0F"
- "\n\t" " movdqa .Lk_inv (%r10), %xmm10 # inv"
- "\n\t" " movdqa .Lk_inv+16(%r10), %xmm11 # inva"
- "\n\t" " movdqa .Lk_sb1 (%r10), %xmm13 # sb1u"
- "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm12 # sb1t"
- "\n\t" " movdqa .Lk_sb2 (%r10), %xmm15 # sb2u"
- "\n\t" " movdqa .Lk_sb2+16(%r10), %xmm14 # sb2t"
-
- "\n\t" " movdqa .Lk_rcon(%r10), %xmm8 # load rcon"
- "\n\t" " movdqu (%rdi), %xmm0 # load key (unaligned)"
-
- "\n\t" " # input transform"
- "\n\t" " movdqu %xmm0, %xmm3"
- "\n\t" " lea .Lk_ipt(%r10), %r11"
- "\n\t" " call .Laes_schedule_transform"
- "\n\t" " movdqu %xmm0, %xmm7"
-
- "\n\t" " test %rcx, %rcx"
- "\n\t" " jnz .Laes_schedule_am_decrypting"
-
- "\n\t" " # encrypting, output zeroth round key after transform"
- "\n\t" " movdqa %xmm0, (%rdx)"
- "\n\t" " jmp .Laes_schedule_go"
-
- "\n\t" ".Laes_schedule_am_decrypting:"
- "\n\t" " # decrypting, output zeroth round key after shiftrows"
- "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3"
- "\n\t" " movdqa %xmm3, (%rdx)"
- "\n\t" " xor $48, %r8"
-
- "\n\t" ".Laes_schedule_go:"
- "\n\t" " cmp $192, %rsi"
- "\n\t" " je .Laes_schedule_192"
- "\n\t" " cmp $256, %rsi"
- "\n\t" " je .Laes_schedule_256"
- "\n\t" " # 128: fall though"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_128"
- "\n\t" "##"
- "\n\t" "## 128-bit specific part of key schedule."
- "\n\t" "##"
- "\n\t" "## This schedule is really simple, because all its parts"
- "\n\t" "## are accomplished by the subroutines."
- "\n\t" "##"
- "\n\t" ".Laes_schedule_128:"
- "\n\t" " mov $10, %rsi"
-
- "\n\t" ".Laes_schedule_128_L:"
- "\n\t" " call .Laes_schedule_round"
- "\n\t" " dec %rsi"
- "\n\t" " jz .Laes_schedule_mangle_last"
- "\n\t" " call .Laes_schedule_mangle # write output"
- "\n\t" " jmp .Laes_schedule_128_L"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_192"
- "\n\t" "##"
- "\n\t" "## 192-bit specific part of key schedule."
- "\n\t" "##"
- "\n\t" "## The main body of this schedule is the same as the 128-bit"
- "\n\t" "## schedule, but with more smearing. The long, high side is"
- "\n\t" "## stored in %xmm7 as before, and the short, low side is in"
- "\n\t" "## the high bits of %xmm6."
- "\n\t" "##"
- "\n\t" "## This schedule is somewhat nastier, however, because each"
- "\n\t" "## round produces 192 bits of key material, or 1.5 round keys."
- "\n\t" "## Therefore, on each cycle we do 2 rounds and produce 3 round"
- "\n\t" "## keys."
- "\n\t" "##"
- "\n\t" ".Laes_schedule_192:"
- "\n\t" " movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)"
- "\n\t" " call .Laes_schedule_transform # input transform"
- "\n\t" " pshufd $0x0E, %xmm0, %xmm6"
- "\n\t" " pslldq $8, %xmm6 # clobber low side with zeros"
- "\n\t" " mov $4, %rsi"
-
- "\n\t" ".Laes_schedule_192_L:"
- "\n\t" " call .Laes_schedule_round"
- "\n\t" " palignr $8,%xmm6,%xmm0 "
- "\n\t" " call .Laes_schedule_mangle # save key n"
- "\n\t" " call .Laes_schedule_192_smear"
- "\n\t" " call .Laes_schedule_mangle # save key n+1"
- "\n\t" " call .Laes_schedule_round"
- "\n\t" " dec %rsi"
- "\n\t" " jz .Laes_schedule_mangle_last"
- "\n\t" " call .Laes_schedule_mangle # save key n+2"
- "\n\t" " call .Laes_schedule_192_smear"
- "\n\t" " jmp .Laes_schedule_192_L"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_192_smear"
- "\n\t" "##"
- "\n\t" "## Smear the short, low side in the 192-bit key schedule."
- "\n\t" "##"
- "\n\t" "## Inputs:"
- "\n\t" "## %xmm7: high side, b a x y"
- "\n\t" "## %xmm6: low side, d c 0 0"
- "\n\t" "## %xmm13: 0"
- "\n\t" "##"
- "\n\t" "## Outputs:"
- "\n\t" "## %xmm6: b+c+d b+c 0 0"
- "\n\t" "## %xmm0: b+c+d b+c b a"
- "\n\t" "##"
- "\n\t" ".Laes_schedule_192_smear:"
- "\n\t" " pshufd $0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0"
- "\n\t" " pxor %xmm0, %xmm6 # -> c+d c 0 0"
- "\n\t" " pshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a"
- "\n\t" " pxor %xmm6, %xmm0 # -> b+c+d b+c b a"
- "\n\t" " pshufd $0x0E, %xmm0, %xmm6"
- "\n\t" " pslldq $8, %xmm6 # clobber low side with zeros"
- "\n\t" " ret"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_256"
- "\n\t" "##"
- "\n\t" "## 256-bit specific part of key schedule."
- "\n\t" "##"
- "\n\t" "## The structure here is very similar to the 128-bit"
- "\n\t" "## schedule, but with an additional 'low side' in"
- "\n\t" "## %xmm6. The low side's rounds are the same as the"
- "\n\t" "## high side's, except no rcon and no rotation."
- "\n\t" "##"
- "\n\t" ".Laes_schedule_256:"
- "\n\t" " movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)"
- "\n\t" " call .Laes_schedule_transform # input transform"
- "\n\t" " mov $7, %rsi"
-
- "\n\t" ".Laes_schedule_256_L:"
- "\n\t" " call .Laes_schedule_mangle # output low result"
- "\n\t" " movdqa %xmm0, %xmm6 # save cur_lo in xmm6"
-
- "\n\t" " # high round"
- "\n\t" " call .Laes_schedule_round"
- "\n\t" " dec %rsi"
- "\n\t" " jz .Laes_schedule_mangle_last"
- "\n\t" " call .Laes_schedule_mangle "
-
- "\n\t" " # low round. swap xmm7 and xmm6"
- "\n\t" " pshufd $0xFF, %xmm0, %xmm0"
- "\n\t" " movdqa %xmm7, %xmm5"
- "\n\t" " movdqa %xmm6, %xmm7"
- "\n\t" " call .Laes_schedule_low_round"
- "\n\t" " movdqa %xmm5, %xmm7"
-
- "\n\t" " jmp .Laes_schedule_256_L"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_round"
- "\n\t" "##"
- "\n\t" "## Runs one main round of the key schedule on %xmm0, %xmm7"
- "\n\t" "##"
- "\n\t" "## Specifically, runs subbytes on the high dword of %xmm0"
- "\n\t" "## then rotates it by one byte and xors into the low dword of"
- "\n\t" "## %xmm7."
- "\n\t" "##"
- "\n\t" "## Adds rcon from low byte of %xmm8, then rotates %xmm8 for"
- "\n\t" "## next rcon."
- "\n\t" "##"
- "\n\t" "## Smears the dwords of %xmm7 by xoring the low into the"
- "\n\t" "## second low, result into third, result into highest."
- "\n\t" "##"
- "\n\t" "## Returns results in %xmm7 = %xmm0."
- "\n\t" "## Clobbers %xmm1-%xmm4, %r11."
- "\n\t" "##"
- "\n\t" ".Laes_schedule_round:"
- "\n\t" " # extract rcon from xmm8"
- "\n\t" " pxor %xmm1, %xmm1"
- "\n\t" " palignr $15, %xmm8, %xmm1"
- "\n\t" " palignr $15, %xmm8, %xmm8"
- "\n\t" " pxor %xmm1, %xmm7"
-
- "\n\t" " # rotate"
- "\n\t" " pshufd $0xFF, %xmm0, %xmm0"
- "\n\t" " palignr $1, %xmm0, %xmm0"
-
- "\n\t" " # fall through..."
-
- "\n\t" " # low round: same as high round, but no rotation and no rcon."
- "\n\t" ".Laes_schedule_low_round:"
- "\n\t" " # smear xmm7"
- "\n\t" " movdqa %xmm7, %xmm1"
- "\n\t" " pslldq $4, %xmm7"
- "\n\t" " pxor %xmm1, %xmm7"
- "\n\t" " movdqa %xmm7, %xmm1"
- "\n\t" " pslldq $8, %xmm7"
- "\n\t" " pxor %xmm1, %xmm7"
- "\n\t" " pxor .Lk_s63(%r10), %xmm7"
-
- "\n\t" " # subbytes"
- "\n\t" " movdqa %xmm9, %xmm1"
- "\n\t" " pandn %xmm0, %xmm1"
- "\n\t" " psrld $4, %xmm1 # 1 = i"
- "\n\t" " pand %xmm9, %xmm0 # 0 = k"
- "\n\t" " movdqa %xmm11, %xmm2 # 2 : a/k"
- "\n\t" " pshufb %xmm0, %xmm2 # 2 = a/k"
- "\n\t" " pxor %xmm1, %xmm0 # 0 = j"
- "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/i"
- "\n\t" " pshufb %xmm1, %xmm3 # 3 = 1/i"
- "\n\t" " pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k"
- "\n\t" " movdqa %xmm10, %xmm4 # 4 : 1/j"
- "\n\t" " pshufb %xmm0, %xmm4 # 4 = 1/j"
- "\n\t" " pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k"
- "\n\t" " movdqa %xmm10, %xmm2 # 2 : 1/iak"
- "\n\t" " pshufb %xmm3, %xmm2 # 2 = 1/iak"
- "\n\t" " pxor %xmm0, %xmm2 # 2 = io"
- "\n\t" " movdqa %xmm10, %xmm3 # 3 : 1/jak"
- "\n\t" " pshufb %xmm4, %xmm3 # 3 = 1/jak"
- "\n\t" " pxor %xmm1, %xmm3 # 3 = jo"
- "\n\t" " movdqa .Lk_sb1(%r10), %xmm4 # 4 : sbou"
- "\n\t" " pshufb %xmm2, %xmm4 # 4 = sbou"
- "\n\t" " movdqa .Lk_sb1+16(%r10), %xmm0 # 0 : sbot"
- "\n\t" " pshufb %xmm3, %xmm0 # 0 = sb1t"
- "\n\t" " pxor %xmm4, %xmm0 # 0 = sbox output"
-
- "\n\t" " # add in smeared stuff"
- "\n\t" " pxor %xmm7, %xmm0 "
- "\n\t" " movdqa %xmm0, %xmm7"
- "\n\t" " ret"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_transform"
- "\n\t" "##"
- "\n\t" "## Linear-transform %xmm0 according to tables at (%r11)"
- "\n\t" "##"
- "\n\t" "## Requires that %xmm9 = 0x0F0F... as in preheat"
- "\n\t" "## Output in %xmm0"
- "\n\t" "## Clobbers %xmm1, %xmm2"
- "\n\t" "##"
- "\n\t" ".Laes_schedule_transform:"
- "\n\t" " movdqa %xmm9, %xmm1"
- "\n\t" " pandn %xmm0, %xmm1"
- "\n\t" " psrld $4, %xmm1"
- "\n\t" " pand %xmm9, %xmm0"
- "\n\t" " movdqa (%r11), %xmm2 # lo"
- "\n\t" " pshufb %xmm0, %xmm2"
- "\n\t" " movdqa 16(%r11), %xmm0 # hi"
- "\n\t" " pshufb %xmm1, %xmm0"
- "\n\t" " pxor %xmm2, %xmm0"
- "\n\t" " ret"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_mangle"
- "\n\t" "##"
- "\n\t" "## Mangle xmm0 from (basis-transformed) standard version"
- "\n\t" "## to our version."
- "\n\t" "##"
- "\n\t" "## On encrypt,"
- "\n\t" "## xor with 0x63"
- "\n\t" "## multiply by circulant 0,1,1,1"
- "\n\t" "## apply shiftrows transform"
- "\n\t" "##"
- "\n\t" "## On decrypt,"
- "\n\t" "## xor with 0x63"
- "\n\t" "## multiply by 'inverse mixcolumns' circulant E,B,D,9"
- "\n\t" "## deskew"
- "\n\t" "## apply shiftrows transform"
- "\n\t" "##"
- "\n\t" "##"
- "\n\t" "## Writes out to (%rdx), and increments or decrements it"
- "\n\t" "## Keeps track of round number mod 4 in %r8"
- "\n\t" "## Preserves xmm0"
- "\n\t" "## Clobbers xmm1-xmm5"
- "\n\t" "##"
- "\n\t" ".Laes_schedule_mangle:"
- "\n\t" " movdqa %xmm0, %xmm4 # save xmm0 for later"
- "\n\t" " movdqa .Lk_mc_forward(%r10),%xmm5"
- "\n\t" " test %rcx, %rcx"
- "\n\t" " jnz .Laes_schedule_mangle_dec"
-
- "\n\t" " # encrypting"
- "\n\t" " add $16, %rdx"
- "\n\t" " pxor .Lk_s63(%r10),%xmm4"
- "\n\t" " pshufb %xmm5, %xmm4"
- "\n\t" " movdqa %xmm4, %xmm3"
- "\n\t" " pshufb %xmm5, %xmm4"
- "\n\t" " pxor %xmm4, %xmm3"
- "\n\t" " pshufb %xmm5, %xmm4"
- "\n\t" " pxor %xmm4, %xmm3"
-
- "\n\t" " jmp .Laes_schedule_mangle_both"
-
- "\n\t" ".Laes_schedule_mangle_dec:"
- "\n\t" " lea .Lk_dks_1(%r10), %r11 # first table: *9"
- "\n\t" " call .Laes_schedule_transform"
- "\n\t" " movdqa %xmm0, %xmm3"
- "\n\t" " pshufb %xmm5, %xmm3"
-
- "\n\t" " add $32, %r11 # next table: *B"
- "\n\t" " call .Laes_schedule_transform"
- "\n\t" " pxor %xmm0, %xmm3"
- "\n\t" " pshufb %xmm5, %xmm3"
-
- "\n\t" " add $32, %r11 # next table: *D"
- "\n\t" " call .Laes_schedule_transform"
- "\n\t" " pxor %xmm0, %xmm3"
- "\n\t" " pshufb %xmm5, %xmm3"
-
- "\n\t" " add $32, %r11 # next table: *E"
- "\n\t" " call .Laes_schedule_transform"
- "\n\t" " pxor %xmm0, %xmm3"
- "\n\t" " pshufb %xmm5, %xmm3"
-
- "\n\t" " movdqa %xmm4, %xmm0 # restore %xmm0"
- "\n\t" " add $-16, %rdx"
-
- "\n\t" ".Laes_schedule_mangle_both:"
- "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm3"
- "\n\t" " add $-16, %r8"
- "\n\t" " and $48, %r8"
- "\n\t" " movdqa %xmm3, (%rdx)"
- "\n\t" " ret"
-
- "\n\t" "##"
- "\n\t" "## .Laes_schedule_mangle_last"
- "\n\t" "##"
- "\n\t" "## Mangler for last round of key schedule"
- "\n\t" "## Mangles %xmm0"
- "\n\t" "## when encrypting, outputs out(%xmm0) ^ 63"
- "\n\t" "## when decrypting, outputs unskew(%xmm0)"
- "\n\t" "##"
- "\n\t" "## Always called right before return... jumps to cleanup and exits"
- "\n\t" "##"
- "\n\t" ".Laes_schedule_mangle_last:"
- "\n\t" " # schedule last round key from xmm0"
- "\n\t" " lea .Lk_deskew(%r10),%r11 # prepare to deskew"
- "\n\t" " test %rcx, %rcx"
- "\n\t" " jnz .Laes_schedule_mangle_last_dec"
-
- "\n\t" " # encrypting"
- "\n\t" " pshufb .Lk_sr(%r8,%r10),%xmm0 # output permute"
- "\n\t" " lea .Lk_opt(%r10), %r11 # prepare to output transform"
- "\n\t" " add $32, %rdx"
-
- "\n\t" ".Laes_schedule_mangle_last_dec:"
- "\n\t" " add $-16, %rdx"
- "\n\t" " pxor .Lk_s63(%r10), %xmm0"
- "\n\t" " call .Laes_schedule_transform # output transform"
- "\n\t" " movdqa %xmm0, (%rdx) # save last key"
-
- "\n\t" " #_aes_cleanup"
- "\n\t" " pxor %xmm0, %xmm0"
- "\n\t" " pxor %xmm1, %xmm1"
- "\n\t" " pxor %xmm2, %xmm2"
- "\n\t" " pxor %xmm3, %xmm3"
- "\n\t" " pxor %xmm4, %xmm4"
- "\n\t" " pxor %xmm5, %xmm5"
- "\n\t" " pxor %xmm6, %xmm6"
- "\n\t" " pxor %xmm7, %xmm7"
- "\n\t" " pxor %xmm8, %xmm8"
- "\n\t" " ret"
-X("\n\t" ".size _aes_schedule_core,.-_aes_schedule_core")
-
- "\n\t" "########################################################"
- "\n\t" "## ##"
- "\n\t" "## Constants ##"
- "\n\t" "## ##"
- "\n\t" "########################################################"
-
- "\n\t" ".align 16"
-X("\n\t" ".type _aes_consts, at object")
- "\n\t" ".Laes_consts:"
- "\n\t" "_aes_consts:"
- "\n\t" " # s0F"
- "\n\t" " .Lk_s0F = .-.Laes_consts"
- "\n\t" " .quad 0x0F0F0F0F0F0F0F0F"
- "\n\t" " .quad 0x0F0F0F0F0F0F0F0F"
-
- "\n\t" " # input transform (lo, hi)"
- "\n\t" " .Lk_ipt = .-.Laes_consts"
- "\n\t" " .quad 0xC2B2E8985A2A7000"
- "\n\t" " .quad 0xCABAE09052227808"
- "\n\t" " .quad 0x4C01307D317C4D00"
- "\n\t" " .quad 0xCD80B1FCB0FDCC81"
-
- "\n\t" " # inv, inva"
- "\n\t" " .Lk_inv = .-.Laes_consts"
- "\n\t" " .quad 0x0E05060F0D080180"
- "\n\t" " .quad 0x040703090A0B0C02"
- "\n\t" " .quad 0x01040A060F0B0780"
- "\n\t" " .quad 0x030D0E0C02050809"
-
- "\n\t" " # sb1u, sb1t"
- "\n\t" " .Lk_sb1 = .-.Laes_consts"
- "\n\t" " .quad 0xB19BE18FCB503E00"
- "\n\t" " .quad 0xA5DF7A6E142AF544"
- "\n\t" " .quad 0x3618D415FAE22300"
- "\n\t" " .quad 0x3BF7CCC10D2ED9EF"
-
-
- "\n\t" " # sb2u, sb2t"
- "\n\t" " .Lk_sb2 = .-.Laes_consts"
- "\n\t" " .quad 0xE27A93C60B712400"
- "\n\t" " .quad 0x5EB7E955BC982FCD"
- "\n\t" " .quad 0x69EB88400AE12900"
- "\n\t" " .quad 0xC2A163C8AB82234A"
-
- "\n\t" " # sbou, sbot"
- "\n\t" " .Lk_sbo = .-.Laes_consts"
- "\n\t" " .quad 0xD0D26D176FBDC700"
- "\n\t" " .quad 0x15AABF7AC502A878"
- "\n\t" " .quad 0xCFE474A55FBB6A00"
- "\n\t" " .quad 0x8E1E90D1412B35FA"
-
- "\n\t" " # mc_forward"
- "\n\t" " .Lk_mc_forward = .-.Laes_consts"
- "\n\t" " .quad 0x0407060500030201"
- "\n\t" " .quad 0x0C0F0E0D080B0A09"
- "\n\t" " .quad 0x080B0A0904070605"
- "\n\t" " .quad 0x000302010C0F0E0D"
- "\n\t" " .quad 0x0C0F0E0D080B0A09"
- "\n\t" " .quad 0x0407060500030201"
- "\n\t" " .quad 0x000302010C0F0E0D"
- "\n\t" " .quad 0x080B0A0904070605"
-
- "\n\t" " # mc_backward"
- "\n\t" " .Lk_mc_backward = .-.Laes_consts"
- "\n\t" " .quad 0x0605040702010003"
- "\n\t" " .quad 0x0E0D0C0F0A09080B"
- "\n\t" " .quad 0x020100030E0D0C0F"
- "\n\t" " .quad 0x0A09080B06050407"
- "\n\t" " .quad 0x0E0D0C0F0A09080B"
- "\n\t" " .quad 0x0605040702010003"
- "\n\t" " .quad 0x0A09080B06050407"
- "\n\t" " .quad 0x020100030E0D0C0F"
-
- "\n\t" " # sr"
- "\n\t" " .Lk_sr = .-.Laes_consts"
- "\n\t" " .quad 0x0706050403020100"
- "\n\t" " .quad 0x0F0E0D0C0B0A0908"
- "\n\t" " .quad 0x030E09040F0A0500"
- "\n\t" " .quad 0x0B06010C07020D08"
- "\n\t" " .quad 0x0F060D040B020900"
- "\n\t" " .quad 0x070E050C030A0108"
- "\n\t" " .quad 0x0B0E0104070A0D00"
- "\n\t" " .quad 0x0306090C0F020508"
-
- "\n\t" " # rcon"
- "\n\t" " .Lk_rcon = .-.Laes_consts"
- "\n\t" " .quad 0x1F8391B9AF9DEEB6"
- "\n\t" " .quad 0x702A98084D7C7D81"
-
- "\n\t" " # s63: all equal to 0x63 transformed"
- "\n\t" " .Lk_s63 = .-.Laes_consts"
- "\n\t" " .quad 0x5B5B5B5B5B5B5B5B"
- "\n\t" " .quad 0x5B5B5B5B5B5B5B5B"
-
- "\n\t" " # output transform"
- "\n\t" " .Lk_opt = .-.Laes_consts"
- "\n\t" " .quad 0xFF9F4929D6B66000"
- "\n\t" " .quad 0xF7974121DEBE6808"
- "\n\t" " .quad 0x01EDBD5150BCEC00"
- "\n\t" " .quad 0xE10D5DB1B05C0CE0"
-
- "\n\t" " # deskew tables: inverts the sbox's 'skew'"
- "\n\t" " .Lk_deskew = .-.Laes_consts"
- "\n\t" " .quad 0x07E4A34047A4E300"
- "\n\t" " .quad 0x1DFEB95A5DBEF91A"
- "\n\t" " .quad 0x5F36B5DC83EA6900"
- "\n\t" " .quad 0x2841C2ABF49D1E77"
-
- "\n\t" "##"
- "\n\t" "## Decryption stuff"
- "\n\t" "## Key schedule constants"
- "\n\t" "##"
- "\n\t" " # decryption key schedule: x -> invskew x*9"
- "\n\t" " .Lk_dks_1 = .-.Laes_consts"
- "\n\t" " .quad 0xB6116FC87ED9A700"
- "\n\t" " .quad 0x4AED933482255BFC"
- "\n\t" " .quad 0x4576516227143300"
- "\n\t" " .quad 0x8BB89FACE9DAFDCE"
-
- "\n\t" " # decryption key schedule: invskew x*9 -> invskew x*D"
- "\n\t" " .Lk_dks_2 = .-.Laes_consts"
- "\n\t" " .quad 0x27438FEBCCA86400"
- "\n\t" " .quad 0x4622EE8AADC90561"
- "\n\t" " .quad 0x815C13CE4F92DD00"
- "\n\t" " .quad 0x73AEE13CBD602FF2"
-
- "\n\t" " # decryption key schedule: invskew x*D -> invskew x*B"
- "\n\t" " .Lk_dks_3 = .-.Laes_consts"
- "\n\t" " .quad 0x03C4C50201C6C700"
- "\n\t" " .quad 0xF83F3EF9FA3D3CFB"
- "\n\t" " .quad 0xEE1921D638CFF700"
- "\n\t" " .quad 0xA5526A9D7384BC4B"
-
- "\n\t" " # decryption key schedule: invskew x*B -> invskew x*E + 0x63"
- "\n\t" " .Lk_dks_4 = .-.Laes_consts"
- "\n\t" " .quad 0xE3C390B053732000"
- "\n\t" " .quad 0xA080D3F310306343"
- "\n\t" " .quad 0xA0CA214B036982E8"
- "\n\t" " .quad 0x2F45AEC48CE60D67"
-
- "\n\t" "##"
- "\n\t" "## Decryption stuff"
- "\n\t" "## Round function constants"
- "\n\t" "##"
- "\n\t" " # decryption input transform"
- "\n\t" " .Lk_dipt = .-.Laes_consts"
- "\n\t" " .quad 0x0F505B040B545F00"
- "\n\t" " .quad 0x154A411E114E451A"
- "\n\t" " .quad 0x86E383E660056500"
- "\n\t" " .quad 0x12771772F491F194"
-
- "\n\t" " # decryption sbox output *9*u, *9*t"
- "\n\t" " .Lk_dsb9 = .-.Laes_consts"
- "\n\t" " .quad 0x851C03539A86D600"
- "\n\t" " .quad 0xCAD51F504F994CC9"
- "\n\t" " .quad 0xC03B1789ECD74900"
- "\n\t" " .quad 0x725E2C9EB2FBA565"
-
- "\n\t" " # decryption sbox output *D*u, *D*t"
- "\n\t" " .Lk_dsbd = .-.Laes_consts"
- "\n\t" " .quad 0x7D57CCDFE6B1A200"
- "\n\t" " .quad 0xF56E9B13882A4439"
- "\n\t" " .quad 0x3CE2FAF724C6CB00"
- "\n\t" " .quad 0x2931180D15DEEFD3"
-
- "\n\t" " # decryption sbox output *B*u, *B*t"
- "\n\t" " .Lk_dsbb = .-.Laes_consts"
- "\n\t" " .quad 0xD022649296B44200"
- "\n\t" " .quad 0x602646F6B0F2D404"
- "\n\t" " .quad 0xC19498A6CD596700"
- "\n\t" " .quad 0xF3FF0C3E3255AA6B"
-
- "\n\t" " # decryption sbox output *E*u, *E*t"
- "\n\t" " .Lk_dsbe = .-.Laes_consts"
- "\n\t" " .quad 0x46F2929626D4D000"
- "\n\t" " .quad 0x2242600464B4F6B0"
- "\n\t" " .quad 0x0C55A6CDFFAAC100"
- "\n\t" " .quad 0x9467F36B98593E32"
-
- "\n\t" " # decryption sbox final output"
- "\n\t" " .Lk_dsbo = .-.Laes_consts"
- "\n\t" " .quad 0x1387EA537EF94000"
- "\n\t" " .quad 0xC7AA6DB9D4943E2D"
- "\n\t" " .quad 0x12D7560F93441D00"
- "\n\t" " .quad 0xCA4B8159D8C58E9C"
-X("\n\t" ".size _aes_consts,.-_aes_consts")
-);
-
#endif /* USE_SSSE3 */
diff --git a/configure.ac b/configure.ac
index 4932786..31c0d55 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2031,6 +2031,7 @@ if test "$found" = "1" ; then
# Build with the SSSE3 implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ssse3-amd64-asm.lo"
;;
arm*-*-*)
# Build with the assembly implementation
More information about the Gcrypt-devel
mailing list