[PATCH 2/2] rijndael-ssse3: call assembly functions directly
Jussi Kivilinna
jussi.kivilinna at iki.fi
Tue Jan 9 18:25:17 CET 2018
* cipher/rijndael-ssse3-amd64-asm.S (_gcry_aes_ssse3_enc_preload)
(_gcry_aes_ssse3_dec_preload, _gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add
ENTER_SYSV_FUNC_PARAMS_* at function entry and EXIT_SYSV_FUNC at exit.
(_gcry_aes_ssse3_encrypt_core, _gcry_aes_ssse3_decrypt_core): Change
to input parameters to RDI and RSI registers.
* cipher/rijndael-ssse3-amd64.c (_gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add parameters
for function prototypes.
(PUSH_STACK_PTR, POP_STACK_PTR): Remove.
(vpaes_ssse3_prepare_enc, vpaes_ssse3_prepare_dec)
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(do_vpaes_ssse3_enc, do_vpaes_ssse3_dec): Remove inline assembly to
call functions, and call directly instead.
--
Instead of using inline assembly to call assembly functions in
AES SSSE3 implementation, change assembly functions so that they
can be called directly instead.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-ssse3-amd64-asm.S | 31 +++++++++----
cipher/rijndael-ssse3-amd64.c | 91 ++++++-------------------------------
2 files changed, 35 insertions(+), 87 deletions(-)
diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index 3ae55e8b6..ffce5df2f 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -40,11 +40,7 @@
(defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define ELF(...)
-#else
-# define ELF(...) __VA_ARGS__
-#endif
+#include "asm-common-amd64.h"
.text
@@ -54,6 +50,7 @@
ELF(.type _gcry_aes_ssse3_enc_preload, at function)
.globl _gcry_aes_ssse3_enc_preload
_gcry_aes_ssse3_enc_preload:
+ ENTER_SYSV_FUNC_PARAMS_0_4
lea .Laes_consts(%rip), %rax
movdqa (%rax), %xmm9 # 0F
movdqa .Lk_inv (%rax), %xmm10 # inv
@@ -62,6 +59,7 @@ _gcry_aes_ssse3_enc_preload:
movdqa .Lk_sb1+16(%rax), %xmm12 # sb1t
movdqa .Lk_sb2 (%rax), %xmm15 # sb2u
movdqa .Lk_sb2+16(%rax), %xmm14 # sb2t
+ EXIT_SYSV_FUNC
ret
ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
@@ -71,6 +69,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
ELF(.type _gcry_aes_ssse3_dec_preload, at function)
.globl _gcry_aes_ssse3_dec_preload
_gcry_aes_ssse3_dec_preload:
+ ENTER_SYSV_FUNC_PARAMS_0_4
lea .Laes_consts(%rip), %rax
movdqa (%rax), %xmm9 # 0F
movdqa .Lk_inv (%rax), %xmm10 # inv
@@ -80,6 +79,7 @@ _gcry_aes_ssse3_dec_preload:
movdqa .Lk_dsbd (%rax), %xmm15 # sbdu
movdqa .Lk_dsbb (%rax), %xmm14 # sbbu
movdqa .Lk_dsbe (%rax), %xmm8 # sbeu
+ EXIT_SYSV_FUNC
ret
ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
@@ -98,11 +98,11 @@ ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
## Inputs:
## %xmm0 = input
## %xmm9-%xmm15 as in .Laes_preheat
-## (%rdx) = scheduled keys
-## %rax = nrounds - 1
+## (%rdi) = scheduled keys
+## %rsi = nrounds
##
## Output in %xmm0
-## Clobbers %xmm1-%xmm4, %r9, %r11, %rax, %rcx
+## Clobbers %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx
## Preserves %xmm6 - %xmm7 so you get some local vectors
##
##
@@ -111,6 +111,9 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
.globl _gcry_aes_ssse3_encrypt_core
_gcry_aes_ssse3_encrypt_core:
_aes_encrypt_core:
+ ENTER_SYSV_FUNC_PARAMS_0_4
+ mov %rdi, %rdx
+ leaq -1(%rsi), %rax
lea .Laes_consts(%rip), %rcx
leaq .Lk_mc_backward(%rcx), %rdi
mov $16, %rsi
@@ -185,6 +188,7 @@ _aes_encrypt_core:
pshufb %xmm3, %xmm0 # 0 = sb1t
pxor %xmm4, %xmm0 # 0 = A
pshufb .Lk_sr(%rsi,%rcx), %xmm0
+ EXIT_SYSV_FUNC
ret
ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
@@ -198,8 +202,11 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
_gcry_aes_ssse3_decrypt_core:
_aes_decrypt_core:
+ ENTER_SYSV_FUNC_PARAMS_0_4
+ mov %rdi, %rdx
lea .Laes_consts(%rip), %rcx
- movl %eax, %esi
+ subl $1, %esi
+ movl %esi, %eax
shll $4, %esi
xorl $48, %esi
andl $48, %esi
@@ -288,6 +295,7 @@ _aes_decrypt_core:
pshufb %xmm3, %xmm0 # 0 = sb1t
pxor %xmm4, %xmm0 # 0 = A
pshufb .Lk_sr(%rsi,%rcx), %xmm0
+ EXIT_SYSV_FUNC
ret
ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
@@ -306,6 +314,8 @@ _aes_schedule_core:
# rsi = size in bits
# rdx = buffer
# rcx = direction. 0=encrypt, 1=decrypt
+ # r8 = rotoffs
+ ENTER_SYSV_FUNC_PARAMS_5
# load the tables
lea .Laes_consts(%rip), %r10
@@ -659,8 +669,9 @@ _aes_schedule_core:
pxor %xmm6, %xmm6
pxor %xmm7, %xmm7
pxor %xmm8, %xmm8
+ EXIT_SYSV_FUNC
ret
-ELF(.size _aes_schedule_core,.-_aes_schedule_core)
+ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
########################################################
## ##
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index da5339e36..98660ecc8 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -58,13 +58,14 @@
/* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
- have custom calling convention and need to be called from assembly
- blocks, not directly. */
+ have custom calling convention (additional XMM parameters). */
extern void _gcry_aes_ssse3_enc_preload(void);
extern void _gcry_aes_ssse3_dec_preload(void);
-extern void _gcry_aes_ssse3_schedule_core(void);
-extern void _gcry_aes_ssse3_encrypt_core(void);
-extern void _gcry_aes_ssse3_decrypt_core(void);
+extern void _gcry_aes_ssse3_schedule_core(const void *key, u64 keybits,
+ void *buffer, u64 decrypt,
+ u64 rotoffs);
+extern void _gcry_aes_ssse3_encrypt_core(const void *key, u64 nrounds);
+extern void _gcry_aes_ssse3_decrypt_core(const void *key, u64 nrounds);
@@ -110,8 +111,6 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
: \
: "r" (ssse3_state) \
: "memory" )
-# define PUSH_STACK_PTR
-# define POP_STACK_PTR
#else
# define SSSE3_STATE_SIZE 1
# define vpaes_ssse3_prepare() (void)ssse3_state
@@ -126,31 +125,15 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
"pxor %%xmm7, %%xmm7 \n\t" \
"pxor %%xmm8, %%xmm8 \n\t" \
::: "memory" )
-/* Old GCC versions use red-zone of AMD64 SYSV ABI and stack pointer is
- * not properly adjusted for assembly block. Therefore stack pointer
- * needs to be manually corrected. */
-# define PUSH_STACK_PTR "subq $128, %%rsp;\n\t"
-# define POP_STACK_PTR "addq $128, %%rsp;\n\t"
#endif
#define vpaes_ssse3_prepare_enc() \
vpaes_ssse3_prepare(); \
- asm volatile (PUSH_STACK_PTR \
- "callq *%q[core] \n\t" \
- POP_STACK_PTR \
- : \
- : [core] "r" (_gcry_aes_ssse3_enc_preload) \
- : "rax", "cc", "memory" )
+ _gcry_aes_ssse3_enc_preload();
#define vpaes_ssse3_prepare_dec() \
vpaes_ssse3_prepare(); \
- asm volatile (PUSH_STACK_PTR \
- "callq *%q[core] \n\t" \
- POP_STACK_PTR \
- : \
- : [core] "r" (_gcry_aes_ssse3_dec_preload) \
- : "rax", "cc", "memory" )
-
+ _gcry_aes_ssse3_dec_preload();
void
@@ -161,23 +144,7 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
vpaes_ssse3_prepare();
- asm volatile ("leaq %q[key], %%rdi" "\n\t"
- "movl %[bits], %%esi" "\n\t"
- "leaq %[buf], %%rdx" "\n\t"
- "movl %[dir], %%ecx" "\n\t"
- "movl %[rotoffs], %%r8d" "\n\t"
- PUSH_STACK_PTR
- "callq *%q[core]" "\n\t"
- POP_STACK_PTR
- :
- : [core] "r" (&_gcry_aes_ssse3_schedule_core),
- [key] "m" (*key),
- [bits] "g" (keybits),
- [buf] "m" (ctx->keyschenc32[0][0]),
- [dir] "g" (0),
- [rotoffs] "g" (48)
- : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
- "cc", "memory");
+ _gcry_aes_ssse3_schedule_core(key, keybits, &ctx->keyschenc32[0][0], 0, 48);
/* Save key for setting up decryption. */
if (keybits > 192)
@@ -216,23 +183,9 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
vpaes_ssse3_prepare();
- asm volatile ("leaq %q[key], %%rdi" "\n\t"
- "movl %[bits], %%esi" "\n\t"
- "leaq %[buf], %%rdx" "\n\t"
- "movl %[dir], %%ecx" "\n\t"
- "movl %[rotoffs], %%r8d" "\n\t"
- PUSH_STACK_PTR
- "callq *%q[core]" "\n\t"
- POP_STACK_PTR
- :
- : [core] "r" (_gcry_aes_ssse3_schedule_core),
- [key] "m" (ctx->keyschdec32[0][0]),
- [bits] "g" (keybits),
- [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
- [dir] "g" (1),
- [rotoffs] "g" ((keybits == 192) ? 0 : 32)
- : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
- "cc", "memory");
+ _gcry_aes_ssse3_schedule_core(&ctx->keyschdec32[0][0], keybits,
+ &ctx->keyschdec32[ctx->rounds][0], 1,
+ (keybits == 192) ? 0 : 32);
vpaes_ssse3_cleanup();
}
@@ -243,15 +196,7 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
static inline void
do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
{
- unsigned int middle_rounds = nrounds - 1;
- const void *keysched = ctx->keyschenc32;
-
- asm volatile (PUSH_STACK_PTR
- "callq *%q[core]" "\n\t"
- POP_STACK_PTR
- : "+a" (middle_rounds), "+d" (keysched)
- : [core] "r" (_gcry_aes_ssse3_encrypt_core)
- : "rcx", "rsi", "rdi", "cc", "memory");
+ _gcry_aes_ssse3_encrypt_core(ctx->keyschenc32, nrounds);
}
@@ -260,15 +205,7 @@ do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
static inline void
do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
{
- unsigned int middle_rounds = nrounds - 1;
- const void *keysched = ctx->keyschdec32;
-
- asm volatile (PUSH_STACK_PTR
- "callq *%q[core]" "\n\t"
- POP_STACK_PTR
- : "+a" (middle_rounds), "+d" (keysched)
- : [core] "r" (_gcry_aes_ssse3_decrypt_core)
- : "rcx", "rsi", "cc", "memory");
+ _gcry_aes_ssse3_decrypt_core(ctx->keyschdec32, nrounds);
}
More information about the Gcrypt-devel
mailing list