[PATCH 2/2] rijndael-ssse3: call assembly functions directly

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Jan 9 18:25:17 CET 2018


* cipher/rijndael-ssse3-amd64-asm.S (_gcry_aes_ssse3_enc_preload)
(_gcry_aes_ssse3_dec_preload, _gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add
ENTER_SYSV_FUNC_PARAMS_* at function entry and EXIT_SYSV_FUNC at exit.
(_gcry_aes_ssse3_encrypt_core, _gcry_aes_ssse3_decrypt_core): Change
to input parameters to RDI and RSI registers.
* cipher/rijndael-ssse3-amd64.c (_gcry_aes_ssse3_encrypt_core)
(_gcry_aes_ssse3_decrypt_core, _gcry_aes_schedule_core): Add parameters
for function prototypes.
(PUSH_STACK_PTR, POP_STACK_PTR): Remove.
(vpaes_ssse3_prepare_enc, vpaes_ssse3_prepare_dec)
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(do_vpaes_ssse3_enc, do_vpaes_ssse3_dec): Remove inline assembly to
call functions, and call directly instead.
--

Instead of using inline assembly to call assembly functions in
AES SSSE3 implementation, change assembly functions so that they
can be called directly instead.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ssse3-amd64-asm.S |   31 +++++++++----
 cipher/rijndael-ssse3-amd64.c     |   91 ++++++-------------------------------
 2 files changed, 35 insertions(+), 87 deletions(-)

diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S
index 3ae55e8b6..ffce5df2f 100644
--- a/cipher/rijndael-ssse3-amd64-asm.S
+++ b/cipher/rijndael-ssse3-amd64-asm.S
@@ -40,11 +40,7 @@
     (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
      defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
-#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define ELF(...)
-#else
-# define ELF(...) __VA_ARGS__
-#endif
+#include "asm-common-amd64.h"
 
 .text
 
@@ -54,6 +50,7 @@
 ELF(.type _gcry_aes_ssse3_enc_preload, at function)
 .globl _gcry_aes_ssse3_enc_preload
 _gcry_aes_ssse3_enc_preload:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9  # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10 # inv
@@ -62,6 +59,7 @@ _gcry_aes_ssse3_enc_preload:
 	movdqa	.Lk_sb1+16(%rax), %xmm12 # sb1t
 	movdqa	.Lk_sb2   (%rax), %xmm15 # sb2u
 	movdqa	.Lk_sb2+16(%rax), %xmm14 # sb2t
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 
@@ -71,6 +69,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload)
 ELF(.type _gcry_aes_ssse3_dec_preload, at function)
 .globl _gcry_aes_ssse3_dec_preload
 _gcry_aes_ssse3_dec_preload:
+	ENTER_SYSV_FUNC_PARAMS_0_4
 	lea	.Laes_consts(%rip), %rax
 	movdqa	          (%rax), %xmm9   # 0F
 	movdqa	.Lk_inv   (%rax), %xmm10  # inv
@@ -80,6 +79,7 @@ _gcry_aes_ssse3_dec_preload:
 	movdqa	.Lk_dsbd   (%rax), %xmm15 # sbdu
 	movdqa	.Lk_dsbb   (%rax), %xmm14 # sbbu
 	movdqa	.Lk_dsbe   (%rax), %xmm8  # sbeu
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 
@@ -98,11 +98,11 @@ ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload)
 ##  Inputs:
 ##     %xmm0 = input
 ##     %xmm9-%xmm15 as in .Laes_preheat
-##    (%rdx) = scheduled keys
-##     %rax  = nrounds - 1
+##    (%rdi) = scheduled keys
+##     %rsi  = nrounds
 ##
 ##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx
+##  Clobbers  %xmm1-%xmm4, %r9, %r11, %rax, %rcx, %rdx
 ##  Preserves %xmm6 - %xmm7 so you get some local vectors
 ##
 ##
@@ -111,6 +111,9 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function)
 .globl _gcry_aes_ssse3_encrypt_core
 _gcry_aes_ssse3_encrypt_core:
 _aes_encrypt_core:
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
+	leaq	-1(%rsi), %rax
 	lea	.Laes_consts(%rip), %rcx
 	leaq	.Lk_mc_backward(%rcx), %rdi
 	mov	$16,	%rsi
@@ -185,6 +188,7 @@ _aes_encrypt_core:
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 
@@ -198,8 +202,11 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core)
 ELF(.type _gcry_aes_ssse3_decrypt_core, at function)
 _gcry_aes_ssse3_decrypt_core:
 _aes_decrypt_core:
+	ENTER_SYSV_FUNC_PARAMS_0_4
+	mov	%rdi,	%rdx
 	lea	.Laes_consts(%rip), %rcx
-	movl	%eax,	%esi
+	subl	$1,	%esi
+	movl	%esi,   %eax
 	shll	$4,	%esi
 	xorl	$48,	%esi
 	andl	$48,	%esi
@@ -288,6 +295,7 @@ _aes_decrypt_core:
 	pshufb  %xmm3,	%xmm0	# 0 = sb1t
 	pxor	%xmm4,	%xmm0	# 0 = A
 	pshufb	.Lk_sr(%rsi,%rcx), %xmm0
+	EXIT_SYSV_FUNC
 	ret
 ELF(.size _aes_decrypt_core,.-_aes_decrypt_core)
 
@@ -306,6 +314,8 @@ _aes_schedule_core:
 	# rsi = size in bits
 	# rdx = buffer
 	# rcx = direction.  0=encrypt, 1=decrypt
+	# r8 = rotoffs
+	ENTER_SYSV_FUNC_PARAMS_5
 
 	# load the tables
 	lea	.Laes_consts(%rip), %r10
@@ -659,8 +669,9 @@ _aes_schedule_core:
 	pxor	%xmm6,  %xmm6
 	pxor	%xmm7,  %xmm7
 	pxor	%xmm8,  %xmm8
+	EXIT_SYSV_FUNC
 	ret
-ELF(.size _aes_schedule_core,.-_aes_schedule_core)
+ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core)
 
 ########################################################
 ##                                                    ##
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index da5339e36..98660ecc8 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -58,13 +58,14 @@
 
 
 /* Assembly functions in rijndael-ssse3-amd64-asm.S. Note that these
-   have custom calling convention and need to be called from assembly
-   blocks, not directly. */
+   have custom calling convention (additional XMM parameters). */
 extern void _gcry_aes_ssse3_enc_preload(void);
 extern void _gcry_aes_ssse3_dec_preload(void);
-extern void _gcry_aes_ssse3_schedule_core(void);
-extern void _gcry_aes_ssse3_encrypt_core(void);
-extern void _gcry_aes_ssse3_decrypt_core(void);
+extern void _gcry_aes_ssse3_schedule_core(const void *key, u64 keybits,
+					  void *buffer, u64 decrypt,
+					  u64 rotoffs);
+extern void _gcry_aes_ssse3_encrypt_core(const void *key, u64 nrounds);
+extern void _gcry_aes_ssse3_decrypt_core(const void *key, u64 nrounds);
 
 
 
@@ -110,8 +111,6 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
                   : \
                   : "r" (ssse3_state) \
                   : "memory" )
-# define PUSH_STACK_PTR
-# define POP_STACK_PTR
 #else
 # define SSSE3_STATE_SIZE 1
 # define vpaes_ssse3_prepare() (void)ssse3_state
@@ -126,31 +125,15 @@ extern void _gcry_aes_ssse3_decrypt_core(void);
                   "pxor	%%xmm7,  %%xmm7 \n\t" \
                   "pxor	%%xmm8,  %%xmm8 \n\t" \
                   ::: "memory" )
-/* Old GCC versions use red-zone of AMD64 SYSV ABI and stack pointer is
- * not properly adjusted for assembly block. Therefore stack pointer
- * needs to be manually corrected. */
-# define PUSH_STACK_PTR "subq $128, %%rsp;\n\t"
-# define POP_STACK_PTR  "addq $128, %%rsp;\n\t"
 #endif
 
 #define vpaes_ssse3_prepare_enc() \
     vpaes_ssse3_prepare(); \
-    asm volatile (PUSH_STACK_PTR \
-                  "callq *%q[core] \n\t" \
-                  POP_STACK_PTR \
-                  : \
-                  : [core] "r" (_gcry_aes_ssse3_enc_preload) \
-                  : "rax", "cc", "memory" )
+    _gcry_aes_ssse3_enc_preload();
 
 #define vpaes_ssse3_prepare_dec() \
     vpaes_ssse3_prepare(); \
-    asm volatile (PUSH_STACK_PTR \
-                  "callq *%q[core] \n\t" \
-                  POP_STACK_PTR \
-                  : \
-                  : [core] "r" (_gcry_aes_ssse3_dec_preload) \
-                  : "rax", "cc", "memory" )
-
+    _gcry_aes_ssse3_dec_preload();
 
 
 void
@@ -161,23 +144,7 @@ _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
 
   vpaes_ssse3_prepare();
 
-  asm volatile ("leaq %q[key], %%rdi"			"\n\t"
-                "movl %[bits], %%esi"			"\n\t"
-                "leaq %[buf], %%rdx"			"\n\t"
-                "movl %[dir], %%ecx"			"\n\t"
-                "movl %[rotoffs], %%r8d"		"\n\t"
-                PUSH_STACK_PTR
-                "callq *%q[core]"			"\n\t"
-                POP_STACK_PTR
-                :
-                : [core] "r" (&_gcry_aes_ssse3_schedule_core),
-                  [key] "m" (*key),
-                  [bits] "g" (keybits),
-                  [buf] "m" (ctx->keyschenc32[0][0]),
-                  [dir] "g" (0),
-                  [rotoffs] "g" (48)
-                : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
-                  "cc", "memory");
+  _gcry_aes_ssse3_schedule_core(key, keybits, &ctx->keyschenc32[0][0], 0, 48);
 
   /* Save key for setting up decryption. */
   if (keybits > 192)
@@ -216,23 +183,9 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 
   vpaes_ssse3_prepare();
 
-  asm volatile ("leaq %q[key], %%rdi"			"\n\t"
-                "movl %[bits], %%esi"			"\n\t"
-                "leaq %[buf], %%rdx"			"\n\t"
-                "movl %[dir], %%ecx"			"\n\t"
-                "movl %[rotoffs], %%r8d"		"\n\t"
-                PUSH_STACK_PTR
-                "callq *%q[core]"			"\n\t"
-                POP_STACK_PTR
-                :
-                : [core] "r" (_gcry_aes_ssse3_schedule_core),
-                  [key] "m" (ctx->keyschdec32[0][0]),
-                  [bits] "g" (keybits),
-                  [buf] "m" (ctx->keyschdec32[ctx->rounds][0]),
-                  [dir] "g" (1),
-                  [rotoffs] "g" ((keybits == 192) ? 0 : 32)
-                : "r8", "r9", "r10", "r11", "rax", "rcx", "rdx", "rdi", "rsi",
-                  "cc", "memory");
+  _gcry_aes_ssse3_schedule_core(&ctx->keyschdec32[0][0], keybits,
+				&ctx->keyschdec32[ctx->rounds][0], 1,
+				(keybits == 192) ? 0 : 32);
 
   vpaes_ssse3_cleanup();
 }
@@ -243,15 +196,7 @@ _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 static inline void
 do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
-  unsigned int middle_rounds = nrounds - 1;
-  const void *keysched = ctx->keyschenc32;
-
-  asm volatile (PUSH_STACK_PTR
-		"callq *%q[core]"			"\n\t"
-		POP_STACK_PTR
-		: "+a" (middle_rounds), "+d" (keysched)
-		: [core] "r" (_gcry_aes_ssse3_encrypt_core)
-		: "rcx", "rsi", "rdi", "cc", "memory");
+  _gcry_aes_ssse3_encrypt_core(ctx->keyschenc32, nrounds);
 }
 
 
@@ -260,15 +205,7 @@ do_vpaes_ssse3_enc (const RIJNDAEL_context *ctx, unsigned int nrounds)
 static inline void
 do_vpaes_ssse3_dec (const RIJNDAEL_context *ctx, unsigned int nrounds)
 {
-  unsigned int middle_rounds = nrounds - 1;
-  const void *keysched = ctx->keyschdec32;
-
-  asm volatile (PUSH_STACK_PTR
-		"callq *%q[core]"			"\n\t"
-		POP_STACK_PTR
-		: "+a" (middle_rounds), "+d" (keysched)
-		: [core] "r" (_gcry_aes_ssse3_decrypt_core)
-		: "rcx", "rsi", "cc", "memory");
+  _gcry_aes_ssse3_decrypt_core(ctx->keyschdec32, nrounds);
 }
 
 




More information about the Gcrypt-devel mailing list