[PATCH 05/10] Enable AMD64 ChaCha20 implementations on WIN64

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu May 14 13:11:24 CEST 2015


* cipher/chacha20-avx2-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/chacha20-sse2-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/chacha20-ssse3-amd64.S: Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ELF): New macro to mask lines with ELF specific commands.
* cipher/chacha20.c (USE_SSE2, USE_SSSE3, USE_AVX2): Enable when
HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
(ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks)
(_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks)
(_gcry_chacha20_armv7_neon_blocks, chacha20_blocks): Add ASM_FUNC_ABI.
(chacha20_core): Add ASM_EXTRA_STACK.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-avx2-amd64.S  |   13 ++++++++++--
 cipher/chacha20-sse2-amd64.S  |   13 ++++++++++--
 cipher/chacha20-ssse3-amd64.S |   13 ++++++++++--
 cipher/chacha20.c             |   43 +++++++++++++++++++++++++++++++----------
 4 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S
index 1f33de8..12bed35 100644
--- a/cipher/chacha20-avx2-amd64.S
+++ b/cipher/chacha20-avx2-amd64.S
@@ -26,7 +26,8 @@
 #ifdef __x86_64__
 #include <config.h>
 
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20
 
 #ifdef __PIC__
@@ -35,11 +36,17 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 .text
 
 .align 8
 .globl _gcry_chacha20_amd64_avx2_blocks
-.type  _gcry_chacha20_amd64_avx2_blocks, at function;
+ELF(.type  _gcry_chacha20_amd64_avx2_blocks, at function;)
 _gcry_chacha20_amd64_avx2_blocks:
 .Lchacha_blocks_avx2_local:
 	vzeroupper
@@ -938,7 +945,7 @@ _gcry_chacha20_amd64_avx2_blocks:
 	vzeroall
 	movl $(63 + 512), %eax
 	ret
-.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;
+ELF(.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;)
 
 .data
 .align 16
diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S
index 4811f40..2b9842c 100644
--- a/cipher/chacha20-sse2-amd64.S
+++ b/cipher/chacha20-sse2-amd64.S
@@ -26,13 +26,20 @@
 #ifdef __x86_64__
 #include <config.h>
 
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && USE_CHACHA20
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && USE_CHACHA20
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 .text
 
 .align 8
 .globl _gcry_chacha20_amd64_sse2_blocks
-.type  _gcry_chacha20_amd64_sse2_blocks, at function;
+ELF(.type  _gcry_chacha20_amd64_sse2_blocks, at function;)
 _gcry_chacha20_amd64_sse2_blocks:
 .Lchacha_blocks_sse2_local:
 	pushq %rbx
@@ -646,7 +653,7 @@ _gcry_chacha20_amd64_sse2_blocks:
 	pxor %xmm8, %xmm8
 	pxor %xmm0, %xmm0
 	ret
-.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;
+ELF(.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;)
 
 #endif /*defined(USE_CHACHA20)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S
index 50c2ff8..a1a843f 100644
--- a/cipher/chacha20-ssse3-amd64.S
+++ b/cipher/chacha20-ssse3-amd64.S
@@ -26,7 +26,8 @@
 #ifdef __x86_64__
 #include <config.h>
 
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20
 
 #ifdef __PIC__
@@ -35,11 +36,17 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 .text
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks
-.type  _gcry_chacha20_amd64_ssse3_blocks, at function;
+ELF(.type  _gcry_chacha20_amd64_ssse3_blocks, at function;)
 _gcry_chacha20_amd64_ssse3_blocks:
 .Lchacha_blocks_ssse3_local:
 	pushq %rbx
@@ -614,7 +621,7 @@ _gcry_chacha20_amd64_ssse3_blocks:
 	pxor %xmm8, %xmm8
 	pxor %xmm0, %xmm0
 	ret
-.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;)
 
 .data
 .align 16;
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 2eaeffd..e25e239 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -50,20 +50,23 @@
 
 /* USE_SSE2 indicates whether to compile with Intel SSE2 code. */
 #undef USE_SSE2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSE2 1
 #endif
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3)
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AVX2_SUPPORT)
 # define USE_AVX2 1
 #endif
@@ -82,8 +85,23 @@
 struct CHACHA20_context_s;
 
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
+
 typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src,
-                                           byte *dst, size_t bytes);
+                                           byte *dst,
+                                           size_t bytes) ASM_FUNC_ABI;
 
 typedef struct CHACHA20_context_s
 {
@@ -97,28 +115,32 @@ typedef struct CHACHA20_context_s
 #ifdef USE_SSE2
 
 unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in,
-                                              byte *out, size_t bytes);
+                                              byte *out,
+                                              size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_SSE2 */
 
 #ifdef USE_SSSE3
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
-                                               byte *out, size_t bytes);
+                                               byte *out,
+                                               size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
 unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
-                                              byte *out, size_t bytes);
+                                              byte *out,
+                                              size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
 #ifdef USE_NEON
 
 unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
-                                              byte *out, size_t bytes);
+                                              byte *out,
+                                              size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_NEON */
 
@@ -141,7 +163,7 @@ static const char *selftest (void);
 
 
 #ifndef USE_SSE2
-static unsigned int
+ASM_FUNC_ABI static unsigned int
 chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
 {
   u32 pad[CHACHA20_INPUT_LENGTH];
@@ -269,7 +291,8 @@ chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
 static unsigned int
 chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx)
 {
-  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE);
+  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE)
+         + ASM_EXTRA_STACK;
 }
 
 




More information about the Gcrypt-devel mailing list