[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-226-g9b0c6c8

by Jussi Kivilinna cvs at cvs.gnupg.org
Sun May 17 15:17:44 CEST 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  9b0c6c8141ae9bd056392a3f6b5704b505fc8501 (commit)
       via  eb0ed576893b6c7990dbcb568510f831d246cea6 (commit)
       via  12bc93ca8187b8061c2e705427ef22f5a71d29b0 (commit)
       via  8d7de4dbf7732c6eb9e9853ad7c19c89075ace6f (commit)
       via  b65e9e71d5ee992db5c96793c6af999545daad28 (commit)
       via  9597cfddf03c467825da152be5ca0d12a8c30d88 (commit)
       via  6a6646df80386204675d8b149ab60e74d7ca124c (commit)
       via  9a4fb3709864bf3e3918800d44ff576590cd4e92 (commit)
       via  e05682093ffb003b589a697428d918d755ac631d (commit)
       via  c46b015bedba7ce0db68929bd33a86a54ab3d919 (commit)
       via  ee8fc4edcb3466b03246c8720b90731bf274ff1d (commit)
      from  bac42c68b069f17abcca810a21439c7233815747 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 9b0c6c8141ae9bd056392a3f6b5704b505fc8501
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 13:07:34 2015 +0300

    Enable AMD64 Twofish implementation on WIN64
    
    * cipher/twofish-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/twofish.c (USE_AMD64_ASM): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (call_sysv_fn): New.
    (twofish_amd64_encrypt_block, twofish_amd64_decrypt_block)
    (twofish_amd64_ctr_enc, twofish_amd64_cbc_dec)
    (twofish_amd64_cfb_dec): New wrapper functions for AMD64
    assembly functions.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index a225307..ea88b94 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -20,7 +20,14 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_TWOFISH)
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH)
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 #ifdef __PIC__
 #  define RIP %rip
@@ -166,7 +173,7 @@
 
 .align 8
 .globl _gcry_twofish_amd64_encrypt_block
-.type   _gcry_twofish_amd64_encrypt_block, at function;
+ELF(.type   _gcry_twofish_amd64_encrypt_block, at function;)
 
 _gcry_twofish_amd64_encrypt_block:
 	/* input:
@@ -205,11 +212,11 @@ _gcry_twofish_amd64_encrypt_block:
 	addq $(3 * 8), %rsp;
 
 	ret;
-.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;
+ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 .align 8
 .globl _gcry_twofish_amd64_decrypt_block
-.type   _gcry_twofish_amd64_decrypt_block, at function;
+ELF(.type   _gcry_twofish_amd64_decrypt_block, at function;)
 
 _gcry_twofish_amd64_decrypt_block:
 	/* input:
@@ -248,7 +255,7 @@ _gcry_twofish_amd64_decrypt_block:
 	addq $(3 * 8), %rsp;
 
 	ret;
-.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;
+ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;)
 
 #undef CTX
 
@@ -462,7 +469,7 @@ _gcry_twofish_amd64_decrypt_block:
 	outunpack3(RAB, 2);
 
 .align 8
-.type __twofish_enc_blk3, at function;
+ELF(.type __twofish_enc_blk3, at function;)
 
 __twofish_enc_blk3:
 	/* input:
@@ -485,10 +492,10 @@ __twofish_enc_blk3:
 	outunpack_enc3();
 
 	ret;
-.size __twofish_enc_blk3,.-__twofish_enc_blk3;
+ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;)
 
 .align 8
-.type  __twofish_dec_blk3, at function;
+ELF(.type  __twofish_dec_blk3, at function;)
 
 __twofish_dec_blk3:
 	/* input:
@@ -511,11 +518,11 @@ __twofish_dec_blk3:
 	outunpack_dec3();
 
 	ret;
-.size __twofish_dec_blk3,.-__twofish_dec_blk3;
+ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;)
 
 .align 8
 .globl _gcry_twofish_amd64_ctr_enc
-.type   _gcry_twofish_amd64_ctr_enc, at function;
+ELF(.type   _gcry_twofish_amd64_ctr_enc, at function;)
 _gcry_twofish_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -593,11 +600,11 @@ _gcry_twofish_amd64_ctr_enc:
 	addq $(8 * 8), %rsp;
 
 	ret;
-.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;
+ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;)
 
 .align 8
 .globl _gcry_twofish_amd64_cbc_dec
-.type   _gcry_twofish_amd64_cbc_dec, at function;
+ELF(.type   _gcry_twofish_amd64_cbc_dec, at function;)
 _gcry_twofish_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -659,11 +666,11 @@ _gcry_twofish_amd64_cbc_dec:
 	addq $(9 * 8), %rsp;
 
 	ret;
-.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;
+ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;)
 
 .align 8
 .globl _gcry_twofish_amd64_cfb_dec
-.type   _gcry_twofish_amd64_cfb_dec, at function;
+ELF(.type   _gcry_twofish_amd64_cfb_dec, at function;)
 _gcry_twofish_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -725,7 +732,7 @@ _gcry_twofish_amd64_cfb_dec:
 	addq $(8 * 8), %rsp;
 
 	ret;
-.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;
+ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
 #endif /*USE_TWOFISH*/
 #endif /*__x86_64*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index ecd76e3..ce83fad 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -53,7 +53,8 @@
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AMD64_ASM 1
 #endif
 
@@ -754,6 +755,77 @@ extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
 extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
 					const byte *in, byte *iv);
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+static inline void
+call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
+              const void *arg3, const void *arg4)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("callq *%0\n\t"
+                : "+a" (fn),
+                  "+D" (arg1),
+                  "+S" (arg2),
+                  "+d" (arg3),
+                  "+c" (arg4)
+                :
+                : "cc", "memory", "r8", "r9", "r10", "r11");
+}
+#endif
+
+static inline void
+twofish_amd64_encrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn(_gcry_twofish_amd64_encrypt_block, c, out, in, NULL);
+#else
+  _gcry_twofish_amd64_encrypt_block(c, out, in);
+#endif
+}
+
+static inline void
+twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn(_gcry_twofish_amd64_decrypt_block, c, out, in, NULL);
+#else
+  _gcry_twofish_amd64_decrypt_block(c, out, in);
+#endif
+}
+
+static inline void
+twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in,
+                      byte *ctr)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn(_gcry_twofish_amd64_ctr_enc, c, out, in, ctr);
+#else
+  _gcry_twofish_amd64_ctr_enc(c, out, in, ctr);
+#endif
+}
+
+static inline void
+twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out, const byte *in,
+                      byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn(_gcry_twofish_amd64_cbc_dec, c, out, in, iv);
+#else
+  _gcry_twofish_amd64_cbc_dec(c, out, in, iv);
+#endif
+}
+
+static inline void
+twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
+                      byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn(_gcry_twofish_amd64_cfb_dec, c, out, in, iv);
+#else
+  _gcry_twofish_amd64_cfb_dec(c, out, in, iv);
+#endif
+}
+
 #elif defined(USE_ARM_ASM)
 
 /* Assembly implementations of Twofish. */
@@ -833,7 +905,7 @@ static unsigned int
 twofish_encrypt (void *context, byte *out, const byte *in)
 {
   TWOFISH_context *ctx = context;
-  _gcry_twofish_amd64_encrypt_block(ctx, out, in);
+  twofish_amd64_encrypt_block(ctx, out, in);
   return /*burn_stack*/ (4*sizeof (void*));
 }
 
@@ -900,7 +972,7 @@ static unsigned int
 twofish_decrypt (void *context, byte *out, const byte *in)
 {
   TWOFISH_context *ctx = context;
-  _gcry_twofish_amd64_decrypt_block(ctx, out, in);
+  twofish_amd64_decrypt_block(ctx, out, in);
   return /*burn_stack*/ (4*sizeof (void*));
 }
 
@@ -980,7 +1052,7 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
-        _gcry_twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+        twofish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 3;
         outbuf += 3 * TWOFISH_BLOCKSIZE;
@@ -1038,7 +1110,7 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
-        _gcry_twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+        twofish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 3;
         outbuf += 3 * TWOFISH_BLOCKSIZE;
@@ -1087,7 +1159,7 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
-        _gcry_twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+        twofish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 3;
         outbuf += 3 * TWOFISH_BLOCKSIZE;

commit eb0ed576893b6c7990dbcb568510f831d246cea6
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 13:07:48 2015 +0300

    Enable AMD64 Serpent implementations on WIN64
    
    * cipher/serpent-avx2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/serpent-sse2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/chacha20.c (USE_SSE2, USE_AVX2): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [USE_SSE2 || USE_AVX2] (ASM_FUNC_ABI): New.
    (_gcry_serpent_sse2_ctr_enc, _gcry_serpent_sse2_cbc_dec)
    (_gcry_serpent_sse2_cfb_dec, _gcry_serpent_avx2_ctr_enc)
    (_gcry_serpent_avx2_cbc_dec, _gcry_serpent_avx2_cfb_dec): Add
    ASM_FUNC_ABI.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index 03d29ae..3f59f06 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -20,9 +20,16 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SERPENT) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \
     defined(ENABLE_AVX2_SUPPORT)
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #ifdef __PIC__
 #  define RIP (%rip)
 #else
@@ -404,7 +411,7 @@
 .text
 
 .align 8
-.type   __serpent_enc_blk16, at function;
+ELF(.type   __serpent_enc_blk16, at function;)
 __serpent_enc_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -489,10 +496,10 @@ __serpent_enc_blk16:
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
-.size __serpent_enc_blk16,.-__serpent_enc_blk16;
+ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
 
 .align 8
-.type   __serpent_dec_blk16, at function;
+ELF(.type   __serpent_dec_blk16, at function;)
 __serpent_dec_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -579,7 +586,7 @@ __serpent_dec_blk16:
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
-.size __serpent_dec_blk16,.-__serpent_dec_blk16;
+ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
@@ -589,7 +596,7 @@ __serpent_dec_blk16:
 
 .align 8
 .globl _gcry_serpent_avx2_ctr_enc
-.type   _gcry_serpent_avx2_ctr_enc, at function;
+ELF(.type   _gcry_serpent_avx2_ctr_enc, at function;)
 _gcry_serpent_avx2_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -695,11 +702,11 @@ _gcry_serpent_avx2_ctr_enc:
 	vzeroall;
 
 	ret
-.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;
+ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_serpent_avx2_cbc_dec
-.type   _gcry_serpent_avx2_cbc_dec, at function;
+ELF(.type   _gcry_serpent_avx2_cbc_dec, at function;)
 _gcry_serpent_avx2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -746,11 +753,11 @@ _gcry_serpent_avx2_cbc_dec:
 	vzeroall;
 
 	ret
-.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;
+ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_serpent_avx2_cfb_dec
-.type   _gcry_serpent_avx2_cfb_dec, at function;
+ELF(.type   _gcry_serpent_avx2_cfb_dec, at function;)
 _gcry_serpent_avx2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -799,7 +806,7 @@ _gcry_serpent_avx2_cfb_dec:
 	vzeroall;
 
 	ret
-.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;
+ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
 .data
 .align 16
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 395f660..adbf4e2 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -20,7 +20,14 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SERPENT)
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT)
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 #ifdef __PIC__
 #  define RIP (%rip)
@@ -427,7 +434,7 @@
 .text
 
 .align 8
-.type   __serpent_enc_blk8, at function;
+ELF(.type   __serpent_enc_blk8, at function;)
 __serpent_enc_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -512,10 +519,10 @@ __serpent_enc_blk8:
 	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
-.size __serpent_enc_blk8,.-__serpent_enc_blk8;
+ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;)
 
 .align 8
-.type   __serpent_dec_blk8, at function;
+ELF(.type   __serpent_dec_blk8, at function;)
 __serpent_dec_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -602,11 +609,11 @@ __serpent_dec_blk8:
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
 	ret;
-.size __serpent_dec_blk8,.-__serpent_dec_blk8;
+ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;)
 
 .align 8
 .globl _gcry_serpent_sse2_ctr_enc
-.type   _gcry_serpent_sse2_ctr_enc, at function;
+ELF(.type   _gcry_serpent_sse2_ctr_enc, at function;)
 _gcry_serpent_sse2_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -732,11 +739,11 @@ _gcry_serpent_sse2_ctr_enc:
 	pxor RNOT, RNOT;
 
 	ret
-.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;
+ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;)
 
 .align 8
 .globl _gcry_serpent_sse2_cbc_dec
-.type   _gcry_serpent_sse2_cbc_dec, at function;
+ELF(.type   _gcry_serpent_sse2_cbc_dec, at function;)
 _gcry_serpent_sse2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -793,11 +800,11 @@ _gcry_serpent_sse2_cbc_dec:
 	pxor RNOT, RNOT;
 
 	ret
-.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
+ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;)
 
 .align 8
 .globl _gcry_serpent_sse2_cfb_dec
-.type   _gcry_serpent_sse2_cfb_dec, at function;
+ELF(.type   _gcry_serpent_sse2_cfb_dec, at function;)
 _gcry_serpent_sse2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -857,7 +864,7 @@ _gcry_serpent_sse2_cfb_dec:
 	pxor RNOT, RNOT;
 
 	ret
-.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;
+ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
 #endif /*defined(USE_SERPENT)*/
 #endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 0be49da..7d0e112 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -34,13 +34,15 @@
 
 /* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
 #undef USE_SSE2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSE2 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
 #undef USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # if defined(ENABLE_AVX2_SUPPORT)
 #  define USE_AVX2 1
 # endif
@@ -86,6 +88,18 @@ typedef struct serpent_context
 } serpent_context_t;
 
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_SSE2) || defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+#  define ASM_FUNC_ABI
+# endif
+#endif
+
+
 #ifdef USE_SSE2
 /* Assembler implementations of Serpent using SSE2.  Process 8 block in
    parallel.
@@ -93,17 +107,17 @@ typedef struct serpent_context
 extern void _gcry_serpent_sse2_ctr_enc(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
-				       unsigned char *ctr);
+				       unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
-				       unsigned char *iv);
+				       unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
-				       unsigned char *iv);
+				       unsigned char *iv) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AVX2
@@ -113,17 +127,17 @@ extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
 extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
-				       unsigned char *ctr);
+				       unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_serpent_avx2_cbc_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
-				       unsigned char *iv);
+				       unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
-				       unsigned char *iv);
+				       unsigned char *iv) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_NEON

commit 12bc93ca8187b8061c2e705427ef22f5a71d29b0
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 12:37:21 2015 +0300

    Enable AMD64 Salsa20 implementation on WIN64
    
    * cipher/salsa20-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/salsa20.c (USE_AMD64): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [USE_AMD64] (ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
    (_gcry_salsa20_amd64_keysetup, _gcry_salsa20_amd64_ivsetup)
    (_gcry_salsa20_amd64_encrypt_blocks): Add ASM_FUNC_ABI.
    [USE_AMD64] (salsa20_core): Add ASM_EXTRA_STACK.
    (salsa20_do_encrypt_stream) [USE_AMD64]: Add ASM_EXTRA_STACK.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S
index 7046dbb..470c32a 100644
--- a/cipher/salsa20-amd64.S
+++ b/cipher/salsa20-amd64.S
@@ -25,13 +25,20 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_SALSA20)
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20)
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 .text
 
 .align 8
 .globl _gcry_salsa20_amd64_keysetup
-.type  _gcry_salsa20_amd64_keysetup, at function;
+ELF(.type  _gcry_salsa20_amd64_keysetup, at function;)
 _gcry_salsa20_amd64_keysetup:
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%r9d
@@ -83,7 +90,7 @@ _gcry_salsa20_amd64_keysetup:
 
 .align 8
 .globl _gcry_salsa20_amd64_ivsetup
-.type  _gcry_salsa20_amd64_ivsetup, at function;
+ELF(.type  _gcry_salsa20_amd64_ivsetup, at function;)
 _gcry_salsa20_amd64_ivsetup:
 	movl   0(%rsi),%r8d
 	movl   4(%rsi),%esi
@@ -97,7 +104,7 @@ _gcry_salsa20_amd64_ivsetup:
 
 .align 8
 .globl _gcry_salsa20_amd64_encrypt_blocks
-.type  _gcry_salsa20_amd64_encrypt_blocks, at function;
+ELF(.type  _gcry_salsa20_amd64_encrypt_blocks, at function;)
 _gcry_salsa20_amd64_encrypt_blocks:
 	/*
 	 * Modifications to original implementation:
@@ -918,7 +925,7 @@ _gcry_salsa20_amd64_encrypt_blocks:
 	add  $64,%rdi
 	add  $64,%rsi
 	jmp .L_bytes_are_64_128_or_192
-.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;
+ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;)
 
 #endif /*defined(USE_SALSA20)*/
 #endif /*__x86_64*/
diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index d75fe51..fa3d23b 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -43,7 +43,8 @@
 
 /* USE_AMD64 indicates whether to compile with AMD64 code. */
 #undef USE_AMD64
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AMD64 1
 #endif
 
@@ -118,12 +119,25 @@ static const char *selftest (void);
 
 
 #ifdef USE_AMD64
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
 /* AMD64 assembly implementations of Salsa20. */
-void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits);
-void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv);
+void _gcry_salsa20_amd64_keysetup(u32 *ctxinput, const void *key, int keybits)
+                                 ASM_FUNC_ABI;
+void _gcry_salsa20_amd64_ivsetup(u32 *ctxinput, const void *iv)
+                                ASM_FUNC_ABI;
 unsigned int
 _gcry_salsa20_amd64_encrypt_blocks(u32 *ctxinput, const void *src, void *dst,
-                                   size_t len, int rounds);
+                                   size_t len, int rounds) ASM_FUNC_ABI;
 
 static void
 salsa20_keysetup(SALSA20_context_t *ctx, const byte *key, int keylen)
@@ -141,7 +155,8 @@ static unsigned int
 salsa20_core (u32 *dst, SALSA20_context_t *ctx, unsigned int rounds)
 {
   memset(dst, 0, SALSA20_BLOCK_SIZE);
-  return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds);
+  return _gcry_salsa20_amd64_encrypt_blocks(ctx->input, dst, dst, 1, rounds)
+         + ASM_EXTRA_STACK;
 }
 
 #else /* USE_AMD64 */
@@ -418,6 +433,7 @@ salsa20_do_encrypt_stream (SALSA20_context_t *ctx,
       size_t nblocks = length / SALSA20_BLOCK_SIZE;
       burn = _gcry_salsa20_amd64_encrypt_blocks(ctx->input, inbuf, outbuf,
                                                 nblocks, rounds);
+      burn += ASM_EXTRA_STACK;
       length -= SALSA20_BLOCK_SIZE * nblocks;
       outbuf += SALSA20_BLOCK_SIZE * nblocks;
       inbuf  += SALSA20_BLOCK_SIZE * nblocks;

commit 8d7de4dbf7732c6eb9e9853ad7c19c89075ace6f
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 12:39:39 2015 +0300

    Enable AMD64 Poly1305 implementations on WIN64
    
    * cipher/poly1305-avx2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/poly1305-sse2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/poly1305-internal.h (POLY1305_SYSV_FUNC_ABI): New.
    (POLY1305_USE_SSE2, POLY1305_USE_AVX2): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (OPS_FUNC_ABI): New.
    (poly1305_ops_t): Use OPS_FUNC_ABI.
    * cipher/poly1305.c (_gcry_poly1305_amd64_sse2_init_ext)
    (_gcry_poly1305_amd64_sse2_finish_ext)
    (_gcry_poly1305_amd64_sse2_blocks, _gcry_poly1305_amd64_avx2_init_ext)
    (_gcry_poly1305_amd64_avx2_finish_ext)
    (_gcry_poly1305_amd64_avx2_blocks, _gcry_poly1305_armv7_neon_init_ext)
    (_gcry_poly1305_armv7_neon_finish_ext)
    (_gcry_poly1305_armv7_neon_blocks, poly1305_init_ext_ref32)
    (poly1305_blocks_ref32, poly1305_finish_ext_ref32)
    (poly1305_init_ext_ref8, poly1305_blocks_ref8)
    (poly1305_finish_ext_ref8): Use OPS_FUNC_ABI.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S
index 0ba7e76..9362a5a 100644
--- a/cipher/poly1305-avx2-amd64.S
+++ b/cipher/poly1305-avx2-amd64.S
@@ -25,15 +25,23 @@
 
 #include <config.h>
 
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AVX2_SUPPORT)
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+
 .text
 
 
 .align 8
 .globl _gcry_poly1305_amd64_avx2_init_ext
-.type  _gcry_poly1305_amd64_avx2_init_ext, at function;
+ELF(.type  _gcry_poly1305_amd64_avx2_init_ext, at function;)
 _gcry_poly1305_amd64_avx2_init_ext:
 .Lpoly1305_init_ext_avx2_local:
 	xor %edx, %edx
@@ -391,12 +399,12 @@ _gcry_poly1305_amd64_avx2_init_ext:
 	popq %r13
 	popq %r12
 	ret
-.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext;
+ELF(.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext;)
 
 
 .align 8
 .globl _gcry_poly1305_amd64_avx2_blocks
-.type  _gcry_poly1305_amd64_avx2_blocks, at function;
+ELF(.type  _gcry_poly1305_amd64_avx2_blocks, at function;)
 _gcry_poly1305_amd64_avx2_blocks:
 .Lpoly1305_blocks_avx2_local:
 	vzeroupper
@@ -717,12 +725,12 @@ _gcry_poly1305_amd64_avx2_blocks:
 	leave
 	addq $8, %rax
 	ret
-.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks;
+ELF(.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks;)
 
 
 .align 8
 .globl _gcry_poly1305_amd64_avx2_finish_ext
-.type  _gcry_poly1305_amd64_avx2_finish_ext, at function;
+ELF(.type  _gcry_poly1305_amd64_avx2_finish_ext, at function;)
 _gcry_poly1305_amd64_avx2_finish_ext:
 .Lpoly1305_finish_ext_avx2_local:
 	vzeroupper
@@ -949,6 +957,6 @@ _gcry_poly1305_amd64_avx2_finish_ext:
 	popq %rbp
 	addq $(8*5), %rax
 ret
-.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext;
+ELF(.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext;)
 
 #endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index dfc0c04..bcbe5df 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -44,24 +44,30 @@
 #define POLY1305_REF_ALIGNMENT sizeof(void *)
 
 
+#undef POLY1305_SYSV_FUNC_ABI
+
 /* POLY1305_USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
 #undef POLY1305_USE_SSE2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define POLY1305_USE_SSE2 1
 # define POLY1305_SSE2_BLOCKSIZE 32
 # define POLY1305_SSE2_STATESIZE 248
 # define POLY1305_SSE2_ALIGNMENT 16
+# define POLY1305_SYSV_FUNC_ABI 1
 #endif
 
 
 /* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
 #undef POLY1305_USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AVX2_SUPPORT)
 # define POLY1305_USE_AVX2 1
 # define POLY1305_AVX2_BLOCKSIZE 64
 # define POLY1305_AVX2_STATESIZE 328
 # define POLY1305_AVX2_ALIGNMENT 32
+# define POLY1305_SYSV_FUNC_ABI 1
 #endif
 
 
@@ -112,6 +118,17 @@
 #endif
 
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef OPS_FUNC_ABI
+#if defined(POLY1305_SYSV_FUNC_ABI) && \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define OPS_FUNC_ABI __attribute__((sysv_abi))
+#else
+# define OPS_FUNC_ABI
+#endif
+
+
 typedef struct poly1305_key_s
 {
   byte b[POLY1305_KEYLEN];
@@ -121,10 +138,10 @@ typedef struct poly1305_key_s
 typedef struct poly1305_ops_s
 {
   size_t block_size;
-  void (*init_ext) (void *ctx, const poly1305_key_t * key);
-  unsigned int (*blocks) (void *ctx, const byte * m, size_t bytes);
+  void (*init_ext) (void *ctx, const poly1305_key_t * key) OPS_FUNC_ABI;
+  unsigned int (*blocks) (void *ctx, const byte * m, size_t bytes) OPS_FUNC_ABI;
   unsigned int (*finish_ext) (void *ctx, const byte * m, size_t remaining,
-			      byte mac[POLY1305_TAGLEN]);
+			      byte mac[POLY1305_TAGLEN]) OPS_FUNC_ABI;
 } poly1305_ops_t;
 
 
diff --git a/cipher/poly1305-sse2-amd64.S b/cipher/poly1305-sse2-amd64.S
index 106b119..219eb07 100644
--- a/cipher/poly1305-sse2-amd64.S
+++ b/cipher/poly1305-sse2-amd64.S
@@ -25,14 +25,22 @@
 
 #include <config.h>
 
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 
 .text
 
 
 .align 8
 .globl _gcry_poly1305_amd64_sse2_init_ext
-.type  _gcry_poly1305_amd64_sse2_init_ext, at function;
+ELF(.type  _gcry_poly1305_amd64_sse2_init_ext, at function;)
 _gcry_poly1305_amd64_sse2_init_ext:
 .Lpoly1305_init_ext_x86_local:
 	xor %edx, %edx
@@ -273,12 +281,12 @@ _gcry_poly1305_amd64_sse2_init_ext:
 	popq %r13
 	popq %r12
 	ret
-.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext;
+ELF(.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext;)
 
 
 .align 8
 .globl _gcry_poly1305_amd64_sse2_finish_ext
-.type  _gcry_poly1305_amd64_sse2_finish_ext, at function;
+ELF(.type  _gcry_poly1305_amd64_sse2_finish_ext, at function;)
 _gcry_poly1305_amd64_sse2_finish_ext:
 .Lpoly1305_finish_ext_x86_local:
 	pushq %rbp
@@ -424,12 +432,12 @@ _gcry_poly1305_amd64_sse2_finish_ext:
 	popq %rbp
 	addq $8, %rax
 	ret
-.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext;
+ELF(.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext;)
 
 
 .align 8
 .globl _gcry_poly1305_amd64_sse2_blocks
-.type  _gcry_poly1305_amd64_sse2_blocks, at function;
+ELF(.type  _gcry_poly1305_amd64_sse2_blocks, at function;)
 _gcry_poly1305_amd64_sse2_blocks:
 .Lpoly1305_blocks_x86_local:
 	pushq %rbp
@@ -1030,6 +1038,6 @@ _gcry_poly1305_amd64_sse2_blocks:
 	pxor %xmm8, %xmm8
 	pxor %xmm0, %xmm0
 	ret
-.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks;
+ELF(.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks;)
 
 #endif
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 28dbbf8..1adf0e7 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -40,12 +40,13 @@ static const char *selftest (void);
 
 #ifdef POLY1305_USE_SSE2
 
-void _gcry_poly1305_amd64_sse2_init_ext(void *state, const poly1305_key_t *key);
+void _gcry_poly1305_amd64_sse2_init_ext(void *state, const poly1305_key_t *key)
+                                       OPS_FUNC_ABI;
 unsigned int _gcry_poly1305_amd64_sse2_finish_ext(void *state, const byte *m,
 						  size_t remaining,
-						  byte mac[16]);
+						  byte mac[16]) OPS_FUNC_ABI;
 unsigned int _gcry_poly1305_amd64_sse2_blocks(void *ctx, const byte *m,
-					      size_t bytes);
+					      size_t bytes) OPS_FUNC_ABI;
 
 static const poly1305_ops_t poly1305_amd64_sse2_ops = {
   POLY1305_SSE2_BLOCKSIZE,
@@ -59,12 +60,13 @@ static const poly1305_ops_t poly1305_amd64_sse2_ops = {
 
 #ifdef POLY1305_USE_AVX2
 
-void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key);
+void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key)
+                                       OPS_FUNC_ABI;
 unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m,
 						  size_t remaining,
-						  byte mac[16]);
+						  byte mac[16]) OPS_FUNC_ABI;
 unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m,
-					      size_t bytes);
+					      size_t bytes) OPS_FUNC_ABI;
 
 static const poly1305_ops_t poly1305_amd64_avx2_ops = {
   POLY1305_AVX2_BLOCKSIZE,
@@ -78,12 +80,13 @@ static const poly1305_ops_t poly1305_amd64_avx2_ops = {
 
 #ifdef POLY1305_USE_NEON
 
-void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key);
+void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key)
+                                       OPS_FUNC_ABI;
 unsigned int _gcry_poly1305_armv7_neon_finish_ext(void *state, const byte *m,
 						  size_t remaining,
-						  byte mac[16]);
+						  byte mac[16]) OPS_FUNC_ABI;
 unsigned int _gcry_poly1305_armv7_neon_blocks(void *ctx, const byte *m,
-					      size_t bytes);
+					      size_t bytes) OPS_FUNC_ABI;
 
 static const poly1305_ops_t poly1305_armv7_neon_ops = {
   POLY1305_NEON_BLOCKSIZE,
@@ -110,7 +113,7 @@ typedef struct poly1305_state_ref32_s
 } poly1305_state_ref32_t;
 
 
-static void
+static OPS_FUNC_ABI void
 poly1305_init_ext_ref32 (void *state, const poly1305_key_t * key)
 {
   poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
@@ -142,7 +145,7 @@ poly1305_init_ext_ref32 (void *state, const poly1305_key_t * key)
 }
 
 
-static unsigned int
+static OPS_FUNC_ABI unsigned int
 poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
 {
   poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
@@ -230,7 +233,7 @@ poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
 }
 
 
-static unsigned int
+static OPS_FUNC_ABI unsigned int
 poly1305_finish_ext_ref32 (void *state, const byte * m,
 			   size_t remaining, byte mac[POLY1305_TAGLEN])
 {
@@ -370,7 +373,7 @@ typedef struct poly1305_state_ref8_t
 } poly1305_state_ref8_t;
 
 
-static void
+static OPS_FUNC_ABI void
 poly1305_init_ext_ref8 (void *state, const poly1305_key_t * key)
 {
   poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state;
@@ -471,7 +474,7 @@ poly1305_freeze_ref8 (byte h[17])
 }
 
 
-static unsigned int
+static OPS_FUNC_ABI unsigned int
 poly1305_blocks_ref8 (void *state, const byte * m, size_t bytes)
 {
   poly1305_state_ref8_t *st = (poly1305_state_ref8_t *) state;
@@ -519,7 +522,7 @@ poly1305_blocks_ref8 (void *state, const byte * m, size_t bytes)
 }
 
 
-static unsigned int
+static OPS_FUNC_ABI unsigned int
 poly1305_finish_ext_ref8 (void *state, const byte * m, size_t remaining,
 			  byte mac[POLY1305_TAGLEN])
 {

commit b65e9e71d5ee992db5c96793c6af999545daad28
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 10:31:18 2015 +0300

    Enable AMD64 3DES implementation on WIN64
    
    * cipher/des-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/des.c (USE_AMD64_ASM): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (call_sysv_fn): New.
    (tripledes_ecb_crypt) [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Call
    assembly function through 'call_sysv_fn'.
    (tripledes_amd64_ctr_enc, tripledes_amd64_cbc_dec)
    (tripledes_amd64_cfb_dec): New wrapper functions for bulk
    assembly functions.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S
index e8b2c56..307d211 100644
--- a/cipher/des-amd64.S
+++ b/cipher/des-amd64.S
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(USE_DES) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(USE_DES) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 
 #ifdef __PIC__
 #  define RIP (%rip)
@@ -28,6 +29,12 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 .text
 
 #define s1 0
@@ -185,7 +192,7 @@
 
 .align 8
 .globl _gcry_3des_amd64_crypt_block
-.type  _gcry_3des_amd64_crypt_block, at function;
+ELF(.type  _gcry_3des_amd64_crypt_block, at function;)
 
 _gcry_3des_amd64_crypt_block:
 	/* input:
@@ -271,7 +278,7 @@ _gcry_3des_amd64_crypt_block:
 	popq %rbp;
 
 	ret;
-.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;
+ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;)
 
 /***********************************************************************
  * 3-way 3DES
@@ -458,7 +465,7 @@ _gcry_3des_amd64_crypt_block:
 	movl   right##d, 4(io);
 
 .align 8
-.type  _gcry_3des_amd64_crypt_blk3, at function;
+ELF(.type  _gcry_3des_amd64_crypt_blk3, at function;)
 _gcry_3des_amd64_crypt_blk3:
 	/* input:
 	 *  %rdi: round keys, CTX
@@ -528,11 +535,11 @@ _gcry_3des_amd64_crypt_blk3:
 	final_permutation3(RR, RL);
 
 	ret;
-.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;
+ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;)
 
 .align 8
 .globl  _gcry_3des_amd64_cbc_dec
-.type   _gcry_3des_amd64_cbc_dec, at function;
+ELF(.type   _gcry_3des_amd64_cbc_dec, at function;)
 _gcry_3des_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -604,11 +611,11 @@ _gcry_3des_amd64_cbc_dec:
 	popq %rbp;
 
 	ret;
-.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;
+ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
 .globl  _gcry_3des_amd64_ctr_enc
-.type   _gcry_3des_amd64_ctr_enc, at function;
+ELF(.type   _gcry_3des_amd64_ctr_enc, at function;)
 _gcry_3des_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -682,11 +689,11 @@ _gcry_3des_amd64_ctr_enc:
 	popq %rbp;
 
 	ret;
-.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;
+ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;)
 
 .align 8
 .globl  _gcry_3des_amd64_cfb_dec
-.type   _gcry_3des_amd64_cfb_dec, at function;
+ELF(.type   _gcry_3des_amd64_cfb_dec, at function;)
 _gcry_3des_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -757,7 +764,7 @@ _gcry_3des_amd64_cfb_dec:
 	popq %rbx;
 	popq %rbp;
 	ret;
-.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;
+ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;)
 
 .data
 .align 16
diff --git a/cipher/des.c b/cipher/des.c
index d4863d1..be62763 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -127,7 +127,8 @@
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AMD64_ASM 1
 #endif
 
@@ -771,6 +772,24 @@ extern void _gcry_3des_amd64_cfb_dec(const void *keys, byte *out,
 
 #define TRIPLEDES_ECB_BURN_STACK (8 * sizeof(void *))
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+static inline void
+call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
+              const void *arg3, const void *arg4)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("callq *%0\n\t"
+                : "+a" (fn),
+                  "+D" (arg1),
+                  "+S" (arg2),
+                  "+d" (arg3),
+                  "+c" (arg4)
+                :
+                : "cc", "memory", "r8", "r9", "r10", "r11");
+}
+#endif
+
 /*
  * Electronic Codebook Mode Triple-DES encryption/decryption of data
  * according to 'mode'.  Sometimes this mode is named 'EDE' mode
@@ -784,11 +803,45 @@ tripledes_ecb_crypt (struct _tripledes_ctx *ctx, const byte * from,
 
   keys = mode ? ctx->decrypt_subkeys : ctx->encrypt_subkeys;
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_3des_amd64_crypt_block, keys, to, from, NULL);
+#else
   _gcry_3des_amd64_crypt_block(keys, to, from);
+#endif
 
   return 0;
 }
 
+static inline void
+tripledes_amd64_ctr_enc(const void *keys, byte *out, const byte *in, byte *ctr)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_3des_amd64_ctr_enc, keys, out, in, ctr);
+#else
+  _gcry_3des_amd64_ctr_enc(keys, out, in, ctr);
+#endif
+}
+
+static inline void
+tripledes_amd64_cbc_dec(const void *keys, byte *out, const byte *in, byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_3des_amd64_cbc_dec, keys, out, in, iv);
+#else
+  _gcry_3des_amd64_cbc_dec(keys, out, in, iv);
+#endif
+}
+
+static inline void
+tripledes_amd64_cfb_dec(const void *keys, byte *out, const byte *in, byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_3des_amd64_cfb_dec, keys, out, in, iv);
+#else
+  _gcry_3des_amd64_cfb_dec(keys, out, in, iv);
+#endif
+}
+
 #else /*USE_AMD64_ASM*/
 
 #define TRIPLEDES_ECB_BURN_STACK 32
@@ -871,7 +924,7 @@ _gcry_3des_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
-        _gcry_3des_amd64_ctr_enc(ctx->encrypt_subkeys, outbuf, inbuf, ctr);
+        tripledes_amd64_ctr_enc(ctx->encrypt_subkeys, outbuf, inbuf, ctr);
 
         nblocks -= 3;
         outbuf += 3 * DES_BLOCKSIZE;
@@ -926,7 +979,7 @@ _gcry_3des_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
-        _gcry_3des_amd64_cbc_dec(ctx->decrypt_subkeys, outbuf, inbuf, iv);
+        tripledes_amd64_cbc_dec(ctx->decrypt_subkeys, outbuf, inbuf, iv);
 
         nblocks -= 3;
         outbuf += 3 * DES_BLOCKSIZE;
@@ -974,7 +1027,7 @@ _gcry_3des_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 3 block chunks. */
     while (nblocks >= 3)
       {
-        _gcry_3des_amd64_cfb_dec(ctx->encrypt_subkeys, outbuf, inbuf, iv);
+        tripledes_amd64_cfb_dec(ctx->encrypt_subkeys, outbuf, inbuf, iv);
 
         nblocks -= 3;
         outbuf += 3 * DES_BLOCKSIZE;

commit 9597cfddf03c467825da152be5ca0d12a8c30d88
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Tue May 5 21:02:43 2015 +0300

    Enable AMD64 ChaCha20 implementations on WIN64
    
    * cipher/chacha20-avx2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/chacha20-sse2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/chacha20-ssse3-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/chacha20.c (USE_SSE2, USE_SSSE3, USE_AVX2): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
    (chacha20_blocks_t, _gcry_chacha20_amd64_sse2_blocks)
    (_gcry_chacha20_amd64_ssse3_blocks, _gcry_chacha20_amd64_avx2_blocks)
    (_gcry_chacha20_armv7_neon_blocks, chacha20_blocks): Add ASM_FUNC_ABI.
    (chacha20_core): Add ASM_EXTRA_STACK.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/chacha20-avx2-amd64.S b/cipher/chacha20-avx2-amd64.S
index 1f33de8..12bed35 100644
--- a/cipher/chacha20-avx2-amd64.S
+++ b/cipher/chacha20-avx2-amd64.S
@@ -26,7 +26,8 @@
 #ifdef __x86_64__
 #include <config.h>
 
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AVX2_SUPPORT) && USE_CHACHA20
 
 #ifdef __PIC__
@@ -35,11 +36,17 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 .text
 
 .align 8
 .globl _gcry_chacha20_amd64_avx2_blocks
-.type  _gcry_chacha20_amd64_avx2_blocks, at function;
+ELF(.type  _gcry_chacha20_amd64_avx2_blocks, at function;)
 _gcry_chacha20_amd64_avx2_blocks:
 .Lchacha_blocks_avx2_local:
 	vzeroupper
@@ -938,7 +945,7 @@ _gcry_chacha20_amd64_avx2_blocks:
 	vzeroall
 	movl $(63 + 512), %eax
 	ret
-.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;
+ELF(.size _gcry_chacha20_amd64_avx2_blocks,.-_gcry_chacha20_amd64_avx2_blocks;)
 
 .data
 .align 16
diff --git a/cipher/chacha20-sse2-amd64.S b/cipher/chacha20-sse2-amd64.S
index 4811f40..2b9842c 100644
--- a/cipher/chacha20-sse2-amd64.S
+++ b/cipher/chacha20-sse2-amd64.S
@@ -26,13 +26,20 @@
 #ifdef __x86_64__
 #include <config.h>
 
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && USE_CHACHA20
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && USE_CHACHA20
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 .text
 
 .align 8
 .globl _gcry_chacha20_amd64_sse2_blocks
-.type  _gcry_chacha20_amd64_sse2_blocks, at function;
+ELF(.type  _gcry_chacha20_amd64_sse2_blocks, at function;)
 _gcry_chacha20_amd64_sse2_blocks:
 .Lchacha_blocks_sse2_local:
 	pushq %rbx
@@ -646,7 +653,7 @@ _gcry_chacha20_amd64_sse2_blocks:
 	pxor %xmm8, %xmm8
 	pxor %xmm0, %xmm0
 	ret
-.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;
+ELF(.size _gcry_chacha20_amd64_sse2_blocks,.-_gcry_chacha20_amd64_sse2_blocks;)
 
 #endif /*defined(USE_CHACHA20)*/
 #endif /*__x86_64*/
diff --git a/cipher/chacha20-ssse3-amd64.S b/cipher/chacha20-ssse3-amd64.S
index 50c2ff8..a1a843f 100644
--- a/cipher/chacha20-ssse3-amd64.S
+++ b/cipher/chacha20-ssse3-amd64.S
@@ -26,7 +26,8 @@
 #ifdef __x86_64__
 #include <config.h>
 
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3) && USE_CHACHA20
 
 #ifdef __PIC__
@@ -35,11 +36,17 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 .text
 
 .align 8
 .globl _gcry_chacha20_amd64_ssse3_blocks
-.type  _gcry_chacha20_amd64_ssse3_blocks, at function;
+ELF(.type  _gcry_chacha20_amd64_ssse3_blocks, at function;)
 _gcry_chacha20_amd64_ssse3_blocks:
 .Lchacha_blocks_ssse3_local:
 	pushq %rbx
@@ -614,7 +621,7 @@ _gcry_chacha20_amd64_ssse3_blocks:
 	pxor %xmm8, %xmm8
 	pxor %xmm0, %xmm0
 	ret
-.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;
+ELF(.size _gcry_chacha20_amd64_ssse3_blocks,.-_gcry_chacha20_amd64_ssse3_blocks;)
 
 .data
 .align 16;
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 2eaeffd..e25e239 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -50,20 +50,23 @@
 
 /* USE_SSE2 indicates whether to compile with Intel SSE2 code. */
 #undef USE_SSE2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_SSE2 1
 #endif
 
 /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
 #undef USE_SSSE3
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(HAVE_GCC_INLINE_ASM_SSSE3)
 # define USE_SSSE3 1
 #endif
 
 /* USE_AVX2 indicates whether to compile with Intel AVX2 code. */
 #undef USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AVX2_SUPPORT)
 # define USE_AVX2 1
 #endif
@@ -82,8 +85,23 @@
 struct CHACHA20_context_s;
 
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if (defined(USE_SSE2) || defined(USE_SSSE3) || defined(USE_AVX2)) && \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16)
+#else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+#endif
+
+
 typedef unsigned int (* chacha20_blocks_t)(u32 *state, const byte *src,
-                                           byte *dst, size_t bytes);
+                                           byte *dst,
+                                           size_t bytes) ASM_FUNC_ABI;
 
 typedef struct CHACHA20_context_s
 {
@@ -97,28 +115,32 @@ typedef struct CHACHA20_context_s
 #ifdef USE_SSE2
 
 unsigned int _gcry_chacha20_amd64_sse2_blocks(u32 *state, const byte *in,
-                                              byte *out, size_t bytes);
+                                              byte *out,
+                                              size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_SSE2 */
 
 #ifdef USE_SSSE3
 
 unsigned int _gcry_chacha20_amd64_ssse3_blocks(u32 *state, const byte *in,
-                                               byte *out, size_t bytes);
+                                               byte *out,
+                                               size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_SSSE3 */
 
 #ifdef USE_AVX2
 
 unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
-                                              byte *out, size_t bytes);
+                                              byte *out,
+                                              size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_AVX2 */
 
 #ifdef USE_NEON
 
 unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
-                                              byte *out, size_t bytes);
+                                              byte *out,
+                                              size_t bytes) ASM_FUNC_ABI;
 
 #endif /* USE_NEON */
 
@@ -141,7 +163,7 @@ static const char *selftest (void);
 
 
 #ifndef USE_SSE2
-static unsigned int
+ASM_FUNC_ABI static unsigned int
 chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
 {
   u32 pad[CHACHA20_INPUT_LENGTH];
@@ -269,7 +291,8 @@ chacha20_blocks (u32 *state, const byte *src, byte *dst, size_t bytes)
 static unsigned int
 chacha20_core(u32 *dst, struct CHACHA20_context_s *ctx)
 {
-  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE);
+  return ctx->blocks(ctx->input, NULL, (byte *)dst, CHACHA20_BLOCK_SIZE)
+         + ASM_EXTRA_STACK;
 }
 
 

commit 6a6646df80386204675d8b149ab60e74d7ca124c
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Tue May 5 20:46:10 2015 +0300

    Enable AMD64 CAST5 implementation on WIN64
    
    * cipher/cast5-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (RIP): Remove.
    (GET_EXTERN_POINTER): Use 'leaq' version on WIN64.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/cast5.c (USE_AMD64_ASM): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (call_sysv_fn): New.
    (do_encrypt_block, do_decrypt_block)
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Call assembly
    function through 'call_sysv_fn'.
    (cast5_amd64_ctr_enc, cast5_amd64_cbc_dec)
    (cast5_amd64_cfb_dec): New wrapper functions for bulk
    assembly functions.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S
index 41fbb74..a5f078e 100644
--- a/cipher/cast5-amd64.S
+++ b/cipher/cast5-amd64.S
@@ -20,14 +20,19 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_CAST5)
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
 
-#ifdef __PIC__
-#  define RIP %rip
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) leaq name, reg
+#else
 #  define GET_EXTERN_POINTER(name, reg) movq name at GOTPCREL(%rip), reg
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
 #else
-#  define RIP
-#  define GET_EXTERN_POINTER(name, reg) leaq name, reg
+# define ELF(...) /*_*/
 #endif
 
 .text
@@ -180,7 +185,7 @@
 
 .align 8
 .globl _gcry_cast5_amd64_encrypt_block
-.type   _gcry_cast5_amd64_encrypt_block, at function;
+ELF(.type   _gcry_cast5_amd64_encrypt_block, at function;)
 
 _gcry_cast5_amd64_encrypt_block:
 	/* input:
@@ -216,11 +221,11 @@ _gcry_cast5_amd64_encrypt_block:
 	popq %rbx;
 	popq %rbp;
 	ret;
-.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;
+ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
 
 .align 8
 .globl _gcry_cast5_amd64_decrypt_block
-.type   _gcry_cast5_amd64_decrypt_block, at function;
+ELF(.type   _gcry_cast5_amd64_decrypt_block, at function;)
 
 _gcry_cast5_amd64_decrypt_block:
 	/* input:
@@ -256,7 +261,7 @@ _gcry_cast5_amd64_decrypt_block:
 	popq %rbx;
 	popq %rbp;
 	ret;
-.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;
+ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
 
 /**********************************************************************
   4-way cast5, four blocks parallel
@@ -359,7 +364,7 @@ _gcry_cast5_amd64_decrypt_block:
 	rorq $32,		d;
 
 .align 8
-.type   __cast5_enc_blk4, at function;
+ELF(.type   __cast5_enc_blk4, at function;)
 
 __cast5_enc_blk4:
 	/* input:
@@ -384,10 +389,10 @@ __cast5_enc_blk4:
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
-.size __cast5_enc_blk4,.-__cast5_enc_blk4;
+ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
 
 .align 8
-.type   __cast5_dec_blk4, at function;
+ELF(.type   __cast5_dec_blk4, at function;)
 
 __cast5_dec_blk4:
 	/* input:
@@ -414,11 +419,11 @@ __cast5_dec_blk4:
 
 	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
 	ret;
-.size __cast5_dec_blk4,.-__cast5_dec_blk4;
+ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
 
 .align 8
 .globl _gcry_cast5_amd64_ctr_enc
-.type   _gcry_cast5_amd64_ctr_enc, at function;
+ELF(.type   _gcry_cast5_amd64_ctr_enc, at function;)
 _gcry_cast5_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -472,11 +477,11 @@ _gcry_cast5_amd64_ctr_enc:
 	popq %rbx;
 	popq %rbp;
 	ret
-.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;
+ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
 
 .align 8
 .globl _gcry_cast5_amd64_cbc_dec
-.type   _gcry_cast5_amd64_cbc_dec, at function;
+ELF(.type   _gcry_cast5_amd64_cbc_dec, at function;)
 _gcry_cast5_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -526,11 +531,11 @@ _gcry_cast5_amd64_cbc_dec:
 	popq %rbp;
 	ret;
 
-.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;
+ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
 
 .align 8
 .globl _gcry_cast5_amd64_cfb_dec
-.type   _gcry_cast5_amd64_cfb_dec, at function;
+ELF(.type   _gcry_cast5_amd64_cfb_dec, at function;)
 _gcry_cast5_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -581,7 +586,7 @@ _gcry_cast5_amd64_cfb_dec:
 	popq %rbp;
 	ret;
 
-.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;
+ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
 
 #endif /*defined(USE_CAST5)*/
 #endif /*__x86_64*/
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 115e1e6..94dcee7 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -48,7 +48,8 @@
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AMD64_ASM 1
 #endif
 
@@ -372,16 +373,72 @@ extern void _gcry_cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out,
 extern void _gcry_cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out,
 				      const byte *in, byte *iv);
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+static inline void
+call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
+              const void *arg3, const void *arg4)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("callq *%0\n\t"
+                : "+a" (fn),
+                  "+D" (arg1),
+                  "+S" (arg2),
+                  "+d" (arg3),
+                  "+c" (arg4)
+                :
+                : "cc", "memory", "r8", "r9", "r10", "r11");
+}
+#endif
+
 static void
 do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_encrypt_block, context, outbuf, inbuf, NULL);
+#else
   _gcry_cast5_amd64_encrypt_block (context, outbuf, inbuf);
+#endif
 }
 
 static void
 do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_decrypt_block, context, outbuf, inbuf, NULL);
+#else
   _gcry_cast5_amd64_decrypt_block (context, outbuf, inbuf);
+#endif
+}
+
+static void
+cast5_amd64_ctr_enc(CAST5_context *ctx, byte *out, const byte *in, byte *ctr)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_ctr_enc, ctx, out, in, ctr);
+#else
+  _gcry_cast5_amd64_ctr_enc (ctx, out, in, ctr);
+#endif
+}
+
+static void
+cast5_amd64_cbc_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_cbc_dec, ctx, out, in, iv);
+#else
+  _gcry_cast5_amd64_cbc_dec (ctx, out, in, iv);
+#endif
+}
+
+static void
+cast5_amd64_cfb_dec(CAST5_context *ctx, byte *out, const byte *in, byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_cast5_amd64_cfb_dec, ctx, out, in, iv);
+#else
+  _gcry_cast5_amd64_cfb_dec (ctx, out, in, iv);
+#endif
 }
 
 static unsigned int
@@ -396,7 +453,7 @@ static unsigned int
 decrypt_block (void *context, byte *outbuf, const byte *inbuf)
 {
   CAST5_context *c = (CAST5_context *) context;
-  _gcry_cast5_amd64_decrypt_block (c, outbuf, inbuf);
+  do_decrypt_block (c, outbuf, inbuf);
   return /*burn_stack*/ (2*8);
 }
 
@@ -582,7 +639,7 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+        cast5_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 4;
         outbuf += 4 * CAST5_BLOCKSIZE;
@@ -651,7 +708,7 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+        cast5_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 4;
         outbuf += 4 * CAST5_BLOCKSIZE;
@@ -710,7 +767,7 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+        cast5_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 4;
         outbuf += 4 * CAST5_BLOCKSIZE;

commit 9a4fb3709864bf3e3918800d44ff576590cd4e92
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 13:33:07 2015 +0300

    Enable AMD64 Camellia implementations on WIN64
    
    * cipher/camellia-aesni-avx-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/camellia-aesni-avx2-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/camellia-glue.c (USE_AESNI_AVX, USE_AESNI_AVX2): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [USE_AESNI_AVX || USE_AESNI_AVX2] (ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
    (_gcry_camellia_aesni_avx_ctr_enc, _gcry_camellia_aesni_avx_cbc_dec)
    (_gcry_camellia_aesni_avx_cfb_dec, _gcry_camellia_aesni_avx_keygen)
    (_gcry_camellia_aesni_avx2_ctr_enc, _gcry_camellia_aesni_avx2_cbc_dec)
    (_gcry_camellia_aesni_avx2_cfb_dec): Add ASM_FUNC_ABI.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 6d157a7..c047a21 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
 
 #ifdef __PIC__
@@ -29,6 +30,12 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
@@ -769,7 +776,7 @@
 .text
 
 .align 8
-.type   __camellia_enc_blk16, at function;
+ELF(.type   __camellia_enc_blk16, at function;)
 
 __camellia_enc_blk16:
 	/* input:
@@ -853,10 +860,10 @@ __camellia_enc_blk16:
 		     %xmm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
-.size __camellia_enc_blk16,.-__camellia_enc_blk16;
+ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;)
 
 .align 8
-.type   __camellia_dec_blk16, at function;
+ELF(.type   __camellia_dec_blk16, at function;)
 
 __camellia_dec_blk16:
 	/* input:
@@ -938,7 +945,7 @@ __camellia_dec_blk16:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
-.size __camellia_dec_blk16,.-__camellia_dec_blk16;
+ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
@@ -948,7 +955,7 @@ __camellia_dec_blk16:
 
 .align 8
 .globl _gcry_camellia_aesni_avx_ctr_enc
-.type   _gcry_camellia_aesni_avx_ctr_enc, at function;
+ELF(.type   _gcry_camellia_aesni_avx_ctr_enc, at function;)
 
 _gcry_camellia_aesni_avx_ctr_enc:
 	/* input:
@@ -1062,11 +1069,11 @@ _gcry_camellia_aesni_avx_ctr_enc:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cbc_dec
-.type   _gcry_camellia_aesni_avx_cbc_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx_cbc_dec, at function;)
 
 _gcry_camellia_aesni_avx_cbc_dec:
 	/* input:
@@ -1130,11 +1137,11 @@ _gcry_camellia_aesni_avx_cbc_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_cfb_dec
-.type   _gcry_camellia_aesni_avx_cfb_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx_cfb_dec, at function;)
 
 _gcry_camellia_aesni_avx_cfb_dec:
 	/* input:
@@ -1202,7 +1209,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
 /*
  * IN:
@@ -1309,7 +1316,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 .text
 
 .align 8
-.type  __camellia_avx_setup128, at function;
+ELF(.type  __camellia_avx_setup128, at function;)
 __camellia_avx_setup128:
 	/* input:
 	 *	%rdi: ctx, CTX; subkey storage at key_table(CTX)
@@ -1650,10 +1657,10 @@ __camellia_avx_setup128:
 	vzeroall;
 
 	ret;
-.size __camellia_avx_setup128,.-__camellia_avx_setup128;
+ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;)
 
 .align 8
-.type  __camellia_avx_setup256, at function;
+ELF(.type  __camellia_avx_setup256, at function;)
 
 __camellia_avx_setup256:
 	/* input:
@@ -2127,11 +2134,11 @@ __camellia_avx_setup256:
 	vzeroall;
 
 	ret;
-.size __camellia_avx_setup256,.-__camellia_avx_setup256;
+ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx_keygen
-.type  _gcry_camellia_aesni_avx_keygen, at function;
+ELF(.type  _gcry_camellia_aesni_avx_keygen, at function;)
 
 _gcry_camellia_aesni_avx_keygen:
 	/* input:
@@ -2159,7 +2166,7 @@ _gcry_camellia_aesni_avx_keygen:
 	vpor %xmm2, %xmm1, %xmm1;
 
 	jmp __camellia_avx_setup256;
-.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;
+ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 25f48bc..a3fa229 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -20,7 +20,8 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
 
 #ifdef __PIC__
@@ -29,6 +30,12 @@
 #  define RIP
 #endif
 
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 /* struct CAMELLIA_context: */
@@ -748,7 +755,7 @@
 .text
 
 .align 8
-.type   __camellia_enc_blk32, at function;
+ELF(.type   __camellia_enc_blk32, at function;)
 
 __camellia_enc_blk32:
 	/* input:
@@ -832,10 +839,10 @@ __camellia_enc_blk32:
 		     %ymm15, %rax, %rcx, 24);
 
 	jmp .Lenc_done;
-.size __camellia_enc_blk32,.-__camellia_enc_blk32;
+ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
 
 .align 8
-.type   __camellia_dec_blk32, at function;
+ELF(.type   __camellia_dec_blk32, at function;)
 
 __camellia_dec_blk32:
 	/* input:
@@ -917,7 +924,7 @@ __camellia_dec_blk32:
 	      ((key_table + (24) * 8) + 4)(CTX));
 
 	jmp .Ldec_max24;
-.size __camellia_dec_blk32,.-__camellia_dec_blk32;
+ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
 
 #define inc_le128(x, minus_one, tmp) \
 	vpcmpeqq minus_one, x, tmp; \
@@ -927,7 +934,7 @@ __camellia_dec_blk32:
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_ctr_enc
-.type   _gcry_camellia_aesni_avx2_ctr_enc, at function;
+ELF(.type   _gcry_camellia_aesni_avx2_ctr_enc, at function;)
 
 _gcry_camellia_aesni_avx2_ctr_enc:
 	/* input:
@@ -1111,11 +1118,11 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;
+ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cbc_dec
-.type   _gcry_camellia_aesni_avx2_cbc_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx2_cbc_dec, at function;)
 
 _gcry_camellia_aesni_avx2_cbc_dec:
 	/* input:
@@ -1183,11 +1190,11 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;)
 
 .align 8
 .globl _gcry_camellia_aesni_avx2_cfb_dec
-.type   _gcry_camellia_aesni_avx2_cfb_dec, at function;
+ELF(.type   _gcry_camellia_aesni_avx2_cfb_dec, at function;)
 
 _gcry_camellia_aesni_avx2_cfb_dec:
 	/* input:
@@ -1257,7 +1264,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
 	leave;
 	ret;
-.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;
+ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index f18d135..5032321 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -75,7 +75,8 @@
 /* USE_AESNI inidicates whether to compile with Intel AES-NI/AVX code. */
 #undef USE_AESNI_AVX
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX 1
 # endif
 #endif
@@ -83,7 +84,8 @@
 /* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
 #undef USE_AESNI_AVX2
 #if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
-# if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 #  define USE_AESNI_AVX2 1
 # endif
 #endif
@@ -100,6 +102,20 @@ typedef struct
 #endif /*USE_AESNI_AVX2*/
 } CAMELLIA_context;
 
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16)
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
 #ifdef USE_AESNI_AVX
 /* Assembler implementations of Camellia using AES-NI and AVX.  Process data
    in 16 block same time.
@@ -107,21 +123,21 @@ typedef struct
 extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
-					     unsigned char *ctr);
+					     unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
-					     unsigned char *iv);
+					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
 					     unsigned char *out,
 					     const unsigned char *in,
-					     unsigned char *iv);
+					     unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 					    const unsigned char *key,
-					    unsigned int keylen);
+					    unsigned int keylen) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AESNI_AVX2
@@ -131,17 +147,17 @@ extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
-					      unsigned char *ctr);
+					      unsigned char *ctr) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cbc_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
-					      unsigned char *iv);
+					      unsigned char *iv) ASM_FUNC_ABI;
 
 extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
-					      unsigned char *iv);
+					      unsigned char *iv) ASM_FUNC_ABI;
 #endif
 
 static const char *selftest(void);
@@ -318,7 +334,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -347,8 +363,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
@@ -409,7 +428,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -437,8 +456,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */
@@ -491,7 +513,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
       if (did_use_aesni_avx2)
         {
           int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *);
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
 
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
@@ -519,8 +541,11 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
-            burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+          if (burn_stack_depth < avx_burn_stack_depth)
+            burn_stack_depth = avx_burn_stack_depth;
         }
 
       /* Use generic code to handle smaller chunks... */

commit e05682093ffb003b589a697428d918d755ac631d
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun May 3 17:28:40 2015 +0300

    Enable AMD64 Blowfish implementation on WIN64
    
    * cipher/blowfish-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/blowfish.c (USE_AMD64_ASM): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (call_sysv_fn): New.
    (do_encrypt, do_encrypt_block, do_decrypt_block)
    [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Call assembly
    function through 'call_sysv_fn'.
    (blowfish_amd64_ctr_enc, blowfish_amd64_cbc_dec)
    (blowfish_amd64_cfb_dec): New wrapper functions for bulk
    assembly functions.
    ..
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S
index 87b676f..21b63fc 100644
--- a/cipher/blowfish-amd64.S
+++ b/cipher/blowfish-amd64.S
@@ -20,7 +20,15 @@
 
 #ifdef __x86_64
 #include <config.h>
-#if defined(USE_BLOWFISH) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(USE_BLOWFISH) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 .text
 
@@ -120,7 +128,7 @@
 	movq RX0, 		(RIO);
 
 .align 8
-.type   __blowfish_enc_blk1, at function;
+ELF(.type   __blowfish_enc_blk1, at function;)
 
 __blowfish_enc_blk1:
 	/* input:
@@ -145,11 +153,11 @@ __blowfish_enc_blk1:
 	movq %r11, %rbp;
 
 	ret;
-.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;
+ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_do_encrypt
-.type   _gcry_blowfish_amd64_do_encrypt, at function;
+ELF(.type   _gcry_blowfish_amd64_do_encrypt, at function;)
 
 _gcry_blowfish_amd64_do_encrypt:
 	/* input:
@@ -171,11 +179,11 @@ _gcry_blowfish_amd64_do_encrypt:
 	movl RX0d, (RX2);
 
 	ret;
-.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;
+ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_encrypt_block
-.type   _gcry_blowfish_amd64_encrypt_block, at function;
+ELF(.type   _gcry_blowfish_amd64_encrypt_block, at function;)
 
 _gcry_blowfish_amd64_encrypt_block:
 	/* input:
@@ -195,11 +203,11 @@ _gcry_blowfish_amd64_encrypt_block:
 	write_block();
 
 	ret;
-.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;
+ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_decrypt_block
-.type   _gcry_blowfish_amd64_decrypt_block, at function;
+ELF(.type   _gcry_blowfish_amd64_decrypt_block, at function;)
 
 _gcry_blowfish_amd64_decrypt_block:
 	/* input:
@@ -231,7 +239,7 @@ _gcry_blowfish_amd64_decrypt_block:
 	movq %r11, %rbp;
 
 	ret;
-.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;
+ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
 
 /**********************************************************************
   4-way blowfish, four blocks parallel
@@ -319,7 +327,7 @@ _gcry_blowfish_amd64_decrypt_block:
 	bswapq 			RX3;
 
 .align 8
-.type   __blowfish_enc_blk4, at function;
+ELF(.type   __blowfish_enc_blk4, at function;)
 
 __blowfish_enc_blk4:
 	/* input:
@@ -343,10 +351,10 @@ __blowfish_enc_blk4:
 	outbswap_block4();
 
 	ret;
-.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;
+ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
 
 .align 8
-.type   __blowfish_dec_blk4, at function;
+ELF(.type   __blowfish_dec_blk4, at function;)
 
 __blowfish_dec_blk4:
 	/* input:
@@ -372,11 +380,11 @@ __blowfish_dec_blk4:
 	outbswap_block4();
 
 	ret;
-.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;
+ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_ctr_enc
-.type   _gcry_blowfish_amd64_ctr_enc, at function;
+ELF(.type   _gcry_blowfish_amd64_ctr_enc, at function;)
 _gcry_blowfish_amd64_ctr_enc:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -429,11 +437,11 @@ _gcry_blowfish_amd64_ctr_enc:
 	popq %rbp;
 
 	ret;
-.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;
+ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_cbc_dec
-.type   _gcry_blowfish_amd64_cbc_dec, at function;
+ELF(.type   _gcry_blowfish_amd64_cbc_dec, at function;)
 _gcry_blowfish_amd64_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -477,11 +485,11 @@ _gcry_blowfish_amd64_cbc_dec:
 	popq %rbp;
 
 	ret;
-.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;
+ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
 
 .align 8
 .globl  _gcry_blowfish_amd64_cfb_dec
-.type   _gcry_blowfish_amd64_cfb_dec, at function;
+ELF(.type   _gcry_blowfish_amd64_cfb_dec, at function;)
 _gcry_blowfish_amd64_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
@@ -527,7 +535,7 @@ _gcry_blowfish_amd64_cfb_dec:
 	popq %rbx;
 	popq %rbp;
 	ret;
-.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;
+ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
 
 #endif /*defined(USE_BLOWFISH)*/
 #endif /*__x86_64*/
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index ae470d8..a3fc26c 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -45,7 +45,8 @@
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
     (BLOWFISH_ROUNDS == 16)
 # define USE_AMD64_ASM 1
 #endif
@@ -280,22 +281,87 @@ extern void _gcry_blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out,
 extern void _gcry_blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out,
 					 const byte *in, byte *iv);
 
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+static inline void
+call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
+              const void *arg3, const void *arg4)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("callq *%0\n\t"
+                : "+a" (fn),
+                  "+D" (arg1),
+                  "+S" (arg2),
+                  "+d" (arg3),
+                  "+c" (arg4)
+                :
+                : "cc", "memory", "r8", "r9", "r10", "r11");
+}
+#endif
+
 static void
 do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr )
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_blowfish_amd64_do_encrypt, bc, ret_xl, ret_xr, NULL);
+#else
   _gcry_blowfish_amd64_do_encrypt (bc, ret_xl, ret_xr);
+#endif
 }
 
 static void
 do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_blowfish_amd64_encrypt_block, context, outbuf, inbuf,
+                NULL);
+#else
   _gcry_blowfish_amd64_encrypt_block (context, outbuf, inbuf);
+#endif
 }
 
 static void
 do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_blowfish_amd64_decrypt_block, context, outbuf, inbuf,
+                NULL);
+#else
   _gcry_blowfish_amd64_decrypt_block (context, outbuf, inbuf);
+#endif
+}
+
+static inline void
+blowfish_amd64_ctr_enc(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *ctr)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_blowfish_amd64_ctr_enc, ctx, out, in, ctr);
+#else
+  _gcry_blowfish_amd64_ctr_enc(ctx, out, in, ctr);
+#endif
+}
+
+static inline void
+blowfish_amd64_cbc_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_blowfish_amd64_cbc_dec, ctx, out, in, iv);
+#else
+  _gcry_blowfish_amd64_cbc_dec(ctx, out, in, iv);
+#endif
+}
+
+static inline void
+blowfish_amd64_cfb_dec(BLOWFISH_context *ctx, byte *out, const byte *in,
+                       byte *iv)
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn (_gcry_blowfish_amd64_cfb_dec, ctx, out, in, iv);
+#else
+  _gcry_blowfish_amd64_cfb_dec(ctx, out, in, iv);
+#endif
 }
 
 static unsigned int
@@ -605,7 +671,7 @@ _gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_blowfish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
+        blowfish_amd64_ctr_enc(ctx, outbuf, inbuf, ctr);
 
         nblocks -= 4;
         outbuf += 4 * BLOWFISH_BLOCKSIZE;
@@ -674,7 +740,7 @@ _gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_blowfish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
+        blowfish_amd64_cbc_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 4;
         outbuf += 4 * BLOWFISH_BLOCKSIZE;
@@ -734,7 +800,7 @@ _gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
     /* Process data in 4 block chunks. */
     while (nblocks >= 4)
       {
-        _gcry_blowfish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
+        blowfish_amd64_cfb_dec(ctx, outbuf, inbuf, iv);
 
         nblocks -= 4;
         outbuf += 4 * BLOWFISH_BLOCKSIZE;

commit c46b015bedba7ce0db68929bd33a86a54ab3d919
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun May 3 17:06:56 2015 +0300

    Enable AMD64 arcfour implementation on WIN64
    
    * cipher/arcfour-amd64.S: Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (ELF): New macro to mask lines with ELF specific commands.
    * cipher/arcfour.c (USE_AMD64_ASM): Enable when
    HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS defined.
    (do_encrypt, do_decrypt) [HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]: Use
    assembly block to call AMD64 assembly function.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
index 8b8031a..2e52ea0 100644
--- a/cipher/arcfour-amd64.S
+++ b/cipher/arcfour-amd64.S
@@ -15,12 +15,19 @@
 
 #ifdef __x86_64__
 #include <config.h>
-#if defined(USE_ARCFOUR) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
 
 .text
 .align 16
 .globl _gcry_arcfour_amd64
-.type _gcry_arcfour_amd64, at function
+ELF(.type _gcry_arcfour_amd64, at function)
 _gcry_arcfour_amd64:
 	push	%rbp
 	push	%rbx
@@ -91,7 +98,7 @@ _gcry_arcfour_amd64:
 	pop	%rbp
 	ret
 .L__gcry_arcfour_amd64_end:
-.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64
+ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
 
 #endif
 #endif
diff --git a/cipher/arcfour.c b/cipher/arcfour.c
index 27537bf..44e8ef4 100644
--- a/cipher/arcfour.c
+++ b/cipher/arcfour.c
@@ -33,7 +33,8 @@
 
 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
 #undef USE_AMD64_ASM
-#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
 # define USE_AMD64_ASM 1
 #endif
 
@@ -53,7 +54,21 @@ static void
 encrypt_stream (void *context,
                 byte *outbuf, const byte *inbuf, size_t length)
 {
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  const void *fn = _gcry_arcfour_amd64;
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("callq *%0\n\t"
+                : "+a" (fn),
+                  "+D" (context),
+                  "+S" (length),
+                  "+d" (inbuf),
+                  "+c" (outbuf)
+                :
+                : "cc", "memory", "r8", "r9", "r10", "r11");
+#else
   _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
+#endif
 }
 
 #else /*!USE_AMD64_ASM*/

commit ee8fc4edcb3466b03246c8720b90731bf274ff1d
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Thu May 14 10:02:51 2015 +0300

    Update documentation for Poly1305-ChaCha20 AEAD, RFC-7539
    
    * cipher/cipher-poly1305.c: Add RFC-7539 to header.
    * doc/gcrypt.texi: Update Poly1305 AEAD documentation with mention of
    RFC-7539; Drop Salsa from supported stream ciphers for Poly1305 AEAD.
    --
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c
index f283333..965a7b6 100644
--- a/cipher/cipher-poly1305.c
+++ b/cipher/cipher-poly1305.c
@@ -1,4 +1,4 @@
-/* cipher-pol1305.c  -  Poly1305 based AEAD cipher mode
+/* cipher-poly1305.c  -  Poly1305 based AEAD cipher mode, RFC-7539
  * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 8683ca8..ab4f685 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -1643,9 +1643,10 @@ Associated Data (AEAD) block cipher mode, which is specified in
 'NIST Special Publication 800-38D'.
 
 @item  GCRY_CIPHER_MODE_POLY1305
- at cindex Poly1305 based AEAD mode
-Poly1305 is an Authenticated Encryption with Associated Data (AEAD)
-mode, which can be used with ChaCha20 and Salsa20 stream ciphers.
+ at cindex Poly1305 based AEAD mode with ChaCha20
+This mode implements the Poly1305 Authenticated Encryption with Associated
+Data (AEAD) mode according to RFC-7539. This mode can be used with ChaCha20
+stream cipher.
 
 @item  GCRY_CIPHER_MODE_OCB
 @cindex OCB, OCB3
@@ -1687,7 +1688,7 @@ and the according constants.  Note that some modes are incompatible
 with some algorithms - in particular, stream mode
 (@code{GCRY_CIPHER_MODE_STREAM}) only works with stream ciphers.
 Poly1305 AEAD mode (@code{GCRY_CIPHER_MODE_POLY1305}) only works with
-ChaCha and Salsa stream ciphers. The block cipher modes
+ChaCha20 stream cipher. The block cipher modes
 (@code{GCRY_CIPHER_MODE_ECB}, @code{GCRY_CIPHER_MODE_CBC},
 @code{GCRY_CIPHER_MODE_CFB}, @code{GCRY_CIPHER_MODE_OFB} and
 @code{GCRY_CIPHER_MODE_CTR}) will work with any block cipher

-----------------------------------------------------------------------

Summary of changes:
 cipher/arcfour-amd64.S             | 13 ++++--
 cipher/arcfour.c                   | 17 +++++++-
 cipher/blowfish-amd64.S            | 46 ++++++++++++---------
 cipher/blowfish.c                  | 74 +++++++++++++++++++++++++++++++--
 cipher/camellia-aesni-avx-amd64.S  | 41 +++++++++++--------
 cipher/camellia-aesni-avx2-amd64.S | 29 ++++++++-----
 cipher/camellia-glue.c             | 61 +++++++++++++++++++--------
 cipher/cast5-amd64.S               | 43 ++++++++++---------
 cipher/cast5.c                     | 67 +++++++++++++++++++++++++++---
 cipher/chacha20-avx2-amd64.S       | 13 ++++--
 cipher/chacha20-sse2-amd64.S       | 13 ++++--
 cipher/chacha20-ssse3-amd64.S      | 13 ++++--
 cipher/chacha20.c                  | 43 ++++++++++++++-----
 cipher/cipher-poly1305.c           |  2 +-
 cipher/des-amd64.S                 | 29 ++++++++-----
 cipher/des.c                       | 61 +++++++++++++++++++++++++--
 cipher/poly1305-avx2-amd64.S       | 22 ++++++----
 cipher/poly1305-internal.h         | 27 +++++++++---
 cipher/poly1305-sse2-amd64.S       | 22 ++++++----
 cipher/poly1305.c                  | 33 ++++++++-------
 cipher/salsa20-amd64.S             | 17 +++++---
 cipher/salsa20.c                   | 26 +++++++++---
 cipher/serpent-avx2-amd64.S        | 29 ++++++++-----
 cipher/serpent-sse2-amd64.S        | 29 ++++++++-----
 cipher/serpent.c                   | 30 ++++++++++----
 cipher/twofish-amd64.S             | 37 ++++++++++-------
 cipher/twofish.c                   | 84 +++++++++++++++++++++++++++++++++++---
 doc/gcrypt.texi                    |  9 ++--
 28 files changed, 699 insertions(+), 231 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list