[PATCH 2/2] sha512/sha256: remove assembler macros from AMD64 implementations

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Jan 22 18:41:43 CET 2021


* configure.ac (gcry_cv_gcc_platform_as_ok_for_intel_syntax): Remove
assembler macro check from Intel syntax assembly support check.
* cipher/sha256-avx-amd64.S: Replace assembler macros with C
preprocessor counterparts.
* cipher/sha256-avx2-bmi2-amd64.S: Ditto.
* cipher/sha256-ssse3-amd64.S: Ditto.
* cipher/sha512-avx-amd64.S: Ditto.
* cipher/sha512-avx2-bmi2-amd64.S: Ditto.
* cipher/sha512-ssse3-amd64.S: Ditto.
--

Removing GNU assembler macros allows building these implementations with
clang.

GnuPG-bug-id: 5255
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sha256-avx-amd64.S       | 516 +++++++++++++++----------------
 cipher/sha256-avx2-bmi2-amd64.S | 421 +++++++++++--------------
 cipher/sha256-ssse3-amd64.S     | 529 +++++++++++++++-----------------
 cipher/sha512-avx-amd64.S       | 456 ++++++++++++++-------------
 cipher/sha512-avx2-bmi2-amd64.S | 498 +++++++++++++-----------------
 cipher/sha512-ssse3-amd64.S     | 455 ++++++++++++++-------------
 configure.ac                    |  20 +-
 7 files changed, 1387 insertions(+), 1508 deletions(-)

diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S
index 77143ff0..ec945f84 100644
--- a/cipher/sha256-avx-amd64.S
+++ b/cipher/sha256-avx-amd64.S
@@ -65,67 +65,64 @@
 
 #define	VMOVDQ vmovdqu /* assume buffers not aligned */
 
-.macro ROR p1 p2
-	/* shld is faster than ror on Intel Sandybridge */
-	shld	\p1, \p1, (32 - \p2)
-.endm
+#define ROR(p1, p2) \
+	/* shld is faster than ror on Intel Sandybridge */ \
+	shld	p1, p1, (32 - p2);
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
-	VMOVDQ \p1, \p2
-	vpshufb \p1, \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+	VMOVDQ p1, p2; \
+	vpshufb p1, p1, p3;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
 
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER  = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
 
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
 
-NUM_BLKS = rdx	/* 3rd arg */
-CTX = rsi	/* 2nd arg */
-INP = rdi	/* 1st arg */
+#define NUM_BLKS rdx	/* 3rd arg */
+#define CTX rsi	/* 2nd arg */
+#define INP rdi	/* 1st arg */
 
-SRND = rdi	/* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi	/* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
 
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
 
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
 
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
 
 
 
@@ -142,220 +139,197 @@ y2 = r15d
 #define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
 
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-		/* compute s0 four at a time and s1 two at a time
-		 * compute W[-16] + W[-7] 4 at a time */
-	mov	y0, e		/* y0 = e */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-		vpalignr	XTMP0, X3, X2, 4	/* XTMP0 = W[-7] */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		vpaddd	XTMP0, XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		/* compute s0 */
-		vpalignr	XTMP1, X1, X0, 4	/* XTMP1 = W[-15] */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpslld	XTMP2, XTMP1, (32-7)
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		vpsrld	XTMP3, XTMP1, 7
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		vpor	XTMP3, XTMP3, XTMP2	/* XTMP1 = W[-15] ror 7 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		/* compute s0 four at a time and s1 two at a time */; \
+		/* compute W[-16] + W[-7] 4 at a time */; \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		vpalignr	XTMP0, X3, X2, 4	/* XTMP0 = W[-7] */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpaddd	XTMP0, XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		/* compute s0 */; \
+		vpalignr	XTMP1, X1, X0, 4	/* XTMP1 = W[-15] */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpslld	XTMP2, XTMP1, (32-7); \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpsrld	XTMP3, XTMP1, 7; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		vpor	XTMP3, XTMP3, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-		vpslld	XTMP2, XTMP1, (32-18)
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-		vpsrld	XTMP4, XTMP1, 18
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		vpxor	XTMP4, XTMP4, XTMP3
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		vpsrld	XTMP1, XTMP1, 3	/* XTMP4 = W[-15] >> 3 */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-		vpxor	XTMP1, XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpxor	XTMP1, XTMP1, XTMP4	/* XTMP1 = s0 */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		/* compute low s1 */
-		vpshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		vpaddd	XTMP0, XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+		vpslld	XTMP2, XTMP1, (32-18); \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrld	XTMP4, XTMP1, 18; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		vpxor	XTMP4, XTMP4, XTMP3; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		vpsrld	XTMP1, XTMP1, 3	/* XTMP4 = W[-15] >> 3 */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+		vpxor	XTMP1, XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpxor	XTMP1, XTMP1, XTMP4	/* XTMP1 = s0 */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		/* compute low s1 */; \
+		vpshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		vpaddd	XTMP0, XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */
-	xor	y2, g		/* y2 = f^g */
-		vpsrlq	XTMP4, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-		vpsrld	XTMP2, XTMP2, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		vpxor	XTMP2, XTMP2, XTMP3
-	add	y2, y0		/* y2 = S1 + CH */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */
-		vpxor	XTMP4, XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpshufb	XTMP4, XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		vpaddd	XTMP0, XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		/* compute high s1 */
-		vpshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrlq	XTMP4, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+		vpsrld	XTMP2, XTMP2, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
+		vpxor	XTMP4, XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpaddd	XTMP0, XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		/* compute high s1 */; \
+		vpshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-	mov	y0, e		/* y0 = e */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		vpsrlq	X0, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		vpsrld	XTMP2, XTMP2,    10	/* X0 = W[-2] >> 10 {DDCC} */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		vpxor	XTMP2, XTMP2, XTMP3
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */
-		vpxor	X0, X0, XTMP2	/* X0 = s1 {xDxC} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		vpshufb	X0, X0, SHUF_DC00	/* X0 = s1 {DC00} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		vpaddd	X0, X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		vpsrlq	XTMP3, XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		vpsrlq	X0, XTMP2, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		vpsrld	XTMP2, XTMP2,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
+		vpxor	X0, X0, XTMP2	/* X0 = s1 {xDxC} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		vpshufb	X0, X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		vpaddd	X0, X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
 
 /* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
-	mov	y0, e		/* y0 = e */
-	ROR	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ROR	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ROR	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	ROR	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ROR	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	add	y2, y0		/* y2 = S1 + CH */
-	ROR	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + \i1 * 4]	/* y2 = k + w + S1 + CH */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ROR(	y0, (25-11))	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ROR(	y1, (22-13))	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ROR(	y0, (11-6))	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	ROR(	y1, (13-2))	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ROR(	y0, 6)		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ROR(	y1, 2)		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
-	ROTATE_ARGS
-.endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -410,10 +384,10 @@ _gcry_sha256_transform_amd64_avx:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* byte swap first 16 dwords */
-	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
 
 	mov	[rsp + _INP], INP
 
@@ -423,20 +397,20 @@ _gcry_sha256_transform_amd64_avx:
 .Loop1:
 	vpaddd	XFER, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
 
-	vpaddd	XFER, X0, [TBL + 1*16]
+	vpaddd	XFER, X1, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
 
-	vpaddd	XFER, X0, [TBL + 2*16]
+	vpaddd	XFER, X2, [TBL + 2*16]
 	vmovdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
 
-	vpaddd	XFER, X0, [TBL + 3*16]
+	vpaddd	XFER, X3, [TBL + 3*16]
 	vmovdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
 	sub	SRND, 1
 	jne	.Loop1
@@ -445,17 +419,17 @@ _gcry_sha256_transform_amd64_avx:
 .Loop2:
 	vpaddd	X0, X0, [TBL + 0*16]
 	vmovdqa	[rsp + _XFER], X0
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, a, b, c, d, e, f, g, h)
+	DO_ROUND(1, h, a, b, c, d, e, f, g)
+	DO_ROUND(2, g, h, a, b, c, d, e, f)
+	DO_ROUND(3, f, g, h, a, b, c, d, e)
 	vpaddd	X1, X1, [TBL + 1*16]
 	vmovdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, e, f, g, h, a, b, c, d)
+	DO_ROUND(1, d, e, f, g, h, a, b, c)
+	DO_ROUND(2, c, d, e, f, g, h, a, b)
+	DO_ROUND(3, b, c, d, e, f, g, h, a)
 
 	vmovdqa	X0, X2
 	vmovdqa	X1, X3
@@ -463,14 +437,14 @@ _gcry_sha256_transform_amd64_avx:
 	sub	SRND, 1
 	jne	.Loop2
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	mov	INP, [rsp + _INP]
 	add	INP, 64
diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S
index 52be1a07..faefba17 100644
--- a/cipher/sha256-avx2-bmi2-amd64.S
+++ b/cipher/sha256-avx2-bmi2-amd64.S
@@ -70,226 +70,171 @@
 
 /*  addm [mem], reg */
 /*  Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-X0 = ymm4
-X1 = ymm5
-X2 = ymm6
-X3 = ymm7
+#define X0 ymm4
+#define X1 ymm5
+#define X2 ymm6
+#define X3 ymm7
 
 /*  XMM versions of above */
-XWORD0 = xmm4
-XWORD1 = xmm5
-XWORD2 = xmm6
-XWORD3 = xmm7
-
-XTMP0 = ymm0
-XTMP1 = ymm1
-XTMP2 = ymm2
-XTMP3 = ymm3
-XTMP4 = ymm8
-XFER =  ymm9
-XTMP5 = ymm11
-
-SHUF_00BA = ymm10 /*  shuffle xBxA -> 00BA */
-SHUF_DC00 = ymm12 /*  shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = ymm13
-
-X_BYTE_FLIP_MASK = xmm13 /*  XMM version of BYTE_FLIP_MASK */
-
-NUM_BLKS = rdx	/*  3rd arg */
-CTX =	rsi   	/*  2nd arg */
-INP =	rdi	/*  1st arg */
-c =	ecx
-d =	r8d
-e =	edx	/*  clobbers NUM_BLKS */
-y3 =	edi	/*  clobbers INP */
-
-TBL =	rbp
-SRND =	CTX	/*  SRND is same register as CTX */
-
-a =	eax
-b =	ebx
-f =	r9d
-g =	r10d
-h =	r11d
-old_h =	r11d
-
-T1 = r12d
-y0 = r13d
-y1 = r14d
-y2 = r15d
-
-
-_XFER_SIZE	= 2*64*4	/*  2 blocks, 64 rounds, 4 bytes/round */
-_XMM_SAVE_SIZE  = 0
-_INP_END_SIZE	= 8
-_INP_SIZE	= 8
-_CTX_SIZE	= 8
-_RSP_SIZE	= 8
-
-_XFER		= 0
-_XMM_SAVE	= _XFER     + _XFER_SIZE
-_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
-_INP 		= _INP_END  + _INP_END_SIZE
-_CTX		= _INP      + _INP_SIZE
-_RSP		= _CTX      + _CTX_SIZE
-STACK_SIZE	= _RSP      + _RSP_SIZE
-
-/*  rotate_Xs */
-/*  Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/*  ROTATE_ARGS */
-/*  Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-old_h = h
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro ONE_ROUND_PART1 XFER
-	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
-	 * d += h;
-	 * h += Sum0 (a) + Maj (a, b, c);
-	 *
-	 * Ch(x, y, z) => ((x & y) + (~x & z))
-	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
-	 */
-
-	mov y3, e
-	add h, [\XFER]
-	and y3, f
-	rorx y0, e, 25
-	rorx y1, e, 11
+#define XWORD0 xmm4
+#define XWORD1 xmm5
+#define XWORD2 xmm6
+#define XWORD3 xmm7
+
+#define XTMP0 ymm0
+#define XTMP1 ymm1
+#define XTMP2 ymm2
+#define XTMP3 ymm3
+#define XTMP4 ymm8
+#define XFER ymm9
+#define XTMP5 ymm11
+
+#define SHUF_00BA ymm10 /*  shuffle xBxA -> 00BA */
+#define SHUF_DC00 ymm12 /*  shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK ymm13
+
+#define X_BYTE_FLIP_MASK xmm13 /*  XMM version of BYTE_FLIP_MASK */
+
+#define NUM_BLKS rdx /*  3rd arg */
+#define CTX rsi      /*  2nd arg */
+#define INP rdi      /*  1st arg */
+#define c ecx
+#define d r8d
+#define e edx        /*  clobbers NUM_BLKS */
+#define y3 edi       /*  clobbers INP */
+
+#define TBL rbp
+#define SRND CTX     /*  SRND is same register as CTX */
+
+#define a eax
+#define b ebx
+#define f r9d
+#define g r10d
+#define h r11d
+#define old_h r11d
+
+#define T1 r12d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
+
+
+#define _XFER_SIZE 2*64*4	/*  2 blocks, 64 rounds, 4 bytes/round */
+#define _XMM_SAVE_SIZE 0
+#define _INP_END_SIZE 8
+#define _INP_SIZE 8
+#define _CTX_SIZE 8
+#define _RSP_SIZE 8
+
+#define _XFER 0
+#define _XMM_SAVE  _XFER     + _XFER_SIZE
+#define _INP_END   _XMM_SAVE + _XMM_SAVE_SIZE
+#define _INP       _INP_END  + _INP_END_SIZE
+#define _CTX       _INP      + _INP_SIZE
+#define _RSP       _CTX      + _CTX_SIZE
+#define STACK_SIZE _RSP      + _RSP_SIZE
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); */ \
+	/* d += h; */ \
+	/* h += Sum0 (a) + Maj (a, b, c); */ \
+	\
+	/* Ch(x, y, z) => ((x & y) + (~x & z)) */ \
+	/* Maj(x, y, z) => ((x & y) + (z & (x ^ y))) */ \
+	\
+	mov y3, e; \
+	add h, [XFERIN]; \
+	and y3, f; \
+	rorx y0, e, 25; \
+	rorx y1, e, 11; \
+	lea h, [h + y3]; \
+	andn y3, e, g; \
+	rorx T1, a, 13; \
+	xor y0, y1; \
 	lea h, [h + y3]
-	andn y3, e, g
-	rorx T1, a, 13
-	xor y0, y1
-	lea h, [h + y3]
-.endm
-.macro ONE_ROUND_PART2
-	rorx y2, a, 22
-	rorx y1, e, 6
-	mov y3, a
-	xor T1, y2
-	xor y0, y1
-	xor y3, b
-	lea h, [h + y0]
-	mov y0, a
-	rorx y2, a, 2
-	add d, h
-	and y3, c
-	xor T1, y2
-	lea h, [h + y3]
-	lea h, [h + T1]
-	and y0, b
-	lea h, [h + y0]
-.endm
-
-.macro ONE_ROUND XFER
-	ONE_ROUND_PART1 \XFER
-	ONE_ROUND_PART2
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */
-		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */
-		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */
-		vpsrld	XTMP2, XTMP1, 7
-		vpslld	XTMP3, XTMP1, (32-7)
-		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */
-		vpsrld	XTMP2, XTMP1,18
-
-	ONE_ROUND 0*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */
-		vpslld	XTMP1, XTMP1, (32-18)
-		vpxor	XTMP3, XTMP3, XTMP1
-		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */
-		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */
-		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */
-		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */
-		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */
-
-	ONE_ROUND 1*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
 
-		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */
-		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */
-		vpxor	XTMP2, XTMP2, XTMP3
-		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */
-		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */
-		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */
-		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */
-
-	ONE_ROUND 2*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */
-		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */
-		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */
-		vpxor	XTMP2, XTMP2, XTMP3
-		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */
-		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */
-		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */
-		vpaddd	XFER, X0, [TBL + \XFEROUT]
-
-	ONE_ROUND_PART1 3*4+\XFER
-		vmovdqa [rsp + _XFER + \XFEROUT], XFER
-	ONE_ROUND_PART2
-	ROTATE_ARGS
-	rotate_Xs
-.endm
-
-.macro DO_4ROUNDS XFER
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND 0*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND 1*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND 2*4+\XFER
-	ROTATE_ARGS
-
-/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+	rorx y2, a, 22; \
+	rorx y1, e, 6; \
+	mov y3, a; \
+	xor T1, y2; \
+	xor y0, y1; \
+	xor y3, b; \
+	lea h, [h + y0]; \
+	mov y0, a; \
+	rorx y2, a, 2; \
+	add d, h; \
+	and y3, c; \
+	xor T1, y2; \
+	lea h, [h + y3]; \
+	lea h, [h + T1]; \
+	and y0, b; \
+	lea h, [h + y0]
 
-	ONE_ROUND 3*4+\XFER
-	ROTATE_ARGS
-.endm
+#define ONE_ROUND(XFER, a, b, c, d, e, f, g, h) \
+	ONE_ROUND_PART1(XFER, a, b, c, d, e, f, g, h); \
+	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(XFERIN, XFEROUT, X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpalignr	XTMP0, X3, X2, 4	/*  XTMP0 = W[-7] */; \
+		vpaddd	XTMP0, XTMP0, X0	/*  XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */; \
+		vpalignr	XTMP1, X1, X0, 4	/*  XTMP1 = W[-15] */; \
+		vpsrld	XTMP2, XTMP1, 7; \
+		vpslld	XTMP3, XTMP1, (32-7); \
+		vpor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 */; \
+		vpsrld	XTMP2, XTMP1,18; \
+	\
+	ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrld	XTMP4, XTMP1, 3	/*  XTMP4 = W[-15] >> 3 */; \
+		vpslld	XTMP1, XTMP1, (32-18); \
+		vpxor	XTMP3, XTMP3, XTMP1; \
+		vpxor	XTMP3, XTMP3, XTMP2	/*  XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+		vpxor	XTMP1, XTMP3, XTMP4	/*  XTMP1 = s0 */; \
+		vpshufd	XTMP2, X3, 0b11111010	/*  XTMP2 = W[-2] {BBAA} */; \
+		vpaddd	XTMP0, XTMP0, XTMP1	/*  XTMP0 = W[-16] + W[-7] + s0 */; \
+		vpsrld	XTMP4, XTMP2, 10	/*  XTMP4 = W[-2] >> 10 {BBAA} */; \
+	\
+	ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xBxA} */; \
+		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xBxA} */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+		vpxor	XTMP4, XTMP4, XTMP2	/*  XTMP4 = s1 {xBxA} */; \
+		vpshufb	XTMP4, XTMP4, SHUF_00BA	/*  XTMP4 = s1 {00BA} */; \
+		vpaddd	XTMP0, XTMP0, XTMP4	/*  XTMP0 = {..., ..., W[1], W[0]} */; \
+		vpshufd	XTMP2, XTMP0, 0b1010000	/*  XTMP2 = W[-2] {DDCC} */; \
+	\
+	ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+	\
+	/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrld	XTMP5, XTMP2,   10	/*  XTMP5 = W[-2] >> 10 {DDCC} */; \
+		vpsrlq	XTMP3, XTMP2, 19	/*  XTMP3 = W[-2] ror 19 {xDxC} */; \
+		vpsrlq	XTMP2, XTMP2, 17	/*  XTMP2 = W[-2] ror 17 {xDxC} */; \
+		vpxor	XTMP2, XTMP2, XTMP3; \
+		vpxor	XTMP5, XTMP5, XTMP2	/*  XTMP5 = s1 {xDxC} */; \
+		vpshufb	XTMP5, XTMP5, SHUF_DC00	/*  XTMP5 = s1 {DC00} */; \
+		vpaddd	X0, XTMP5, XTMP0	/*  X0 = {W[3], W[2], W[1], W[0]} */; \
+		vpaddd	XFER, X0, [TBL + XFEROUT]; \
+	\
+	ONE_ROUND_PART1(3*4+XFERIN, f, g, h, a, b, c, d, e); \
+		vmovdqa [rsp + _XFER + XFEROUT], XFER; \
+	ONE_ROUND_PART2(f, g, h, a, b, c, d, e);
+
+#define DO_4ROUNDS(XFERIN, a, b, c, d, e, f, g, h) \
+	ONE_ROUND(0*4+XFERIN, a, b, c, d, e, f, g, h); \
+	ONE_ROUND(1*4+XFERIN, h, a, b, c, d, e, f, g); \
+	ONE_ROUND(2*4+XFERIN, g, h, a, b, c, d, e, f); \
+	ONE_ROUND(3*4+XFERIN, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -391,32 +336,32 @@ _gcry_sha256_transform_amd64_avx2:
 
 .align 16
 .Loop1:
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 0*32, SRND + 4*32
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 1*32, SRND + 5*32
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 2*32, SRND + 6*32
-	FOUR_ROUNDS_AND_SCHED	rsp + _XFER + SRND + 3*32, SRND + 7*32
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 0*32, SRND + 4*32, X0, X1, X2, X3, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 1*32, SRND + 5*32, X1, X2, X3, X0, e, f, g, h, a, b, c, d)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 2*32, SRND + 6*32, X2, X3, X0, X1, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(rsp + _XFER + SRND + 3*32, SRND + 7*32, X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
 	add	SRND, 4*32
 	cmp	SRND, 3 * 4*32
 	jb	.Loop1
 
 	/* ; Do last 16 rounds with no scheduling */
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 0*32)
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 1*32)
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 2*32)
-	DO_4ROUNDS	rsp + _XFER + (3*4*32 + 3*32)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 0*32), a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 1*32), e, f, g, h, a, b, c, d)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 2*32), a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + (3*4*32 + 3*32), e, f, g, h, a, b, c, d)
 
 	mov	CTX, [rsp + _CTX]
 	mov	INP, [rsp + _INP]
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	cmp	INP, [rsp + _INP_END]
 	ja	.Ldone_hash
@@ -425,8 +370,8 @@ _gcry_sha256_transform_amd64_avx2:
 	xor	SRND, SRND
 .align 16
 .Loop3:
-	DO_4ROUNDS	rsp + _XFER + SRND + 0*32 + 16
-	DO_4ROUNDS	rsp + _XFER + SRND + 1*32 + 16
+	DO_4ROUNDS(rsp + _XFER + SRND + 0*32 + 16, a, b, c, d, e, f, g, h)
+	DO_4ROUNDS(rsp + _XFER + SRND + 1*32 + 16, e, f, g, h, a, b, c, d)
 	add	SRND, 2*32
 	cmp	SRND, 4 * 4*32
 	jb .Loop3
@@ -435,14 +380,14 @@ _gcry_sha256_transform_amd64_avx2:
 	mov	INP, [rsp + _INP]
 	add	INP, 64
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	cmp	INP, [rsp + _INP_END]
 	jb	.Loop0
diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S
index 0fb94c1b..098b0eb6 100644
--- a/cipher/sha256-ssse3-amd64.S
+++ b/cipher/sha256-ssse3-amd64.S
@@ -70,58 +70,56 @@
 
 /* addm [mem], reg
  * Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
 /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  * Load xmm with mem and byte swap each dword */
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
-	MOVDQ \p1, \p2
-	pshufb \p1, \p3
-.endm
+#define COPY_XMM_AND_BSWAP(p1, p2, p3) \
+	MOVDQ p1, p2; \
+	pshufb p1, p3;
 
 /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
 
-X0 = xmm4
-X1 = xmm5
-X2 = xmm6
-X3 = xmm7
+#define X0 xmm4
+#define X1 xmm5
+#define X2 xmm6
+#define X3 xmm7
 
-XTMP0 = xmm0
-XTMP1 = xmm1
-XTMP2 = xmm2
-XTMP3 = xmm3
-XTMP4 = xmm8
-XFER  = xmm9
+#define XTMP0 xmm0
+#define XTMP1 xmm1
+#define XTMP2 xmm2
+#define XTMP3 xmm3
+#define XTMP4 xmm8
+#define XFER xmm9
 
-SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */
-SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */
-BYTE_FLIP_MASK = xmm12
+#define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
+#define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
+#define BYTE_FLIP_MASK xmm12
 
-NUM_BLKS = rdx	/* 3rd arg */
-CTX = rsi	/* 2nd arg */
-INP = rdi	/* 1st arg */
+#define NUM_BLKS rdx	/* 3rd arg */
+#define CTX rsi	/* 2nd arg */
+#define INP rdi	/* 1st arg */
 
-SRND = rdi	/* clobbers INP */
-c = ecx
-d = r8d
-e = edx
+#define SRND rdi	/* clobbers INP */
+#define c ecx
+#define d r8d
+#define e edx
 
-TBL = rbp
-a = eax
-b = ebx
+#define TBL rbp
+#define a eax
+#define b ebx
 
-f = r9d
-g = r10d
-h = r11d
+#define f r9d
+#define g r10d
+#define h r11d
 
-y0 = r13d
-y1 = r14d
-y2 = r15d
+#define y0 r13d
+#define y1 r14d
+#define y2 r15d
 
 
 
@@ -138,230 +136,207 @@ y2 = r15d
 #define _XMM_SAVE	(_XFER     + _XFER_SIZE + _ALIGN_SIZE)
 #define STACK_SIZE	(_XMM_SAVE + _XMM_SAVE_SIZE)
 
-/* rotate_Xs
- * Rotate values of symbols X0...X3 */
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-/* ROTATE_ARGS
- * Rotate values of symbols a...h */
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
-		/* compute s0 four at a time and s1 two at a time
-		 * compute W[-16] + W[-7] 4 at a time */
-		movdqa	XTMP0, X3
-	mov	y0, e		/* y0 = e */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		movdqa	XTMP1, X1
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		/* compute s0 */
-		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */
-		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pslld	XTMP1, (32-7)
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		psrld	XTMP2, 7
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+
+#define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		/* compute s0 four at a time and s1 two at a time */; \
+		/* compute W[-16] + W[-7] 4 at a time */; \
+		movdqa	XTMP0, X3; \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		palignr	XTMP0, X2, 4	/* XTMP0 = W[-7] */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		movdqa	XTMP1, X1; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		paddd	XTMP0, X0	/* XTMP0 = W[-7] + W[-16] */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		/* compute s0 */; \
+		palignr	XTMP1, X0, 4	/* XTMP1 = W[-15] */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		movdqa	XTMP2, XTMP1	/* XTMP2 = W[-15] */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 0*4]	/* y2 = k + w + S1 + CH */; \
+		movdqa	XTMP3, XTMP1	/* XTMP3 = W[-15] */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pslld	XTMP1, (32-7); \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		psrld	XTMP2, 7; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		por	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-		pslld	XTMP3, (32-18)
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-		psrld	XTMP2, 18
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		pxor	XTMP1, XTMP3
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		/* compute low s1 */
-		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP2, XTMP3	/* XTMP2 = W[-15] */; \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+		movdqa	XTMP4, XTMP3	/* XTMP4 = W[-15] */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+		pslld	XTMP3, (32-18); \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrld	XTMP2, 18; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		pxor	XTMP1, XTMP3; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		psrld	XTMP4, 3	/* XTMP4 = W[-15] >> 3 */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 1*4]	/* y2 = k + w + S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+		pxor	XTMP1, XTMP2	/* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pxor	XTMP1, XTMP4	/* XTMP1 = s0 */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		/* compute low s1 */; \
+		pshufd	XTMP2, X3, 0b11111010	/* XTMP2 = W[-2] {BBAA} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		paddd	XTMP0, XTMP1	/* XTMP0 = W[-16] + W[-7] + s0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */
-	mov	y0, e		/* y0 = e */
-	mov	y1, a		/* y1 = a */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */
-	xor	y2, g		/* y2 = f^g */
-		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-		pxor	XTMP2, XTMP3
-	add	y2, y0		/* y2 = S1 + CH */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */
-		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-		/* compute high s1 */
-		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {BBAA} */; \
+	mov	y0, e		/* y0 = e */; \
+	mov	y1, a		/* y1 = a */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+		movdqa	XTMP4, XTMP2	/* XTMP4 = W[-2] {BBAA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xBxA} */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xBxA} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+		psrld	XTMP4, 10	/* XTMP4 = W[-2] >> 10 {BBAA} */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+		pxor	XTMP2, XTMP3; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + 2*4]	/* y2 = k + w + S1 + CH */; \
+		pxor	XTMP4, XTMP2	/* XTMP4 = s1 {xBxA} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pshufb	XTMP4, SHUF_00BA	/* XTMP4 = s1 {00BA} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		paddd	XTMP0, XTMP4	/* XTMP0 = {..., ..., W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+		/* compute high s1 */; \
+		pshufd	XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */
-	mov	y0, e		/* y0 = e */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	mov	y2, f		/* y2 = f */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	xor	y2, g		/* y2 = f^g */
-		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-		pxor	XTMP2, XTMP3
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, y0		/* y2 = S1 + CH */
-	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */
-		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+		movdqa	XTMP3, XTMP2	/* XTMP3 = W[-2] {DDCC} */; \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+		movdqa	X0,    XTMP2	/* X0    = W[-2] {DDCC} */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	mov	y2, f		/* y2 = f */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+		psrlq	XTMP2, 17	/* XTMP2 = W[-2] ror 17 {xDxC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	xor	y2, g		/* y2 = f^g */; \
+		psrlq	XTMP3, 19	/* XTMP3 = W[-2] ror 19 {xDxC} */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+		psrld	X0,    10	/* X0 = W[-2] >> 10 {DDCC} */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+		pxor	XTMP2, XTMP3; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	add	y2, [rsp + _XFER + 3*4]	/* y2 = k + w + S1 + CH */; \
+		pxor	X0, XTMP2	/* X0 = s1 {xDxC} */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+		pshufb	X0, SHUF_DC00	/* X0 = s1 {DC00} */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+		paddd	X0, XTMP0	/* X0 = {W[3], W[2], W[1], W[0]} */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
 
-ROTATE_ARGS
-rotate_Xs
-.endm
+#define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
+	FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
+	FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
+	FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
+	FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
 
 /* input is [rsp + _XFER + %1 * 4] */
-.macro DO_ROUND i1
-	mov	y0, e		/* y0 = e */
-	ror	y0, (25-11)	/* y0 = e >> (25-11) */
-	mov	y1, a		/* y1 = a */
-	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */
-	ror	y1, (22-13)	/* y1 = a >> (22-13) */
-	mov	y2, f		/* y2 = f */
-	xor	y1, a		/* y1 = a ^ (a >> (22-13) */
-	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */
-	xor	y2, g		/* y2 = f^g */
-	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */
-	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */
-	and	y2, e		/* y2 = (f^g)&e */
-	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */
-	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */
-	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */
-	add	y2, y0		/* y2 = S1 + CH */
-	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */
-	add	y2, [rsp + _XFER + \i1 * 4]	/* y2 = k + w + S1 + CH */
-	mov	y0, a		/* y0 = a */
-	add	h, y2		/* h = h + S1 + CH + k + w */
-	mov	y2, a		/* y2 = a */
-	or	y0, c		/* y0 = a|c */
-	add	d, h		/* d = d + h + S1 + CH + k + w */
-	and	y2, c		/* y2 = a&c */
-	and	y0, b		/* y0 = (a|c)&b */
-	add	h, y1		/* h = h + S1 + CH + k + w + S0 */
-	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */
+#define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
+	mov	y0, e		/* y0 = e */; \
+	ror	y0, (25-11)	/* y0 = e >> (25-11) */; \
+	mov	y1, a		/* y1 = a */; \
+	xor	y0, e		/* y0 = e ^ (e >> (25-11)) */; \
+	ror	y1, (22-13)	/* y1 = a >> (22-13) */; \
+	mov	y2, f		/* y2 = f */; \
+	xor	y1, a		/* y1 = a ^ (a >> (22-13) */; \
+	ror	y0, (11-6)	/* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
+	xor	y2, g		/* y2 = f^g */; \
+	xor	y0, e		/* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
+	ror	y1, (13-2)	/* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
+	and	y2, e		/* y2 = (f^g)&e */; \
+	xor	y1, a		/* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
+	ror	y0, 6		/* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
+	xor	y2, g		/* y2 = CH = ((f^g)&e)^g */; \
+	add	y2, y0		/* y2 = S1 + CH */; \
+	ror	y1, 2		/* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
+	add	y2, [rsp + _XFER + i1 * 4]	/* y2 = k + w + S1 + CH */; \
+	mov	y0, a		/* y0 = a */; \
+	add	h, y2		/* h = h + S1 + CH + k + w */; \
+	mov	y2, a		/* y2 = a */; \
+	or	y0, c		/* y0 = a|c */; \
+	add	d, h		/* d = d + h + S1 + CH + k + w */; \
+	and	y2, c		/* y2 = a&c */; \
+	and	y0, b		/* y0 = (a|c)&b */; \
+	add	h, y1		/* h = h + S1 + CH + k + w + S0 */; \
+	or	y0, y2		/* y0 = MAJ = (a|c)&b)|(a&c) */; \
 	lea	h, [h + y0]	/* h = h + S1 + CH + k + w + S0 + MAJ */
-	ROTATE_ARGS
-.endm
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -414,10 +389,10 @@ _gcry_sha256_transform_amd64_ssse3:
 	lea	TBL, [.LK256 ADD_RIP]
 
 	/* byte swap first 16 dwords */
-	COPY_XMM_AND_BSWAP	X0, [INP + 0*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X1, [INP + 1*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X2, [INP + 2*16], BYTE_FLIP_MASK
-	COPY_XMM_AND_BSWAP	X3, [INP + 3*16], BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
+	COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
 
 	mov	[rsp + _INP], INP
 
@@ -428,23 +403,23 @@ _gcry_sha256_transform_amd64_ssse3:
 	movdqa	XFER, [TBL + 0*16]
 	paddd	XFER, X0
 	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
 
 	movdqa	XFER, [TBL + 1*16]
-	paddd	XFER, X0
+	paddd	XFER, X1
 	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
 
 	movdqa	XFER, [TBL + 2*16]
-	paddd	XFER, X0
+	paddd	XFER, X2
 	movdqa	[rsp + _XFER], XFER
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
 
 	movdqa	XFER, [TBL + 3*16]
-	paddd	XFER, X0
+	paddd	XFER, X3
 	movdqa	[rsp + _XFER], XFER
 	add	TBL, 4*16
-	FOUR_ROUNDS_AND_SCHED
+	FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
 
 	sub	SRND, 1
 	jne	.Loop1
@@ -453,17 +428,17 @@ _gcry_sha256_transform_amd64_ssse3:
 .Loop2:
 	paddd	X0, [TBL + 0*16]
 	movdqa	[rsp + _XFER], X0
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, a, b, c, d, e, f, g, h)
+	DO_ROUND(1, h, a, b, c, d, e, f, g)
+	DO_ROUND(2, g, h, a, b, c, d, e, f)
+	DO_ROUND(3, f, g, h, a, b, c, d, e)
 	paddd	X1, [TBL + 1*16]
 	movdqa	[rsp + _XFER], X1
 	add	TBL, 2*16
-	DO_ROUND	0
-	DO_ROUND	1
-	DO_ROUND	2
-	DO_ROUND	3
+	DO_ROUND(0, e, f, g, h, a, b, c, d)
+	DO_ROUND(1, d, e, f, g, h, a, b, c)
+	DO_ROUND(2, c, d, e, f, g, h, a, b)
+	DO_ROUND(3, b, c, d, e, f, g, h, a)
 
 	movdqa	X0, X2
 	movdqa	X1, X3
@@ -471,14 +446,14 @@ _gcry_sha256_transform_amd64_ssse3:
 	sub	SRND, 1
 	jne	.Loop2
 
-	addm	[4*0 + CTX],a
-	addm	[4*1 + CTX],b
-	addm	[4*2 + CTX],c
-	addm	[4*3 + CTX],d
-	addm	[4*4 + CTX],e
-	addm	[4*5 + CTX],f
-	addm	[4*6 + CTX],g
-	addm	[4*7 + CTX],h
+	addm([4*0 + CTX],a)
+	addm([4*1 + CTX],b)
+	addm([4*2 + CTX],c)
+	addm([4*3 + CTX],d)
+	addm([4*4 + CTX],e)
+	addm([4*5 + CTX],f)
+	addm([4*6 + CTX],g)
+	addm([4*7 + CTX],h)
 
 	mov	INP, [rsp + _INP]
 	add	INP, 64
diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S
index 991fd639..75f7b070 100644
--- a/cipher/sha512-avx-amd64.S
+++ b/cipher/sha512-avx-amd64.S
@@ -53,32 +53,32 @@
 .text
 
 /* Virtual Registers */
-msg = rdi /* ARG1 */
-digest = rsi /* ARG2 */
-msglen = rdx /* ARG3 */
-T1 = rcx
-T2 = r8
-a_64 = r9
-b_64 = r10
-c_64 = r11
-d_64 = r12
-e_64 = r13
-f_64 = r14
-g_64 = r15
-h_64 = rbx
-tmp0 = rax
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
-frame_W      = 0 /* Message Schedule */
-frame_W_size = (80 * 8)
-frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
-frame_WK_size = (2 * 8)
-frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
-frame_GPRSAVE_size = (5 * 8)
-frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
@@ -90,162 +90,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
-.macro RotateState
-	/* Rotate symbles a..h right */
-	__TMP = h_64
-	h_64 =  g_64
-	g_64 =  f_64
-	f_64 =  e_64
-	e_64 =  d_64
-	d_64 =  c_64
-	c_64 =  b_64
-	b_64 =  a_64
-	a_64 =  __TMP
-.endm
-
-.macro RORQ p1 p2
-	/* shld is faster than ror on Intel Sandybridge */
-	shld	\p1, \p1, (64 - \p2)
-.endm
-
-.macro SHA512_Round t
-	/* Compute Round %%t */
-	mov	T1,   f_64        /* T1 = f */
-	mov	tmp0, e_64        /* tmp = e */
-	xor	T1,   g_64        /* T1 = f ^ g */
-	RORQ	tmp0, 23 /* 41     ; tmp = e ror 23 */
-	and	T1,   e_64        /* T1 = (f ^ g) & e */
-	xor	tmp0, e_64        /* tmp = (e ror 23) ^ e */
-	xor	T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
-	add	T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
-	RORQ	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
-	xor	tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
-	mov	T2,   a_64        /* T2 = a */
-	add	T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
-	RORQ	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
-	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
-	mov	tmp0, a_64        /* tmp = a */
-	xor	T2,   c_64        /* T2 = a ^ c */
-	and	tmp0, c_64        /* tmp = a & c */
-	and	T2,   b_64        /* T2 = (a ^ c) & b */
-	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
-	mov	tmp0, a_64        /* tmp = a */
-	RORQ	tmp0, 5 /* 39      ; tmp = a ror 5 */
-	xor	tmp0, a_64        /* tmp = (a ror 5) ^ a */
-	add	d_64, T1          /* e(next_state) = d + T1  */
-	RORQ	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
-	xor	tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
-	lea	h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
-	RORQ	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
-	add	h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
-	RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_avx t
-/*	; Compute rounds %%t-2 and %%t-1
-	; Compute message schedule QWORDS %%t and %%t+1
-
-	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
-	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
-	; scheduler.
-	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
-	; They are then added to their respective SHA512 constants at
-	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
-	;   For brievity, the comments following vectored instructions only refer to
-	; the first of a pair of QWORDS.
-	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
-	;   The computation of the message schedule and the rounds are tightly
-	; stitched to take advantage of instruction-level parallelism.
-	; For clarity, integer instructions (for the rounds calculation) are indented
-	; by one tab. Vectored instructions (for the message scheduler) are indented
-	; by two tabs. */
-
-		vmovdqa	xmm4, [W_t(\t-2)]   /* XMM4 = W[t-2] */
-		vmovdqu	xmm5, [W_t(\t-15)]  /* XMM5 = W[t-15] */
-	mov	T1,   f_64
-		vpsrlq	xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */
-	mov	tmp0, e_64
-		vpsrlq	xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */
-	xor	T1,   g_64
-	RORQ	tmp0, 23 /* 41 */
-		vpsrlq	xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */
-	and	T1,   e_64
-	xor	tmp0, e_64
-		vpxor	xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */
-	xor	T1,   g_64
-	add	T1,   [WK_2(\t)];
-		vpsrlq	xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */
-	RORQ	tmp0, 4 /* 18 */
-		vpsrlq	xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */
-	xor	tmp0, e_64
-	mov	T2,   a_64
-	add	T1,   h_64
-		vpxor	xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */
-	RORQ	tmp0, 14 /* 14 */
-	add	T1,   tmp0
-		vpsrlq	xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */
-	mov 	tmp0, a_64
-	xor	T2,   c_64
-		vpsllq	xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */
-	and	tmp0, c_64
-	and	T2,   b_64
-		vpxor	xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */
-	xor	T2,   tmp0
-	mov	tmp0, a_64
-		vpsllq	xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */
-	RORQ	tmp0, 5 /* 39 */
-		vpxor	xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */
-	xor	tmp0, a_64
-	add	d_64, T1
-	RORQ	tmp0, 6 /* 34 */
-	xor	tmp0, a_64
-		vpxor	xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */
-	lea	h_64, [T1 + T2]
-	RORQ 	tmp0, 28 /* 28 */
-		vpsllq	xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */
-	add	h_64, tmp0
-	RotateState
-		vpxor	xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */
-	mov	T1, f_64
-		vpxor	xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */
-	mov	tmp0, e_64
-	xor	T1,   g_64
-		vpaddq	xmm0, xmm0, [W_t(\t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */
-		vmovdqu	xmm1, [W_t(\t- 7)]  /* XMM1 = W[t-7] */
-	RORQ	tmp0, 23 /* 41 */
-	and	T1,   e_64
-	xor	tmp0, e_64
-	xor	T1,   g_64
-		vpsllq	xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */
-	add	T1,   [WK_2(\t+1)]
-		vpxor	xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */
-	RORQ	tmp0, 4 /* 18 */
-		vpaddq	xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */
-	xor	tmp0, e_64
-		vpaddq	xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
-	mov	T2,   a_64
-	add	T1,   h_64
-	RORQ	tmp0, 14 /* 14 */
-	add	T1,   tmp0
-		vmovdqa	[W_t(\t)], xmm0      /* Store W[t] */
-		vpaddq	xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */
-		vmovdqa	[WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */
-	mov	tmp0, a_64
-	xor	T2,   c_64
-	and	tmp0, c_64
-	and	T2,   b_64
-	xor	T2,   tmp0
-	mov	tmp0, a_64
-	RORQ	tmp0, 5 /* 39 */
-	xor	tmp0, a_64
-	add	d_64, T1
-	RORQ	tmp0, 6 /* 34 */
-	xor	tmp0, a_64
-	lea	h_64, [T1 + T2]
-	RORQ	tmp0, 28 /* 28 */
-	add	h_64, tmp0
-	RotateState
-.endm
+#define RORQ(p1, p2) \
+	/* shld is faster than ror on Intel Sandybridge */ \
+	shld	p1, p1, (64 - p2)
+
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+	/* Compute Round %%t */; \
+	mov	T1,   f        /* T1 = f */; \
+	mov	tmp0, e        /* tmp = e */; \
+	xor	T1,   g        /* T1 = f ^ g */; \
+	RORQ(	tmp0, 23) /* 41     ; tmp = e ror 23 */; \
+	and	T1,   e        /* T1 = (f ^ g) & e */; \
+	xor	tmp0, e        /* tmp = (e ror 23) ^ e */; \
+	xor	T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+	add	T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+	RORQ(	tmp0, 4) /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+	xor	tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+	mov	T2,   a        /* T2 = a */; \
+	add	T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+	RORQ(	tmp0, 14) /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	xor	T2,   c        /* T2 = a ^ c */; \
+	and	tmp0, c        /* tmp = a & c */; \
+	and	T2,   b        /* T2 = (a ^ c) & b */; \
+	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	RORQ(	tmp0, 5) /* 39      ; tmp = a ror 5 */; \
+	xor	tmp0, a        /* tmp = (a ror 5) ^ a */; \
+	add	d, T1          /* e(next_state) = d + T1  */; \
+	RORQ(	tmp0, 6) /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+	xor	tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+	lea	h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+	RORQ(	tmp0, 28) /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+	add	h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h) \
+	/* \
+	; Compute rounds %%t-2 and %%t-1 \
+	; Compute message schedule QWORDS %%t and %%t+1 \
+	; \
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+	; scheduler. \
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+	; They are then added to their respective SHA512 constants at \
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+	;   For brievity, the comments following vectored instructions only refer to \
+	; the first of a pair of QWORDS. \
+	; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} \
+	;   The computation of the message schedule and the rounds are tightly \
+	; stitched to take advantage of instruction-level parallelism. \
+	; For clarity, integer instructions (for the rounds calculation) are indented \
+	; by one tab. Vectored instructions (for the message scheduler) are indented \
+	; by two tabs. \
+	*/ \
+	\
+		vmovdqa	xmm4, [W_t(t-2)]   /* XMM4 = W[t-2] */; \
+		vmovdqu	xmm5, [W_t(t-15)]  /* XMM5 = W[t-15] */; \
+	mov	T1,   f; \
+		vpsrlq	xmm0, xmm4, 61       /* XMM0 = W[t-2]>>61 */; \
+	mov	tmp0, e; \
+		vpsrlq	xmm6, xmm5, 1        /* XMM6 = W[t-15]>>1 */; \
+	xor	T1,   g; \
+	RORQ(	tmp0, 23) /* 41 */; \
+		vpsrlq	xmm1, xmm4, 19       /* XMM1 = W[t-2]>>19 */; \
+	and	T1,   e; \
+	xor	tmp0, e; \
+		vpxor	xmm0, xmm0, xmm1           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */; \
+	xor	T1,   g; \
+	add	T1,   [WK_2(t)]; \
+		vpsrlq	xmm7, xmm5, 8        /* XMM7 = W[t-15]>>8 */; \
+	RORQ(	tmp0, 4) /* 18 */; \
+		vpsrlq	xmm2, xmm4, 6        /* XMM2 = W[t-2]>>6 */; \
+	xor	tmp0, e; \
+	mov	T2,   a; \
+	add	T1,   h; \
+		vpxor	xmm6, xmm6, xmm7           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */; \
+	RORQ(	tmp0, 14) /* 14 */; \
+	add	T1,   tmp0; \
+		vpsrlq	xmm8, xmm5, 7        /* XMM8 = W[t-15]>>7 */; \
+	mov 	tmp0, a; \
+	xor	T2,   c; \
+		vpsllq	xmm3, xmm4, (64-61)  /* XMM3 = W[t-2]<<3 */; \
+	and	tmp0, c; \
+	and	T2,   b; \
+		vpxor	xmm2, xmm2, xmm3           /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */; \
+	xor	T2,   tmp0; \
+	mov	tmp0, a; \
+		vpsllq	xmm9, xmm5, (64-1)   /* XMM9 = W[t-15]<<63 */; \
+	RORQ(	tmp0, 5) /* 39 */; \
+		vpxor	xmm8, xmm8, xmm9           /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */; \
+	xor	tmp0, a; \
+	add	d, T1; \
+	RORQ(	tmp0, 6) /* 34 */; \
+	xor	tmp0, a; \
+		vpxor	xmm6, xmm6, xmm8           /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */; \
+	lea	h, [T1 + T2]; \
+	RORQ(	tmp0, 28) /* 28 */; \
+		vpsllq	xmm4, xmm4, (64-19)        /* XMM4 = W[t-2]<<25 */; \
+	add	h, tmp0
+
+#define SHA512_2Sched_2Round_avx_PART2(t, a, b, c, d, e, f, g, h) \
+		vpxor	xmm0, xmm0, xmm4           /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */; \
+	mov	T1, f; \
+		vpxor	xmm0, xmm0, xmm2           /* XMM0 = s1(W[t-2]) */; \
+	mov	tmp0, e; \
+	xor	T1,   g; \
+		vpaddq	xmm0, xmm0, [W_t(t-16)]  /* XMM0 = s1(W[t-2]) + W[t-16] */; \
+		vmovdqu	xmm1, [W_t(t- 7)]  /* XMM1 = W[t-7] */; \
+	RORQ(	tmp0, 23) /* 41 */; \
+	and	T1,   e; \
+	xor	tmp0, e; \
+	xor	T1,   g; \
+		vpsllq	xmm5, xmm5, (64-8)         /* XMM5 = W[t-15]<<56 */; \
+	add	T1,   [WK_2(t+1)]; \
+		vpxor	xmm6, xmm6, xmm5           /* XMM6 = s0(W[t-15]) */; \
+	RORQ(	tmp0, 4) /* 18 */; \
+		vpaddq	xmm0, xmm0, xmm6           /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */; \
+	xor	tmp0, e; \
+		vpaddq	xmm0, xmm0, xmm1           /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+	mov	T2,   a; \
+	add	T1,   h; \
+	RORQ(	tmp0, 14) /* 14 */; \
+	add	T1,   tmp0; \
+		vmovdqa	[W_t(t)], xmm0      /* Store W[t] */; \
+		vpaddq	xmm0, xmm0, [K_t(t)]        /* Compute W[t]+K[t] */; \
+		vmovdqa	[WK_2(t)], xmm0       /* Store W[t]+K[t] for next rounds */; \
+	mov	tmp0, a; \
+	xor	T2,   c; \
+	and	tmp0, c; \
+	and	T2,   b; \
+	xor	T2,   tmp0; \
+	mov	tmp0, a; \
+	RORQ(	tmp0, 5) /* 39 */; \
+	xor	tmp0, a; \
+	add	d, T1; \
+	RORQ(	tmp0, 6) /* 34 */; \
+	xor	tmp0, a; \
+	lea	h, [T1 + T2]; \
+	RORQ(	tmp0, 28) /* 28 */; \
+	add	h, tmp0
+
+#define SHA512_2Sched_2Round_avx(t, a, b, c, d, e, f, g, h) \
+	SHA512_2Sched_2Round_avx_PART1(t, a, b, c, d, e, f, g, h); \
+	SHA512_2Sched_2Round_avx_PART2(t, h, a, b, c, d, e, f, g)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -295,37 +284,77 @@ _gcry_sha512_transform_amd64_avx:
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 
-	t = 0
-	.rept 80/2 + 1
-	/* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
-	/* +1 iteration because the scheduler leads hashing by 1 iteration */
-		.if t < 2
-			/* BSWAP 2 QWORDS */
-			vmovdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
-			vmovdqu	xmm0, [MSG(t)]
-			vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
-			vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */
-			vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
-			vmovdqa	[WK_2(t)], xmm0      /* Store into WK for rounds */
-		.elseif t < 16
-			/* BSWAP 2 QWORDS, Compute 2 Rounds */
-			vmovdqu	xmm0, [MSG(t)]
-			vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
-			SHA512_Round (t - 2)         /* Round t-2 */
-			vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */
-			vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */
-			SHA512_Round (t - 1)         /* Round t-1 */
-			vmovdqa	[WK_2(t)], xmm0      /* W[t]+K[t] into WK */
-		.elseif t < 79
-			/* Schedule 2 QWORDS; Compute 2 Rounds */
-			SHA512_2Sched_2Round_avx t
-		.else
-			/* Compute 2 Rounds */
-			SHA512_Round (t - 2)
-			SHA512_Round (t - 1)
-		.endif
-		t = ((t)+2)
-	.endr
+	/* BSWAP 2 QWORDS */
+	vmovdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+	vmovdqu	xmm0, [MSG(0)]
+	vpshufb	xmm0, xmm0, xmm1     /* BSWAP */
+	vmovdqa	[W_t(0)], xmm0       /* Store Scheduled Pair */
+	vpaddq	xmm0, xmm0, [K_t(0)] /* Compute W[t]+K[t] */
+	vmovdqa	[WK_2(0)], xmm0      /* Store into WK for rounds */
+
+	#define T_2_14(t, a, b, c, d, e, f, g, h) \
+		/* BSWAP 2 QWORDS, Compute 2 Rounds */; \
+		vmovdqu	xmm0, [MSG(t)]; \
+		vpshufb	xmm0, xmm0, xmm1     /* BSWAP */; \
+		SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+				        e##_64, f##_64, g##_64, h##_64); \
+		vmovdqa	[W_t(t)], xmm0       /* Store Scheduled Pair */; \
+		vpaddq	xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */; \
+		SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+				        d##_64, e##_64, f##_64, g##_64); \
+		vmovdqa	[WK_2(t)], xmm0      /* W[t]+K[t] into WK */
+
+	#define T_16_78(t, a, b, c, d, e, f, g, h) \
+		SHA512_2Sched_2Round_avx((t), a##_64, b##_64, c##_64, d##_64, \
+					      e##_64, f##_64, g##_64, h##_64)
+
+	#define T_80(t, a, b, c, d, e, f, g, h) \
+		/* Compute 2 Rounds */; \
+		SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+				      e##_64, f##_64, g##_64, h##_64); \
+		SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+				      d##_64, e##_64, f##_64, g##_64)
+
+	T_2_14(2, a, b, c, d, e, f, g, h)
+	T_2_14(4, g, h, a, b, c, d, e, f)
+	T_2_14(6, e, f, g, h, a, b, c, d)
+	T_2_14(8, c, d, e, f, g, h, a, b)
+	T_2_14(10, a, b, c, d, e, f, g, h)
+	T_2_14(12, g, h, a, b, c, d, e, f)
+	T_2_14(14, e, f, g, h, a, b, c, d)
+	T_16_78(16, c, d, e, f, g, h, a, b)
+	T_16_78(18, a, b, c, d, e, f, g, h)
+	T_16_78(20, g, h, a, b, c, d, e, f)
+	T_16_78(22, e, f, g, h, a, b, c, d)
+	T_16_78(24, c, d, e, f, g, h, a, b)
+	T_16_78(26, a, b, c, d, e, f, g, h)
+	T_16_78(28, g, h, a, b, c, d, e, f)
+	T_16_78(30, e, f, g, h, a, b, c, d)
+	T_16_78(32, c, d, e, f, g, h, a, b)
+	T_16_78(34, a, b, c, d, e, f, g, h)
+	T_16_78(36, g, h, a, b, c, d, e, f)
+	T_16_78(38, e, f, g, h, a, b, c, d)
+	T_16_78(40, c, d, e, f, g, h, a, b)
+	T_16_78(42, a, b, c, d, e, f, g, h)
+	T_16_78(44, g, h, a, b, c, d, e, f)
+	T_16_78(46, e, f, g, h, a, b, c, d)
+	T_16_78(48, c, d, e, f, g, h, a, b)
+	T_16_78(50, a, b, c, d, e, f, g, h)
+	T_16_78(52, g, h, a, b, c, d, e, f)
+	T_16_78(54, e, f, g, h, a, b, c, d)
+	T_16_78(56, c, d, e, f, g, h, a, b)
+	T_16_78(58, a, b, c, d, e, f, g, h)
+	T_16_78(60, g, h, a, b, c, d, e, f)
+	T_16_78(62, e, f, g, h, a, b, c, d)
+	T_16_78(64, c, d, e, f, g, h, a, b)
+	T_16_78(66, a, b, c, d, e, f, g, h)
+	T_16_78(68, g, h, a, b, c, d, e, f)
+	T_16_78(70, e, f, g, h, a, b, c, d)
+	T_16_78(72, c, d, e, f, g, h, a, b)
+	T_16_78(74, a, b, c, d, e, f, g, h)
+	T_16_78(76, g, h, a, b, c, d, e, f)
+	T_16_78(78, e, f, g, h, a, b, c, d)
+	T_80(80, c, d, e, f, g, h, a, b)
 
 	/* Update digest */
 	add	[DIGEST(0)], a_64
@@ -357,11 +386,12 @@ _gcry_sha512_transform_amd64_avx:
 	vzeroall
 
 	/* Burn stack */
-	t = 0
-	.rept frame_W_size / 32
-		vmovups [rsp + frame_W + (t) * 32], ymm0
-		t = ((t)+1)
-	.endr
+	mov eax, 0
+.Lerase_stack:
+	vmovdqu [rsp + rax], ymm0
+	add eax, 32
+	cmp eax, frame_W_size
+	jne .Lerase_stack
 	vmovdqu [rsp + frame_WK], xmm0
 	xor     eax, eax
 
diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S
index 3b28ab6c..7f119e6c 100644
--- a/cipher/sha512-avx2-bmi2-amd64.S
+++ b/cipher/sha512-avx2-bmi2-amd64.S
@@ -56,46 +56,45 @@
 .text
 
 /* Virtual Registers */
-Y_0 = ymm4
-Y_1 = ymm5
-Y_2 = ymm6
-Y_3 = ymm7
-
-YTMP0 = ymm0
-YTMP1 = ymm1
-YTMP2 = ymm2
-YTMP3 = ymm3
-YTMP4 = ymm8
-XFER =  YTMP0
-
-BYTE_FLIP_MASK =  ymm9
-MASK_YMM_LO    =  ymm10
-MASK_YMM_LOx   =  xmm10
-
-INP =         rdi /* 1st arg */
-CTX =         rsi /* 2nd arg */
-NUM_BLKS =    rdx /* 3rd arg */
-c =           rcx
-d =           r8
-e =           rdx
-y3 =          rdi
-
-TBL =   rbp
-
-a =     rax
-b =     rbx
-
-f =     r9
-g =     r10
-h =     r11
-old_h = rax
-
-T1 =    r12
-y0 =    r13
-y1 =    r14
-y2 =    r15
-
-y4 =    r12
+#define Y_0 ymm4
+#define Y_1 ymm5
+#define Y_2 ymm6
+#define Y_3 ymm7
+
+#define YTMP0 ymm0
+#define YTMP1 ymm1
+#define YTMP2 ymm2
+#define YTMP3 ymm3
+#define YTMP4 ymm8
+#define XFER YTMP0
+
+#define BYTE_FLIP_MASK ymm9
+#define MASK_YMM_LO ymm10
+#define MASK_YMM_LOx xmm10
+
+#define INP rdi /* 1st arg */
+#define CTX rsi /* 2nd arg */
+#define NUM_BLKS rdx /* 3rd arg */
+#define c rcx
+#define d r8
+#define e rdx
+#define y3 rdi
+
+#define TBL rbp
+
+#define a rax
+#define b rbx
+
+#define f r9
+#define g r10
+#define h r11
+
+#define T1 r12
+#define y0 r13
+#define y1 r14
+#define y2 r15
+
+#define y4 r12
 
 /* Local variables (stack frame) */
 #define frame_XFER      0
@@ -116,218 +115,153 @@ y4 =    r12
 
 /* addm [mem], reg */
 /* Add reg to mem using reg-mem add and store */
-.macro addm p1 p2
-	add	\p2, \p1
-	mov	\p1, \p2
-.endm
+#define addm(p1, p2) \
+	add	p2, p1; \
+	mov	p1, p2;
 
 
 /* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */
 /* Load ymm with mem and byte swap each dword */
-.macro COPY_YMM_AND_BSWAP p1 p2 p3
-	VMOVDQ \p1, \p2
-	vpshufb \p1, \p1, \p3
-.endm
-/* rotate_Ys */
-/* Rotate values of symbols Y0...Y3 */
-.macro rotate_Ys
-	__Y_ = Y_0
-	Y_0 = Y_1
-	Y_1 = Y_2
-	Y_2 = Y_3
-	Y_3 = __Y_
-.endm
-
-/* RotateState */
-.macro RotateState
-	/* Rotate symbles a..h right */
-	old_h =  h
-	__TMP_ = h
-	h =      g
-	g =      f
-	f =      e
-	e =      d
-	d =      c
-	c =      b
-	b =      a
-	a =      __TMP_
-.endm
+#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
+	VMOVDQ p1, p2; \
+	vpshufb p1, p1, p3
 
 /* %macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL */
 /* YDST = {YSRC1, YSRC2} >> RVAL*8 */
-.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL
-	vperm2f128 	\YDST, \YSRC1, \YSRC2, 0x3	/* YDST = {YS1_LO, YS2_HI} */
-	vpalignr 	\YDST, \YDST, \YSRC2, \RVAL	/* YDST = {YDS1, YS2} >> RVAL*8 */
-.endm
-
-.macro ONE_ROUND_PART1 XFER
-	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]);
-	 * d += h;
-	 * h += Sum0 (a) + Maj (a, b, c);
-	 *
-	 * Ch(x, y, z) => ((x & y) + (~x & z))
-	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y)))
-	 */
-
-	mov y3, e
-	add h, [\XFER]
-	and y3, f
-	rorx y0, e, 41
-	rorx y1, e, 18
+#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
+	vperm2i128 YDST, YSRC1, YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */; \
+	vpalignr   YDST, YDST, YSRC2, RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */
+
+#define ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h) \
+	/* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); \
+	 * d += h; \
+	 * h += Sum0 (a) + Maj (a, b, c); \
+	 * \
+	 * Ch(x, y, z) => ((x & y) + (~x & z)) \
+	 * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) \
+	 */ \
+	\
+	mov y3, e; \
+	add h, [XFERIN]; \
+	and y3, f; \
+	rorx y0, e, 41; \
+	rorx y1, e, 18; \
+	lea h, [h + y3]; \
+	andn y3, e, g; \
+	rorx T1, a, 34; \
+	xor y0, y1; \
 	lea h, [h + y3]
-	andn y3, e, g
-	rorx T1, a, 34
-	xor y0, y1
-	lea h, [h + y3]
-.endm
-.macro ONE_ROUND_PART2
-	rorx y2, a, 39
-	rorx y1, e, 14
-	mov y3, a
-	xor T1, y2
-	xor y0, y1
-	xor y3, b
-	lea h, [h + y0]
-	mov y0, a
-	rorx y2, a, 28
-	add d, h
-	and y3, c
-	xor T1, y2
-	lea h, [h + y3]
-	lea h, [h + T1]
-	and y0, b
-	lea h, [h + y0]
-.endm
-
-.macro ONE_ROUND XFER
-	ONE_ROUND_PART1 \XFER
-	ONE_ROUND_PART2
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED X
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		/* Extract w[t-7] */
-		MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		/* YTMP0 = W[-7] */
-		/* Calculate w[t-16] + w[t-7] */
-		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */
-		/* Extract w[t-15] */
-		MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		/* YTMP1 = W[-15] */
-
-		/* Calculate sigma0 */
-
-		/* Calculate w[t-15] ror 1 */
-		vpsrlq		YTMP2, YTMP1, 1
-		vpsllq		YTMP3, YTMP1, (64-1)
-		vpor		YTMP3, YTMP3, YTMP2		/* YTMP3 = W[-15] ror 1 */
-		/* Calculate w[t-15] shr 7 */
-		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */
-
-	ONE_ROUND rsp+frame_XFER+0*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		/* Calculate w[t-15] ror 8 */
-		vpsrlq		YTMP2, YTMP1, 8
-		vpsllq		YTMP1, YTMP1, (64-8)
-		vpor		YTMP1, YTMP1, YTMP2		/* YTMP1 = W[-15] ror 8 */
-		/* XOR the three components */
-		vpxor		YTMP3, YTMP3, YTMP4		/* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */
-		vpxor		YTMP1, YTMP3, YTMP1		/* YTMP1 = s0 */
-
-
-		/* Add three components, w[t-16], w[t-7] and sigma0 */
-		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */
-		/* Move to appropriate lanes for calculating w[16] and w[17] */
-		vperm2f128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */
-		/* Move to appropriate lanes for calculating w[18] and w[19] */
-		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */
-
-		/* Calculate w[16] and w[17] in both 128 bit lanes */
-
-		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */
-		vperm2f128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */
-		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */
-
-	ONE_ROUND rsp+frame_XFER+1*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
 
+#define ONE_ROUND_PART2(a, b, c, d, e, f, g, h) \
+	rorx y2, a, 39; \
+	rorx y1, e, 14; \
+	mov y3, a; \
+	xor T1, y2; \
+	xor y0, y1; \
+	xor y3, b; \
+	lea h, [h + y0]; \
+	mov y0, a; \
+	rorx y2, a, 28; \
+	add d, h; \
+	and y3, c; \
+	xor T1, y2; \
+	lea h, [h + y3]; \
+	lea h, [h + T1]; \
+	and y0, b; \
+	lea h, [h + y0]
 
-		vpsrlq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] >> 19 {BABA} */
-		vpsllq		YTMP1, YTMP2, (64-19)		/* YTMP1 = W[-2] << 19 {BABA} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {BABA} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */
-		vpsrlq		YTMP3, YTMP2, 61		/* YTMP3 = W[-2] >> 61 {BABA} */
-		vpsllq		YTMP1, YTMP2, (64-61)		/* YTMP1 = W[-2] << 61 {BABA} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {BABA} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */
-
-		/* Add sigma1 to the other compunents to get w[16] and w[17] */
-		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */
-
-		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */
-		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */
-
-	ONE_ROUND rsp+frame_XFER+2*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-		vpsrlq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] >> 19 {DC--} */
-		vpsllq		YTMP1, Y_0, (64-19)		/* YTMP1 = W[-2] << 19 {DC--} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {DC--} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */
-		vpsrlq		YTMP3, Y_0, 61			/* YTMP3 = W[-2] >> 61 {DC--} */
-		vpsllq		YTMP1, Y_0, (64-61)		/* YTMP1 = W[-2] << 61 {DC--} */
-		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {DC--} */
-		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */
-
-		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */
-		vpaddq		YTMP2, YTMP0, YTMP4		/* YTMP2 = {W[3], W[2], --, --} */
-
-		/* Form w[19, w[18], w17], w[16] */
-		vpblendd		Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */
-
-	ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32
-		vpaddq		XFER, Y_0, [TBL + (4+\X)*32]
-		vmovdqa		[rsp + frame_XFER + \X*32], XFER
-	ONE_ROUND_PART2
-	RotateState
-	rotate_Ys
-.endm
-
-.macro DO_4ROUNDS X
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+0*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+1*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+2*8+\X*32
-	RotateState
-
-/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */
-
-	ONE_ROUND rsp+frame_XFER+3*8+\X*32
-	RotateState
-
-.endm
+#define ONE_ROUND(XFERIN, a, b, c, d, e, f, g, h) \
+	ONE_ROUND_PART1(XFERIN, a, b, c, d, e, f, g, h); \
+	ONE_ROUND_PART2(a, b, c, d, e, f, g, h)
+
+#define FOUR_ROUNDS_AND_SCHED(X, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h) \
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		/* Extract w[t-7] */; \
+		MY_VPALIGNR(	YTMP0, Y_3, Y_2, 8)		/* YTMP0 = W[-7] */; \
+		/* Calculate w[t-16] + w[t-7] */; \
+		vpaddq		YTMP0, YTMP0, Y_0		/* YTMP0 = W[-7] + W[-16] */; \
+		/* Extract w[t-15] */; \
+		MY_VPALIGNR(	YTMP1, Y_1, Y_0, 8)		/* YTMP1 = W[-15] */; \
+		\
+		/* Calculate sigma0 */; \
+		\
+		/* Calculate w[t-15] ror 1 */; \
+		vpsrlq		YTMP2, YTMP1, 1; \
+		vpsllq		YTMP3, YTMP1, (64-1); \
+		vpor		YTMP3, YTMP3, YTMP2		/* YTMP3 = W[-15] ror 1 */; \
+		/* Calculate w[t-15] shr 7 */; \
+		vpsrlq		YTMP4, YTMP1, 7			/* YTMP4 = W[-15] >> 7 */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		/* Calculate w[t-15] ror 8 */; \
+		vpsrlq		YTMP2, YTMP1, 8; \
+		vpsllq		YTMP1, YTMP1, (64-8); \
+		vpor		YTMP1, YTMP1, YTMP2		/* YTMP1 = W[-15] ror 8 */; \
+		/* XOR the three components */; \
+		vpxor		YTMP3, YTMP3, YTMP4		/* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */; \
+		vpxor		YTMP1, YTMP3, YTMP1		/* YTMP1 = s0 */; \
+		\
+		/* Add three components, w[t-16], w[t-7] and sigma0 */; \
+		vpaddq		YTMP0, YTMP0, YTMP1		/* YTMP0 = W[-16] + W[-7] + s0 */; \
+		/* Move to appropriate lanes for calculating w[16] and w[17] */; \
+		vperm2i128	Y_0, YTMP0, YTMP0, 0x0		/* Y_0 = W[-16] + W[-7] + s0 {BABA} */; \
+		/* Move to appropriate lanes for calculating w[18] and w[19] */; \
+		vpand		YTMP0, YTMP0, MASK_YMM_LO	/* YTMP0 = W[-16] + W[-7] + s0 {DC00} */; \
+		\
+		/* Calculate w[16] and w[17] in both 128 bit lanes */; \
+		\
+		/* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */; \
+		vperm2i128	YTMP2, Y_3, Y_3, 0x11		/* YTMP2 = W[-2] {BABA} */; \
+		vpsrlq		YTMP4, YTMP2, 6			/* YTMP4 = W[-2] >> 6 {BABA} */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq		YTMP3, YTMP2, 19		/* YTMP3 = W[-2] >> 19 {BABA} */; \
+		vpsllq		YTMP1, YTMP2, (64-19)		/* YTMP1 = W[-2] << 19 {BABA} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {BABA} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */; \
+		vpsrlq		YTMP3, YTMP2, 61		/* YTMP3 = W[-2] >> 61 {BABA} */; \
+		vpsllq		YTMP1, YTMP2, (64-61)		/* YTMP1 = W[-2] << 61 {BABA} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {BABA} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */; \
+		\
+		/* Add sigma1 to the other compunents to get w[16] and w[17] */; \
+		vpaddq		Y_0, Y_0, YTMP4			/* Y_0 = {W[1], W[0], W[1], W[0]} */; \
+		\
+		/* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */; \
+		vpsrlq		YTMP4, Y_0, 6			/* YTMP4 = W[-2] >> 6 {DC--} */; \
+	\
+	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+	\
+	/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */; \
+		vpsrlq		YTMP3, Y_0, 19			/* YTMP3 = W[-2] >> 19 {DC--} */; \
+		vpsllq		YTMP1, Y_0, (64-19)		/* YTMP1 = W[-2] << 19 {DC--} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 19 {DC--} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */; \
+		vpsrlq		YTMP3, Y_0, 61			/* YTMP3 = W[-2] >> 61 {DC--} */; \
+		vpsllq		YTMP1, Y_0, (64-61)		/* YTMP1 = W[-2] << 61 {DC--} */; \
+		vpor		YTMP3, YTMP3, YTMP1		/* YTMP3 = W[-2] ror 61 {DC--} */; \
+		vpxor		YTMP4, YTMP4, YTMP3		/* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */; \
+		\
+		/* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */; \
+		vpaddq		YTMP2, YTMP0, YTMP4		/* YTMP2 = {W[3], W[2], --, --} */; \
+		\
+		/* Form w[19, w[18], w17], w[16] */; \
+		vpblendd	Y_0, Y_0, YTMP2, 0xF0		/* Y_0 = {W[3], W[2], W[1], W[0]} */; \
+	\
+	ONE_ROUND_PART1(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e); \
+		vpaddq		XFER, Y_0, [TBL + (4+X)*32]; \
+		vmovdqa		[rsp + frame_XFER + X*32], XFER; \
+	ONE_ROUND_PART2(f, g, h, a, b, c, d, e)
+
+#define DO_4ROUNDS(X, a, b, c, d, e, f, g, h) \
+	ONE_ROUND(rsp+frame_XFER+0*8+X*32, a, b, c, d, e, f, g, h); \
+	ONE_ROUND(rsp+frame_XFER+1*8+X*32, h, a, b, c, d, e, f, g); \
+	ONE_ROUND(rsp+frame_XFER+2*8+X*32, g, h, a, b, c, d, e, f); \
+	ONE_ROUND(rsp+frame_XFER+3*8+X*32, f, g, h, a, b, c, d, e)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -390,10 +324,10 @@ _gcry_sha512_transform_amd64_avx2:
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/*; byte swap first 16 dwords */
-	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
 	add	INP, 128
 	mov	[rsp + frame_INP], INP
@@ -408,20 +342,20 @@ _gcry_sha512_transform_amd64_avx2:
 	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
-	movq	[rsp + frame_SRND],4
+	mov	qword ptr [rsp + frame_SRND], 4
 
 .align 16
 .Loop0:
-	FOUR_ROUNDS_AND_SCHED 0
-	FOUR_ROUNDS_AND_SCHED 1
-	FOUR_ROUNDS_AND_SCHED 2
-	FOUR_ROUNDS_AND_SCHED 3
+	FOUR_ROUNDS_AND_SCHED(0, Y_0, Y_1, Y_2, Y_3, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(1, Y_1, Y_2, Y_3, Y_0, e, f, g, h, a, b, c, d)
+	FOUR_ROUNDS_AND_SCHED(2, Y_2, Y_3, Y_0, Y_1, a, b, c, d, e, f, g, h)
+	FOUR_ROUNDS_AND_SCHED(3, Y_3, Y_0, Y_1, Y_2, e, f, g, h, a, b, c, d)
 	add	TBL, 4*32
 
-	subq	[rsp + frame_SRND], 1
+	sub	qword ptr [rsp + frame_SRND], 1
 	jne	.Loop0
 
-	subq	[rsp + frame_NBLKS], 1
+	sub	qword ptr [rsp + frame_NBLKS], 1
 	je	.Ldone_hash
 
 	mov	INP, [rsp + frame_INP]
@@ -429,62 +363,62 @@ _gcry_sha512_transform_amd64_avx2:
 	lea	TBL,[.LK512 ADD_RIP]
 
 	/* load next block and byte swap */
-	COPY_YMM_AND_BSWAP	Y_0, [INP + 0*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_1, [INP + 1*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_2, [INP + 2*32], BYTE_FLIP_MASK
-	COPY_YMM_AND_BSWAP	Y_3, [INP + 3*32], BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP(Y_0, [INP + 0*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_1, [INP + 1*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_2, [INP + 2*32], BYTE_FLIP_MASK)
+	COPY_YMM_AND_BSWAP(Y_3, [INP + 3*32], BYTE_FLIP_MASK)
 
 	add	INP, 128
 	mov	[rsp + frame_INP], INP
 
-	DO_4ROUNDS 0
+	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
 	vpaddq	XFER, Y_0, [TBL + 0*32]
 	vmovdqa [rsp + frame_XFER + 0*32], XFER
-	DO_4ROUNDS 1
+	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
 	vpaddq	XFER, Y_1, [TBL + 1*32]
 	vmovdqa [rsp + frame_XFER + 1*32], XFER
-	DO_4ROUNDS 2
+	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
 	vpaddq	XFER, Y_2, [TBL + 2*32]
 	vmovdqa [rsp + frame_XFER + 2*32], XFER
-	DO_4ROUNDS 3
+	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
 	vpaddq	XFER, Y_3, [TBL + 3*32]
 	vmovdqa [rsp + frame_XFER + 3*32], XFER
 
-	addm	[8*0 + CTX],a
-	addm	[8*1 + CTX],b
-	addm	[8*2 + CTX],c
-	addm	[8*3 + CTX],d
-	addm	[8*4 + CTX],e
-	addm	[8*5 + CTX],f
-	addm	[8*6 + CTX],g
-	addm	[8*7 + CTX],h
+	addm([8*0 + CTX],a)
+	addm([8*1 + CTX],b)
+	addm([8*2 + CTX],c)
+	addm([8*3 + CTX],d)
+	addm([8*4 + CTX],e)
+	addm([8*5 + CTX],f)
+	addm([8*6 + CTX],g)
+	addm([8*7 + CTX],h)
 
 	/*; schedule 64 input dwords, by doing 12 rounds of 4 each */
-	movq	[rsp + frame_SRND],4
+	mov	qword ptr [rsp + frame_SRND],4
 
 	jmp	.Loop0
 
 .Ldone_hash:
 	vzeroall
 
-	DO_4ROUNDS 0
+	DO_4ROUNDS(0, a, b, c, d, e, f, g, h)
 	vmovdqa	[rsp + frame_XFER + 0*32], ymm0 /* burn stack */
-	DO_4ROUNDS 1
+	DO_4ROUNDS(1, e, f, g, h, a, b, c, d)
 	vmovdqa	[rsp + frame_XFER + 1*32], ymm0 /* burn stack */
-	DO_4ROUNDS 2
+	DO_4ROUNDS(2, a, b, c, d, e, f, g, h)
 	vmovdqa	[rsp + frame_XFER + 2*32], ymm0 /* burn stack */
-	DO_4ROUNDS 3
+	DO_4ROUNDS(3, e, f, g, h, a, b, c, d)
 	vmovdqa	[rsp + frame_XFER + 3*32], ymm0 /* burn stack */
 
-	addm	[8*0 + CTX],a
+	addm([8*0 + CTX],a)
 	xor     eax, eax /* burn stack */
-	addm	[8*1 + CTX],b
-	addm	[8*2 + CTX],c
-	addm	[8*3 + CTX],d
-	addm	[8*4 + CTX],e
-	addm	[8*5 + CTX],f
-	addm	[8*6 + CTX],g
-	addm	[8*7 + CTX],h
+	addm([8*1 + CTX],b)
+	addm([8*2 + CTX],c)
+	addm([8*3 + CTX],d)
+	addm([8*4 + CTX],e)
+	addm([8*5 + CTX],f)
+	addm([8*6 + CTX],g)
+	addm([8*7 + CTX],h)
 
 	/* Restore GPRs */
 	mov	rbp, [rsp + frame_GPRSAVE + 8 * 0]
diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S
index 39bfe362..6a1328a6 100644
--- a/cipher/sha512-ssse3-amd64.S
+++ b/cipher/sha512-ssse3-amd64.S
@@ -56,32 +56,32 @@
 .text
 
 /* Virtual Registers */
-msg = rdi /* ARG1 */
-digest = rsi /* ARG2 */
-msglen = rdx /* ARG3 */
-T1 = rcx
-T2 = r8
-a_64 = r9
-b_64 = r10
-c_64 = r11
-d_64 = r12
-e_64 = r13
-f_64 = r14
-g_64 = r15
-h_64 = rbx
-tmp0 = rax
+#define msg rdi /* ARG1 */
+#define digest rsi /* ARG2 */
+#define msglen rdx /* ARG3 */
+#define T1 rcx
+#define T2 r8
+#define a_64 r9
+#define b_64 r10
+#define c_64 r11
+#define d_64 r12
+#define e_64 r13
+#define f_64 r14
+#define g_64 r15
+#define h_64 rbx
+#define tmp0 rax
 
 /*
 ; Local variables (stack frame)
 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
 */
-frame_W      = 0 /* Message Schedule */
-frame_W_size = (80 * 8)
-frame_WK      = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
-frame_WK_size = (2 * 8)
-frame_GPRSAVE      = ((frame_WK) + (frame_WK_size))
-frame_GPRSAVE_size = (5 * 8)
-frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
+#define frame_W 0 /* Message Schedule */
+#define frame_W_size (80 * 8)
+#define frame_WK ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */
+#define frame_WK_size (2 * 8)
+#define frame_GPRSAVE ((frame_WK) + (frame_WK_size))
+#define frame_GPRSAVE_size (5 * 8)
+#define frame_size ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 
 
 /* Useful QWORD "arrays" for simpler memory references */
@@ -93,161 +93,151 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size))
 /* MSG, DIGEST, K_t, W_t are arrays */
 /* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */
 
-.macro RotateState
-	/* Rotate symbles a..h right */
-	__TMP = h_64
-	h_64 =  g_64
-	g_64 =  f_64
-	f_64 =  e_64
-	e_64 =  d_64
-	d_64 =  c_64
-	c_64 =  b_64
-	b_64 =  a_64
-	a_64 =  __TMP
-.endm
-
-.macro SHA512_Round t
-	/* Compute Round %%t */
-	mov	T1,   f_64        /* T1 = f */
-	mov	tmp0, e_64        /* tmp = e */
-	xor	T1,   g_64        /* T1 = f ^ g */
-	ror	tmp0, 23 /* 41     ; tmp = e ror 23 */
-	and	T1,   e_64        /* T1 = (f ^ g) & e */
-	xor	tmp0, e_64        /* tmp = (e ror 23) ^ e */
-	xor	T1,   g_64        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */
-	add	T1,   [WK_2(\t)] /* W[t] + K[t] from message scheduler */
-	ror	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */
-	xor	tmp0, e_64        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */
-	mov	T2,   a_64        /* T2 = a */
-	add	T1,   h_64        /* T1 = CH(e,f,g) + W[t] + K[t] + h */
-	ror	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */
-	add	T1,   tmp0        /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */
-	mov	tmp0, a_64        /* tmp = a */
-	xor	T2,   c_64        /* T2 = a ^ c */
-	and	tmp0, c_64        /* tmp = a & c */
-	and	T2,   b_64        /* T2 = (a ^ c) & b */
-	xor	T2,   tmp0        /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */
-	mov	tmp0, a_64        /* tmp = a */
-	ror	tmp0, 5 /* 39      ; tmp = a ror 5 */
-	xor	tmp0, a_64        /* tmp = (a ror 5) ^ a */
-	add	d_64, T1          /* e(next_state) = d + T1  */
-	ror	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */
-	xor	tmp0, a_64        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */
-	lea	h_64, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */
-	ror	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */
-	add	h_64, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
-	RotateState
-.endm
-
-.macro SHA512_2Sched_2Round_sse t
-/*	; Compute rounds %%t-2 and %%t-1
-	; Compute message schedule QWORDS %%t and %%t+1
-
-	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
-	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
-	; scheduler.
-	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
-	; They are then added to their respective SHA512 constants at
-	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
-	;   For brievity, the comments following vectored instructions only refer to
-	; the first of a pair of QWORDS.
-	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
-	;   The computation of the message schedule and the rounds are tightly
-	; stitched to take advantage of instruction-level parallelism.
-	; For clarity, integer instructions (for the rounds calculation) are indented
-	; by one tab. Vectored instructions (for the message scheduler) are indented
-	; by two tabs. */
-
-	mov	T1, f_64
-		movdqa	xmm2, [W_t(\t-2)]  /* XMM2 = W[t-2] */
-	xor	T1,   g_64
-	and	T1,   e_64
-		movdqa	xmm0, xmm2          /* XMM0 = W[t-2] */
-	xor	T1,   g_64
-	add	T1,   [WK_2(\t)]
-		movdqu	xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */
-	mov	tmp0, e_64
-	ror	tmp0, 23 /* 41 */
-		movdqa	xmm3, xmm5          /* XMM3 = W[t-15] */
-	xor	tmp0, e_64
-	ror	tmp0, 4 /* 18 */
-		psrlq	xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */
-	xor	tmp0, e_64
-	ror	tmp0, 14 /* 14 */
-		psrlq	xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */
-	add	T1,   tmp0
-	add	T1,   h_64
-		pxor	xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */
-	mov	T2,   a_64
-	xor	T2,   c_64
-		pxor	xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */
-	and	T2,   b_64
-	mov	tmp0, a_64
-		psrlq	xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */
-	and	tmp0, c_64
-	xor	T2,   tmp0
-		psrlq	xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */
-	mov	tmp0, a_64
-	ror	tmp0, 5 /* 39 */
-		pxor	xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */
-	xor	tmp0, a_64
-	ror	tmp0, 6 /* 34 */
-		pxor	xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */
-	xor	tmp0, a_64
-	ror	tmp0, 28 /* 28 */
-		psrlq	xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */
-	add	T2,   tmp0
-	add	d_64, T1
-		psrlq	xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */
-	lea	h_64, [T1 + T2]
-	RotateState
-		movdqa	xmm1, xmm2          /* XMM1 = W[t-2] */
-	mov	T1, f_64
-	xor	T1,   g_64
-		movdqa	xmm4, xmm5          /* XMM4 = W[t-15] */
-	and	T1,   e_64
-	xor	T1,   g_64
-		psllq	xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */
-	add	T1,   [WK_2(\t+1)]
-	mov	tmp0, e_64
-		psllq	xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */
-	ror	tmp0, 23 /* 41 */
-	xor	tmp0, e_64
-		pxor	xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */
-	ror	tmp0, 4 /* 18 */
-	xor	tmp0, e_64
-		pxor	xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */
-	ror	tmp0, 14 /* 14 */
-	add	T1,   tmp0
-		psllq	xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */
-	add	T1,   h_64
-	mov	T2,   a_64
-		psllq	xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */
-	xor	T2,   c_64
-	and	T2,   b_64
-		pxor	xmm0, xmm1          /* XMM0 = s1(W[t-2]) */
-	mov	tmp0, a_64
-	and	tmp0, c_64
-		movdqu	xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */
-	xor	T2,   tmp0
-		pxor	xmm3, xmm4          /* XMM3 = s0(W[t-15]) */
-	mov	tmp0, a_64
-		paddq	xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */
-	ror	tmp0, 5 /* 39 */
-		paddq	xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */
-	xor	tmp0, a_64
-		paddq	xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */
-	ror	tmp0, 6 /* 34 */
-		movdqa	[W_t(\t)], xmm0     /* Store scheduled qwords */
-	xor	tmp0, a_64
-		paddq	xmm0, [K_t(t)]      /* Compute W[t]+K[t] */
-	ror	tmp0, 28 /* 28 */
-		movdqa	[WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */
-	add	T2,   tmp0
-	add	d_64, T1
-	lea	h_64, [T1 + T2]
-	RotateState
-.endm
+#define SHA512_Round(t, a, b, c, d, e, f, g, h) \
+	/* Compute Round %%t */; \
+	mov	T1,   f        /* T1 = f */; \
+	mov	tmp0, e        /* tmp = e */; \
+	xor	T1,   g        /* T1 = f ^ g */; \
+	ror	tmp0, 23 /* 41     ; tmp = e ror 23 */; \
+	and	T1,   e        /* T1 = (f ^ g) & e */; \
+	xor	tmp0, e        /* tmp = (e ror 23) ^ e */; \
+	xor	T1,   g        /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */; \
+	add	T1,   [WK_2(t)] /* W[t] + K[t] from message scheduler */; \
+	ror	tmp0, 4 /* 18      ; tmp = ((e ror 23) ^ e) ror 4 */; \
+	xor	tmp0, e        /* tmp = (((e ror 23) ^ e) ror 4) ^ e */; \
+	mov	T2,   a        /* T2 = a */; \
+	add	T1,   h        /* T1 = CH(e,f,g) + W[t] + K[t] + h */; \
+	ror	tmp0, 14 /* 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */; \
+	add	T1,   tmp0     /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	xor	T2,   c        /* T2 = a ^ c */; \
+	and	tmp0, c        /* tmp = a & c */; \
+	and	T2,   b        /* T2 = (a ^ c) & b */; \
+	xor	T2,   tmp0     /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */; \
+	mov	tmp0, a        /* tmp = a */; \
+	ror	tmp0, 5 /* 39      ; tmp = a ror 5 */; \
+	xor	tmp0, a        /* tmp = (a ror 5) ^ a */; \
+	add	d, T1          /* e(next_state) = d + T1  */; \
+	ror	tmp0, 6 /* 34      ; tmp = ((a ror 5) ^ a) ror 6 */; \
+	xor	tmp0, a        /* tmp = (((a ror 5) ^ a) ror 6) ^ a */; \
+	lea	h, [T1 + T2]   /* a(next_state) = T1 + Maj(a,b,c) */; \
+	ror	tmp0, 28 /* 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */; \
+	add	h, tmp0        /* a(next_state) = T1 + Maj(a,b,c) S0(a) */
+
+#define SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h) \
+	/* \
+	; Compute rounds %%t-2 and %%t-1 \
+	; Compute message schedule QWORDS %%t and %%t+1 \
+	; \
+	;   Two rounds are computed based on the values for K[t-2]+W[t-2] and \
+	; K[t-1]+W[t-1] which were previously stored at WK_2 by the message \
+	; scheduler. \
+	;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. \
+	; They are then added to their respective SHA512 constants at \
+	; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] \
+	;   For brievity, the comments following vectored instructions only refer to \
+	; the first of a pair of QWORDS. \
+	; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} \
+	;   The computation of the message schedule and the rounds are tightly \
+	; stitched to take advantage of instruction-level parallelism. \
+	; For clarity, integer instructions (for the rounds calculation) are indented \
+	; by one tab. Vectored instructions (for the message scheduler) are indented \
+	; by two tabs. \
+	*/ \
+	\
+	mov	T1, f; \
+		movdqa	xmm2, [W_t(t-2)]  /* XMM2 = W[t-2] */; \
+	xor	T1,   g; \
+	and	T1,   e; \
+		movdqa	xmm0, xmm2          /* XMM0 = W[t-2] */; \
+	xor	T1,   g; \
+	add	T1,   [WK_2(t)]; \
+		movdqu	xmm5, [W_t(t-15)] /* XMM5 = W[t-15] */; \
+	mov	tmp0, e; \
+	ror	tmp0, 23 /* 41 */; \
+		movdqa	xmm3, xmm5          /* XMM3 = W[t-15] */; \
+	xor	tmp0, e; \
+	ror	tmp0, 4 /* 18 */; \
+		psrlq	xmm0, 61 - 19       /* XMM0 = W[t-2] >> 42 */; \
+	xor	tmp0, e; \
+	ror	tmp0, 14 /* 14 */; \
+		psrlq	xmm3, (8 - 7)       /* XMM3 = W[t-15] >> 1 */; \
+	add	T1,   tmp0; \
+	add	T1,   h; \
+		pxor	xmm0, xmm2          /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */; \
+	mov	T2,   a; \
+	xor	T2,   c; \
+		pxor	xmm3, xmm5          /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */; \
+	and	T2,   b; \
+	mov	tmp0, a; \
+		psrlq	xmm0, 19 - 6        /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */; \
+	and	tmp0, c; \
+	xor	T2,   tmp0; \
+		psrlq	xmm3, (7 - 1)       /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */; \
+	mov	tmp0, a; \
+	ror	tmp0, 5 /* 39 */; \
+		pxor	xmm0, xmm2          /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */; \
+	xor	tmp0, a; \
+	ror	tmp0, 6 /* 34 */; \
+		pxor	xmm3, xmm5          /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */; \
+	xor	tmp0, a; \
+	ror	tmp0, 28 /* 28 */; \
+		psrlq	xmm0, 6             /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */; \
+	add	T2,   tmp0; \
+	add	d, T1; \
+		psrlq	xmm3, 1             /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */; \
+	lea	h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse_PART2(t, a, b, c, d, e, f, g, h) \
+		movdqa	xmm1, xmm2          /* XMM1 = W[t-2] */; \
+	mov	T1,   f; \
+	xor	T1,   g; \
+		movdqa	xmm4, xmm5          /* XMM4 = W[t-15] */; \
+	and	T1,   e; \
+	xor	T1,   g; \
+		psllq	xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */; \
+	add	T1,   [WK_2(t+1)]; \
+	mov	tmp0, e; \
+		psllq	xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */; \
+	ror	tmp0, 23 /* 41 */; \
+	xor	tmp0, e; \
+		pxor	xmm1, xmm2          /* XMM1 = (W[t-2] << 42)^W[t-2] */; \
+	ror	tmp0, 4 /* 18 */; \
+	xor	tmp0, e; \
+		pxor	xmm4, xmm5          /* XMM4 = (W[t-15]<<7)^W[t-15] */; \
+	ror	tmp0, 14 /* 14 */; \
+	add	T1,   tmp0; \
+		psllq	xmm1, (64 - 61)     /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */; \
+	add	T1,   h; \
+	mov	T2,   a; \
+		psllq	xmm4, (64 - 8)      /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */; \
+	xor	T2,   c; \
+	and	T2,   b; \
+		pxor	xmm0, xmm1          /* XMM0 = s1(W[t-2]) */; \
+	mov	tmp0, a; \
+	and	tmp0, c; \
+		movdqu	xmm1, [W_t(t- 7)] /* XMM1 = W[t-7] */; \
+	xor	T2,   tmp0; \
+		pxor	xmm3, xmm4          /* XMM3 = s0(W[t-15]) */; \
+	mov	tmp0, a; \
+		paddq	xmm0, xmm3          /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */; \
+	ror	tmp0, 5 /* 39 */; \
+		paddq	xmm0, [W_t(t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */; \
+	xor	tmp0, a; \
+		paddq	xmm0, xmm1          /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */; \
+	ror	tmp0, 6 /* 34 */; \
+		movdqa	[W_t(t)], xmm0     /* Store scheduled qwords */; \
+	xor	tmp0, a; \
+		paddq	xmm0, [K_t(t)]      /* Compute W[t]+K[t] */; \
+	ror	tmp0, 28 /* 28 */; \
+		movdqa	[WK_2(t)], xmm0     /* Store W[t]+K[t] for next rounds */; \
+	add	T2,   tmp0; \
+	add	d, T1; \
+	lea	h, [T1 + T2]
+
+#define SHA512_2Sched_2Round_sse(t, a, b, c, d, e, f, g, h) \
+	SHA512_2Sched_2Round_sse_PART1(t, a, b, c, d, e, f, g, h); \
+	SHA512_2Sched_2Round_sse_PART2(t, h, a, b, c, d, e, f, g)
 
 /*
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -295,37 +285,77 @@ _gcry_sha512_transform_amd64_ssse3:
 	mov	g_64, [DIGEST(6)]
 	mov	h_64, [DIGEST(7)]
 
-	t = 0
-	.rept 80/2 + 1
-	/* (80 rounds) / (2 rounds/iteration) + (1 iteration) */
-	/* +1 iteration because the scheduler leads hashing by 1 iteration */
-		.if t < 2
-			/* BSWAP 2 QWORDS */
-			movdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
-			movdqu	xmm0, [MSG(t)]
-			pshufb	xmm0, xmm1      /* BSWAP */
-			movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */
-			paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
-			movdqa	[WK_2(t)], xmm0 /* Store into WK for rounds */
-		.elseif t < 16
-			/* BSWAP 2 QWORDS; Compute 2 Rounds */
-			movdqu	xmm0, [MSG(t)]
-			pshufb	xmm0, xmm1      /* BSWAP */
-			SHA512_Round (t - 2)    /* Round t-2 */
-			movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */
-			paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */
-			SHA512_Round (t - 1)    /* Round t-1 */
-			movdqa	[WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
-		.elseif t < 79
-			/* Schedule 2 QWORDS; Compute 2 Rounds */
-			SHA512_2Sched_2Round_sse t
-		.else
-			/* Compute 2 Rounds */
-			SHA512_Round (t - 2)
-			SHA512_Round (t - 1)
-		.endif
-		t = (t)+2
-	.endr
+	/* BSWAP 2 QWORDS */
+	movdqa	xmm1, [.LXMM_QWORD_BSWAP ADD_RIP]
+	movdqu	xmm0, [MSG(0)]
+	pshufb	xmm0, xmm1      /* BSWAP */
+	movdqa	[W_t(0)], xmm0  /* Store Scheduled Pair */
+	paddq	xmm0, [K_t(0)]  /* Compute W[t]+K[t] */
+	movdqa	[WK_2(0)], xmm0 /* Store into WK for rounds */
+
+	#define T_2_14(t, a, b, c, d, e, f, g, h) \
+		/* BSWAP 2 QWORDS; Compute 2 Rounds */; \
+		movdqu	xmm0, [MSG(t)]; \
+		pshufb	xmm0, xmm1      /* BSWAP */; \
+		SHA512_Round(((t) - 2), a##_64, b##_64, c##_64, d##_64, \
+				        e##_64, f##_64, g##_64, h##_64); \
+		movdqa	[W_t(t)], xmm0  /* Store Scheduled Pair */; \
+		paddq	xmm0, [K_t(t)]  /* Compute W[t]+K[t] */; \
+		SHA512_Round(((t) - 1), h##_64, a##_64, b##_64, c##_64, \
+				        d##_64, e##_64, f##_64, g##_64); \
+		movdqa	[WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */
+
+	#define T_16_78(t, a, b, c, d, e, f, g, h) \
+		SHA512_2Sched_2Round_sse((t), a##_64, b##_64, c##_64, d##_64, \
+					      e##_64, f##_64, g##_64, h##_64)
+
+	#define T_80(t, a, b, c, d, e, f, g, h) \
+		/* Compute 2 Rounds */; \
+		SHA512_Round((t - 2), a##_64, b##_64, c##_64, d##_64, \
+				      e##_64, f##_64, g##_64, h##_64); \
+		SHA512_Round((t - 1), h##_64, a##_64, b##_64, c##_64, \
+				      d##_64, e##_64, f##_64, g##_64)
+
+	T_2_14(2, a, b, c, d, e, f, g, h)
+	T_2_14(4, g, h, a, b, c, d, e, f)
+	T_2_14(6, e, f, g, h, a, b, c, d)
+	T_2_14(8, c, d, e, f, g, h, a, b)
+	T_2_14(10, a, b, c, d, e, f, g, h)
+	T_2_14(12, g, h, a, b, c, d, e, f)
+	T_2_14(14, e, f, g, h, a, b, c, d)
+	T_16_78(16, c, d, e, f, g, h, a, b)
+	T_16_78(18, a, b, c, d, e, f, g, h)
+	T_16_78(20, g, h, a, b, c, d, e, f)
+	T_16_78(22, e, f, g, h, a, b, c, d)
+	T_16_78(24, c, d, e, f, g, h, a, b)
+	T_16_78(26, a, b, c, d, e, f, g, h)
+	T_16_78(28, g, h, a, b, c, d, e, f)
+	T_16_78(30, e, f, g, h, a, b, c, d)
+	T_16_78(32, c, d, e, f, g, h, a, b)
+	T_16_78(34, a, b, c, d, e, f, g, h)
+	T_16_78(36, g, h, a, b, c, d, e, f)
+	T_16_78(38, e, f, g, h, a, b, c, d)
+	T_16_78(40, c, d, e, f, g, h, a, b)
+	T_16_78(42, a, b, c, d, e, f, g, h)
+	T_16_78(44, g, h, a, b, c, d, e, f)
+	T_16_78(46, e, f, g, h, a, b, c, d)
+	T_16_78(48, c, d, e, f, g, h, a, b)
+	T_16_78(50, a, b, c, d, e, f, g, h)
+	T_16_78(52, g, h, a, b, c, d, e, f)
+	T_16_78(54, e, f, g, h, a, b, c, d)
+	T_16_78(56, c, d, e, f, g, h, a, b)
+	T_16_78(58, a, b, c, d, e, f, g, h)
+	T_16_78(60, g, h, a, b, c, d, e, f)
+	T_16_78(62, e, f, g, h, a, b, c, d)
+	T_16_78(64, c, d, e, f, g, h, a, b)
+	T_16_78(66, a, b, c, d, e, f, g, h)
+	T_16_78(68, g, h, a, b, c, d, e, f)
+	T_16_78(70, e, f, g, h, a, b, c, d)
+	T_16_78(72, c, d, e, f, g, h, a, b)
+	T_16_78(74, a, b, c, d, e, f, g, h)
+	T_16_78(76, g, h, a, b, c, d, e, f)
+	T_16_78(78, e, f, g, h, a, b, c, d)
+	T_80(80, c, d, e, f, g, h, a, b)
 
 	/* Update digest */
 	add	[DIGEST(0)], a_64
@@ -362,11 +392,12 @@ _gcry_sha512_transform_amd64_ssse3:
 	pxor	xmm5, xmm5
 
 	/* Burn stack */
-	t = 0
-	.rept frame_W_size / 16
-		movdqu [rsp + frame_W + (t) * 16], xmm0
-		t = ((t)+1)
-	.endr
+	mov eax, 0
+.Lerase_stack:
+	movdqu [rsp + rax], xmm0
+	add eax, 16
+	cmp eax, frame_W_size
+	jne .Lerase_stack
 	movdqu [rsp + frame_WK], xmm0
 	xor     eax, eax
 
diff --git a/configure.ac b/configure.ac
index f7339a3e..e4a10b78 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1741,21 +1741,11 @@ AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly im
                 ".text\n\t"
                 "actest:\n\t"
                 "pxor xmm1, xmm7;\n\t"
-                /* Intel syntax implementation also use GAS macros, so check
-                 * for them here. */
-                "VAL_A = xmm4\n\t"
-                "VAL_B = xmm2\n\t"
-                ".macro SET_VAL_A p1\n\t"
-                "  VAL_A = \\\\p1 \n\t"
-                ".endm\n\t"
-                ".macro SET_VAL_B p1\n\t"
-                "  VAL_B = \\\\p1 \n\t"
-                ".endm\n\t"
-                "vmovdqa VAL_A, VAL_B;\n\t"
-                "SET_VAL_A eax\n\t"
-                "SET_VAL_B ebp\n\t"
-                "add VAL_A, VAL_B;\n\t"
-                "add VAL_B, 0b10101;\n\t"
+                "vperm2i128 ymm2, ymm3, ymm0, 1;\n\t"
+                "add eax, ebp;\n\t"
+                "rorx eax, ebp, 1;\n\t"
+                "sub eax, [esp + 4];\n\t"
+                "add dword ptr [esp + eax], 0b10101;\n\t"
                 ".att_syntax prefix\n\t"
             );]], [ actest(); ])],
           [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])
-- 
2.27.0




More information about the Gcrypt-devel mailing list