[PATCH] serpent-amd64: do not use GAS macros

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Oct 20 20:01:42 CEST 2013


* cipher/serpent-avx2-amd64.S: Remove use of GAS macros.
* cipher/serpent-sse2-amd64.S: Ditto.
* configure.ac [HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS]: Do not check
for GAS macros.
--

This way we have better portability; for example, when compiling with clang
on x86-64, the assembly implementations are now enabled and working.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/serpent-avx2-amd64.S |  519 ++++++++++++++++++-------------------------
 cipher/serpent-sse2-amd64.S |  507 ++++++++++++++++++------------------------
 configure.ac                |    6 
 3 files changed, 439 insertions(+), 593 deletions(-)

diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index c726e7b..8a76ab1 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -36,51 +36,36 @@
 #define CTX %rdi
 
 /* vector registers */
-.set RA0, %ymm0
-.set RA1, %ymm1
-.set RA2, %ymm2
-.set RA3, %ymm3
-.set RA4, %ymm4
-
-.set RB0, %ymm5
-.set RB1, %ymm6
-.set RB2, %ymm7
-.set RB3, %ymm8
-.set RB4, %ymm9
-
-.set RNOT, %ymm10
-.set RTMP0, %ymm11
-.set RTMP1, %ymm12
-.set RTMP2, %ymm13
-.set RTMP3, %ymm14
-.set RTMP4, %ymm15
-
-.set RNOTx, %xmm10
-.set RTMP0x, %xmm11
-.set RTMP1x, %xmm12
-.set RTMP2x, %xmm13
-.set RTMP3x, %xmm14
-.set RTMP4x, %xmm15
+#define RA0 %ymm0
+#define RA1 %ymm1
+#define RA2 %ymm2
+#define RA3 %ymm3
+#define RA4 %ymm4
+
+#define RB0 %ymm5
+#define RB1 %ymm6
+#define RB2 %ymm7
+#define RB3 %ymm8
+#define RB4 %ymm9
+
+#define RNOT %ymm10
+#define RTMP0 %ymm11
+#define RTMP1 %ymm12
+#define RTMP2 %ymm13
+#define RTMP3 %ymm14
+#define RTMP4 %ymm15
+
+#define RNOTx %xmm10
+#define RTMP0x %xmm11
+#define RTMP1x %xmm12
+#define RTMP2x %xmm13
+#define RTMP3x %xmm14
+#define RTMP4x %xmm15
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
-			new_r0, new_r1, new_r2, new_r3, new_r4) \
-	.set rename_reg0, new_r0; \
-	.set rename_reg1, new_r1; \
-	.set rename_reg2, new_r2; \
-	.set rename_reg3, new_r3; \
-	.set rename_reg4, new_r4; \
-	\
-	.set r0, rename_reg0; \
-	.set r1, rename_reg1; \
-	.set r2, rename_reg2; \
-	.set r3, rename_reg3; \
-	.set r4, rename_reg4;
-
 /* vector 32-bit rotation to left */
 #define vec_rol(reg, nleft, tmp) \
 	vpslld $(nleft), reg, tmp;		\
@@ -128,9 +113,7 @@
 	vpxor	r4, r2, r2;		vpxor	RNOT, r4, r4;		\
 	vpor	r1, r4, r4;		vpxor	r3, r1, r1;		\
 	vpxor	r4, r1, r1;		vpor	r0, r3, r3;		\
-	vpxor	r3, r1, r1;		vpxor	r3, r4, r4;		\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+	vpxor	r3, r1, r1;		vpxor	r3, r4, r4;
 
 #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r2, r2;		vmovdqa	r1, r4;			\
@@ -143,9 +126,7 @@
 	vpxor	r1, r2, r2;		vpxor	r0, r3, r3;		\
 	vpxor	r1, r3, r3;	\
 	vpand	r3, r2, r2;	\
-	vpxor	r2, r4, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+	vpxor	r2, r4, r4;
 
 #define SBOX1(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r0, r0;		vpxor	RNOT, r2, r2;		\
@@ -157,9 +138,7 @@
 	vpand	r4, r2, r2;		vpxor	r1, r0, r0;		\
 	vpand	r2, r1, r1;	\
 	vpxor	r0, r1, r1;		vpand	r2, r0, r0;		\
-	vpxor	r4, r0, r0;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+	vpxor	r4, r0, r0;
 
 #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r1, r4;			vpxor	r3, r1, r1;		\
@@ -172,9 +151,7 @@
 	vpxor	r1, r4, r4;		vpor	r0, r1, r1;		\
 	vpxor	r0, r1, r1;	\
 	vpor	r4, r1, r1;	\
-	vpxor	r1, r3, r3;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+	vpxor	r1, r3, r3;
 
 #define SBOX2(r0, r1, r2, r3, r4) \
 	vmovdqa	r0, r4;			vpand	r2, r0, r0;		\
@@ -184,9 +161,7 @@
 	vmovdqa	r3, r1;			vpor	r4, r3, r3;		\
 	vpxor	r0, r3, r3;		vpand	r1, r0, r0;		\
 	vpxor	r0, r4, r4;		vpxor	r3, r1, r1;		\
-	vpxor	r4, r1, r1;		vpxor	RNOT, r4, r4;		\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+	vpxor	r4, r1, r1;		vpxor	RNOT, r4, r4;
 
 #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	r3, r2, r2;		vpxor	r0, r3, r3;		\
@@ -198,9 +173,7 @@
 	vpor	r0, r2, r2;		vpxor	RNOT, r3, r3;		\
 	vpxor	r3, r2, r2;		vpxor	r3, r0, r0;		\
 	vpand	r1, r0, r0;		vpxor	r4, r3, r3;		\
-	vpxor	r0, r3, r3;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+	vpxor	r0, r3, r3;
 
 #define SBOX3(r0, r1, r2, r3, r4) \
 	vmovdqa	r0, r4;			vpor	r3, r0, r0;		\
@@ -212,9 +185,7 @@
 	vpxor	r2, r4, r4;		vpor	r0, r1, r1;		\
 	vpxor	r2, r1, r1;		vpxor	r3, r0, r0;		\
 	vmovdqa	r1, r2;			vpor	r3, r1, r1;		\
-	vpxor	r0, r1, r1;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+	vpxor	r0, r1, r1;
 
 #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r2, r4;			vpxor	r1, r2, r2;		\
@@ -226,9 +197,7 @@
 	vpxor	r1, r3, r3;		vpxor	r0, r1, r1;		\
 	vpor	r2, r1, r1;		vpxor	r3, r0, r0;		\
 	vpxor	r4, r1, r1;	\
-	vpxor	r1, r0, r0;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+	vpxor	r1, r0, r0;
 
 #define SBOX4(r0, r1, r2, r3, r4) \
 	vpxor	r3, r1, r1;		vpxor	RNOT, r3, r3;		\
@@ -240,9 +209,7 @@
 	vpxor	r0, r3, r3;		vpor	r1, r4, r4;		\
 	vpxor	r0, r4, r4;		vpor	r3, r0, r0;		\
 	vpxor	r2, r0, r0;		vpand	r3, r2, r2;		\
-	vpxor	RNOT, r0, r0;		vpxor	r2, r4, r4;		\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+	vpxor	RNOT, r0, r0;		vpxor	r2, r4, r4;
 
 #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r2, r4;			vpand	r3, r2, r2;		\
@@ -255,9 +222,7 @@
 	vpand	r0, r2, r2;		vpxor	r0, r3, r3;		\
 	vpxor	r4, r2, r2;	\
 	vpor	r3, r2, r2;		vpxor	r0, r3, r3;		\
-	vpxor	r1, r2, r2;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+	vpxor	r1, r2, r2;
 
 #define SBOX5(r0, r1, r2, r3, r4) \
 	vpxor	r1, r0, r0;		vpxor	r3, r1, r1;		\
@@ -269,9 +234,7 @@
 	vpxor	r2, r4, r4;		vpxor	r0, r2, r2;		\
 	vpand	r3, r0, r0;		vpxor	RNOT, r2, r2;		\
 	vpxor	r4, r0, r0;		vpor	r3, r4, r4;		\
-	vpxor	r4, r2, r2;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+	vpxor	r4, r2, r2;
 
 #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r1, r1;		vmovdqa	r3, r4;			\
@@ -283,9 +246,7 @@
 	vpxor	r3, r1, r1;		vpxor	r2, r4, r4;		\
 	vpand	r4, r3, r3;		vpxor	r1, r4, r4;		\
 	vpxor	r4, r3, r3;		vpxor	RNOT, r4, r4;		\
-	vpxor	r0, r3, r3;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+	vpxor	r0, r3, r3;
 
 #define SBOX6(r0, r1, r2, r3, r4) \
 	vpxor	RNOT, r2, r2;		vmovdqa	r3, r4;			\
@@ -297,9 +258,7 @@
 	vpxor	r2, r0, r0;		vpxor	r3, r4, r4;		\
 	vpxor	r0, r4, r4;		vpxor	RNOT, r3, r3;		\
 	vpand	r4, r2, r2;	\
-	vpxor	r3, r2, r2;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+	vpxor	r3, r2, r2;
 
 #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
 	vpxor	r2, r0, r0;		vmovdqa	r2, r4;			\
@@ -310,9 +269,7 @@
 	vpxor	r1, r4, r4;		vpand	r3, r1, r1;		\
 	vpxor	r0, r1, r1;		vpxor	r3, r0, r0;		\
 	vpor	r2, r0, r0;		vpxor	r1, r3, r3;		\
-	vpxor	r0, r4, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+	vpxor	r0, r4, r4;
 
 #define SBOX7(r0, r1, r2, r3, r4) \
 	vmovdqa	r1, r4;			vpor	r2, r1, r1;		\
@@ -325,9 +282,7 @@
 	vpxor	r1, r2, r2;		vpand	r0, r1, r1;		\
 	vpxor	r4, r1, r1;		vpxor	RNOT, r2, r2;		\
 	vpor	r0, r2, r2;	\
-	vpxor	r2, r4, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+	vpxor	r2, r4, r4;
 
 #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
 	vmovdqa	r2, r4;			vpxor	r0, r2, r2;		\
@@ -339,9 +294,7 @@
 	vpor	r2, r0, r0;		vpxor	r1, r4, r4;		\
 	vpxor	r3, r0, r0;		vpxor	r4, r3, r3;		\
 	vpor	r0, r4, r4;		vpxor	r2, r3, r3;		\
-	vpxor	r2, r4, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+	vpxor	r2, r4, r4;
 
 /* Apply SBOX number WHICH to to the block.  */
 #define SBOX(which, r0, r1, r2, r3, r4) \
@@ -402,49 +355,51 @@
 
 /* Apply a Serpent round to sixteen parallel blocks.  This macro increments
    `round'.  */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-	SBOX (which, a0, a1, a2, a3, a4);		\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-		SBOX (which, b0, b1, b2, b3, b4);		\
-	LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4);	\
-		LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4);	\
-	.set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4);	\
+		LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
 
 /* Apply the last Serpent round to sixteen parallel blocks.  This macro
    increments `round'.  */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-	SBOX (which, a0, a1, a2, a3, a4);		\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-		SBOX (which, b0, b1, b2, b3, b4);		\
-	.set round, (round + 1);			\
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-	.set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1));		\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
 
 /* Apply an inverse Serpent round to sixteen parallel blocks.  This macro
    increments `round'.  */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+				    na0, na1, na2, na3, na4, \
+				    b0, b1, b2, b3, b4, \
+				    nb0, nb1, nb2, nb3, nb4) \
 	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
 		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);		\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
-	.set round, (round - 1);
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 /* Apply the first inverse Serpent round to sixteen parallel blocks.  This macro
    increments `round'.  */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-	.set round, (round - 1);			\
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+					  na0, na1, na2, na3, na4, \
+					  b0, b1, b2, b3, b4, \
+					  nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1));	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1));	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);	\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-	.set round, (round - 1);
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 .text
 
@@ -456,72 +411,82 @@ __serpent_enc_blk16:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
 	 *						plaintext blocks
 	 * output:
-	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
 	 * 						ciphertext blocks
 	 */
 
-	/* record input vector names for __serpent_enc_blk16 */
-	.set enc_in_a0, RA0
-	.set enc_in_a1, RA1
-	.set enc_in_a2, RA2
-	.set enc_in_a3, RA3
-	.set enc_in_b0, RB0
-	.set enc_in_b1, RB1
-	.set enc_in_b2, RB2
-	.set enc_in_b3, RB3
-
 	vpcmpeqd RNOT, RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	.set round, 0
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
-	ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
-	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
-	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
-	/* record output vector names for __serpent_enc_blk16 */
-	.set enc_out_a0, RA0
-	.set enc_out_a1, RA1
-	.set enc_out_a2, RA2
-	.set enc_out_a3, RA3
-	.set enc_out_b0, RB0
-	.set enc_out_b1, RB1
-	.set enc_out_b2, RB2
-	.set enc_out_b3, RB3
+	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
 .size __serpent_enc_blk16,.-__serpent_enc_blk16;
@@ -538,69 +503,81 @@ __serpent_dec_blk16:
 	 *						plaintext blocks
 	 */
 
-	/* record input vector names for __serpent_dec_blk16 */
-	.set dec_in_a0, RA0
-	.set dec_in_a1, RA1
-	.set dec_in_a2, RA2
-	.set dec_in_a3, RA3
-	.set dec_in_b0, RB0
-	.set dec_in_b1, RB1
-	.set dec_in_b2, RB2
-	.set dec_in_b3, RB3
-
 	vpcmpeqd RNOT, RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	.set round, 32
-	ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+				    RA3, RA0, RA1, RA4, RA2,
+				    RB0, RB1, RB2, RB3, RB4,
+				    RB3, RB0, RB1, RB4, RB2);
+	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	/* record output vector names for __serpent_dec_blk16 */
-	.set dec_out_a0, RA0
-	.set dec_out_a1, RA1
-	.set dec_out_a2, RA2
-	.set dec_out_a3, RA3
-	.set dec_out_b0, RB0
-	.set dec_out_b1, RB1
-	.set dec_out_b2, RB2
-	.set dec_out_b3, RB3
-
 	ret;
 .size __serpent_dec_blk16,.-__serpent_dec_blk16;
 
@@ -623,15 +600,6 @@ _gcry_serpent_avx2_ctr_enc:
 
 	vzeroupper;
 
-	.set RA0, enc_in_a0
-	.set RA1, enc_in_a1
-	.set RA2, enc_in_a2
-	.set RA3, enc_in_a3
-	.set RB0, enc_in_b0
-	.set RB1, enc_in_b1
-	.set RB2, enc_in_b2
-	.set RB3, enc_in_b3
-
 	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
 	vpcmpeqd RNOT, RNOT, RNOT;
 	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
@@ -703,32 +671,23 @@ _gcry_serpent_avx2_ctr_enc:
 
 	call __serpent_enc_blk16;
 
-	.set RA0, enc_out_a0
-	.set RA1, enc_out_a1
-	.set RA2, enc_out_a2
-	.set RA3, enc_out_a3
-	.set RB0, enc_out_b0
-	.set RB1, enc_out_b1
-	.set RB2, enc_out_b2
-	.set RB3, enc_out_b3
-
-	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (0 * 32)(%rdx), RA4, RA4;
 	vpxor (1 * 32)(%rdx), RA1, RA1;
 	vpxor (2 * 32)(%rdx), RA2, RA2;
-	vpxor (3 * 32)(%rdx), RA3, RA3;
-	vpxor (4 * 32)(%rdx), RB0, RB0;
+	vpxor (3 * 32)(%rdx), RA0, RA0;
+	vpxor (4 * 32)(%rdx), RB4, RB4;
 	vpxor (5 * 32)(%rdx), RB1, RB1;
 	vpxor (6 * 32)(%rdx), RB2, RB2;
-	vpxor (7 * 32)(%rdx), RB3, RB3;
+	vpxor (7 * 32)(%rdx), RB0, RB0;
 
-	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA4, (0 * 32)(%rsi);
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vmovdqu RA2, (2 * 32)(%rsi);
-	vmovdqu RA3, (3 * 32)(%rsi);
-	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RA0, (3 * 32)(%rsi);
+	vmovdqu RB4, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RB2, (6 * 32)(%rsi);
-	vmovdqu RB3, (7 * 32)(%rsi);
+	vmovdqu RB0, (7 * 32)(%rsi);
 
 	vzeroall;
 
@@ -748,15 +707,6 @@ _gcry_serpent_avx2_cbc_dec:
 
 	vzeroupper;
 
-	.set RA0, dec_in_a0
-	.set RA1, dec_in_a1
-	.set RA2, dec_in_a2
-	.set RA3, dec_in_a3
-	.set RB0, dec_in_b0
-	.set RB1, dec_in_b1
-	.set RB2, dec_in_b2
-	.set RB3, dec_in_b3
-
 	vmovdqu (0 * 32)(%rdx), RA0;
 	vmovdqu (1 * 32)(%rdx), RA1;
 	vmovdqu (2 * 32)(%rdx), RA2;
@@ -768,15 +718,6 @@ _gcry_serpent_avx2_cbc_dec:
 
 	call __serpent_dec_blk16;
 
-	.set RA0, dec_out_a0
-	.set RA1, dec_out_a1
-	.set RA2, dec_out_a2
-	.set RA3, dec_out_a3
-	.set RB0, dec_out_b0
-	.set RB1, dec_out_b1
-	.set RB2, dec_out_b2
-	.set RB3, dec_out_b3
-
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RNOT;
 	vpxor RNOT, RA0, RA0;
@@ -817,15 +758,6 @@ _gcry_serpent_avx2_cfb_dec:
 
 	vzeroupper;
 
-	.set RA0, enc_in_a0
-	.set RA1, enc_in_a1
-	.set RA2, enc_in_a2
-	.set RA3, enc_in_a3
-	.set RB0, enc_in_b0
-	.set RB1, enc_in_b1
-	.set RB2, enc_in_b2
-	.set RB3, enc_in_b3
-
 	/* Load input */
 	vmovdqu (%rcx), RNOTx;
 	vinserti128 $1, (%rdx), RNOT, RA0;
@@ -843,32 +775,23 @@ _gcry_serpent_avx2_cfb_dec:
 
 	call __serpent_enc_blk16;
 
-	.set RA0, enc_out_a0
-	.set RA1, enc_out_a1
-	.set RA2, enc_out_a2
-	.set RA3, enc_out_a3
-	.set RB0, enc_out_b0
-	.set RB1, enc_out_b1
-	.set RB2, enc_out_b2
-	.set RB3, enc_out_b3
-
-	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (0 * 32)(%rdx), RA4, RA4;
 	vpxor (1 * 32)(%rdx), RA1, RA1;
 	vpxor (2 * 32)(%rdx), RA2, RA2;
-	vpxor (3 * 32)(%rdx), RA3, RA3;
-	vpxor (4 * 32)(%rdx), RB0, RB0;
+	vpxor (3 * 32)(%rdx), RA0, RA0;
+	vpxor (4 * 32)(%rdx), RB4, RB4;
 	vpxor (5 * 32)(%rdx), RB1, RB1;
 	vpxor (6 * 32)(%rdx), RB2, RB2;
-	vpxor (7 * 32)(%rdx), RB3, RB3;
+	vpxor (7 * 32)(%rdx), RB0, RB0;
 
-	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RA4, (0 * 32)(%rsi);
 	vmovdqu RA1, (1 * 32)(%rsi);
 	vmovdqu RA2, (2 * 32)(%rsi);
-	vmovdqu RA3, (3 * 32)(%rsi);
-	vmovdqu RB0, (4 * 32)(%rsi);
+	vmovdqu RA0, (3 * 32)(%rsi);
+	vmovdqu RB4, (4 * 32)(%rsi);
 	vmovdqu RB1, (5 * 32)(%rsi);
 	vmovdqu RB2, (6 * 32)(%rsi);
-	vmovdqu RB3, (7 * 32)(%rsi);
+	vmovdqu RB0, (7 * 32)(%rsi);
 
 	vzeroall;
 
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index a5cf353..516126b 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -35,42 +35,27 @@
 #define CTX %rdi
 
 /* vector registers */
-.set RA0, %xmm0
-.set RA1, %xmm1
-.set RA2, %xmm2
-.set RA3, %xmm3
-.set RA4, %xmm4
-
-.set RB0, %xmm5
-.set RB1, %xmm6
-.set RB2, %xmm7
-.set RB3, %xmm8
-.set RB4, %xmm9
-
-.set RNOT, %xmm10
-.set RTMP0, %xmm11
-.set RTMP1, %xmm12
-.set RTMP2, %xmm13
+#define RA0 %xmm0
+#define RA1 %xmm1
+#define RA2 %xmm2
+#define RA3 %xmm3
+#define RA4 %xmm4
+
+#define RB0 %xmm5
+#define RB1 %xmm6
+#define RB2 %xmm7
+#define RB3 %xmm8
+#define RB4 %xmm9
+
+#define RNOT %xmm10
+#define RTMP0 %xmm11
+#define RTMP1 %xmm12
+#define RTMP2 %xmm13
 
 /**********************************************************************
   helper macros
  **********************************************************************/
 
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
-			new_r0, new_r1, new_r2, new_r3, new_r4) \
-	.set rename_reg0, new_r0; \
-	.set rename_reg1, new_r1; \
-	.set rename_reg2, new_r2; \
-	.set rename_reg3, new_r3; \
-	.set rename_reg4, new_r4; \
-	\
-	.set r0, rename_reg0; \
-	.set r1, rename_reg1; \
-	.set r2, rename_reg2; \
-	.set r3, rename_reg3; \
-	.set r4, rename_reg4;
-
 /* vector 32-bit rotation to left */
 #define vec_rol(reg, nleft, tmp) \
 	movdqa reg, tmp; 		\
@@ -147,9 +132,7 @@
 	pxor	r4, r2;		pxor	RNOT, r4;	\
 	por	r1, r4;		pxor	r3, r1;		\
 	pxor	r4, r1;		por	r0, r3;		\
-	pxor	r3, r1;		pxor	r3, r4;		\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+	pxor	r3, r1;		pxor	r3, r4;
 
 #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r2;	movdqa	r1, r4;		\
@@ -162,9 +145,7 @@
 	pxor	r1, r2;		pxor	r0, r3;		\
 	pxor	r1, r3;	\
 	pand	r3, r2;	\
-	pxor	r2, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+	pxor	r2, r4;
 
 #define SBOX1(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r0;	pxor	RNOT, r2;	\
@@ -176,9 +157,7 @@
 	pand	r4, r2;		pxor	r1, r0;		\
 	pand	r2, r1;	\
 	pxor	r0, r1;		pand	r2, r0;		\
-	pxor	r4, r0;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+	pxor	r4, r0;
 
 #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r1, r4;		pxor	r3, r1;		\
@@ -191,9 +170,7 @@
 	pxor	r1, r4;		por	r0, r1;		\
 	pxor	r0, r1;	\
 	por	r4, r1;	\
-	pxor	r1, r3;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+	pxor	r1, r3;
 
 #define SBOX2(r0, r1, r2, r3, r4) \
 	movdqa	r0, r4;		pand	r2, r0;		\
@@ -203,9 +180,7 @@
 	movdqa	r3, r1;		por	r4, r3;		\
 	pxor	r0, r3;		pand	r1, r0;		\
 	pxor	r0, r4;		pxor	r3, r1;		\
-	pxor	r4, r1;		pxor	RNOT, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+	pxor	r4, r1;		pxor	RNOT, r4;
 
 #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	r3, r2;		pxor	r0, r3;		\
@@ -217,9 +192,7 @@
 	por	r0, r2;		pxor	RNOT, r3;	\
 	pxor	r3, r2;		pxor	r3, r0;		\
 	pand	r1, r0;		pxor	r4, r3;		\
-	pxor	r0, r3;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+	pxor	r0, r3;
 
 #define SBOX3(r0, r1, r2, r3, r4) \
 	movdqa	r0, r4;		por	r3, r0;		\
@@ -231,9 +204,7 @@
 	pxor	r2, r4;		por	r0, r1;		\
 	pxor	r2, r1;		pxor	r3, r0;		\
 	movdqa	r1, r2;		por	r3, r1;		\
-	pxor	r0, r1;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+	pxor	r0, r1;
 
 #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r2, r4;		pxor	r1, r2;		\
@@ -245,9 +216,7 @@
 	pxor	r1, r3;		pxor	r0, r1;		\
 	por	r2, r1;		pxor	r3, r0;		\
 	pxor	r4, r1;	\
-	pxor	r1, r0;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+	pxor	r1, r0;
 
 #define SBOX4(r0, r1, r2, r3, r4) \
 	pxor	r3, r1;		pxor	RNOT, r3;	\
@@ -259,9 +228,7 @@
 	pxor	r0, r3;		por	r1, r4;		\
 	pxor	r0, r4;		por	r3, r0;		\
 	pxor	r2, r0;		pand	r3, r2;		\
-	pxor	RNOT, r0;	pxor	r2, r4;		\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+	pxor	RNOT, r0;	pxor	r2, r4;
 
 #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r2, r4;		pand	r3, r2;		\
@@ -274,9 +241,7 @@
 	pand	r0, r2;		pxor	r0, r3;		\
 	pxor	r4, r2;	\
 	por	r3, r2;		pxor	r0, r3;		\
-	pxor	r1, r2;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+	pxor	r1, r2;
 
 #define SBOX5(r0, r1, r2, r3, r4) \
 	pxor	r1, r0;		pxor	r3, r1;		\
@@ -288,9 +253,7 @@
 	pxor	r2, r4;		pxor	r0, r2;		\
 	pand	r3, r0;		pxor	RNOT, r2;	\
 	pxor	r4, r0;		por	r3, r4;		\
-	pxor	r4, r2;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+	pxor	r4, r2;
 
 #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r1;	movdqa	r3, r4;		\
@@ -302,9 +265,7 @@
 	pxor	r3, r1;		pxor	r2, r4;		\
 	pand	r4, r3;		pxor	r1, r4;		\
 	pxor	r4, r3;		pxor	RNOT, r4;	\
-	pxor	r0, r3;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+	pxor	r0, r3;
 
 #define SBOX6(r0, r1, r2, r3, r4) \
 	pxor	RNOT, r2;	movdqa	r3, r4;		\
@@ -316,9 +277,7 @@
 	pxor	r2, r0;		pxor	r3, r4;		\
 	pxor	r0, r4;		pxor	RNOT, r3;	\
 	pand	r4, r2;	\
-	pxor	r3, r2;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+	pxor	r3, r2;
 
 #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
 	pxor	r2, r0;		movdqa	r2, r4;		\
@@ -329,9 +288,7 @@
 	pxor	r1, r4;		pand	r3, r1;		\
 	pxor	r0, r1;		pxor	r3, r0;		\
 	por	r2, r0;		pxor	r1, r3;		\
-	pxor	r0, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+	pxor	r0, r4;
 
 #define SBOX7(r0, r1, r2, r3, r4) \
 	movdqa	r1, r4;		por	r2, r1;		\
@@ -344,9 +301,7 @@
 	pxor	r1, r2;		pand	r0, r1;		\
 	pxor	r4, r1;		pxor	RNOT, r2;	\
 	por	r0, r2;	\
-	pxor	r2, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+	pxor	r2, r4;
 
 #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
 	movdqa	r2, r4;		pxor	r0, r2;		\
@@ -358,9 +313,7 @@
 	por	r2, r0;		pxor	r1, r4;		\
 	pxor	r3, r0;		pxor	r4, r3;		\
 	por	r0, r4;		pxor	r2, r3;		\
-	pxor	r2, r4;	\
-	\
-	sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+	pxor	r2, r4;
 
 /* Apply SBOX number WHICH to to the block.  */
 #define SBOX(which, r0, r1, r2, r3, r4) \
@@ -425,49 +378,51 @@
 
 /* Apply a Serpent round to eight parallel blocks.  This macro increments
    `round'.  */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-	SBOX (which, a0, a1, a2, a3, a4);		\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-		SBOX (which, b0, b1, b2, b3, b4);		\
-	LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4);	\
-		LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4);	\
-	.set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+			    b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4);	\
+		LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
 
 /* Apply the last Serpent round to eight parallel blocks.  This macro increments
    `round'.  */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-	SBOX (which, a0, a1, a2, a3, a4);		\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-		SBOX (which, b0, b1, b2, b3, b4);		\
-	.set round, (round + 1);			\
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-	.set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+				 b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	SBOX (which, a0, a1, a2, a3, a4);			\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
+		SBOX (which, b0, b1, b2, b3, b4);			\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1));		\
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
 
 /* Apply an inverse Serpent round to eight parallel blocks.  This macro
    increments `round'.  */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+				    na0, na1, na2, na3, na4, \
+				    b0, b1, b2, b3, b4, \
+				    nb0, nb1, nb2, nb3, nb4) \
 	LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4);	\
 		LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4);	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4);		\
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);		\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);		\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4);		\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);		\
-	.set round, (round - 1);
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 /* Apply the first inverse Serpent round to eight parallel blocks.  This macro
    increments `round'.  */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-	.set round, (round - 1);			\
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+					  na0, na1, na2, na3, na4, \
+					  b0, b1, b2, b3, b4, \
+					  nb0, nb1, nb2, nb3, nb4) \
+	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1));	\
+		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1));	\
 	SBOX_INVERSE (which, a0, a1, a2, a3, a4); 	\
-	BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round);	\
+	BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round);	\
 		SBOX_INVERSE (which, b0, b1, b2, b3, b4); 	\
-		BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round);	\
-	.set round, (round - 1);
+		BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
 
 .text
 
@@ -479,72 +434,82 @@ __serpent_enc_blk8:
 	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
 	 *						blocks
 	 * output:
-	 *	RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+	 *	RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
 	 * 						ciphertext blocks
 	 */
 
-	/* record input vector names for __serpent_enc_blk8 */
-	.set enc_in_a0, RA0
-	.set enc_in_a1, RA1
-	.set enc_in_a2, RA2
-	.set enc_in_a3, RA3
-	.set enc_in_b0, RB0
-	.set enc_in_b1, RB1
-	.set enc_in_b2, RB2
-	.set enc_in_b3, RB3
-
 	pcmpeqd RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	.set round, 0
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
-	ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
-	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
-	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
-	/* record output vector names for __serpent_enc_blk8 */
-	.set enc_out_a0, RA0
-	.set enc_out_a1, RA1
-	.set enc_out_a2, RA2
-	.set enc_out_a3, RA3
-	.set enc_out_b0, RB0
-	.set enc_out_b1, RB1
-	.set enc_out_b2, RB2
-	.set enc_out_b3, RB3
+	ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		     RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		     RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		     RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		     RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		     RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		     RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		     RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		     RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+	ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+		     RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+	ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+		     RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+	ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+		      RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+	ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+		      RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+	ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+		      RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+	ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+		      RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+	ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+		      RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+	ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+		      RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+	ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+		      RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+	ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+		      RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+	ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+		      RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+	ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+		      RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+	ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+		      RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+	ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+		      RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+		      RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+	ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+		      RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+	ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+		      RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+	ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+		      RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+	ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+		      RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+	ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+		      RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+	ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+		      RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+	ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+		      RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+	ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+		      RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+	ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+		           RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+	transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+	transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
 
 	ret;
 .size __serpent_enc_blk8,.-__serpent_enc_blk8;
@@ -561,69 +526,81 @@ __serpent_dec_blk8:
 	 *						blocks
 	 */
 
-	/* record input vector names for __serpent_dec_blk8 */
-	.set dec_in_a0, RA0
-	.set dec_in_a1, RA1
-	.set dec_in_a2, RA2
-	.set dec_in_a3, RA3
-	.set dec_in_b0, RB0
-	.set dec_in_b1, RB1
-	.set dec_in_b2, RB2
-	.set dec_in_b3, RB3
-
 	pcmpeqd RNOT, RNOT;
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	.set round, 32
-	ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-	ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+	ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+				    RA3, RA0, RA1, RA4, RA2,
+				    RB0, RB1, RB2, RB3, RB4,
+				    RB3, RB0, RB1, RB4, RB2);
+	ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+		              RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+	ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+		              RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+	ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+		              RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+	ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+		              RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+	ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+		              RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+	ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+		              RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+	ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+		              RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+	ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+		              RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+	ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+		              RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+	ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+		              RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+	ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+		              RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+	ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+		              RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+	ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+		              RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+	ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+		              RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+	ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+		              RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+	ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+		              RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+	ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+		              RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+	ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+		              RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+	ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+		              RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+	ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+		              RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+	ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+		              RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+	ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+		             RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+	ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+		             RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+	ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+		             RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+	ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+		             RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+	ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+		             RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+	ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+		             RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+	ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+		             RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+	ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+		             RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+	ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+		             RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+	ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+		             RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
 
 	transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
 	transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
 
-	/* record output vector names for __serpent_dec_blk8 */
-	.set dec_out_a0, RA0
-	.set dec_out_a1, RA1
-	.set dec_out_a2, RA2
-	.set dec_out_a3, RA3
-	.set dec_out_b0, RB0
-	.set dec_out_b1, RB1
-	.set dec_out_b2, RB2
-	.set dec_out_b3, RB3
-
 	ret;
 .size __serpent_dec_blk8,.-__serpent_dec_blk8;
 
@@ -638,15 +615,6 @@ _gcry_serpent_sse2_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 
-	.set RA0, enc_in_a0
-	.set RA1, enc_in_a1
-	.set RA2, enc_in_a2
-	.set RA3, enc_in_a3
-	.set RB0, enc_in_b0
-	.set RB1, enc_in_b1
-	.set RB2, enc_in_b2
-	.set RB3, enc_in_b3
-
 	/* load IV and byteswap */
 	movdqu (%rcx), RA0;
 	movdqa RA0, RTMP0;
@@ -729,42 +697,35 @@ _gcry_serpent_sse2_ctr_enc:
 
 	call __serpent_enc_blk8;
 
-	.set RA0, enc_out_a0
-	.set RA1, enc_out_a1
-	.set RA2, enc_out_a2
-	.set RA3, enc_out_a3
-	.set RB0, enc_out_b0
-	.set RB1, enc_out_b1
-	.set RB2, enc_out_b2
-	.set RB3, enc_out_b3
-
-	pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((0 * 16)(%rdx), RA4, RTMP0);
 	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
 	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
-	pxor_u((3 * 16)(%rdx), RA3, RTMP0);
-	pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB4, RTMP0);
 	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
 	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
-	pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB0, RTMP0);
 
-	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA4, (0 * 16)(%rsi);
 	movdqu RA1, (1 * 16)(%rsi);
 	movdqu RA2, (2 * 16)(%rsi);
-	movdqu RA3, (3 * 16)(%rsi);
-	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RA0, (3 * 16)(%rsi);
+	movdqu RB4, (4 * 16)(%rsi);
 	movdqu RB1, (5 * 16)(%rsi);
 	movdqu RB2, (6 * 16)(%rsi);
-	movdqu RB3, (7 * 16)(%rsi);
+	movdqu RB0, (7 * 16)(%rsi);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
+	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
+	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
@@ -784,15 +745,6 @@ _gcry_serpent_sse2_cbc_dec:
 	 *	%rcx: iv
 	 */
 
-	.set RA0, dec_in_a0
-	.set RA1, dec_in_a1
-	.set RA2, dec_in_a2
-	.set RA3, dec_in_a3
-	.set RB0, dec_in_b0
-	.set RB1, dec_in_b1
-	.set RB2, dec_in_b2
-	.set RB3, dec_in_b3
-
 	movdqu (0 * 16)(%rdx), RA0;
 	movdqu (1 * 16)(%rdx), RA1;
 	movdqu (2 * 16)(%rdx), RA2;
@@ -804,15 +756,6 @@ _gcry_serpent_sse2_cbc_dec:
 
 	call __serpent_dec_blk8;
 
-	.set RA0, dec_out_a0
-	.set RA1, dec_out_a1
-	.set RA2, dec_out_a2
-	.set RA3, dec_out_a3
-	.set RB0, dec_out_b0
-	.set RB1, dec_out_b1
-	.set RB2, dec_out_b2
-	.set RB3, dec_out_b3
-
 	movdqu (7 * 16)(%rdx), RNOT;
 	pxor_u((%rcx), RA0, RTMP0);
 	pxor_u((0 * 16)(%rdx), RA1, RTMP0);
@@ -838,10 +781,12 @@ _gcry_serpent_sse2_cbc_dec:
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
+	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
+	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
@@ -861,15 +806,6 @@ _gcry_serpent_sse2_cfb_dec:
 	 *	%rcx: iv
 	 */
 
-	.set RA0, enc_in_a0
-	.set RA1, enc_in_a1
-	.set RA2, enc_in_a2
-	.set RA3, enc_in_a3
-	.set RB0, enc_in_b0
-	.set RB1, enc_in_b1
-	.set RB2, enc_in_b2
-	.set RB3, enc_in_b3
-
 	/* Load input */
 	movdqu (%rcx), RA0;
 	movdqu 0 * 16(%rdx), RA1;
@@ -886,42 +822,35 @@ _gcry_serpent_sse2_cfb_dec:
 
 	call __serpent_enc_blk8;
 
-	.set RA0, enc_out_a0
-	.set RA1, enc_out_a1
-	.set RA2, enc_out_a2
-	.set RA3, enc_out_a3
-	.set RB0, enc_out_b0
-	.set RB1, enc_out_b1
-	.set RB2, enc_out_b2
-	.set RB3, enc_out_b3
-
-	pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((0 * 16)(%rdx), RA4, RTMP0);
 	pxor_u((1 * 16)(%rdx), RA1, RTMP0);
 	pxor_u((2 * 16)(%rdx), RA2, RTMP0);
-	pxor_u((3 * 16)(%rdx), RA3, RTMP0);
-	pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+	pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+	pxor_u((4 * 16)(%rdx), RB4, RTMP0);
 	pxor_u((5 * 16)(%rdx), RB1, RTMP0);
 	pxor_u((6 * 16)(%rdx), RB2, RTMP0);
-	pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+	pxor_u((7 * 16)(%rdx), RB0, RTMP0);
 
-	movdqu RA0, (0 * 16)(%rsi);
+	movdqu RA4, (0 * 16)(%rsi);
 	movdqu RA1, (1 * 16)(%rsi);
 	movdqu RA2, (2 * 16)(%rsi);
-	movdqu RA3, (3 * 16)(%rsi);
-	movdqu RB0, (4 * 16)(%rsi);
+	movdqu RA0, (3 * 16)(%rsi);
+	movdqu RB4, (4 * 16)(%rsi);
 	movdqu RB1, (5 * 16)(%rsi);
 	movdqu RB2, (6 * 16)(%rsi);
-	movdqu RB3, (7 * 16)(%rsi);
+	movdqu RB0, (7 * 16)(%rsi);
 
 	/* clear the used registers */
 	pxor RA0, RA0;
 	pxor RA1, RA1;
 	pxor RA2, RA2;
 	pxor RA3, RA3;
+	pxor RA4, RA4;
 	pxor RB0, RB0;
 	pxor RB1, RB1;
 	pxor RB2, RB2;
 	pxor RB3, RB3;
+	pxor RB4, RB4;
 	pxor RTMP0, RTMP0;
 	pxor RTMP1, RTMP1;
 	pxor RTMP2, RTMP2;
diff --git a/configure.ac b/configure.ac
index 1460dfd..a803b5f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1034,12 +1034,6 @@ if test $amd64_as_feature_detection = yes; then
        [gcry_cv_gcc_amd64_platform_as_ok=no
         AC_COMPILE_IFELSE([AC_LANG_SOURCE(
           [[__asm__(
-                /* Test if '.set' is supported by underlying assembler.  */
-                ".set a0, %rax\n\t"
-                ".set b0, %rdx\n\t"
-                "asmfunc:\n\t"
-                "movq a0, b0;\n\t" /* Fails here if .set ignored by as.  */
-
                 /* Test if '.type' and '.size' are supported.  */
                 /* These work only on ELF targets. */
 		/* TODO: add COFF (mingw64, cygwin64) support to assembly




More information about the Gcrypt-devel mailing list