[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-324-g98674fd
by Jussi Kivilinna
cvs at cvs.gnupg.org
Tue Oct 22 19:20:55 CEST 2013
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".
The branch, master has been updated
via 98674fdaa30ab22a3ac86ca05d688b5b6112895d (commit)
via e67c67321ce240c93dd0fa2b21c649c0a8e233f7 (commit)
via c7efaa5fe0ee92e321a7b49d56752cc12eb75fe0 (commit)
from 335d9bf7b035815750b63a3a8334d6ce44dc4449 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 98674fdaa30ab22a3ac86ca05d688b5b6112895d
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Tue Oct 22 17:07:53 2013 +0300
twofish: add ARMv6 assembly implementation
* cipher/Makefile.am: Add 'twofish-armv6.S'.
* cipher/twofish-armv6.S: New.
* cipher/twofish.c (USE_ARMV6_ASM): New macro.
[USE_ARMV6_ASM] (_gcry_twofish_armv6_encrypt_block)
(_gcry_twofish_armv6_decrypt_block): New prototypes.
[USE_AMDV6_ASM] (twofish_encrypt, twofish_decrypt): Add.
[USE_AMD64_ASM] (do_twofish_encrypt, do_twofish_decrypt): Remove.
(_gcry_twofish_ctr_enc, _gcry_twofish_cfb_dec): Use 'twofish_encrypt'
instead of 'do_twofish_encrypt'.
(_gcry_twofish_cbc_dec): Use 'twofish_decrypt' instead of
'do_twofish_decrypt'.
* configure.ac [arm]: Add 'twofish-armv6.lo'.
--
Add optimized ARMv6 assembly implementation for Twofish. Implementation is tuned
for Cortex-A8. Unaligned access handling is done in assembly part.
For now, only enable this on little-endian systems as big-endian correctness
have not been tested yet.
Old (gcc-4.8) vs new (twofish-asm), Cortex-A8 (on armhf):
ECB/Stream CBC CFB OFB CTR CCM
--------------- --------------- --------------- --------------- --------------- ---------------
TWOFISH 1.23x 1.25x 1.16x 1.26x 1.16x 1.30x 1.18x 1.17x 1.23x 1.23x 1.22x 1.22x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index b0efd89..3d8149a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -80,7 +80,7 @@ sha512.c sha512-armv7-neon.S \
stribog.c \
tiger.c \
whirlpool.c \
-twofish.c twofish-amd64.S \
+twofish.c twofish-amd64.S twofish-armv6.S \
rfc2268.c \
camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
camellia-aesni-avx2-amd64.S camellia-armv6.S
diff --git a/cipher/twofish-armv6.S b/cipher/twofish-armv6.S
new file mode 100644
index 0000000..b76ab37
--- /dev/null
+++ b/cipher/twofish-armv6.S
@@ -0,0 +1,365 @@
+/* twofish-armv6.S - ARM assembly implementation of Twofish cipher
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+.text
+
+.syntax unified
+.arm
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w ((s3) + 4 * 256)
+#define k ((w) + 4 * 8)
+
+/* register macros */
+#define CTX %r0
+#define CTXs0 %r0
+#define CTXs1 %r1
+#define CTXs3 %r7
+
+#define RA %r3
+#define RB %r4
+#define RC %r5
+#define RD %r6
+
+#define RX %r2
+#define RY %ip
+
+#define RMASK %lr
+
+#define RT0 %r8
+#define RT1 %r9
+#define RT2 %r10
+#define RT3 %r11
+
+/* helper macros */
+#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
+ ldrb rout, [rsrc, #((offs) + 0)]; \
+ ldrb rtmp, [rsrc, #((offs) + 1)]; \
+ orr rout, rout, rtmp, lsl #8; \
+ ldrb rtmp, [rsrc, #((offs) + 2)]; \
+ orr rout, rout, rtmp, lsl #16; \
+ ldrb rtmp, [rsrc, #((offs) + 3)]; \
+ orr rout, rout, rtmp, lsl #24;
+
+#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
+ mov rtmp0, rin, lsr #8; \
+ strb rin, [rdst, #((offs) + 0)]; \
+ mov rtmp1, rin, lsr #16; \
+ strb rtmp0, [rdst, #((offs) + 1)]; \
+ mov rtmp0, rin, lsr #24; \
+ strb rtmp1, [rdst, #((offs) + 2)]; \
+ strb rtmp0, [rdst, #((offs) + 3)];
+
+#ifndef __ARMEL__
+ /* bswap on big-endian */
+ #define host_to_le(reg) \
+ rev reg, reg;
+ #define le_to_host(reg) \
+ rev reg, reg;
+#else
+ /* nop on little-endian */
+ #define host_to_le(reg) /*_*/
+ #define le_to_host(reg) /*_*/
+#endif
+
+#define ldr_input_aligned_le(rin, a, b, c, d) \
+ ldr a, [rin, #0]; \
+ ldr b, [rin, #4]; \
+ le_to_host(a); \
+ ldr c, [rin, #8]; \
+ le_to_host(b); \
+ ldr d, [rin, #12]; \
+ le_to_host(c); \
+ le_to_host(d);
+
+#define str_output_aligned_le(rout, a, b, c, d) \
+ le_to_host(a); \
+ le_to_host(b); \
+ str a, [rout, #0]; \
+ le_to_host(c); \
+ str b, [rout, #4]; \
+ le_to_host(d); \
+ str c, [rout, #8]; \
+ str d, [rout, #12];
+
+#ifdef __ARM_FEATURE_UNALIGNED
+ /* unaligned word reads/writes allowed */
+ #define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
+ ldr_input_aligned_le(rin, ra, rb, rc, rd)
+
+ #define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+ str_output_aligned_le(rout, ra, rb, rc, rd)
+#else
+ /* need to handle unaligned reads/writes by byte reads */
+ #define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
+ tst rin, #3; \
+ beq 1f; \
+ ldr_unaligned_le(ra, rin, 0, rtmp0); \
+ ldr_unaligned_le(rb, rin, 4, rtmp0); \
+ ldr_unaligned_le(rc, rin, 8, rtmp0); \
+ ldr_unaligned_le(rd, rin, 12, rtmp0); \
+ b 2f; \
+ 1:;\
+ ldr_input_aligned_le(rin, ra, rb, rc, rd); \
+ 2:;
+
+ #define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+ tst rout, #3; \
+ beq 1f; \
+ str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
+ str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
+ str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
+ str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
+ b 2f; \
+ 1:;\
+ str_output_aligned_le(rout, ra, rb, rc, rd); \
+ 2:;
+#endif
+
+/**********************************************************************
+ 1-way twofish
+ **********************************************************************/
+#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
+ and RT0, RMASK, b, lsr#(8 - 2); \
+ and RY, RMASK, b, lsr#(16 - 2); \
+ add RT0, RT0, #(s2 - s1); \
+ and RT1, RMASK, b, lsr#(24 - 2); \
+ ldr RY, [CTXs3, RY]; \
+ and RT2, RMASK, b, lsl#(2); \
+ ldr RT0, [CTXs1, RT0]; \
+ and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
+ ldr RT1, [CTXs0, RT1]; \
+ and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
+ ldr RT2, [CTXs1, RT2]; \
+ add RT3, RT3, #(s2 - s1); \
+ ldr RX, [CTXs1, RX]; \
+ ror_a(a); \
+ \
+ eor RY, RY, RT0; \
+ ldr RT3, [CTXs1, RT3]; \
+ and RT0, RMASK, a, lsl#(2); \
+ eor RY, RY, RT1; \
+ and RT1, RMASK, a, lsr#(24 - 2); \
+ eor RY, RY, RT2; \
+ ldr RT0, [CTXs0, RT0]; \
+ eor RX, RX, RT3; \
+ ldr RT1, [CTXs3, RT1]; \
+ eor RX, RX, RT0; \
+ \
+ ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+ eor RX, RX, RT1; \
+ ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+ \
+ add RT0, RX, RY, lsl #1; \
+ add RX, RX, RY; \
+ add RT0, RT0, RT3; \
+ add RX, RX, RT2; \
+ eor rd, RT0, rd, ror #31; \
+ eor rc, rc, RX;
+
+#define dummy(x) /*_*/
+
+#define ror1(r) \
+ ror r, r, #1;
+
+#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
+ and RT3, RMASK, b, lsl#(2 - (adj_b)); \
+ and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
+ ror_b(b); \
+ and RT2, RMASK, a, lsl#(2); \
+ and RT0, RMASK, a, lsr#(8 - 2); \
+ \
+ ldr RY, [CTXs1, RT3]; \
+ add RT1, RT1, #(s2 - s1); \
+ ldr RX, [CTXs0, RT2]; \
+ and RT3, RMASK, b, lsr#(16 - 2); \
+ ldr RT1, [CTXs1, RT1]; \
+ and RT2, RMASK, a, lsr#(16 - 2); \
+ ldr RT0, [CTXs1, RT0]; \
+ \
+ add RT2, RT2, #(s2 - s1); \
+ ldr RT3, [CTXs3, RT3]; \
+ eor RY, RY, RT1; \
+ \
+ and RT1, RMASK, b, lsr#(24 - 2); \
+ eor RX, RX, RT0; \
+ ldr RT2, [CTXs1, RT2]; \
+ and RT0, RMASK, a, lsr#(24 - 2); \
+ \
+ ldr RT1, [CTXs0, RT1]; \
+ \
+ eor RY, RY, RT3; \
+ ldr RT0, [CTXs3, RT0]; \
+ eor RX, RX, RT2; \
+ eor RY, RY, RT1; \
+ \
+ ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+ eor RX, RX, RT0; \
+ ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+ \
+ add RT0, RX, RY, lsl #1; \
+ add RX, RX, RY; \
+ add RT0, RT0, RT1; \
+ add RX, RX, RT2; \
+ eor rd, rd, RT0; \
+ eor rc, RX, rc, ror #31;
+
+#define first_encrypt_cycle(nc) \
+ encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
+ encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define encrypt_cycle(nc) \
+ encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+ encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define last_encrypt_cycle(nc) \
+ encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+ encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+ ror1(RA);
+
+#define first_decrypt_cycle(nc) \
+ decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
+ decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define decrypt_cycle(nc) \
+ decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+ decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define last_decrypt_cycle(nc) \
+ decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+ decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+ ror1(RD);
+
+.align 3
+.global _gcry_twofish_armv6_encrypt_block
+.type _gcry_twofish_armv6_encrypt_block,%function;
+
+_gcry_twofish_armv6_encrypt_block:
+ /* input:
+ * %r0: ctx
+ * %r1: dst
+ * %r2: src
+ */
+ push {%r1, %r4-%r11, %ip, %lr};
+
+ add RY, CTXs0, #w;
+
+ ldr_input_le(%r2, RA, RB, RC, RD, RT0);
+
+ /* Input whitening */
+ ldm RY, {RT0, RT1, RT2, RT3};
+ add CTXs3, CTXs0, #(s3 - s0);
+ add CTXs1, CTXs0, #(s1 - s0);
+ mov RMASK, #(0xff << 2);
+ eor RA, RA, RT0;
+ eor RB, RB, RT1;
+ eor RC, RC, RT2;
+ eor RD, RD, RT3;
+
+ first_encrypt_cycle(0);
+ encrypt_cycle(1);
+ encrypt_cycle(2);
+ encrypt_cycle(3);
+ encrypt_cycle(4);
+ encrypt_cycle(5);
+ encrypt_cycle(6);
+ last_encrypt_cycle(7);
+
+ add RY, CTXs3, #(w + 4*4 - s3);
+ pop {%r1}; /* dst */
+
+ /* Output whitening */
+ ldm RY, {RT0, RT1, RT2, RT3};
+ eor RC, RC, RT0;
+ eor RD, RD, RT1;
+ eor RA, RA, RT2;
+ eor RB, RB, RT3;
+
+ str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
+
+ pop {%r4-%r11, %ip, %lr};
+ bx %lr;
+.ltorg
+.size _gcry_twofish_armv6_encrypt_block,.-_gcry_twofish_armv6_encrypt_block;
+
+.align 3
+.global _gcry_twofish_armv6_decrypt_block
+.type _gcry_twofish_armv6_decrypt_block,%function;
+
+_gcry_twofish_armv6_decrypt_block:
+ /* input:
+ * %r0: ctx
+ * %r1: dst
+ * %r2: src
+ */
+ push {%r1, %r4-%r11, %ip, %lr};
+
+ add CTXs3, CTXs0, #(s3 - s0);
+
+ ldr_input_le(%r2, RC, RD, RA, RB, RT0);
+
+ add RY, CTXs3, #(w + 4*4 - s3);
+ add CTXs3, CTXs0, #(s3 - s0);
+
+ /* Input whitening */
+ ldm RY, {RT0, RT1, RT2, RT3};
+ add CTXs1, CTXs0, #(s1 - s0);
+ mov RMASK, #(0xff << 2);
+ eor RC, RC, RT0;
+ eor RD, RD, RT1;
+ eor RA, RA, RT2;
+ eor RB, RB, RT3;
+
+ first_decrypt_cycle(7);
+ decrypt_cycle(6);
+ decrypt_cycle(5);
+ decrypt_cycle(4);
+ decrypt_cycle(3);
+ decrypt_cycle(2);
+ decrypt_cycle(1);
+ last_decrypt_cycle(0);
+
+ add RY, CTXs0, #w;
+ pop {%r1}; /* dst */
+
+ /* Output whitening */
+ ldm RY, {RT0, RT1, RT2, RT3};
+ eor RA, RA, RT0;
+ eor RB, RB, RT1;
+ eor RC, RC, RT2;
+ eor RD, RD, RT3;
+
+ str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
+
+ pop {%r4-%r11, %ip, %lr};
+ bx %lr;
+.size _gcry_twofish_armv6_decrypt_block,.-_gcry_twofish_armv6_decrypt_block;
+
+#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
+#endif /*__ARM_ARCH >= 6*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 993ad0f..d2cabbe 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -57,6 +57,14 @@
# define USE_AMD64_ASM 1
#endif
+/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */
+#undef USE_ARMV6_ASM
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__)
+# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+# define USE_ARMV6_ASM 1
+# endif
+#endif
+
/* Prototype for the self-test function. */
static const char *selftest(void);
@@ -746,7 +754,16 @@ extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
const byte *in, byte *iv);
-#else /*!USE_AMD64_ASM*/
+#elif defined(USE_ARMV6_ASM)
+
+/* Assembly implementations of Twofish. */
+extern void _gcry_twofish_armv6_encrypt_block(const TWOFISH_context *c,
+ byte *out, const byte *in);
+
+extern void _gcry_twofish_armv6_decrypt_block(const TWOFISH_context *c,
+ byte *out, const byte *in);
+
+#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
/* Macros to compute the g() function in the encryption and decryption
* rounds. G1 is the straight g() function; G2 includes the 8-bit
@@ -812,21 +829,25 @@ extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
#ifdef USE_AMD64_ASM
-static void
-do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
+static unsigned int
+twofish_encrypt (void *context, byte *out, const byte *in)
{
+ TWOFISH_context *ctx = context;
_gcry_twofish_amd64_encrypt_block(ctx, out, in);
+ return /*burn_stack*/ (4*sizeof (void*));
}
+#elif defined(USE_ARMV6_ASM)
+
static unsigned int
twofish_encrypt (void *context, byte *out, const byte *in)
{
TWOFISH_context *ctx = context;
- _gcry_twofish_amd64_encrypt_block(ctx, out, in);
+ _gcry_twofish_armv6_encrypt_block(ctx, out, in);
return /*burn_stack*/ (4*sizeof (void*));
}
-#else /*!USE_AMD64_ASM*/
+#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
static void
do_twofish_encrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
@@ -868,28 +889,32 @@ twofish_encrypt (void *context, byte *out, const byte *in)
return /*burn_stack*/ (24+3*sizeof (void*));
}
-#endif /*!USE_AMD64_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
/* Decrypt one block. in and out may be the same. */
#ifdef USE_AMD64_ASM
-static void
-do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
+static unsigned int
+twofish_decrypt (void *context, byte *out, const byte *in)
{
+ TWOFISH_context *ctx = context;
_gcry_twofish_amd64_decrypt_block(ctx, out, in);
+ return /*burn_stack*/ (4*sizeof (void*));
}
+#elif defined(USE_ARMV6_ASM)
+
static unsigned int
twofish_decrypt (void *context, byte *out, const byte *in)
{
TWOFISH_context *ctx = context;
- _gcry_twofish_amd64_decrypt_block(ctx, out, in);
+ _gcry_twofish_armv6_decrypt_block(ctx, out, in);
return /*burn_stack*/ (4*sizeof (void*));
}
-#else /*!USE_AMD64_ASM*/
+#else /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
static void
do_twofish_decrypt (const TWOFISH_context *ctx, byte *out, const byte *in)
@@ -932,7 +957,7 @@ twofish_decrypt (void *context, byte *out, const byte *in)
return /*burn_stack*/ (24+3*sizeof (void*));
}
-#endif /*!USE_AMD64_ASM*/
+#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/
@@ -947,14 +972,11 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char tmpbuf[TWOFISH_BLOCKSIZE];
- int burn_stack_depth = 24 + 3 * sizeof (void*);
+ unsigned int burn, burn_stack_depth = 0;
int i;
#ifdef USE_AMD64_ASM
{
- if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*))
- burn_stack_depth = 8 * sizeof(void*);
-
/* Process data in 3 block chunks. */
while (nblocks >= 3)
{
@@ -963,6 +985,10 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
nblocks -= 3;
outbuf += 3 * TWOFISH_BLOCKSIZE;
inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+ burn = 8 * sizeof(void*);
+ if (burn > burn_stack_depth)
+ burn_stack_depth = burn;
}
/* Use generic code to handle smaller chunks... */
@@ -973,7 +999,10 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
for ( ;nblocks; nblocks-- )
{
/* Encrypt the counter. */
- do_twofish_encrypt(ctx, tmpbuf, ctr);
+ burn = twofish_encrypt(ctx, tmpbuf, ctr);
+ if (burn > burn_stack_depth)
+ burn_stack_depth = burn;
+
/* XOR the input with the encrypted counter and store in output. */
buf_xor(outbuf, tmpbuf, inbuf, TWOFISH_BLOCKSIZE);
outbuf += TWOFISH_BLOCKSIZE;
@@ -1002,13 +1031,10 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
unsigned char savebuf[TWOFISH_BLOCKSIZE];
- int burn_stack_depth = 24 + 3 * sizeof (void*);
+ unsigned int burn, burn_stack_depth = 0;
#ifdef USE_AMD64_ASM
{
- if (nblocks >= 3 && burn_stack_depth < 9 * sizeof(void*))
- burn_stack_depth = 9 * sizeof(void*);
-
/* Process data in 3 block chunks. */
while (nblocks >= 3)
{
@@ -1017,6 +1043,10 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
nblocks -= 3;
outbuf += 3 * TWOFISH_BLOCKSIZE;
inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+ burn = 9 * sizeof(void*);
+ if (burn > burn_stack_depth)
+ burn_stack_depth = burn;
}
/* Use generic code to handle smaller chunks... */
@@ -1029,7 +1059,9 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
OUTBUF. */
memcpy(savebuf, inbuf, TWOFISH_BLOCKSIZE);
- do_twofish_decrypt (ctx, outbuf, inbuf);
+ burn = twofish_decrypt (ctx, outbuf, inbuf);
+ if (burn > burn_stack_depth)
+ burn_stack_depth = burn;
buf_xor(outbuf, outbuf, iv, TWOFISH_BLOCKSIZE);
memcpy(iv, savebuf, TWOFISH_BLOCKSIZE);
@@ -1051,13 +1083,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
TWOFISH_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- int burn_stack_depth = 24 + 3 * sizeof (void*);
+ unsigned int burn, burn_stack_depth = 0;
#ifdef USE_AMD64_ASM
{
- if (nblocks >= 3 && burn_stack_depth < 8 * sizeof(void*))
- burn_stack_depth = 8 * sizeof(void*);
-
/* Process data in 3 block chunks. */
while (nblocks >= 3)
{
@@ -1066,6 +1095,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
nblocks -= 3;
outbuf += 3 * TWOFISH_BLOCKSIZE;
inbuf += 3 * TWOFISH_BLOCKSIZE;
+
+ burn = 8 * sizeof(void*);
+ if (burn > burn_stack_depth)
+ burn_stack_depth = burn;
}
/* Use generic code to handle smaller chunks... */
@@ -1074,7 +1107,10 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
for ( ;nblocks; nblocks-- )
{
- do_twofish_encrypt(ctx, iv, iv);
+ burn = twofish_encrypt(ctx, iv, iv);
+ if (burn > burn_stack_depth)
+ burn_stack_depth = burn;
+
buf_xor_n_copy(outbuf, iv, inbuf, TWOFISH_BLOCKSIZE);
outbuf += TWOFISH_BLOCKSIZE;
inbuf += TWOFISH_BLOCKSIZE;
diff --git a/configure.ac b/configure.ac
index 8fb14e2..739a650 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1480,6 +1480,10 @@ if test "$found" = "1" ; then
# Build with the assembly implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
;;
+ arm*-*-*)
+ # Build with the assembly implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-armv6.lo"
+ ;;
esac
fi
commit e67c67321ce240c93dd0fa2b21c649c0a8e233f7
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Tue Oct 22 17:07:53 2013 +0300
mpi: allow building with clang on ARM
* mpi/longlong.h [__arm__] (add_ssaaaa, sub_ddmmss, umul_ppmm)
(count_leading_zeros): Do not cast assembly output arguments.
[__arm__] (umul_ppmm): Remove the extra '%' ahead of assembly comment.
[_ARM_ARCH >= 4] (umul_ppmm): Use correct inputs and outputs instead of
registers.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/mpi/longlong.h b/mpi/longlong.h
index c2ab9c5..8c8260e 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -213,8 +213,8 @@ extern UDItype __udiv_qrnnd ();
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("adds %1, %4, %5\n" \
"adc %0, %2, %3" \
- : "=r" ((USItype)(sh)), \
- "=&r" ((USItype)(sl)) \
+ : "=r" ((sh)), \
+ "=&r" ((sl)) \
: "%r" ((USItype)(ah)), \
"rI" ((USItype)(bh)), \
"%r" ((USItype)(al)), \
@@ -222,15 +222,15 @@ extern UDItype __udiv_qrnnd ();
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
__asm__ ("subs %1, %4, %5\n" \
"sbc %0, %2, %3" \
- : "=r" ((USItype)(sh)), \
- "=&r" ((USItype)(sl)) \
+ : "=r" ((sh)), \
+ "=&r" ((sl)) \
: "r" ((USItype)(ah)), \
"rI" ((USItype)(bh)), \
"r" ((USItype)(al)), \
"rI" ((USItype)(bl)) __CLOBBER_CC)
#if (defined __ARM_ARCH && __ARM_ARCH <= 3)
#define umul_ppmm(xh, xl, a, b) \
- __asm__ ("%@ Inlined umul_ppmm\n" \
+ __asm__ ("@ Inlined umul_ppmm\n" \
"mov %|r0, %2, lsr #16 @ AAAA\n" \
"mov %|r2, %3, lsr #16 @ BBBB\n" \
"bic %|r1, %2, %|r0, lsl #16 @ aaaa\n" \
@@ -243,27 +243,26 @@ extern UDItype __udiv_qrnnd ();
"addcs %|r2, %|r2, #65536\n" \
"adds %1, %|r1, %|r0, lsl #16\n" \
"adc %0, %|r2, %|r0, lsr #16" \
- : "=&r" ((USItype)(xh)), \
- "=r" ((USItype)(xl)) \
+ : "=&r" ((xh)), \
+ "=r" ((xl)) \
: "r" ((USItype)(a)), \
"r" ((USItype)(b)) \
: "r0", "r1", "r2" __CLOBBER_CC)
#else /* __ARM_ARCH >= 4 */
#define umul_ppmm(xh, xl, a, b) \
- __asm__ ("%@ Inlined umul_ppmm\n" \
- "umull %r1, %r0, %r2, %r3" \
- : "=&r" ((USItype)(xh)), \
- "=r" ((USItype)(xl)) \
+ __asm__ ("@ Inlined umul_ppmm\n" \
+ "umull %1, %0, %2, %3" \
+ : "=&r" ((xh)), \
+ "=r" ((xl)) \
: "r" ((USItype)(a)), \
- "r" ((USItype)(b)) \
- : "r0", "r1")
+ "r" ((USItype)(b)))
#endif /* __ARM_ARCH >= 4 */
#define UMUL_TIME 20
#define UDIV_TIME 100
#if (defined __ARM_ARCH && __ARM_ARCH >= 5)
#define count_leading_zeros(count, x) \
__asm__ ("clz %0, %1" \
- : "=r" ((USItype)(count)) \
+ : "=r" ((count)) \
: "r" ((USItype)(x)))
#endif /* __ARM_ARCH >= 5 */
#endif /* __arm__ */
commit c7efaa5fe0ee92e321a7b49d56752cc12eb75fe0
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Tue Oct 22 17:07:53 2013 +0300
serpent-amd64: do not use GAS macros
* cipher/serpent-avx2-amd64.S: Remove use of GAS macros.
* cipher/serpent-sse2-amd64.S: Ditto.
* configure.ac [HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS]: Do not check
for GAS macros.
--
This way we have better portability; for example, when compiling with clang
on x86-64, the assembly implementations are now enabled and working.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index c726e7b..8a76ab1 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -36,51 +36,36 @@
#define CTX %rdi
/* vector registers */
-.set RA0, %ymm0
-.set RA1, %ymm1
-.set RA2, %ymm2
-.set RA3, %ymm3
-.set RA4, %ymm4
-
-.set RB0, %ymm5
-.set RB1, %ymm6
-.set RB2, %ymm7
-.set RB3, %ymm8
-.set RB4, %ymm9
-
-.set RNOT, %ymm10
-.set RTMP0, %ymm11
-.set RTMP1, %ymm12
-.set RTMP2, %ymm13
-.set RTMP3, %ymm14
-.set RTMP4, %ymm15
-
-.set RNOTx, %xmm10
-.set RTMP0x, %xmm11
-.set RTMP1x, %xmm12
-.set RTMP2x, %xmm13
-.set RTMP3x, %xmm14
-.set RTMP4x, %xmm15
+#define RA0 %ymm0
+#define RA1 %ymm1
+#define RA2 %ymm2
+#define RA3 %ymm3
+#define RA4 %ymm4
+
+#define RB0 %ymm5
+#define RB1 %ymm6
+#define RB2 %ymm7
+#define RB3 %ymm8
+#define RB4 %ymm9
+
+#define RNOT %ymm10
+#define RTMP0 %ymm11
+#define RTMP1 %ymm12
+#define RTMP2 %ymm13
+#define RTMP3 %ymm14
+#define RTMP4 %ymm15
+
+#define RNOTx %xmm10
+#define RTMP0x %xmm11
+#define RTMP1x %xmm12
+#define RTMP2x %xmm13
+#define RTMP3x %xmm14
+#define RTMP4x %xmm15
/**********************************************************************
helper macros
**********************************************************************/
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
- new_r0, new_r1, new_r2, new_r3, new_r4) \
- .set rename_reg0, new_r0; \
- .set rename_reg1, new_r1; \
- .set rename_reg2, new_r2; \
- .set rename_reg3, new_r3; \
- .set rename_reg4, new_r4; \
- \
- .set r0, rename_reg0; \
- .set r1, rename_reg1; \
- .set r2, rename_reg2; \
- .set r3, rename_reg3; \
- .set r4, rename_reg4;
-
/* vector 32-bit rotation to left */
#define vec_rol(reg, nleft, tmp) \
vpslld $(nleft), reg, tmp; \
@@ -128,9 +113,7 @@
vpxor r4, r2, r2; vpxor RNOT, r4, r4; \
vpor r1, r4, r4; vpxor r3, r1, r1; \
vpxor r4, r1, r1; vpor r0, r3, r3; \
- vpxor r3, r1, r1; vpxor r3, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+ vpxor r3, r1, r1; vpxor r3, r4, r4;
#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
vpxor RNOT, r2, r2; vmovdqa r1, r4; \
@@ -143,9 +126,7 @@
vpxor r1, r2, r2; vpxor r0, r3, r3; \
vpxor r1, r3, r3; \
vpand r3, r2, r2; \
- vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+ vpxor r2, r4, r4;
#define SBOX1(r0, r1, r2, r3, r4) \
vpxor RNOT, r0, r0; vpxor RNOT, r2, r2; \
@@ -157,9 +138,7 @@
vpand r4, r2, r2; vpxor r1, r0, r0; \
vpand r2, r1, r1; \
vpxor r0, r1, r1; vpand r2, r0, r0; \
- vpxor r4, r0, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+ vpxor r4, r0, r0;
#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r1, r4; vpxor r3, r1, r1; \
@@ -172,9 +151,7 @@
vpxor r1, r4, r4; vpor r0, r1, r1; \
vpxor r0, r1, r1; \
vpor r4, r1, r1; \
- vpxor r1, r3, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+ vpxor r1, r3, r3;
#define SBOX2(r0, r1, r2, r3, r4) \
vmovdqa r0, r4; vpand r2, r0, r0; \
@@ -184,9 +161,7 @@
vmovdqa r3, r1; vpor r4, r3, r3; \
vpxor r0, r3, r3; vpand r1, r0, r0; \
vpxor r0, r4, r4; vpxor r3, r1, r1; \
- vpxor r4, r1, r1; vpxor RNOT, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+ vpxor r4, r1, r1; vpxor RNOT, r4, r4;
#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
vpxor r3, r2, r2; vpxor r0, r3, r3; \
@@ -198,9 +173,7 @@
vpor r0, r2, r2; vpxor RNOT, r3, r3; \
vpxor r3, r2, r2; vpxor r3, r0, r0; \
vpand r1, r0, r0; vpxor r4, r3, r3; \
- vpxor r0, r3, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+ vpxor r0, r3, r3;
#define SBOX3(r0, r1, r2, r3, r4) \
vmovdqa r0, r4; vpor r3, r0, r0; \
@@ -212,9 +185,7 @@
vpxor r2, r4, r4; vpor r0, r1, r1; \
vpxor r2, r1, r1; vpxor r3, r0, r0; \
vmovdqa r1, r2; vpor r3, r1, r1; \
- vpxor r0, r1, r1; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+ vpxor r0, r1, r1;
#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r2, r4; vpxor r1, r2, r2; \
@@ -226,9 +197,7 @@
vpxor r1, r3, r3; vpxor r0, r1, r1; \
vpor r2, r1, r1; vpxor r3, r0, r0; \
vpxor r4, r1, r1; \
- vpxor r1, r0, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+ vpxor r1, r0, r0;
#define SBOX4(r0, r1, r2, r3, r4) \
vpxor r3, r1, r1; vpxor RNOT, r3, r3; \
@@ -240,9 +209,7 @@
vpxor r0, r3, r3; vpor r1, r4, r4; \
vpxor r0, r4, r4; vpor r3, r0, r0; \
vpxor r2, r0, r0; vpand r3, r2, r2; \
- vpxor RNOT, r0, r0; vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+ vpxor RNOT, r0, r0; vpxor r2, r4, r4;
#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r2, r4; vpand r3, r2, r2; \
@@ -255,9 +222,7 @@
vpand r0, r2, r2; vpxor r0, r3, r3; \
vpxor r4, r2, r2; \
vpor r3, r2, r2; vpxor r0, r3, r3; \
- vpxor r1, r2, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+ vpxor r1, r2, r2;
#define SBOX5(r0, r1, r2, r3, r4) \
vpxor r1, r0, r0; vpxor r3, r1, r1; \
@@ -269,9 +234,7 @@
vpxor r2, r4, r4; vpxor r0, r2, r2; \
vpand r3, r0, r0; vpxor RNOT, r2, r2; \
vpxor r4, r0, r0; vpor r3, r4, r4; \
- vpxor r4, r2, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+ vpxor r4, r2, r2;
#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
vpxor RNOT, r1, r1; vmovdqa r3, r4; \
@@ -283,9 +246,7 @@
vpxor r3, r1, r1; vpxor r2, r4, r4; \
vpand r4, r3, r3; vpxor r1, r4, r4; \
vpxor r4, r3, r3; vpxor RNOT, r4, r4; \
- vpxor r0, r3, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+ vpxor r0, r3, r3;
#define SBOX6(r0, r1, r2, r3, r4) \
vpxor RNOT, r2, r2; vmovdqa r3, r4; \
@@ -297,9 +258,7 @@
vpxor r2, r0, r0; vpxor r3, r4, r4; \
vpxor r0, r4, r4; vpxor RNOT, r3, r3; \
vpand r4, r2, r2; \
- vpxor r3, r2, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+ vpxor r3, r2, r2;
#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
vpxor r2, r0, r0; vmovdqa r2, r4; \
@@ -310,9 +269,7 @@
vpxor r1, r4, r4; vpand r3, r1, r1; \
vpxor r0, r1, r1; vpxor r3, r0, r0; \
vpor r2, r0, r0; vpxor r1, r3, r3; \
- vpxor r0, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+ vpxor r0, r4, r4;
#define SBOX7(r0, r1, r2, r3, r4) \
vmovdqa r1, r4; vpor r2, r1, r1; \
@@ -325,9 +282,7 @@
vpxor r1, r2, r2; vpand r0, r1, r1; \
vpxor r4, r1, r1; vpxor RNOT, r2, r2; \
vpor r0, r2, r2; \
- vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+ vpxor r2, r4, r4;
#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
vmovdqa r2, r4; vpxor r0, r2, r2; \
@@ -339,9 +294,7 @@
vpor r2, r0, r0; vpxor r1, r4, r4; \
vpxor r3, r0, r0; vpxor r4, r3, r3; \
vpor r0, r4, r4; vpxor r2, r3, r3; \
- vpxor r2, r4, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+ vpxor r2, r4, r4;
/* Apply SBOX number WHICH to to the block. */
#define SBOX(which, r0, r1, r2, r3, r4) \
@@ -402,49 +355,51 @@
/* Apply a Serpent round to sixteen parallel blocks. This macro increments
`round'. */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4); \
- LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4); \
- .set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \
+ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
/* Apply the last Serpent round to sixteen parallel blocks. This macro
increments `round'. */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- .set round, (round + 1); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
/* Apply an inverse Serpent round to sixteen parallel blocks. This macro
increments `round'. */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \
LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
/* Apply the first inverse Serpent round to sixteen parallel blocks. This macro
increments `round'. */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1); \
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
.text
@@ -456,72 +411,82 @@ __serpent_enc_blk16:
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
* plaintext blocks
* output:
- * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
* ciphertext blocks
*/
- /* record input vector names for __serpent_enc_blk16 */
- .set enc_in_a0, RA0
- .set enc_in_a1, RA1
- .set enc_in_a2, RA2
- .set enc_in_a3, RA3
- .set enc_in_b0, RB0
- .set enc_in_b1, RB1
- .set enc_in_b2, RB2
- .set enc_in_b3, RB3
-
vpcmpeqd RNOT, RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 0
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
- transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
- /* record output vector names for __serpent_enc_blk16 */
- .set enc_out_a0, RA0
- .set enc_out_a1, RA1
- .set enc_out_a2, RA2
- .set enc_out_a3, RA3
- .set enc_out_b0, RB0
- .set enc_out_b1, RB1
- .set enc_out_b2, RB2
- .set enc_out_b3, RB3
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+ transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
ret;
.size __serpent_enc_blk16,.-__serpent_enc_blk16;
@@ -538,69 +503,81 @@ __serpent_dec_blk16:
* plaintext blocks
*/
- /* record input vector names for __serpent_dec_blk16 */
- .set dec_in_a0, RA0
- .set dec_in_a1, RA1
- .set dec_in_a2, RA2
- .set dec_in_a3, RA3
- .set dec_in_b0, RB0
- .set dec_in_b1, RB1
- .set dec_in_b2, RB2
- .set dec_in_b3, RB3
-
vpcmpeqd RNOT, RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 32
- ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+ RA3, RA0, RA1, RA4, RA2,
+ RB0, RB1, RB2, RB3, RB4,
+ RB3, RB0, RB1, RB4, RB2);
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- /* record output vector names for __serpent_dec_blk16 */
- .set dec_out_a0, RA0
- .set dec_out_a1, RA1
- .set dec_out_a2, RA2
- .set dec_out_a3, RA3
- .set dec_out_b0, RB0
- .set dec_out_b1, RB1
- .set dec_out_b2, RB2
- .set dec_out_b3, RB3
-
ret;
.size __serpent_dec_blk16,.-__serpent_dec_blk16;
@@ -623,15 +600,6 @@ _gcry_serpent_avx2_ctr_enc:
vzeroupper;
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
vpcmpeqd RNOT, RNOT, RNOT;
vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
@@ -703,32 +671,23 @@ _gcry_serpent_avx2_ctr_enc:
call __serpent_enc_blk16;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (0 * 32)(%rdx), RA4, RA4;
vpxor (1 * 32)(%rdx), RA1, RA1;
vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (3 * 32)(%rdx), RA0, RA0;
+ vpxor (4 * 32)(%rdx), RB4, RB4;
vpxor (5 * 32)(%rdx), RB1, RB1;
vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
+ vpxor (7 * 32)(%rdx), RB0, RB0;
- vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA4, (0 * 32)(%rsi);
vmovdqu RA1, (1 * 32)(%rsi);
vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RA0, (3 * 32)(%rsi);
+ vmovdqu RB4, (4 * 32)(%rsi);
vmovdqu RB1, (5 * 32)(%rsi);
vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
+ vmovdqu RB0, (7 * 32)(%rsi);
vzeroall;
@@ -748,15 +707,6 @@ _gcry_serpent_avx2_cbc_dec:
vzeroupper;
- .set RA0, dec_in_a0
- .set RA1, dec_in_a1
- .set RA2, dec_in_a2
- .set RA3, dec_in_a3
- .set RB0, dec_in_b0
- .set RB1, dec_in_b1
- .set RB2, dec_in_b2
- .set RB3, dec_in_b3
-
vmovdqu (0 * 32)(%rdx), RA0;
vmovdqu (1 * 32)(%rdx), RA1;
vmovdqu (2 * 32)(%rdx), RA2;
@@ -768,15 +718,6 @@ _gcry_serpent_avx2_cbc_dec:
call __serpent_dec_blk16;
- .set RA0, dec_out_a0
- .set RA1, dec_out_a1
- .set RA2, dec_out_a2
- .set RA3, dec_out_a3
- .set RB0, dec_out_b0
- .set RB1, dec_out_b1
- .set RB2, dec_out_b2
- .set RB3, dec_out_b3
-
vmovdqu (%rcx), RNOTx;
vinserti128 $1, (%rdx), RNOT, RNOT;
vpxor RNOT, RA0, RA0;
@@ -817,15 +758,6 @@ _gcry_serpent_avx2_cfb_dec:
vzeroupper;
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* Load input */
vmovdqu (%rcx), RNOTx;
vinserti128 $1, (%rdx), RNOT, RA0;
@@ -843,32 +775,23 @@ _gcry_serpent_avx2_cfb_dec:
call __serpent_enc_blk16;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (0 * 32)(%rdx), RA4, RA4;
vpxor (1 * 32)(%rdx), RA1, RA1;
vpxor (2 * 32)(%rdx), RA2, RA2;
- vpxor (3 * 32)(%rdx), RA3, RA3;
- vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (3 * 32)(%rdx), RA0, RA0;
+ vpxor (4 * 32)(%rdx), RB4, RB4;
vpxor (5 * 32)(%rdx), RB1, RB1;
vpxor (6 * 32)(%rdx), RB2, RB2;
- vpxor (7 * 32)(%rdx), RB3, RB3;
+ vpxor (7 * 32)(%rdx), RB0, RB0;
- vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA4, (0 * 32)(%rsi);
vmovdqu RA1, (1 * 32)(%rsi);
vmovdqu RA2, (2 * 32)(%rsi);
- vmovdqu RA3, (3 * 32)(%rsi);
- vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RA0, (3 * 32)(%rsi);
+ vmovdqu RB4, (4 * 32)(%rsi);
vmovdqu RB1, (5 * 32)(%rsi);
vmovdqu RB2, (6 * 32)(%rsi);
- vmovdqu RB3, (7 * 32)(%rsi);
+ vmovdqu RB0, (7 * 32)(%rsi);
vzeroall;
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index a5cf353..516126b 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -35,42 +35,27 @@
#define CTX %rdi
/* vector registers */
-.set RA0, %xmm0
-.set RA1, %xmm1
-.set RA2, %xmm2
-.set RA3, %xmm3
-.set RA4, %xmm4
-
-.set RB0, %xmm5
-.set RB1, %xmm6
-.set RB2, %xmm7
-.set RB3, %xmm8
-.set RB4, %xmm9
-
-.set RNOT, %xmm10
-.set RTMP0, %xmm11
-.set RTMP1, %xmm12
-.set RTMP2, %xmm13
+#define RA0 %xmm0
+#define RA1 %xmm1
+#define RA2 %xmm2
+#define RA3 %xmm3
+#define RA4 %xmm4
+
+#define RB0 %xmm5
+#define RB1 %xmm6
+#define RB2 %xmm7
+#define RB3 %xmm8
+#define RB4 %xmm9
+
+#define RNOT %xmm10
+#define RTMP0 %xmm11
+#define RTMP1 %xmm12
+#define RTMP2 %xmm13
/**********************************************************************
helper macros
**********************************************************************/
-/* preprocessor macro for renaming vector registers using GAS macros */
-#define sbox_reg_rename(r0, r1, r2, r3, r4, \
- new_r0, new_r1, new_r2, new_r3, new_r4) \
- .set rename_reg0, new_r0; \
- .set rename_reg1, new_r1; \
- .set rename_reg2, new_r2; \
- .set rename_reg3, new_r3; \
- .set rename_reg4, new_r4; \
- \
- .set r0, rename_reg0; \
- .set r1, rename_reg1; \
- .set r2, rename_reg2; \
- .set r3, rename_reg3; \
- .set r4, rename_reg4;
-
/* vector 32-bit rotation to left */
#define vec_rol(reg, nleft, tmp) \
movdqa reg, tmp; \
@@ -147,9 +132,7 @@
pxor r4, r2; pxor RNOT, r4; \
por r1, r4; pxor r3, r1; \
pxor r4, r1; por r0, r3; \
- pxor r3, r1; pxor r3, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r0,r3);
+ pxor r3, r1; pxor r3, r4;
#define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
pxor RNOT, r2; movdqa r1, r4; \
@@ -162,9 +145,7 @@
pxor r1, r2; pxor r0, r3; \
pxor r1, r3; \
pand r3, r2; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r4,r1,r3,r2);
+ pxor r2, r4;
#define SBOX1(r0, r1, r2, r3, r4) \
pxor RNOT, r0; pxor RNOT, r2; \
@@ -176,9 +157,7 @@
pand r4, r2; pxor r1, r0; \
pand r2, r1; \
pxor r0, r1; pand r2, r0; \
- pxor r4, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r0,r3,r1,r4);
+ pxor r4, r0;
#define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
movdqa r1, r4; pxor r3, r1; \
@@ -191,9 +170,7 @@
pxor r1, r4; por r0, r1; \
pxor r0, r1; \
por r4, r1; \
- pxor r1, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r0,r3,r2,r1);
+ pxor r1, r3;
#define SBOX2(r0, r1, r2, r3, r4) \
movdqa r0, r4; pand r2, r0; \
@@ -203,9 +180,7 @@
movdqa r3, r1; por r4, r3; \
pxor r0, r3; pand r1, r0; \
pxor r0, r4; pxor r3, r1; \
- pxor r4, r1; pxor RNOT, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r3,r1,r4,r0);
+ pxor r4, r1; pxor RNOT, r4;
#define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
pxor r3, r2; pxor r0, r3; \
@@ -217,9 +192,7 @@
por r0, r2; pxor RNOT, r3; \
pxor r3, r2; pxor r3, r0; \
pand r1, r0; pxor r4, r3; \
- pxor r0, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r2,r3,r0);
+ pxor r0, r3;
#define SBOX3(r0, r1, r2, r3, r4) \
movdqa r0, r4; por r3, r0; \
@@ -231,9 +204,7 @@
pxor r2, r4; por r0, r1; \
pxor r2, r1; pxor r3, r0; \
movdqa r1, r2; por r3, r1; \
- pxor r0, r1; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r3,r4,r0);
+ pxor r0, r1;
#define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pxor r1, r2; \
@@ -245,9 +216,7 @@
pxor r1, r3; pxor r0, r1; \
por r2, r1; pxor r3, r0; \
pxor r4, r1; \
- pxor r1, r0; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r2,r1,r3,r0,r4);
+ pxor r1, r0;
#define SBOX4(r0, r1, r2, r3, r4) \
pxor r3, r1; pxor RNOT, r3; \
@@ -259,9 +228,7 @@
pxor r0, r3; por r1, r4; \
pxor r0, r4; por r3, r0; \
pxor r2, r0; pand r3, r2; \
- pxor RNOT, r0; pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r0,r3,r2);
+ pxor RNOT, r0; pxor r2, r4;
#define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pand r3, r2; \
@@ -274,9 +241,7 @@
pand r0, r2; pxor r0, r3; \
pxor r4, r2; \
por r3, r2; pxor r0, r3; \
- pxor r1, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r3,r2,r4,r1);
+ pxor r1, r2;
#define SBOX5(r0, r1, r2, r3, r4) \
pxor r1, r0; pxor r3, r1; \
@@ -288,9 +253,7 @@
pxor r2, r4; pxor r0, r2; \
pand r3, r0; pxor RNOT, r2; \
pxor r4, r0; por r3, r4; \
- pxor r4, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r3,r0,r2,r4);
+ pxor r4, r2;
#define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
pxor RNOT, r1; movdqa r3, r4; \
@@ -302,9 +265,7 @@
pxor r3, r1; pxor r2, r4; \
pand r4, r3; pxor r1, r4; \
pxor r4, r3; pxor RNOT, r4; \
- pxor r0, r3; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r4,r3,r2,r0);
+ pxor r0, r3;
#define SBOX6(r0, r1, r2, r3, r4) \
pxor RNOT, r2; movdqa r3, r4; \
@@ -316,9 +277,7 @@
pxor r2, r0; pxor r3, r4; \
pxor r0, r4; pxor RNOT, r3; \
pand r4, r2; \
- pxor r3, r2; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r0,r1,r4,r2,r3);
+ pxor r3, r2;
#define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
pxor r2, r0; movdqa r2, r4; \
@@ -329,9 +288,7 @@
pxor r1, r4; pand r3, r1; \
pxor r0, r1; pxor r3, r0; \
por r2, r0; pxor r1, r3; \
- pxor r0, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r1,r2,r4,r3,r0);
+ pxor r0, r4;
#define SBOX7(r0, r1, r2, r3, r4) \
movdqa r1, r4; por r2, r1; \
@@ -344,9 +301,7 @@
pxor r1, r2; pand r0, r1; \
pxor r4, r1; pxor RNOT, r2; \
por r0, r2; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r4,r3,r1,r0,r2);
+ pxor r2, r4;
#define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
movdqa r2, r4; pxor r0, r2; \
@@ -358,9 +313,7 @@
por r2, r0; pxor r1, r4; \
pxor r3, r0; pxor r4, r3; \
por r0, r4; pxor r2, r3; \
- pxor r2, r4; \
- \
- sbox_reg_rename(r0,r1,r2,r3,r4, r3,r0,r1,r4,r2);
+ pxor r2, r4;
/* Apply SBOX number WHICH to to the block. */
#define SBOX(which, r0, r1, r2, r3, r4) \
@@ -425,49 +378,51 @@
/* Apply a Serpent round to eight parallel blocks. This macro increments
`round'. */
-#define ROUND(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- LINEAR_TRANSFORMATION (a0, a1, a2, a3, a4); \
- LINEAR_TRANSFORMATION (b0, b1, b2, b3, b4); \
- .set round, (round + 1);
+#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \
+ LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
/* Apply the last Serpent round to eight parallel blocks. This macro increments
`round'. */
-#define ROUND_LAST(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- SBOX (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- SBOX (which, b0, b1, b2, b3, b4); \
- .set round, (round + 1); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round + 1);
+#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ SBOX (which, a0, a1, a2, a3, a4); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
+ SBOX (which, b0, b1, b2, b3, b4); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
/* Apply an inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
-#define ROUND_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
+#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \
LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
/* Apply the first inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
-#define ROUND_FIRST_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1); \
+#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
+ na0, na1, na2, na3, na4, \
+ b0, b1, b2, b3, b4, \
+ nb0, nb1, nb2, nb3, nb4) \
+ BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \
+ BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
- BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
+ BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
- BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
- .set round, (round - 1);
+ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
.text
@@ -479,72 +434,82 @@ __serpent_enc_blk8:
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
* blocks
* output:
- * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
* ciphertext blocks
*/
- /* record input vector names for __serpent_enc_blk8 */
- .set enc_in_a0, RA0
- .set enc_in_a1, RA1
- .set enc_in_a2, RA2
- .set enc_in_a3, RA3
- .set enc_in_b0, RB0
- .set enc_in_b1, RB1
- .set enc_in_b2, RB2
- .set enc_in_b3, RB3
-
pcmpeqd RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 0
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_LAST (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
- transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
-
- /* record output vector names for __serpent_enc_blk8 */
- .set enc_out_a0, RA0
- .set enc_out_a1, RA1
- .set enc_out_a2, RA2
- .set enc_out_a3, RA3
- .set enc_out_b0, RB0
- .set enc_out_b1, RB1
- .set enc_out_b2, RB2
- .set enc_out_b3, RB3
+ ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+ ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
+ RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
+ ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
+ RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
+ ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
+ RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
+ ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
+ RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
+ ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
+ RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
+ ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
+ RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
+ ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
+ RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
+ ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
+ RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
+ ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
+ RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
+ ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
+ RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
+ ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
+ RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
+ ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
+ ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
+ RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
+ ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
+ RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
+ ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
+ RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
+ ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
+ RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
+ ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
+ RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
+ ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
+ RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
+ ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
+ RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
+ ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
+ RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
+ ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
+ RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
+ ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
+ ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
+ RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
+
+ transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
ret;
.size __serpent_enc_blk8,.-__serpent_enc_blk8;
@@ -561,69 +526,81 @@ __serpent_dec_blk8:
* blocks
*/
- /* record input vector names for __serpent_dec_blk8 */
- .set dec_in_a0, RA0
- .set dec_in_a1, RA1
- .set dec_in_a2, RA2
- .set dec_in_a3, RA3
- .set dec_in_b0, RB0
- .set dec_in_b1, RB1
- .set dec_in_b2, RB2
- .set dec_in_b3, RB3
-
pcmpeqd RNOT, RNOT;
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- .set round, 32
- ROUND_FIRST_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
-
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (7, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (6, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (5, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (4, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (3, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (2, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (1, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
- ROUND_INVERSE (0, RA0, RA1, RA2, RA3, RA4, RB0, RB1, RB2, RB3, RB4);
+ ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
+ RA3, RA0, RA1, RA4, RA2,
+ RB0, RB1, RB2, RB3, RB4,
+ RB3, RB0, RB1, RB4, RB2);
+ ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
+ RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
+ ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
+ RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
+ ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
+ RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
+ ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
+ RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
+ ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
+ RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
+ ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
+ RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
+ ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
+ RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
+ ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
+ RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
+ ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
+ RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
+ ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
+ RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
+ ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
+ RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
+ ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
+ RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
+ ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
+ RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
+ ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
+ RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
+ ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
+ RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
+ ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
+ RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
+ ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
+ RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
+ ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
+ RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
+ ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
+ RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
+ ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
+ RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
+ ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
+ RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
+ ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
+ RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
+ ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
+ RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
+ ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
+ RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
+ ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
+ RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
+ ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
+ RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
+ ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
+ RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
+ ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
+ RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
+ ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
+ RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
+ ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
+ RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
+ ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
+ RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
- /* record output vector names for __serpent_dec_blk8 */
- .set dec_out_a0, RA0
- .set dec_out_a1, RA1
- .set dec_out_a2, RA2
- .set dec_out_a3, RA3
- .set dec_out_b0, RB0
- .set dec_out_b1, RB1
- .set dec_out_b2, RB2
- .set dec_out_b3, RB3
-
ret;
.size __serpent_dec_blk8,.-__serpent_dec_blk8;
@@ -638,15 +615,6 @@ _gcry_serpent_sse2_ctr_enc:
* %rcx: iv (big endian, 128bit)
*/
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* load IV and byteswap */
movdqu (%rcx), RA0;
movdqa RA0, RTMP0;
@@ -729,42 +697,35 @@ _gcry_serpent_sse2_ctr_enc:
call __serpent_enc_blk8;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((0 * 16)(%rdx), RA4, RTMP0);
pxor_u((1 * 16)(%rdx), RA1, RTMP0);
pxor_u((2 * 16)(%rdx), RA2, RTMP0);
- pxor_u((3 * 16)(%rdx), RA3, RTMP0);
- pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB4, RTMP0);
pxor_u((5 * 16)(%rdx), RB1, RTMP0);
pxor_u((6 * 16)(%rdx), RB2, RTMP0);
- pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB0, RTMP0);
- movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA4, (0 * 16)(%rsi);
movdqu RA1, (1 * 16)(%rsi);
movdqu RA2, (2 * 16)(%rsi);
- movdqu RA3, (3 * 16)(%rsi);
- movdqu RB0, (4 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
movdqu RB1, (5 * 16)(%rsi);
movdqu RB2, (6 * 16)(%rsi);
- movdqu RB3, (7 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
/* clear the used registers */
pxor RA0, RA0;
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
@@ -784,15 +745,6 @@ _gcry_serpent_sse2_cbc_dec:
* %rcx: iv
*/
- .set RA0, dec_in_a0
- .set RA1, dec_in_a1
- .set RA2, dec_in_a2
- .set RA3, dec_in_a3
- .set RB0, dec_in_b0
- .set RB1, dec_in_b1
- .set RB2, dec_in_b2
- .set RB3, dec_in_b3
-
movdqu (0 * 16)(%rdx), RA0;
movdqu (1 * 16)(%rdx), RA1;
movdqu (2 * 16)(%rdx), RA2;
@@ -804,15 +756,6 @@ _gcry_serpent_sse2_cbc_dec:
call __serpent_dec_blk8;
- .set RA0, dec_out_a0
- .set RA1, dec_out_a1
- .set RA2, dec_out_a2
- .set RA3, dec_out_a3
- .set RB0, dec_out_b0
- .set RB1, dec_out_b1
- .set RB2, dec_out_b2
- .set RB3, dec_out_b3
-
movdqu (7 * 16)(%rdx), RNOT;
pxor_u((%rcx), RA0, RTMP0);
pxor_u((0 * 16)(%rdx), RA1, RTMP0);
@@ -838,10 +781,12 @@ _gcry_serpent_sse2_cbc_dec:
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
@@ -861,15 +806,6 @@ _gcry_serpent_sse2_cfb_dec:
* %rcx: iv
*/
- .set RA0, enc_in_a0
- .set RA1, enc_in_a1
- .set RA2, enc_in_a2
- .set RA3, enc_in_a3
- .set RB0, enc_in_b0
- .set RB1, enc_in_b1
- .set RB2, enc_in_b2
- .set RB3, enc_in_b3
-
/* Load input */
movdqu (%rcx), RA0;
movdqu 0 * 16(%rdx), RA1;
@@ -886,42 +822,35 @@ _gcry_serpent_sse2_cfb_dec:
call __serpent_enc_blk8;
- .set RA0, enc_out_a0
- .set RA1, enc_out_a1
- .set RA2, enc_out_a2
- .set RA3, enc_out_a3
- .set RB0, enc_out_b0
- .set RB1, enc_out_b1
- .set RB2, enc_out_b2
- .set RB3, enc_out_b3
-
- pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((0 * 16)(%rdx), RA4, RTMP0);
pxor_u((1 * 16)(%rdx), RA1, RTMP0);
pxor_u((2 * 16)(%rdx), RA2, RTMP0);
- pxor_u((3 * 16)(%rdx), RA3, RTMP0);
- pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB4, RTMP0);
pxor_u((5 * 16)(%rdx), RB1, RTMP0);
pxor_u((6 * 16)(%rdx), RB2, RTMP0);
- pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB0, RTMP0);
- movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA4, (0 * 16)(%rsi);
movdqu RA1, (1 * 16)(%rsi);
movdqu RA2, (2 * 16)(%rsi);
- movdqu RA3, (3 * 16)(%rsi);
- movdqu RB0, (4 * 16)(%rsi);
+ movdqu RA0, (3 * 16)(%rsi);
+ movdqu RB4, (4 * 16)(%rsi);
movdqu RB1, (5 * 16)(%rsi);
movdqu RB2, (6 * 16)(%rsi);
- movdqu RB3, (7 * 16)(%rsi);
+ movdqu RB0, (7 * 16)(%rsi);
/* clear the used registers */
pxor RA0, RA0;
pxor RA1, RA1;
pxor RA2, RA2;
pxor RA3, RA3;
+ pxor RA4, RA4;
pxor RB0, RB0;
pxor RB1, RB1;
pxor RB2, RB2;
pxor RB3, RB3;
+ pxor RB4, RB4;
pxor RTMP0, RTMP0;
pxor RTMP1, RTMP1;
pxor RTMP2, RTMP2;
diff --git a/configure.ac b/configure.ac
index 1460dfd..8fb14e2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1034,17 +1034,12 @@ if test $amd64_as_feature_detection = yes; then
[gcry_cv_gcc_amd64_platform_as_ok=no
AC_COMPILE_IFELSE([AC_LANG_SOURCE(
[[__asm__(
- /* Test if '.set' is supported by underlying assembler. */
- ".set a0, %rax\n\t"
- ".set b0, %rdx\n\t"
- "asmfunc:\n\t"
- "movq a0, b0;\n\t" /* Fails here if .set ignored by as. */
-
/* Test if '.type' and '.size' are supported. */
/* These work only on ELF targets. */
/* TODO: add COFF (mingw64, cygwin64) support to assembly
* implementations. Mingw64/cygwin64 also require additional
* work because they use different calling convention. */
+ "asmfunc:\n\t"
".size asmfunc,.-asmfunc;\n\t"
".type asmfunc, at function;\n\t"
);]])],
-----------------------------------------------------------------------
Summary of changes:
cipher/Makefile.am | 2 +-
cipher/serpent-avx2-amd64.S | 519 ++++++++++++++++++-------------------------
cipher/serpent-sse2-amd64.S | 507 ++++++++++++++++++------------------------
cipher/twofish-armv6.S | 365 ++++++++++++++++++++++++++++++
cipher/twofish.c | 88 +++++---
configure.ac | 11 +-
mpi/longlong.h | 27 ++-
7 files changed, 885 insertions(+), 634 deletions(-)
create mode 100644 cipher/twofish-armv6.S
hooks/post-receive
--
The GNU crypto library
http://git.gnupg.org
More information about the Gnupg-commits
mailing list