[PATCH] Add AMD64 assembly implementation for arcfour
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Dec 20 15:05:45 CET 2013
* cipher/Makefile.am: Add 'arcfour-amd64.S'.
* cipher/arcfour-amd64.S: New.
* cipher/arcfour.c (USE_AMD64_ASM): New.
[USE_AMD64_ASM] (ARCFOUR_context, _gcry_arcfour_amd64)
(encrypt_stream): New.
* configure.ac [host=x86_64]: Add 'arcfour-amd64.lo'.
--
Patch adds Marc Bevand's public-domain AMD64 assembly implementation of RC4 to
libgcrypt. Original implementation is at:
http://www.zorinaq.com/papers/rc4-amd64.html
Benchmarks on Intel i5-4570 (3200 Mhz):
New:
ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 1.29 ns/B 737.7 MiB/s 4.14 c/B
STREAM dec | 1.31 ns/B 730.6 MiB/s 4.18 c/B
Old (C-language):
ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 2.09 ns/B 457.4 MiB/s 6.67 c/B
STREAM dec | 2.09 ns/B 457.2 MiB/s 6.68 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 2 -
cipher/arcfour-amd64.S | 97 ++++++++++++++++++++++++++++++++++++++++++++++++
cipher/arcfour.c | 28 ++++++++++++++
configure.ac | 7 +++
4 files changed, 133 insertions(+), 1 deletion(-)
create mode 100644 cipher/arcfour-amd64.S
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 98c6254..15400e5 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -56,7 +56,7 @@ dsa-common.c rsa-common.c \
rmd.h
EXTRA_libcipher_la_SOURCES = \
-arcfour.c \
+arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S \
cast5.c cast5-amd64.S cast5-arm.S \
crc.c \
diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
new file mode 100644
index 0000000..c32cd6f
--- /dev/null
+++ b/cipher/arcfour-amd64.S
@@ -0,0 +1,97 @@
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+**
+** 2013/12/20 <jussi.kivilinna at iki.fi>:
+** - Integrated to libgcrypt
+** - 4.18 cycles/byte on Intel i5-4570
+*/
+
+#ifdef __x86_64__
+#include <config.h>
+#if defined(USE_ARCFOUR) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+
+.text
+.align 16
+.globl _gcry_arcfour_amd64
+.type _gcry_arcfour_amd64, at function
+_gcry_arcfour_amd64:
+ push %rbp
+ push %rbx
+ mov %rdi, %rbp # key = ARG(key)
+ mov %rsi, %rbx # rbx = ARG(len)
+ mov %rdx, %rsi # in = ARG(in)
+ mov %rcx, %rdi # out = ARG(out)
+ mov (4*256)(%rbp), %ecx # x = key->x
+ mov (4*256+4)(%rbp),%edx # y = key->y
+ inc %rcx # x++
+ and $255, %rcx # x &= 0xff
+ lea -8(%rbx,%rsi), %rbx # rbx = in+len-8
+ mov %rbx, %r9 # tmp = in+len-8
+ mov (%rbp,%rcx,4), %eax # tx = d[x]
+ cmp %rsi, %rbx # cmp in with in+len-8
+ jl .Lend # jump if (in+len-8 < in)
+
+.Lstart:
+ add $8, %rsi # increment in
+ add $8, %rdi # increment out
+
+ # generate the next 8 bytes of the rc4 stream into %r8
+ mov $8, %r11 # byte counter
+1: add %al, %dl # y += tx
+ mov (%rbp,%rdx,4), %ebx # ty = d[y]
+ mov %ebx, (%rbp,%rcx,4) # d[x] = ty
+ add %al, %bl # val = ty + tx
+ mov %eax, (%rbp,%rdx,4) # d[y] = tx
+ inc %cl # x++ (NEXT ROUND)
+ mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND)
+ shl $8, %r8
+ movb (%rbp,%rbx,4), %r8b # val = d[val]
+ dec %r11b
+ jnz 1b
+
+ # xor 8 bytes
+ bswap %r8
+ xor -8(%rsi), %r8
+ cmp %r9, %rsi # cmp in+len-8 with in
+ mov %r8, -8(%rdi)
+ jle .Lstart # jump if (in <= in+len-8)
+
+.Lend:
+ add $8, %r9 # tmp = in+len
+
+ # handle the last bytes, one by one
+1: cmp %rsi, %r9 # cmp in with in+len
+ jle .Lfinished # jump if (in+len <= in)
+ add %al, %dl # y += tx
+ mov (%rbp,%rdx,4), %ebx # ty = d[y]
+ mov %ebx, (%rbp,%rcx,4) # d[x] = ty
+ add %al, %bl # val = ty + tx
+ mov %eax, (%rbp,%rdx,4) # d[y] = tx
+ inc %cl # x++ (NEXT ROUND)
+ mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND)
+ movb (%rbp,%rbx,4), %r8b # val = d[val]
+ xor (%rsi), %r8b # xor 1 byte
+ movb %r8b, (%rdi)
+ inc %rsi # in++
+ inc %rdi # out++
+ jmp 1b
+
+.Lfinished:
+ dec %rcx # x--
+ movb %dl, (4*256)(%rbp) # key->y = y
+ movb %cl, (4*256+4)(%rbp) # key->x = x
+ pop %rbx
+ pop %rbp
+ ret
+.L__gcry_arcfour_amd64_end:
+.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64
+
+#endif
+#endif
diff --git a/cipher/arcfour.c b/cipher/arcfour.c
index d692c84..7488637 100644
--- a/cipher/arcfour.c
+++ b/cipher/arcfour.c
@@ -31,8 +31,34 @@
#include "g10lib.h"
#include "cipher.h"
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64_ASM 1
+#endif
+
static const char *selftest(void);
+#ifdef USE_AMD64_ASM
+
+typedef struct {
+ u32 sbox[256];
+ u32 idx_i, idx_j;
+} ARCFOUR_context;
+
+void _gcry_arcfour_amd64(void *key, size_t len, const byte *indata,
+ byte *outdata);
+
+static void
+encrypt_stream (void *context,
+ byte *outbuf, const byte *inbuf, size_t length)
+{
+ ARCFOUR_context *ctx = (ARCFOUR_context *) context;
+ _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
+}
+
+#else /*!USE_AMD64_ASM*/
+
typedef struct {
byte sbox[256];
int idx_i, idx_j;
@@ -96,6 +122,8 @@ encrypt_stream (void *context,
_gcry_burn_stack (64);
}
+#endif /*!USE_AMD64_ASM*/
+
static gcry_err_code_t
do_arcfour_setkey (void *context, const byte *key, unsigned int keylen)
diff --git a/configure.ac b/configure.ac
index 27de850..d97dd33 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1505,6 +1505,13 @@ LIST_MEMBER(arcfour, $enabled_ciphers)
if test "$found" = "1"; then
GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
+
+ case "${host}" in
+ x86_64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo"
+ ;;
+ esac
fi
LIST_MEMBER(blowfish, $enabled_ciphers)
More information about the Gcrypt-devel
mailing list