[PATCH] Add AMD64 assembly implementation for arcfour

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Dec 20 15:05:45 CET 2013


* cipher/Makefile.am: Add 'arcfour-amd64.S'.
* cipher/arcfour-amd64.S: New.
* cipher/arcfour.c (USE_AMD64_ASM): New.
[USE_AMD64_ASM] (ARCFOUR_context, _gcry_arcfour_amd64)
(encrypt_stream): New.
* configure.ac [host=x86_64]: Add 'arcfour-amd64.lo'.
--

Patch adds Marc Bevand's public-domain AMD64 assembly implementation of RC4 to
libgcrypt. Original implementation is at:
  http://www.zorinaq.com/papers/rc4-amd64.html

Benchmarks on Intel i5-4570 (3200 Mhz):

New:
 ARCFOUR        |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      1.29 ns/B     737.7 MiB/s      4.14 c/B
     STREAM dec |      1.31 ns/B     730.6 MiB/s      4.18 c/B

Old (C-language):
 ARCFOUR        |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      2.09 ns/B     457.4 MiB/s      6.67 c/B
     STREAM dec |      2.09 ns/B     457.2 MiB/s      6.68 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am     |    2 -
 cipher/arcfour-amd64.S |   97 ++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/arcfour.c       |   28 ++++++++++++++
 configure.ac           |    7 +++
 4 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 cipher/arcfour-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 98c6254..15400e5 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -56,7 +56,7 @@ dsa-common.c rsa-common.c \
 rmd.h
 
 EXTRA_libcipher_la_SOURCES = \
-arcfour.c \
+arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S \
 cast5.c cast5-amd64.S cast5-arm.S \
 crc.c \
diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S
new file mode 100644
index 0000000..c32cd6f
--- /dev/null
+++ b/cipher/arcfour-amd64.S
@@ -0,0 +1,97 @@
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+**
+** 2013/12/20 <jussi.kivilinna at iki.fi>:
+**  - Integrated to libgcrypt
+**  - 4.18 cycles/byte on Intel i5-4570
+*/
+
+#ifdef __x86_64__
+#include <config.h>
+#if defined(USE_ARCFOUR) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+
+.text
+.align 16
+.globl _gcry_arcfour_amd64
+.type _gcry_arcfour_amd64, at function
+_gcry_arcfour_amd64:
+	push	%rbp
+	push	%rbx
+	mov	%rdi,		%rbp	# key = ARG(key)
+	mov	%rsi,		%rbx	# rbx = ARG(len)
+	mov	%rdx,		%rsi	# in = ARG(in)
+	mov	%rcx,		%rdi	# out = ARG(out)
+	mov	(4*256)(%rbp),	%ecx	# x = key->x
+	mov	(4*256+4)(%rbp),%edx	# y = key->y
+	inc	%rcx			# x++
+	and	$255,		%rcx	# x &= 0xff
+	lea	-8(%rbx,%rsi),	%rbx	# rbx = in+len-8
+	mov	%rbx,		%r9	# tmp = in+len-8
+	mov	(%rbp,%rcx,4),	%eax	# tx = d[x]
+	cmp	%rsi,		%rbx	# cmp in with in+len-8
+	jl	.Lend			# jump if (in+len-8 < in)
+
+.Lstart:
+	add	$8,		%rsi		# increment in
+	add	$8,		%rdi		# increment out
+
+	# generate the next 8 bytes of the rc4 stream into %r8
+	mov	$8,		%r11		# byte counter
+1:	add	%al,		%dl		# y += tx
+	mov	(%rbp,%rdx,4),	%ebx		# ty = d[y]
+	mov	%ebx,		(%rbp,%rcx,4)	# d[x] = ty
+	add	%al,		%bl		# val = ty + tx
+	mov	%eax,		(%rbp,%rdx,4)	# d[y] = tx
+	inc	%cl				# x++		(NEXT ROUND)
+	mov	(%rbp,%rcx,4),	%eax		# tx = d[x]	(NEXT ROUND)
+	shl	$8,		%r8
+	movb	(%rbp,%rbx,4),	%r8b		# val = d[val]
+	dec	%r11b
+	jnz 1b
+
+	# xor 8 bytes
+	bswap	%r8
+	xor	-8(%rsi),	%r8
+	cmp	%r9,		%rsi		# cmp in+len-8 with in
+	mov	%r8,		-8(%rdi)
+	jle	.Lstart				# jump if (in <= in+len-8)
+
+.Lend:
+	add	$8,		%r9		# tmp = in+len
+
+	# handle the last bytes, one by one
+1:	cmp	%rsi,		%r9		# cmp in with in+len
+	jle	.Lfinished			# jump if (in+len <= in)
+	add	%al,		%dl		# y += tx
+	mov	(%rbp,%rdx,4),	%ebx		# ty = d[y]
+	mov	%ebx,		(%rbp,%rcx,4)	# d[x] = ty
+	add	%al,		%bl		# val = ty + tx
+	mov	%eax,		(%rbp,%rdx,4)	# d[y] = tx
+	inc	%cl				# x++		(NEXT ROUND)
+	mov	(%rbp,%rcx,4),	%eax		# tx = d[x]	(NEXT ROUND)
+	movb	(%rbp,%rbx,4),	%r8b		# val = d[val]
+	xor	(%rsi),		%r8b		# xor 1 byte
+	movb	%r8b,		(%rdi)
+	inc	%rsi				# in++
+	inc	%rdi				# out++
+	jmp 1b
+
+.Lfinished:
+	dec	%rcx				# x--
+	movb	%dl,		(4*256)(%rbp)	# key->y = y
+	movb	%cl,		(4*256+4)(%rbp)	# key->x = x
+	pop	%rbx
+	pop	%rbp
+	ret
+.L__gcry_arcfour_amd64_end:
+.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64
+
+#endif
+#endif
diff --git a/cipher/arcfour.c b/cipher/arcfour.c
index d692c84..7488637 100644
--- a/cipher/arcfour.c
+++ b/cipher/arcfour.c
@@ -31,8 +31,34 @@
 #include "g10lib.h"
 #include "cipher.h"
 
+/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
+#undef USE_AMD64_ASM
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)
+# define USE_AMD64_ASM 1
+#endif
+
 static const char *selftest(void);
 
+#ifdef USE_AMD64_ASM
+
+typedef struct {
+    u32 sbox[256];
+    u32 idx_i, idx_j;
+} ARCFOUR_context;
+
+void _gcry_arcfour_amd64(void *key, size_t len, const byte *indata,
+			 byte *outdata);
+
+static void
+encrypt_stream (void *context,
+                byte *outbuf, const byte *inbuf, size_t length)
+{
+  ARCFOUR_context *ctx = (ARCFOUR_context *) context;
+  _gcry_arcfour_amd64 (context, length, inbuf, outbuf );
+}
+
+#else /*!USE_AMD64_ASM*/
+
 typedef struct {
     byte sbox[256];
     int idx_i, idx_j;
@@ -96,6 +122,8 @@ encrypt_stream (void *context,
   _gcry_burn_stack (64);
 }
 
+#endif /*!USE_AMD64_ASM*/
+
 
 static gcry_err_code_t
 do_arcfour_setkey (void *context, const byte *key, unsigned int keylen)
diff --git a/configure.ac b/configure.ac
index 27de850..d97dd33 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1505,6 +1505,13 @@ LIST_MEMBER(arcfour, $enabled_ciphers)
 if test "$found" = "1"; then
    GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo"
    AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo"
+      ;;
+   esac
 fi
 
 LIST_MEMBER(blowfish, $enabled_ciphers)




More information about the Gcrypt-devel mailing list