[PATCH 3/3] Add Aarch64 implementation of Twofish

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu Mar 24 17:54:16 CET 2016


* cipher/Makefile.am: Add 'twofish-aarch64.S'.
* cipher/twofish-aarch64.S: New.
* cipher/twofish.c: Enable USE_ARM_ASM if __AARCH64EL__ and
HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS defined.
* configure.ac [host=aarch64]: Add 'twofish-aarch64.lo'.
--

Patch adds ARMv8/Aarch64 implementation of Twofish.

Benchmark on Cortex-A53 (1536 Mhz):

 Before:

 TWOFISH128     |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     20.62 ns/B     46.25 MiB/s     31.67 c/B
        ECB dec |     19.77 ns/B     48.24 MiB/s     30.37 c/B
        CBC enc |     21.48 ns/B     44.40 MiB/s     32.99 c/B
        CBC dec |     19.65 ns/B     48.53 MiB/s     30.18 c/B
        CFB enc |     21.40 ns/B     44.57 MiB/s     32.87 c/B
        CFB dec |     20.54 ns/B     46.42 MiB/s     31.56 c/B
        OFB enc |     21.29 ns/B     44.80 MiB/s     32.70 c/B
        OFB dec |     21.28 ns/B     44.82 MiB/s     32.69 c/B
        CTR enc |     20.68 ns/B     46.12 MiB/s     31.76 c/B
        CTR dec |     20.68 ns/B     46.12 MiB/s     31.76 c/B
        CCM enc |     41.47 ns/B     22.99 MiB/s     63.71 c/B
        CCM dec |     41.47 ns/B     23.00 MiB/s     63.70 c/B
       CCM auth |     20.84 ns/B     45.77 MiB/s     32.01 c/B
        GCM enc |     32.33 ns/B     29.50 MiB/s     49.66 c/B
        GCM dec |     32.33 ns/B     29.50 MiB/s     49.66 c/B
       GCM auth |     11.69 ns/B     81.55 MiB/s     17.96 c/B
        OCB enc |     22.17 ns/B     43.02 MiB/s     34.05 c/B
        OCB dec |     21.27 ns/B     44.84 MiB/s     32.67 c/B
       OCB auth |     21.73 ns/B     43.88 MiB/s     33.38 c/B
                =

 After (~1.3x faster):

 TWOFISH128     |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |     14.98 ns/B     63.67 MiB/s     23.01 c/B
        ECB dec |     13.72 ns/B     69.53 MiB/s     21.07 c/B
        CBC enc |     15.71 ns/B     60.69 MiB/s     24.14 c/B
        CBC dec |     13.75 ns/B     69.34 MiB/s     21.13 c/B
        CFB enc |     15.62 ns/B     61.04 MiB/s     24.00 c/B
        CFB dec |     14.98 ns/B     63.67 MiB/s     23.01 c/B
        OFB enc |     15.71 ns/B     60.70 MiB/s     24.13 c/B
        OFB dec |     15.69 ns/B     60.77 MiB/s     24.11 c/B
        CTR enc |     15.14 ns/B     62.97 MiB/s     23.26 c/B
        CTR dec |     15.14 ns/B     62.98 MiB/s     23.26 c/B
        CCM enc |     30.40 ns/B     31.37 MiB/s     46.70 c/B
        CCM dec |     30.39 ns/B     31.38 MiB/s     46.68 c/B
       CCM auth |     15.30 ns/B     62.34 MiB/s     23.50 c/B
        GCM enc |     26.79 ns/B     35.60 MiB/s     41.15 c/B
        GCM dec |     26.80 ns/B     35.59 MiB/s     41.16 c/B
       GCM auth |     11.65 ns/B     81.85 MiB/s     17.90 c/B
        OCB enc |     16.61 ns/B     57.41 MiB/s     25.52 c/B
        OCB dec |     15.28 ns/B     62.43 MiB/s     23.46 c/B
       OCB auth |     16.16 ns/B     59.00 MiB/s     24.83 c/B
                =

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 46125db..82c79d8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -96,7 +96,7 @@ keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 stribog.c \
 tiger.c \
 whirlpool.c whirlpool-sse2-amd64.S \
-twofish.c twofish-amd64.S twofish-arm.S \
+twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
 rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
   camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S
diff --git a/cipher/twofish-aarch64.S b/cipher/twofish-aarch64.S
new file mode 100644
index 0000000..13ea15b
--- /dev/null
+++ b/cipher/twofish-aarch64.S
@@ -0,0 +1,317 @@
+/* twofish-aarch64.S  -  ARMv8/Aarch64 assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__)
+#ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+
+.text
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w  ((s3) + 4 * 256)
+#define k  ((w) + 4 * 8)
+
+/* register macros */
+#define CTX x0
+#define RDST x1
+#define RSRC x2
+#define CTXs0 CTX
+#define CTXs1 x3
+#define CTXs2 x4
+#define CTXs3 x5
+#define CTXw x17
+
+#define RA w6
+#define RB w7
+#define RC w8
+#define RD w9
+
+#define RX w10
+#define RY w11
+
+#define xRX x10
+#define xRY x11
+
+#define RMASK w12
+
+#define RT0 w13
+#define RT1 w14
+#define RT2 w15
+#define RT3 w16
+
+#define xRT0 x13
+#define xRT1 x14
+#define xRT2 x15
+#define xRT3 x16
+
+/* helper macros */
+#ifndef __AARCH64EL__
+	/* bswap on big-endian */
+	#define host_to_le(reg) \
+		rev reg, reg;
+	#define le_to_host(reg) \
+		rev reg, reg;
+#else
+	/* nop on little-endian */
+	#define host_to_le(reg) /*_*/
+	#define le_to_host(reg) /*_*/
+#endif
+
+#define ldr_input_aligned_le(rin, a, b, c, d) \
+	ldr a, [rin, #0]; \
+	ldr b, [rin, #4]; \
+	le_to_host(a); \
+	ldr c, [rin, #8]; \
+	le_to_host(b); \
+	ldr d, [rin, #12]; \
+	le_to_host(c); \
+	le_to_host(d);
+
+#define str_output_aligned_le(rout, a, b, c, d) \
+	le_to_host(a); \
+	le_to_host(b); \
+	str a, [rout, #0]; \
+	le_to_host(c); \
+	str b, [rout, #4]; \
+	le_to_host(d); \
+	str c, [rout, #8]; \
+	str d, [rout, #12];
+
+/* unaligned word reads/writes allowed */
+#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
+	ldr_input_aligned_le(rin, ra, rb, rc, rd)
+
+#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
+	str_output_aligned_le(rout, ra, rb, rc, rd)
+
+/**********************************************************************
+  1-way twofish
+ **********************************************************************/
+#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
+	and RT0, RMASK, b, lsr#(8 - 2); \
+	and RY, RMASK, b, lsr#(16 - 2); \
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	ldr RY, [CTXs3, xRY]; \
+	and RT2, RMASK, b, lsl#(2); \
+	ldr RT0, [CTXs2, xRT0]; \
+	and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
+	ldr RT1, [CTXs0, xRT1]; \
+	and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
+	ldr RT2, [CTXs1, xRT2]; \
+	ldr RX, [CTXs1, xRX]; \
+	ror_a(a); \
+	\
+	eor RY, RY, RT0; \
+	ldr RT3, [CTXs2, xRT3]; \
+	and RT0, RMASK, a, lsl#(2); \
+	eor RY, RY, RT1; \
+	and RT1, RMASK, a, lsr#(24 - 2); \
+	eor RY, RY, RT2; \
+	ldr RT0, [CTXs0, xRT0]; \
+	eor RX, RX, RT3; \
+	ldr RT1, [CTXs3, xRT1]; \
+	eor RX, RX, RT0; \
+	\
+	ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT1; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT3; \
+	add RX, RX, RT2; \
+	eor rd, RT0, rd, ror #31; \
+	eor rc, rc, RX;
+
+#define dummy(x) /*_*/
+
+#define ror1(r) \
+	ror r, r, #1;
+
+#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
+	and RT3, RMASK, b, lsl#(2 - (adj_b)); \
+	and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
+	ror_b(b); \
+	and RT2, RMASK, a, lsl#(2); \
+	and RT0, RMASK, a, lsr#(8 - 2); \
+	\
+	ldr RY, [CTXs1, xRT3]; \
+	ldr RX, [CTXs0, xRT2]; \
+	and RT3, RMASK, b, lsr#(16 - 2); \
+	ldr RT1, [CTXs2, xRT1]; \
+	and RT2, RMASK, a, lsr#(16 - 2); \
+	ldr RT0, [CTXs1, xRT0]; \
+	\
+	ldr RT3, [CTXs3, xRT3]; \
+	eor RY, RY, RT1; \
+	\
+	and RT1, RMASK, b, lsr#(24 - 2); \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs2, xRT2]; \
+	and RT0, RMASK, a, lsr#(24 - 2); \
+	\
+	ldr RT1, [CTXs0, xRT1]; \
+	\
+	eor RY, RY, RT3; \
+	ldr RT0, [CTXs3, xRT0]; \
+	eor RX, RX, RT2; \
+	eor RY, RY, RT1; \
+	\
+	ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
+	eor RX, RX, RT0; \
+	ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
+	\
+	add RT0, RX, RY, lsl #1; \
+	add RX, RX, RY; \
+	add RT0, RT0, RT1; \
+	add RX, RX, RT2; \
+	eor rd, rd, RT0; \
+	eor rc, RX, rc, ror #31;
+
+#define first_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
+
+#define last_encrypt_cycle(nc) \
+	encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	ror1(RA);
+
+#define first_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
+
+#define last_decrypt_cycle(nc) \
+	decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
+	decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
+	ror1(RD);
+
+.globl _gcry_twofish_arm_encrypt_block
+.type   _gcry_twofish_arm_encrypt_block,%function;
+
+_gcry_twofish_arm_encrypt_block:
+	/* input:
+	 *	x0: ctx
+	 *	x1: dst
+	 *	x2: src
+	 */
+
+	add CTXw, CTX, #(w);
+
+	ldr_input_le(RSRC, RA, RB, RC, RD, RT0);
+
+	/* Input whitening */
+	ldp RT0, RT1, [CTXw, #(0*8)];
+	ldp RT2, RT3, [CTXw, #(1*8)];
+	add CTXs3, CTX, #(s3);
+	add CTXs2, CTX, #(s2);
+	add CTXs1, CTX, #(s1);
+	mov RMASK, #(0xff << 2);
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	first_encrypt_cycle(0);
+	encrypt_cycle(1);
+	encrypt_cycle(2);
+	encrypt_cycle(3);
+	encrypt_cycle(4);
+	encrypt_cycle(5);
+	encrypt_cycle(6);
+	last_encrypt_cycle(7);
+
+	/* Output whitening */
+	ldp RT0, RT1, [CTXw, #(2*8)];
+	ldp RT2, RT3, [CTXw, #(3*8)];
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	str_output_le(RDST, RC, RD, RA, RB, RT0, RT1);
+
+	ret;
+.ltorg
+.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
+
+.globl _gcry_twofish_arm_decrypt_block
+.type   _gcry_twofish_arm_decrypt_block,%function;
+
+_gcry_twofish_arm_decrypt_block:
+	/* input:
+	 *	%r0: ctx
+	 *	%r1: dst
+	 *	%r2: src
+	 */
+
+	add CTXw, CTX, #(w);
+
+	ldr_input_le(RSRC, RC, RD, RA, RB, RT0);
+
+	/* Input whitening */
+	ldp RT0, RT1, [CTXw, #(2*8)];
+	ldp RT2, RT3, [CTXw, #(3*8)];
+	add CTXs3, CTX, #(s3);
+	add CTXs2, CTX, #(s2);
+	add CTXs1, CTX, #(s1);
+	mov RMASK, #(0xff << 2);
+	eor RC, RC, RT0;
+	eor RD, RD, RT1;
+	eor RA, RA, RT2;
+	eor RB, RB, RT3;
+
+	first_decrypt_cycle(7);
+	decrypt_cycle(6);
+	decrypt_cycle(5);
+	decrypt_cycle(4);
+	decrypt_cycle(3);
+	decrypt_cycle(2);
+	decrypt_cycle(1);
+	last_decrypt_cycle(0);
+
+	/* Output whitening */
+	ldp RT0, RT1, [CTXw, #(0*8)];
+	ldp RT2, RT3, [CTXw, #(1*8)];
+	eor RA, RA, RT0;
+	eor RB, RB, RT1;
+	eor RC, RC, RT2;
+	eor RD, RD, RT3;
+
+	str_output_le(RDST, RA, RB, RC, RD, RT0, RT1);
+
+	ret;
+.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
+
+#endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/
+#endif /*__AARCH64EL__*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index f6ecd67..7a4d26a 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -66,6 +66,11 @@
 #  define USE_ARM_ASM 1
 # endif
 #endif
+# if defined(__AARCH64EL__)
+#  ifdef HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS
+#   define USE_ARM_ASM 1
+#  endif
+# endif
 
 
 /* Prototype for the self-test function. */
diff --git a/configure.ac b/configure.ac
index 34791fa..076a92b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1895,6 +1895,10 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-arm.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-aarch64.lo"
+      ;;
    esac
 fi
 




More information about the Gcrypt-devel mailing list