[PATCH] Add ARMv8/AArch64 implementation of chacha20

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Aug 6 14:09:49 CEST 2017


* cipher/Makefile.am: Add 'chacha20-aarch64.S'.
* cipher/chacha20-aarch64.S: New.
* cipher/chacha20.c (USE_AARCH64_SIMD): New.
(_gcry_chacha20_aarch_blocks): New.
(chacha20_do_setkey): Add HWF selection for Aarch64 implementation.
* configure.ac: Add 'chacha20-aarch64.lo'.
--

Patch adds ARMv8/AArch64 SIMD implementation based on public domain
ARMv7/NEON implementation by Andrew Moon at:
  https://github.com/floodyberry/chacha-opt

Benchmark on ARM Cortex-A53 (1536 Mhz):

Before:
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      5.70 ns/B     167.2 MiB/s      8.76 c/B
     STREAM dec |      5.71 ns/B     166.9 MiB/s      8.78 c/B

After (~1.7x faster):
 CHACHA20       |  nanosecs/byte   mebibytes/sec   cycles/byte
     STREAM enc |      3.32 ns/B     287.7 MiB/s      5.09 c/B
     STREAM dec |      3.31 ns/B     287.9 MiB/s      5.09 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 95c45108..26d25e1a 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -65,7 +65,7 @@ arcfour.c arcfour-amd64.S \
 blowfish.c blowfish-amd64.S blowfish-arm.S \
 cast5.c cast5-amd64.S cast5-arm.S \
 chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
-  chacha20-armv7-neon.S \
+  chacha20-armv7-neon.S chacha20-aarch64.S \
 crc.c \
   crc-intel-pclmul.c \
 des.c des-amd64.S \
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
new file mode 100644
index 00000000..d07511ff
--- /dev/null
+++ b/cipher/chacha20-aarch64.S
@@ -0,0 +1,772 @@
+/* chacha20-aarch64.S - ARMv8/AArch64 accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2014,2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain ARMv7/NEON implementation by Andrew Moon at
+ *  https://github.com/floodyberry/chacha-opt
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON) && \
+    defined(USE_CHACHA20)
+
+.cpu generic+simd
+
+.text
+
+#define STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+	add x17, ptr, #8; \
+	stp l0, l1, [ptr], #16; \
+	stp l2, l3, [x17], #16; \
+	stp l4, l5, [ptr], #16; \
+	stp l6, l7, [x17];
+
+#define LDMIA16(ptr, l0, l1, l2, l3, l4, l5, l6, l7, \
+		     l8, l9, l10, l11, l12, l13, l14, l15) \
+	add x17, ptr, #8; \
+	ldp l0, l1, [ptr], #16; \
+	ldp l2, l3, [x17], #16; \
+	ldp l4, l5, [ptr], #16; \
+	ldp l6, l7, [x17], #16; \
+	ldp l8, l9, [ptr], #16; \
+	ldp l10, l11, [x17], #16; \
+	ldp l12, l13, [ptr], #16; \
+	ldp l14, l15, [x17]; \
+
+#define LDMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+	add x17, ptr, #8; \
+	ldp l0, l1, [ptr], #16; \
+	ldp l2, l3, [x17], #16; \
+	ldp l4, l5, [ptr], #16; \
+	ldp l6, l7, [x17];
+
+#define LDMIA4(ptr, l0, l1, l2, l3) \
+	ldp l0, l1, [ptr], #8; \
+	ldp l2, l3, [ptr], #8;
+
+#define EXT32(a,b,c,n) \
+	ext a,b,c,#(n*4);
+
+.text
+
+#define STACK_STATE	48
+#define STACK_SRC	56
+#define STACK_SP	192
+#define STACK_DST	200
+#define STACK_BYTES	208
+#define STACK_DST_TMP   216
+
+.globl _gcry_chacha20_aarch64_blocks
+.type  _gcry_chacha20_aarch64_blocks,%function;
+_gcry_chacha20_aarch64_blocks:
+.Lchacha_blocks_neon_local:
+	tst x3, x3
+	beq .Lchacha_blocks_neon_nobytes
+	mov x16, sp
+	mov x8, sp
+	sub x16, x16, #(216+8)
+	mov v16.16b, v8.16b
+	mov v17.16b, v9.16b
+	and x16, x16, #(-32)
+	mov v18.16b, v10.16b
+	mov v19.16b, v11.16b
+	mov v20.16b, v12.16b
+	mov sp, x16
+	add x16, x16, #64
+	mov v21.16b, v13.16b
+	mov v22.16b, v14.16b
+	mov v23.16b, v15.16b
+	mov w4, #20
+	ld1 {v24.4s-v27.4s}, [x0]
+	str x0, [sp, # STACK_STATE]
+	str x1, [sp, # STACK_SRC]
+	str x2, [sp, # STACK_DST]
+	str x3, [sp, # STACK_BYTES]
+	str x8, [sp, # STACK_SP]
+	st1 {v24.4s-v27.4s}, [x16]
+	str w4, [sp, #44]
+	cmp x3, #256
+	blo .Lchacha_blocks_neon_mainloop2
+.Lchacha_blocks_neon_mainloop1:
+	ldr w0, [sp, #44]
+	add x16, sp, #64
+	str w0, [sp, #0]
+	mov x2, #1
+	eor v12.16b, v12.16b, v12.16b
+	mov v0.16b, v24.16b
+	mov v1.16b, v25.16b
+	mov v2.16b, v26.16b
+	mov v3.16b, v27.16b
+	mov v12.2d[0], x2
+	add v3.2d, v3.2d, v12.2d
+	mov v4.16b, v0.16b
+	mov v5.16b, v1.16b
+	mov v6.16b, v2.16b
+	add v7.2d, v3.2d, v12.2d
+	LDMIA16(x16, w0, w1, w2, w3, w4, w5, w6, w7,
+		     w8, w9, w10, w11, w12, w13, w14, w15)
+	mov v8.16b, v0.16b
+	mov v9.16b, v1.16b
+	mov v10.16b, v2.16b
+	add v11.2d, v7.2d, v12.2d
+	str w6, [sp, #8]
+	str w11, [sp, #12]
+	mov w11, w13
+	str w15, [sp, #28]
+.Lchacha_blocks_neon_rounds1:
+	ldr w6, [sp, #0]
+	add v0.4s, v0.4s, v1.4s
+	add w0, w0, w4
+	add v4.4s, v4.4s, v5.4s
+	add w1, w1, w5
+	add v8.4s, v8.4s, v9.4s
+	eor w12, w12, w0
+	eor v12.16b, v3.16b, v0.16b
+	eor w11, w11, w1
+	eor v13.16b, v7.16b, v4.16b
+	ror w12, w12, #16
+	eor v14.16b, v11.16b, v8.16b
+	ror w11, w11, #16
+	rev32 v3.8h, v12.8h
+	subs w6, w6, #2
+	rev32 v7.8h, v13.8h
+	add w8, w8, w12
+	rev32 v11.8h, v14.8h
+	add w9, w9, w11
+	add v2.4s, v2.4s, v3.4s
+	eor w4, w4, w8
+	add v6.4s, v6.4s, v7.4s
+	eor w5, w5, w9
+	add v10.4s, v10.4s, v11.4s
+	str w6, [sp, #0]
+	eor v12.16b, v1.16b, v2.16b
+	ror w4, w4, #20
+	eor v13.16b, v5.16b, v6.16b
+	ror w5, w5, #20
+	eor v14.16b, v9.16b, v10.16b
+	add w0, w0, w4
+	shl v1.4s, v12.4s, #12
+	add w1, w1, w5
+	shl v5.4s, v13.4s, #12
+	ldr w6, [sp, #8]
+	shl v9.4s, v14.4s, #12
+	eor w12, w12, w0
+	sri v1.4s, v12.4s, #20
+	eor w11, w11, w1
+	sri v5.4s, v13.4s, #20
+	ror w12, w12, #24
+	sri v9.4s, v14.4s, #20
+	ror w11, w11, #24
+	add v0.4s, v0.4s, v1.4s
+	add w8, w8, w12
+	add v4.4s, v4.4s, v5.4s
+	add w9, w9, w11
+	add v8.4s, v8.4s, v9.4s
+	eor w4, w4, w8
+	eor v12.16b, v3.16b, v0.16b
+	eor w5, w5, w9
+	eor v13.16b, v7.16b, v4.16b
+	str w11, [sp, #20]
+	eor v14.16b, v11.16b, v8.16b
+	ror w4, w4, #25
+	shl v3.4s, v12.4s, #8
+	ror w5, w5, #25
+	shl v7.4s, v13.4s, #8
+	str w4, [sp, #4]
+	shl v11.4s, v14.4s, #8
+	ldr w4, [sp, #28]
+	sri v3.4s, v12.4s, #24
+	add w2, w2, w6
+	sri v7.4s, v13.4s, #24
+	add w3, w3, w7
+	sri v11.4s, v14.4s, #24
+	ldr w11, [sp, #12]
+	add v2.4s, v2.4s, v3.4s
+	eor w14, w14, w2
+	add v6.4s, v6.4s, v7.4s
+	eor w4, w4, w3
+	add v10.4s, v10.4s, v11.4s
+	ror w14, w14, #16
+	eor v12.16b, v1.16b, v2.16b
+	ror w4, w4, #16
+	eor v13.16b, v5.16b, v6.16b
+	add w10, w10, w14
+	eor v14.16b, v9.16b, v10.16b
+	add w11, w11, w4
+	shl v1.4s, v12.4s, #7
+	eor w6, w6, w10
+	shl v5.4s, v13.4s, #7
+	eor w7, w7, w11
+	shl v9.4s, v14.4s, #7
+	ror w6, w6, #20
+	sri v1.4s, v12.4s, #25
+	ror w7, w7, #20
+	sri v5.4s, v13.4s, #25
+	add w2, w2, w6
+	sri v9.4s, v14.4s, #25
+	add w3, w3, w7
+	EXT32(v3.16b, v3.16b, v3.16b, 3)
+	eor w14, w14, w2
+	EXT32(v7.16b, v7.16b, v7.16b, 3)
+	eor w4, w4, w3
+	EXT32(v11.16b, v11.16b, v11.16b, 3)
+	ror w14, w14, #24
+	EXT32(v1.16b, v1.16b, v1.16b, 1)
+	ror w4, w4, #24
+	EXT32(v5.16b, v5.16b, v5.16b, 1)
+	add w10, w10, w14
+	EXT32(v9.16b, v9.16b, v9.16b, 1)
+	add w11, w11, w4
+	EXT32(v2.16b, v2.16b, v2.16b, 2)
+	eor w6, w6, w10
+	EXT32(v6.16b, v6.16b, v6.16b, 2)
+	eor w7, w7, w11
+	EXT32(v10.16b, v10.16b, v10.16b, 2)
+	ror w6, w6, #25
+	add v0.4s, v0.4s, v1.4s
+	ror w7, w7, #25
+	add v4.4s, v4.4s, v5.4s
+	add w0, w0, w5
+	add v8.4s, v8.4s, v9.4s
+	add w1, w1, w6
+	eor v12.16b, v3.16b, v0.16b
+	eor w4, w4, w0
+	eor v13.16b, v7.16b, v4.16b
+	eor w12, w12, w1
+	eor v14.16b, v11.16b, v8.16b
+	ror w4, w4, #16
+	rev32 v3.8h, v12.8h
+	ror w12, w12, #16
+	rev32 v7.8h, v13.8h
+	add w10, w10, w4
+	rev32 v11.8h, v14.8h
+	add w11, w11, w12
+	add v2.4s, v2.4s, v3.4s
+	eor w5, w5, w10
+	add v6.4s, v6.4s, v7.4s
+	eor w6, w6, w11
+	add v10.4s, v10.4s, v11.4s
+	ror w5, w5, #20
+	eor v12.16b, v1.16b, v2.16b
+	ror w6, w6, #20
+	eor v13.16b, v5.16b, v6.16b
+	add w0, w0, w5
+	eor v14.16b, v9.16b, v10.16b
+	add w1, w1, w6
+	shl v1.4s, v12.4s, #12
+	eor w4, w4, w0
+	shl v5.4s, v13.4s, #12
+	eor w12, w12, w1
+	shl v9.4s, v14.4s, #12
+	ror w4, w4, #24
+	sri v1.4s, v12.4s, #20
+	ror w12, w12, #24
+	sri v5.4s, v13.4s, #20
+	add w10, w10, w4
+	sri v9.4s, v14.4s, #20
+	add w11, w11, w12
+	add v0.4s, v0.4s, v1.4s
+	eor w5, w5, w10
+	add v4.4s, v4.4s, v5.4s
+	eor w6, w6, w11
+	add v8.4s, v8.4s, v9.4s
+	str w11, [sp, #12]
+	eor v12.16b, v3.16b, v0.16b
+	ror w5, w5, #25
+	eor v13.16b, v7.16b, v4.16b
+	ror w6, w6, #25
+	eor v14.16b, v11.16b, v8.16b
+	str w4, [sp, #28]
+	shl v3.4s, v12.4s, #8
+	ldr w4, [sp, #4]
+	shl v7.4s, v13.4s, #8
+	add w2, w2, w7
+	shl v11.4s, v14.4s, #8
+	add w3, w3, w4
+	sri v3.4s, v12.4s, #24
+	ldr w11, [sp, #20]
+	sri v7.4s, v13.4s, #24
+	eor w11, w11, w2
+	sri v11.4s, v14.4s, #24
+	eor w14, w14, w3
+	add v2.4s, v2.4s, v3.4s
+	ror w11, w11, #16
+	add v6.4s, v6.4s, v7.4s
+	ror w14, w14, #16
+	add v10.4s, v10.4s, v11.4s
+	add w8, w8, w11
+	eor v12.16b, v1.16b, v2.16b
+	add w9, w9, w14
+	eor v13.16b, v5.16b, v6.16b
+	eor w7, w7, w8
+	eor v14.16b, v9.16b, v10.16b
+	eor w4, w4, w9
+	shl v1.4s, v12.4s, #7
+	ror w7, w7, #20
+	shl v5.4s, v13.4s, #7
+	ror w4, w4, #20
+	shl v9.4s, v14.4s, #7
+	str w6, [sp, #8]
+	sri v1.4s, v12.4s, #25
+	add w2, w2, w7
+	sri v5.4s, v13.4s, #25
+	add w3, w3, w4
+	sri v9.4s, v14.4s, #25
+	eor w11, w11, w2
+	EXT32(v3.16b, v3.16b, v3.16b, 1)
+	eor w14, w14, w3
+	EXT32(v7.16b, v7.16b, v7.16b, 1)
+	ror w11, w11, #24
+	EXT32(v11.16b, v11.16b, v11.16b, 1)
+	ror w14, w14, #24
+	EXT32(v1.16b, v1.16b, v1.16b, 3)
+	add w8, w8, w11
+	EXT32(v5.16b, v5.16b, v5.16b, 3)
+	add w9, w9, w14
+	EXT32(v9.16b, v9.16b, v9.16b, 3)
+	eor w7, w7, w8
+	EXT32(v2.16b, v2.16b, v2.16b, 2)
+	eor w4, w4, w9
+	EXT32(v6.16b, v6.16b, v6.16b, 2)
+	ror w7, w7, #25
+	EXT32(v10.16b, v10.16b, v10.16b, 2)
+	ror w4, w4, #25
+	bne .Lchacha_blocks_neon_rounds1
+	str w8, [sp, #0]
+	str w9, [sp, #4]
+	mov v12.16b, v24.16b
+	str w10, [sp, #8]
+	str w12, [sp, #16]
+	mov v13.16b, v25.16b
+	str w11, [sp, #20]
+	str w14, [sp, #24]
+	mov v14.16b, v26.16b
+	mov v15.16b, v27.16b
+	ldr x12, [sp, # STACK_SRC]
+	ldr x14, [sp, # STACK_DST]
+	add v0.4s, v0.4s, v12.4s
+	ldr w8, [sp, #(64 +0)]
+	add v4.4s, v4.4s, v12.4s
+	ldr w9, [sp, #(64 +4)]
+	add v8.4s, v8.4s, v12.4s
+	ldr w10, [sp, #(64 +8)]
+	add v1.4s, v1.4s, v13.4s
+	ldr w11, [sp, #(64 +12)]
+	add v5.4s, v5.4s, v13.4s
+	add w0, w0, w8
+	add v9.4s, v9.4s, v13.4s
+	add w1, w1, w9
+	add v2.4s, v2.4s, v14.4s
+	add w2, w2, w10
+	add v6.4s, v6.4s, v14.4s
+	ldr w8, [sp, #(64 +16)]
+	add v10.4s, v10.4s, v14.4s
+	add w3, w3, w11
+	eor v14.16b, v14.16b, v14.16b
+	ldr w9, [sp, #(64 +20)]
+	mov x11, #1
+	add w4, w4, w8
+	mov v14.2d[0], x11
+	ldr w10, [sp, #(64 +24)]
+	add v12.2d, v14.2d, v15.2d
+	add w5, w5, w9
+	add v13.2d, v14.2d, v12.2d
+	ldr w11, [sp, #(64 +28)]
+	add v14.2d, v14.2d, v13.2d
+	add w6, w6, w10
+	add v3.4s, v3.4s, v12.4s
+	tst x12, x12
+	add v7.4s, v7.4s, v13.4s
+	add w7, w7, w11
+	add v11.4s, v11.4s, v14.4s
+	beq .Lchacha_blocks_neon_nomessage11
+	LDMIA4(x12, w8, w9, w10, w11)
+	tst x12, x12
+	eor w0, w0, w8
+	eor w1, w1, w9
+	eor w2, w2, w10
+	ldr w8, [x12, #0]
+	eor w3, w3, w11
+	ldr w9, [x12, #4]
+	eor w4, w4, w8
+	ldr w10, [x12, #8]
+	eor w5, w5, w9
+	ldr w11, [x12, #12]
+	eor w6, w6, w10
+	add x12, x12, #16
+	eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage11:
+	mov x16, sp
+	STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+	tst x12, x12
+	LDMIA8(x16, w0, w1, w2, w3, w4, w5, w6, w7)
+	ldr w8, [sp, #(64 +32)]
+	ldr w9, [sp, #(64 +36)]
+	ldr w10, [sp, #(64 +40)]
+	ldr w11, [sp, #(64 +44)]
+	add w0, w0, w8
+	add w1, w1, w9
+	add w2, w2, w10
+	ldr w8, [sp, #(64 +48)]
+	add w3, w3, w11
+	ldr w9, [sp, #(64 +52)]
+	add w4, w4, w8
+	ldr w10, [sp, #(64 +56)]
+	add w5, w5, w9
+	ldr w11, [sp, #(64 +60)]
+	add w6, w6, w10
+	adds w8, w8, #4
+	add w7, w7, w11
+	adc w9, w9, wzr
+	str w8, [sp, #(64 +48)]
+	mov v27.4s[0], w8
+	tst x12, x12
+	str w9, [sp, #(64 +52)]
+	mov v27.4s[1], w9
+	beq .Lchacha_blocks_neon_nomessage12
+	LDMIA4(x12, w8, w9, w10, w11)
+	tst x12, x12
+	eor w0, w0, w8
+	eor w1, w1, w9
+	eor w2, w2, w10
+	ldr w8, [x12, #0]
+	eor w3, w3, w11
+	ldr w9, [x12, #4]
+	eor w4, w4, w8
+	ldr w10, [x12, #8]
+	eor w5, w5, w9
+	ldr w11, [x12, #12]
+	eor w6, w6, w10
+	add x12, x12, #16
+	eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage12:
+	STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+	tst x12, x12
+	beq .Lchacha_blocks_neon_nomessage13
+	ld1 {v12.4s-v15.4s}, [x12], #64
+	eor v0.16b, v0.16b, v12.16b
+	eor v1.16b, v1.16b, v13.16b
+	eor v2.16b, v2.16b, v14.16b
+	eor v3.16b, v3.16b, v15.16b
+.Lchacha_blocks_neon_nomessage13:
+	st1 {v0.4s-v3.4s}, [x14], #64
+	beq .Lchacha_blocks_neon_nomessage14
+	ld1 {v12.4s-v15.4s}, [x12], #64
+	eor v4.16b, v4.16b, v12.16b
+	eor v5.16b, v5.16b, v13.16b
+	eor v6.16b, v6.16b, v14.16b
+	eor v7.16b, v7.16b, v15.16b
+.Lchacha_blocks_neon_nomessage14:
+	st1 {v4.4s-v7.4s}, [x14], #64
+	beq .Lchacha_blocks_neon_nomessage15
+	ld1 {v12.4s-v15.4s}, [x12], #64
+	eor v8.16b, v8.16b, v12.16b
+	eor v9.16b, v9.16b, v13.16b
+	eor v10.16b, v10.16b, v14.16b
+	eor v11.16b, v11.16b, v15.16b
+.Lchacha_blocks_neon_nomessage15:
+	st1 {v8.4s-v11.4s}, [x14], #64
+	str x12, [sp, # STACK_SRC]
+	str x14, [sp, # STACK_DST]
+	ldr x3, [sp, # STACK_BYTES]
+	sub x3, x3, #256
+	cmp x3, #256
+	str x3, [sp, # STACK_BYTES]
+	bhs .Lchacha_blocks_neon_mainloop1
+	tst x3, x3
+	beq .Lchacha_blocks_neon_done
+.Lchacha_blocks_neon_mainloop2:
+	ldr x3, [sp, # STACK_BYTES]
+	ldr x1, [sp, # STACK_SRC]
+	cmp x3, #64
+	bhs .Lchacha_blocks_neon_noswap1
+	add x4, sp, #128
+	mov x5, x4
+	tst x1, x1
+	beq .Lchacha_blocks_neon_nocopy1
+.Lchacha_blocks_neon_copyinput1:
+	subs x3, x3, #1
+	ldrb w0, [x1], #1
+	strb w0, [x4], #1
+	bne .Lchacha_blocks_neon_copyinput1
+	str x5, [sp, # STACK_SRC]
+.Lchacha_blocks_neon_nocopy1:
+	ldr x4, [sp, # STACK_DST]
+	str x5, [sp, # STACK_DST]
+	str x4, [sp, # STACK_DST_TMP]
+.Lchacha_blocks_neon_noswap1:
+	add x16, sp, #64
+	ldr w0, [sp, #44]
+	str w0, [sp, #0]
+	LDMIA16(x16, w0, w1, w2, w3, w4, w5, w6, w7,
+		     w8, w9, w10, w11, w12, w13, w14, w15)
+	str w6, [sp, #8]
+	str w11, [sp, #12]
+	mov w11, w13
+	str w15, [sp, #28]
+.Lchacha_blocks_neon_rounds2:
+	ldr w6, [sp, #0]
+	add w0, w0, w4
+	add w1, w1, w5
+	eor w12, w12, w0
+	eor w11, w11, w1
+	ror w12, w12, #16
+	ror w11, w11, #16
+	subs w6, w6, #2
+	add w8, w8, w12
+	add w9, w9, w11
+	eor w4, w4, w8
+	eor w5, w5, w9
+	str w6, [sp, #0]
+	ror w4, w4, #20
+	ror w5, w5, #20
+	add w0, w0, w4
+	add w1, w1, w5
+	ldr w6, [sp, #8]
+	eor w12, w12, w0
+	eor w11, w11, w1
+	ror w12, w12, #24
+	ror w11, w11, #24
+	add w8, w8, w12
+	add w9, w9, w11
+	eor w4, w4, w8
+	eor w5, w5, w9
+	str w11, [sp, #20]
+	ror w4, w4, #25
+	ror w5, w5, #25
+	str w4, [sp, #4]
+	ldr w4, [sp, #28]
+	add w2, w2, w6
+	add w3, w3, w7
+	ldr w11, [sp, #12]
+	eor w14, w14, w2
+	eor w4, w4, w3
+	ror w14, w14, #16
+	ror w4, w4, #16
+	add w10, w10, w14
+	add w11, w11, w4
+	eor w6, w6, w10
+	eor w7, w7, w11
+	ror w6, w6, #20
+	ror w7, w7, #20
+	add w2, w2, w6
+	add w3, w3, w7
+	eor w14, w14, w2
+	eor w4, w4, w3
+	ror w14, w14, #24
+	ror w4, w4, #24
+	add w10, w10, w14
+	add w11, w11, w4
+	eor w6, w6, w10
+	eor w7, w7, w11
+	ror w6, w6, #25
+	ror w7, w7, #25
+	add w0, w0, w5
+	add w1, w1, w6
+	eor w4, w4, w0
+	eor w12, w12, w1
+	ror w4, w4, #16
+	ror w12, w12, #16
+	add w10, w10, w4
+	add w11, w11, w12
+	eor w5, w5, w10
+	eor w6, w6, w11
+	ror w5, w5, #20
+	ror w6, w6, #20
+	add w0, w0, w5
+	add w1, w1, w6
+	eor w4, w4, w0
+	eor w12, w12, w1
+	ror w4, w4, #24
+	ror w12, w12, #24
+	add w10, w10, w4
+	add w11, w11, w12
+	eor w5, w5, w10
+	eor w6, w6, w11
+	str w11, [sp, #12]
+	ror w5, w5, #25
+	ror w6, w6, #25
+	str w4, [sp, #28]
+	ldr w4, [sp, #4]
+	add w2, w2, w7
+	add w3, w3, w4
+	ldr w11, [sp, #20]
+	eor w11, w11, w2
+	eor w14, w14, w3
+	ror w11, w11, #16
+	ror w14, w14, #16
+	add w8, w8, w11
+	add w9, w9, w14
+	eor w7, w7, w8
+	eor w4, w4, w9
+	ror w7, w7, #20
+	ror w4, w4, #20
+	str w6, [sp, #8]
+	add w2, w2, w7
+	add w3, w3, w4
+	eor w11, w11, w2
+	eor w14, w14, w3
+	ror w11, w11, #24
+	ror w14, w14, #24
+	add w8, w8, w11
+	add w9, w9, w14
+	eor w7, w7, w8
+	eor w4, w4, w9
+	ror w7, w7, #25
+	ror w4, w4, #25
+	bne .Lchacha_blocks_neon_rounds2
+	str w8, [sp, #0]
+	str w9, [sp, #4]
+	str w10, [sp, #8]
+	str w12, [sp, #16]
+	str w11, [sp, #20]
+	str w14, [sp, #24]
+	ldr x12, [sp, # STACK_SRC]
+	ldr x14, [sp, # STACK_DST]
+	ldr w8, [sp, #(64 +0)]
+	ldr w9, [sp, #(64 +4)]
+	ldr w10, [sp, #(64 +8)]
+	ldr w11, [sp, #(64 +12)]
+	add w0, w0, w8
+	add w1, w1, w9
+	add w2, w2, w10
+	ldr w8, [sp, #(64 +16)]
+	add w3, w3, w11
+	ldr w9, [sp, #(64 +20)]
+	add w4, w4, w8
+	ldr w10, [sp, #(64 +24)]
+	add w5, w5, w9
+	ldr w11, [sp, #(64 +28)]
+	add w6, w6, w10
+	tst x12, x12
+	add w7, w7, w11
+	beq .Lchacha_blocks_neon_nomessage21
+	LDMIA4(x12, w8, w9, w10, w11)
+	tst x12, x12
+	eor w0, w0, w8
+	eor w1, w1, w9
+	eor w2, w2, w10
+	ldr w8, [x12, #0]
+	eor w3, w3, w11
+	ldr w9, [x12, #4]
+	eor w4, w4, w8
+	ldr w10, [x12, #8]
+	eor w5, w5, w9
+	ldr w11, [x12, #12]
+	eor w6, w6, w10
+	add x12, x12, #16
+	eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage21:
+	mov x16, sp
+	STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+	LDMIA8(x16, w0, w1, w2, w3, w4, w5, w6, w7)
+	ldr w8, [sp, #(64 +32)]
+	ldr w9, [sp, #(64 +36)]
+	ldr w10, [sp, #(64 +40)]
+	ldr w11, [sp, #(64 +44)]
+	add w0, w0, w8
+	add w1, w1, w9
+	add w2, w2, w10
+	ldr w8, [sp, #(64 +48)]
+	add w3, w3, w11
+	ldr w9, [sp, #(64 +52)]
+	add w4, w4, w8
+	ldr w10, [sp, #(64 +56)]
+	add w5, w5, w9
+	ldr w11, [sp, #(64 +60)]
+	add w6, w6, w10
+	adds w8, w8, #1
+	add w7, w7, w11
+	adc w9, w9, wzr
+	str w8, [sp, #(64 +48)]
+	tst x12, x12
+	str w9, [sp, #(64 +52)]
+	beq .Lchacha_blocks_neon_nomessage22
+	LDMIA4(x12, w8, w9, w10, w11)
+	tst x12, x12
+	eor w0, w0, w8
+	eor w1, w1, w9
+	eor w2, w2, w10
+	ldr w8, [x12, #0]
+	eor w3, w3, w11
+	ldr w9, [x12, #4]
+	eor w4, w4, w8
+	ldr w10, [x12, #8]
+	eor w5, w5, w9
+	ldr w11, [x12, #12]
+	eor w6, w6, w10
+	add x12, x12, #16
+	eor w7, w7, w11
+.Lchacha_blocks_neon_nomessage22:
+	STMIA8(x14, w0, w1, w2, w3, w4, w5, w6, w7)
+	str x12, [sp, # STACK_SRC]
+	str x14, [sp, # STACK_DST]
+	ldr x3, [sp, # STACK_BYTES]
+	cmp x3, #64
+	sub x4, x3, #64
+	str x4, [sp, # STACK_BYTES]
+	bhi .Lchacha_blocks_neon_mainloop2
+	cmp x3, #64
+	beq .Lchacha_blocks_neon_nocopy2
+	ldr x1, [sp, # STACK_DST_TMP]
+	sub x14, x14, #64
+.Lchacha_blocks_neon_copyinput2:
+	subs x3, x3, #1
+	ldrb w0, [x14], #1
+	strb w0, [x1], #1
+	bne .Lchacha_blocks_neon_copyinput2
+.Lchacha_blocks_neon_nocopy2:
+.Lchacha_blocks_neon_done:
+	ldr x16, [sp, # STACK_SP]
+	ldr x7, [sp, # STACK_STATE]
+	ldr w8, [sp, #(64 +48)]
+	ldr w9, [sp, #(64 +52)]
+	str w8, [x7, #(48 + 0)]
+	str w9, [x7, #(48 + 4)]
+	sub x0, sp, #8
+	mov v8.16b, v16.16b
+	mov v9.16b, v17.16b
+	mov v10.16b, v18.16b
+	mov v11.16b, v19.16b
+	mov sp, x16
+	mov v12.16b, v20.16b
+	mov v13.16b, v21.16b
+	mov v14.16b, v22.16b
+	mov v15.16b, v23.16b
+	sub x0, sp, x0
+	eor v0.16b, v0.16b, v0.16b
+	eor v1.16b, v1.16b, v1.16b
+	eor v2.16b, v2.16b, v2.16b
+	eor v3.16b, v3.16b, v3.16b
+	eor v4.16b, v4.16b, v4.16b
+	eor v5.16b, v5.16b, v5.16b
+	eor v6.16b, v6.16b, v6.16b
+	eor v7.16b, v7.16b, v7.16b
+	ret
+.Lchacha_blocks_neon_nobytes:
+	mov x0, xzr;
+	ret
+.ltorg
+.size _gcry_chacha20_aarch64_blocks,.-_gcry_chacha20_aarch64_blocks;
+
+#endif
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 613fa82a..a11986c1 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -81,6 +81,16 @@
 # endif
 #endif /*ENABLE_NEON_SUPPORT*/
 
+/* USE_AARCH64_SIMD indicates whether to enable ARMv8 SIMD assembly
+ * code. */
+#undef USE_AARCH64_SIMD
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(__AARCH64EL__) \
+       && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
+       && defined(HAVE_GCC_INLINE_ASM_AARCH64_NEON)
+#  define USE_AARCH64_SIMD 1
+# endif
+#endif
 
 struct CHACHA20_context_s;
 
@@ -144,6 +154,14 @@ unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
 
 #endif /* USE_NEON */
 
+#ifdef USE_AARCH64_SIMD
+
+unsigned int _gcry_chacha20_aarch64_blocks(u32 *state, const byte *in,
+					   byte *out,
+					   size_t bytes) ASM_FUNC_ABI;
+
+#endif /* USE_AARCH64_SIMD */
+
 
 static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
 static const char *selftest (void);
@@ -406,6 +424,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
   if (features & HWF_ARM_NEON)
     ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
 #endif
+#ifdef USE_AARCH64_SIMD
+  if (features & HWF_ARM_NEON)
+    ctx->blocks = _gcry_chacha20_aarch64_blocks;
+#endif
 
   (void)features;
 
diff --git a/configure.ac b/configure.ac
index 66e7cd67..1e6ac9d7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2243,6 +2243,10 @@ if test "$found" = "1" ; then
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-ssse3-amd64.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-aarch64.lo"
+      ;;
    esac
 
    if test x"$neonsupport" = xyes ; then




More information about the Gcrypt-devel mailing list