[PATCH 1/3] chacha20: add ARMv7/NEON implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Nov 2 17:52:35 CET 2014
* cipher/Makefile.am: Add 'chacha20-armv7-neon.S'.
* cipher/chacha20-armv7-neon.S: New.
* cipher/chacha20.c (USE_NEON): New.
[USE_NEON] (_gcry_chacha20_armv7_neon_blocks): New.
(chacha20_do_setkey) [USE_NEON]: Use Neon implementation if
HWF_ARM_NEON flag set.
(selftest): Self-test encrypting buffer byte by byte.
* configure.ac [neonsupport=yes]: Add 'chacha20-armv7-neon.lo'.
--
Add Andrew Moon's public domain ARMv7/NEON implementation of ChaCha20. Original
source is available at: https://github.com/floodyberry/chacha-opt
Benchmark on Cortex-A8 (--cpu-mhz 1008):
Old:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 13.45 ns/B 70.92 MiB/s 13.56 c/B
STREAM dec | 13.45 ns/B 70.90 MiB/s 13.56 c/B
New:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 6.20 ns/B 153.9 MiB/s 6.25 c/B
STREAM dec | 6.20 ns/B 153.9 MiB/s 6.25 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 1
cipher/chacha20-armv7-neon.S | 710 ++++++++++++++++++++++++++++++++++++++++++
cipher/chacha20.c | 34 ++
configure.ac | 5
4 files changed, 750 insertions(+)
create mode 100644 cipher/chacha20-armv7-neon.S
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 7f45cbb..09ccaf9 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -61,6 +61,7 @@ arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
+ chacha20-armv7-neon.S \
crc.c \
des.c des-amd64.S \
dsa.c \
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
new file mode 100644
index 0000000..1a395ba
--- /dev/null
+++ b/cipher/chacha20-armv7-neon.S
@@ -0,0 +1,710 @@
+/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by Andrew Moon at
+ * https://github.com/floodyberry/chacha-opt
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20)
+
+.syntax unified
+.fpu neon
+.arm
+
+.text
+
+.globl _gcry_chacha20_armv7_neon_blocks
+.type _gcry_chacha20_armv7_neon_blocks,%function;
+_gcry_chacha20_armv7_neon_blocks:
+.Lchacha_blocks_neon_local:
+ tst r3, r3
+ beq .Lchacha_blocks_neon_nobytes
+ vstmdb sp!, {q4,q5,q6,q7}
+ stmfd sp!, {r4-r12, r14}
+ mov r8, sp
+ sub sp, sp, #196
+ and sp, sp, #0xffffffe0
+ str r0, [sp, #60]
+ str r1, [sp, #48]
+ str r2, [sp, #40]
+ str r3, [sp, #52]
+ str r8, [sp, #192]
+ add r1, sp, #64
+ ldmia r0!, {r4-r11}
+ stmia r1!, {r4-r11}
+ ldmia r0!, {r4-r11}
+ stmia r1!, {r4-r11}
+ mov r4, #20
+ str r4, [sp, #44]
+ cmp r3, #256
+ blo .Lchacha_blocks_neon_mainloop2
+.Lchacha_blocks_neon_mainloop1:
+ ldr r0, [sp, #44]
+ str r0, [sp, #0]
+ add r1, sp, #(64)
+ mov r2, #1
+ veor q12, q12
+ vld1.32 {q0,q1}, [r1,:128]!
+ vld1.32 {q2,q3}, [r1,:128]
+ vmov.32 d24[0], r2
+ vadd.u64 q3, q3, q12
+ vmov q4, q0
+ vmov q5, q1
+ vmov q6, q2
+ vadd.u64 q7, q3, q12
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vadd.u64 q11, q7, q12
+ add r0, sp, #64
+ ldm r0, {r0-r12}
+ ldr r14, [sp, #(64 +60)]
+ str r6, [sp, #8]
+ str r11, [sp, #12]
+ str r14, [sp, #28]
+ ldr r11, [sp, #(64 +52)]
+ ldr r14, [sp, #(64 +56)]
+.Lchacha_blocks_neon_rounds1:
+ ldr r6, [sp, #0]
+ vadd.i32 q0, q0, q1
+ add r0, r0, r4
+ vadd.i32 q4, q4, q5
+ add r1, r1, r5
+ vadd.i32 q8, q8, q9
+ eor r12, r12, r0
+ veor q12, q3, q0
+ eor r11, r11, r1
+ veor q13, q7, q4
+ ror r12, r12, #16
+ veor q14, q11, q8
+ ror r11, r11, #16
+ vrev32.16 q3, q12
+ subs r6, r6, #2
+ vrev32.16 q7, q13
+ add r8, r8, r12
+ vrev32.16 q11, q14
+ add r9, r9, r11
+ vadd.i32 q2, q2, q3
+ eor r4, r4, r8
+ vadd.i32 q6, q6, q7
+ eor r5, r5, r9
+ vadd.i32 q10, q10, q11
+ str r6, [sp, #0]
+ veor q12, q1, q2
+ ror r4, r4, #20
+ veor q13, q5, q6
+ ror r5, r5, #20
+ veor q14, q9, q10
+ add r0, r0, r4
+ vshl.i32 q1, q12, #12
+ add r1, r1, r5
+ vshl.i32 q5, q13, #12
+ ldr r6, [sp, #8]
+ vshl.i32 q9, q14, #12
+ eor r12, r12, r0
+ vsri.u32 q1, q12, #20
+ eor r11, r11, r1
+ vsri.u32 q5, q13, #20
+ ror r12, r12, #24
+ vsri.u32 q9, q14, #20
+ ror r11, r11, #24
+ vadd.i32 q0, q0, q1
+ add r8, r8, r12
+ vadd.i32 q4, q4, q5
+ add r9, r9, r11
+ vadd.i32 q8, q8, q9
+ eor r4, r4, r8
+ veor q12, q3, q0
+ eor r5, r5, r9
+ veor q13, q7, q4
+ str r11, [sp, #20]
+ veor q14, q11, q8
+ ror r4, r4, #25
+ vshl.i32 q3, q12, #8
+ ror r5, r5, #25
+ vshl.i32 q7, q13, #8
+ str r4, [sp, #4]
+ vshl.i32 q11, q14, #8
+ ldr r4, [sp, #28]
+ vsri.u32 q3, q12, #24
+ add r2, r2, r6
+ vsri.u32 q7, q13, #24
+ add r3, r3, r7
+ vsri.u32 q11, q14, #24
+ ldr r11, [sp, #12]
+ vadd.i32 q2, q2, q3
+ eor r14, r14, r2
+ vadd.i32 q6, q6, q7
+ eor r4, r4, r3
+ vadd.i32 q10, q10, q11
+ ror r14, r14, #16
+ veor q12, q1, q2
+ ror r4, r4, #16
+ veor q13, q5, q6
+ add r10, r10, r14
+ veor q14, q9, q10
+ add r11, r11, r4
+ vshl.i32 q1, q12, #7
+ eor r6, r6, r10
+ vshl.i32 q5, q13, #7
+ eor r7, r7, r11
+ vshl.i32 q9, q14, #7
+ ror r6, r6, #20
+ vsri.u32 q1, q12, #25
+ ror r7, r7, #20
+ vsri.u32 q5, q13, #25
+ add r2, r2, r6
+ vsri.u32 q9, q14, #25
+ add r3, r3, r7
+ vext.32 q3, q3, q3, #3
+ eor r14, r14, r2
+ vext.32 q7, q7, q7, #3
+ eor r4, r4, r3
+ vext.32 q11, q11, q11, #3
+ ror r14, r14, #24
+ vext.32 q1, q1, q1, #1
+ ror r4, r4, #24
+ vext.32 q5, q5, q5, #1
+ add r10, r10, r14
+ vext.32 q9, q9, q9, #1
+ add r11, r11, r4
+ vext.32 q2, q2, q2, #2
+ eor r6, r6, r10
+ vext.32 q6, q6, q6, #2
+ eor r7, r7, r11
+ vext.32 q10, q10, q10, #2
+ ror r6, r6, #25
+ vadd.i32 q0, q0, q1
+ ror r7, r7, #25
+ vadd.i32 q4, q4, q5
+ add r0, r0, r5
+ vadd.i32 q8, q8, q9
+ add r1, r1, r6
+ veor q12, q3, q0
+ eor r4, r4, r0
+ veor q13, q7, q4
+ eor r12, r12, r1
+ veor q14, q11, q8
+ ror r4, r4, #16
+ vrev32.16 q3, q12
+ ror r12, r12, #16
+ vrev32.16 q7, q13
+ add r10, r10, r4
+ vrev32.16 q11, q14
+ add r11, r11, r12
+ vadd.i32 q2, q2, q3
+ eor r5, r5, r10
+ vadd.i32 q6, q6, q7
+ eor r6, r6, r11
+ vadd.i32 q10, q10, q11
+ ror r5, r5, #20
+ veor q12, q1, q2
+ ror r6, r6, #20
+ veor q13, q5, q6
+ add r0, r0, r5
+ veor q14, q9, q10
+ add r1, r1, r6
+ vshl.i32 q1, q12, #12
+ eor r4, r4, r0
+ vshl.i32 q5, q13, #12
+ eor r12, r12, r1
+ vshl.i32 q9, q14, #12
+ ror r4, r4, #24
+ vsri.u32 q1, q12, #20
+ ror r12, r12, #24
+ vsri.u32 q5, q13, #20
+ add r10, r10, r4
+ vsri.u32 q9, q14, #20
+ add r11, r11, r12
+ vadd.i32 q0, q0, q1
+ eor r5, r5, r10
+ vadd.i32 q4, q4, q5
+ eor r6, r6, r11
+ vadd.i32 q8, q8, q9
+ str r11, [sp, #12]
+ veor q12, q3, q0
+ ror r5, r5, #25
+ veor q13, q7, q4
+ ror r6, r6, #25
+ veor q14, q11, q8
+ str r4, [sp, #28]
+ vshl.i32 q3, q12, #8
+ ldr r4, [sp, #4]
+ vshl.i32 q7, q13, #8
+ add r2, r2, r7
+ vshl.i32 q11, q14, #8
+ add r3, r3, r4
+ vsri.u32 q3, q12, #24
+ ldr r11, [sp, #20]
+ vsri.u32 q7, q13, #24
+ eor r11, r11, r2
+ vsri.u32 q11, q14, #24
+ eor r14, r14, r3
+ vadd.i32 q2, q2, q3
+ ror r11, r11, #16
+ vadd.i32 q6, q6, q7
+ ror r14, r14, #16
+ vadd.i32 q10, q10, q11
+ add r8, r8, r11
+ veor q12, q1, q2
+ add r9, r9, r14
+ veor q13, q5, q6
+ eor r7, r7, r8
+ veor q14, q9, q10
+ eor r4, r4, r9
+ vshl.i32 q1, q12, #7
+ ror r7, r7, #20
+ vshl.i32 q5, q13, #7
+ ror r4, r4, #20
+ vshl.i32 q9, q14, #7
+ str r6, [sp, #8]
+ vsri.u32 q1, q12, #25
+ add r2, r2, r7
+ vsri.u32 q5, q13, #25
+ add r3, r3, r4
+ vsri.u32 q9, q14, #25
+ eor r11, r11, r2
+ vext.32 q3, q3, q3, #1
+ eor r14, r14, r3
+ vext.32 q7, q7, q7, #1
+ ror r11, r11, #24
+ vext.32 q11, q11, q11, #1
+ ror r14, r14, #24
+ vext.32 q1, q1, q1, #3
+ add r8, r8, r11
+ vext.32 q5, q5, q5, #3
+ add r9, r9, r14
+ vext.32 q9, q9, q9, #3
+ eor r7, r7, r8
+ vext.32 q2, q2, q2, #2
+ eor r4, r4, r9
+ vext.32 q6, q6, q6, #2
+ ror r7, r7, #25
+ vext.32 q10, q10, q10, #2
+ ror r4, r4, #25
+ bne .Lchacha_blocks_neon_rounds1
+ str r8, [sp, #0]
+ str r9, [sp, #4]
+ str r10, [sp, #8]
+ str r12, [sp, #16]
+ str r11, [sp, #20]
+ str r14, [sp, #24]
+ add r9, sp, #64
+ vld1.32 {q12,q13}, [r9,:128]!
+ ldr r12, [sp, #48]
+ vld1.32 {q14,q15}, [r9,:128]
+ ldr r14, [sp, #40]
+ vadd.i32 q0, q0, q12
+ ldr r8, [sp, #(64 +0)]
+ vadd.i32 q4, q4, q12
+ ldr r9, [sp, #(64 +4)]
+ vadd.i32 q8, q8, q12
+ ldr r10, [sp, #(64 +8)]
+ vadd.i32 q1, q1, q13
+ ldr r11, [sp, #(64 +12)]
+ vadd.i32 q5, q5, q13
+ add r0, r0, r8
+ vadd.i32 q9, q9, q13
+ add r1, r1, r9
+ vadd.i32 q2, q2, q14
+ add r2, r2, r10
+ vadd.i32 q6, q6, q14
+ ldr r8, [sp, #(64 +16)]
+ vadd.i32 q10, q10, q14
+ add r3, r3, r11
+ veor q14, q14, q14
+ ldr r9, [sp, #(64 +20)]
+ mov r11, #1
+ add r4, r4, r8
+ vmov.32 d28[0], r11
+ ldr r10, [sp, #(64 +24)]
+ vadd.u64 q12, q14, q15
+ add r5, r5, r9
+ vadd.u64 q13, q14, q12
+ ldr r11, [sp, #(64 +28)]
+ vadd.u64 q14, q14, q13
+ add r6, r6, r10
+ vadd.i32 q3, q3, q12
+ tst r12, r12
+ vadd.i32 q7, q7, q13
+ add r7, r7, r11
+ vadd.i32 q11, q11, q14
+ beq .Lchacha_blocks_neon_nomessage11
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage11:
+ stmia r14!, {r0-r7}
+ ldm sp, {r0-r7}
+ ldr r8, [sp, #(64 +32)]
+ ldr r9, [sp, #(64 +36)]
+ ldr r10, [sp, #(64 +40)]
+ ldr r11, [sp, #(64 +44)]
+ add r0, r0, r8
+ add r1, r1, r9
+ add r2, r2, r10
+ ldr r8, [sp, #(64 +48)]
+ add r3, r3, r11
+ ldr r9, [sp, #(64 +52)]
+ add r4, r4, r8
+ ldr r10, [sp, #(64 +56)]
+ add r5, r5, r9
+ ldr r11, [sp, #(64 +60)]
+ add r6, r6, r10
+ adds r8, r8, #4
+ add r7, r7, r11
+ adc r9, r9, #0
+ str r8, [sp, #(64 +48)]
+ tst r12, r12
+ str r9, [sp, #(64 +52)]
+ beq .Lchacha_blocks_neon_nomessage12
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage12:
+ stmia r14!, {r0-r7}
+ beq .Lchacha_blocks_neon_nomessage13
+ vld1.32 {q12,q13}, [r12]!
+ vld1.32 {q14,q15}, [r12]!
+ veor q0, q0, q12
+ veor q1, q1, q13
+ veor q2, q2, q14
+ veor q3, q3, q15
+.Lchacha_blocks_neon_nomessage13:
+ vst1.32 {q0,q1}, [r14]!
+ vst1.32 {q2,q3}, [r14]!
+ beq .Lchacha_blocks_neon_nomessage14
+ vld1.32 {q12,q13}, [r12]!
+ vld1.32 {q14,q15}, [r12]!
+ veor q4, q4, q12
+ veor q5, q5, q13
+ veor q6, q6, q14
+ veor q7, q7, q15
+.Lchacha_blocks_neon_nomessage14:
+ vst1.32 {q4,q5}, [r14]!
+ vst1.32 {q6,q7}, [r14]!
+ beq .Lchacha_blocks_neon_nomessage15
+ vld1.32 {q12,q13}, [r12]!
+ vld1.32 {q14,q15}, [r12]!
+ veor q8, q8, q12
+ veor q9, q9, q13
+ veor q10, q10, q14
+ veor q11, q11, q15
+.Lchacha_blocks_neon_nomessage15:
+ vst1.32 {q8,q9}, [r14]!
+ vst1.32 {q10,q11}, [r14]!
+ str r12, [sp, #48]
+ str r14, [sp, #40]
+ ldr r3, [sp, #52]
+ sub r3, r3, #256
+ cmp r3, #256
+ str r3, [sp, #52]
+ bhs .Lchacha_blocks_neon_mainloop1
+ tst r3, r3
+ beq .Lchacha_blocks_neon_done
+.Lchacha_blocks_neon_mainloop2:
+ ldr r3, [sp, #52]
+ ldr r1, [sp, #48]
+ cmp r3, #64
+ bhs .Lchacha_blocks_neon_noswap1
+ add r4, sp, #128
+ mov r5, r4
+ tst r1, r1
+ beq .Lchacha_blocks_neon_nocopy1
+.Lchacha_blocks_neon_copyinput1:
+ subs r3, r3, #1
+ ldrb r0, [r1], #1
+ strb r0, [r4], #1
+ bne .Lchacha_blocks_neon_copyinput1
+ str r5, [sp, #48]
+.Lchacha_blocks_neon_nocopy1:
+ ldr r4, [sp, #40]
+ str r5, [sp, #40]
+ str r4, [sp, #56]
+.Lchacha_blocks_neon_noswap1:
+ ldr r0, [sp, #44]
+ str r0, [sp, #0]
+ add r0, sp, #64
+ ldm r0, {r0-r12}
+ ldr r14, [sp, #(64 +60)]
+ str r6, [sp, #8]
+ str r11, [sp, #12]
+ str r14, [sp, #28]
+ ldr r11, [sp, #(64 +52)]
+ ldr r14, [sp, #(64 +56)]
+.Lchacha_blocks_neon_rounds2:
+ ldr r6, [sp, #0]
+ add r0, r0, r4
+ add r1, r1, r5
+ eor r12, r12, r0
+ eor r11, r11, r1
+ ror r12, r12, #16
+ ror r11, r11, #16
+ subs r6, r6, #2
+ add r8, r8, r12
+ add r9, r9, r11
+ eor r4, r4, r8
+ eor r5, r5, r9
+ str r6, [sp, #0]
+ ror r4, r4, #20
+ ror r5, r5, #20
+ add r0, r0, r4
+ add r1, r1, r5
+ ldr r6, [sp, #8]
+ eor r12, r12, r0
+ eor r11, r11, r1
+ ror r12, r12, #24
+ ror r11, r11, #24
+ add r8, r8, r12
+ add r9, r9, r11
+ eor r4, r4, r8
+ eor r5, r5, r9
+ str r11, [sp, #20]
+ ror r4, r4, #25
+ ror r5, r5, #25
+ str r4, [sp, #4]
+ ldr r4, [sp, #28]
+ add r2, r2, r6
+ add r3, r3, r7
+ ldr r11, [sp, #12]
+ eor r14, r14, r2
+ eor r4, r4, r3
+ ror r14, r14, #16
+ ror r4, r4, #16
+ add r10, r10, r14
+ add r11, r11, r4
+ eor r6, r6, r10
+ eor r7, r7, r11
+ ror r6, r6, #20
+ ror r7, r7, #20
+ add r2, r2, r6
+ add r3, r3, r7
+ eor r14, r14, r2
+ eor r4, r4, r3
+ ror r14, r14, #24
+ ror r4, r4, #24
+ add r10, r10, r14
+ add r11, r11, r4
+ eor r6, r6, r10
+ eor r7, r7, r11
+ ror r6, r6, #25
+ ror r7, r7, #25
+ add r0, r0, r5
+ add r1, r1, r6
+ eor r4, r4, r0
+ eor r12, r12, r1
+ ror r4, r4, #16
+ ror r12, r12, #16
+ add r10, r10, r4
+ add r11, r11, r12
+ eor r5, r5, r10
+ eor r6, r6, r11
+ ror r5, r5, #20
+ ror r6, r6, #20
+ add r0, r0, r5
+ add r1, r1, r6
+ eor r4, r4, r0
+ eor r12, r12, r1
+ ror r4, r4, #24
+ ror r12, r12, #24
+ add r10, r10, r4
+ add r11, r11, r12
+ eor r5, r5, r10
+ eor r6, r6, r11
+ str r11, [sp, #12]
+ ror r5, r5, #25
+ ror r6, r6, #25
+ str r4, [sp, #28]
+ ldr r4, [sp, #4]
+ add r2, r2, r7
+ add r3, r3, r4
+ ldr r11, [sp, #20]
+ eor r11, r11, r2
+ eor r14, r14, r3
+ ror r11, r11, #16
+ ror r14, r14, #16
+ add r8, r8, r11
+ add r9, r9, r14
+ eor r7, r7, r8
+ eor r4, r4, r9
+ ror r7, r7, #20
+ ror r4, r4, #20
+ str r6, [sp, #8]
+ add r2, r2, r7
+ add r3, r3, r4
+ eor r11, r11, r2
+ eor r14, r14, r3
+ ror r11, r11, #24
+ ror r14, r14, #24
+ add r8, r8, r11
+ add r9, r9, r14
+ eor r7, r7, r8
+ eor r4, r4, r9
+ ror r7, r7, #25
+ ror r4, r4, #25
+ bne .Lchacha_blocks_neon_rounds2
+ str r8, [sp, #0]
+ str r9, [sp, #4]
+ str r10, [sp, #8]
+ str r12, [sp, #16]
+ str r11, [sp, #20]
+ str r14, [sp, #24]
+ ldr r12, [sp, #48]
+ ldr r14, [sp, #40]
+ ldr r8, [sp, #(64 +0)]
+ ldr r9, [sp, #(64 +4)]
+ ldr r10, [sp, #(64 +8)]
+ ldr r11, [sp, #(64 +12)]
+ add r0, r0, r8
+ add r1, r1, r9
+ add r2, r2, r10
+ ldr r8, [sp, #(64 +16)]
+ add r3, r3, r11
+ ldr r9, [sp, #(64 +20)]
+ add r4, r4, r8
+ ldr r10, [sp, #(64 +24)]
+ add r5, r5, r9
+ ldr r11, [sp, #(64 +28)]
+ add r6, r6, r10
+ tst r12, r12
+ add r7, r7, r11
+ beq .Lchacha_blocks_neon_nomessage21
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage21:
+ stmia r14!, {r0-r7}
+ ldm sp, {r0-r7}
+ ldr r8, [sp, #(64 +32)]
+ ldr r9, [sp, #(64 +36)]
+ ldr r10, [sp, #(64 +40)]
+ ldr r11, [sp, #(64 +44)]
+ add r0, r0, r8
+ add r1, r1, r9
+ add r2, r2, r10
+ ldr r8, [sp, #(64 +48)]
+ add r3, r3, r11
+ ldr r9, [sp, #(64 +52)]
+ add r4, r4, r8
+ ldr r10, [sp, #(64 +56)]
+ add r5, r5, r9
+ ldr r11, [sp, #(64 +60)]
+ add r6, r6, r10
+ adds r8, r8, #1
+ add r7, r7, r11
+ adc r9, r9, #0
+ str r8, [sp, #(64 +48)]
+ tst r12, r12
+ str r9, [sp, #(64 +52)]
+ beq .Lchacha_blocks_neon_nomessage22
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage22:
+ stmia r14!, {r0-r7}
+ str r12, [sp, #48]
+ str r14, [sp, #40]
+ ldr r3, [sp, #52]
+ cmp r3, #64
+ sub r4, r3, #64
+ str r4, [sp, #52]
+ bhi .Lchacha_blocks_neon_mainloop2
+ cmp r3, #64
+ beq .Lchacha_blocks_neon_nocopy2
+ ldr r1, [sp, #56]
+ sub r14, r14, #64
+.Lchacha_blocks_neon_copyinput2:
+ subs r3, r3, #1
+ ldrb r0, [r14], #1
+ strb r0, [r1], #1
+ bne .Lchacha_blocks_neon_copyinput2
+.Lchacha_blocks_neon_nocopy2:
+.Lchacha_blocks_neon_done:
+ ldr r7, [sp, #60]
+ ldr r8, [sp, #(64 +48)]
+ ldr r9, [sp, #(64 +52)]
+ str r8, [r7, #(48 + 0)]
+ str r9, [r7, #(48 + 4)]
+ mov r12, sp
+ stmia r12!, {r0-r7}
+ add r12, r12, #48
+ stmia r12!, {r0-r7}
+ sub r0, sp, #8
+ ldr sp, [sp, #192]
+ ldmfd sp!, {r4-r12, r14}
+ vldm sp!, {q4-q7}
+ sub r0, sp, r0
+ bx lr
+.Lchacha_blocks_neon_nobytes:
+ mov r0, #0;
+ bx lr
+.ltorg
+.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks;
+
+#endif
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ebba2fc..c1847aa 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -67,6 +67,16 @@
# define USE_AVX2 1
#endif
+/* USE_NEON indicates whether to enable ARM NEON assembly code. */
+#undef USE_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
+# define USE_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
struct CHACHA20_context_s;
@@ -104,6 +114,13 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
#endif /* USE_AVX2 */
+#ifdef USE_NEON
+
+unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
+ byte *out, size_t bytes);
+
+#endif /* USE_NEON */
+
static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
static const char *selftest (void);
@@ -353,6 +370,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
if (features & HWF_INTEL_AVX2)
ctx->blocks = _gcry_chacha20_amd64_avx2_blocks;
#endif
+#ifdef USE_NEON
+ if (features & HWF_ARM_NEON)
+ ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
+#endif
(void)features;
@@ -541,6 +562,19 @@ selftest (void)
if (buf[i] != (byte) i)
return "ChaCha20 encryption test 2 failed.";
+ chacha20_setkey (&ctx, key_1, sizeof key_1);
+ chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+ /* encrypt */
+ for (i = 0; i < sizeof buf; i++)
+ chacha20_encrypt_stream (&ctx, &buf[i], &buf[i], 1);
+ /* decrypt */
+ chacha20_setkey (&ctx, key_1, sizeof key_1);
+ chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+ chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+ for (i = 0; i < sizeof buf; i++)
+ if (buf[i] != (byte) i)
+ return "ChaCha20 encryption test 3 failed.";
+
return NULL;
}
diff --git a/configure.ac b/configure.ac
index d14b7f6..60ed015 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1822,6 +1822,11 @@ if test "$found" = "1" ; then
GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
;;
esac
+
+ if test x"$neonsupport" = xyes ; then
+ # Build with the NEON implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo"
+ fi
fi
case "${host}" in
More information about the Gcrypt-devel
mailing list