[git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-125-g95eef21
by Jussi Kivilinna
cvs at cvs.gnupg.org
Wed Nov 5 17:13:44 CET 2014
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".
The branch, master has been updated
via 95eef21583d8e998efc48f22898c1ae31b77cb48 (commit)
via 0b520128551054d83fb0bb2db8873394f38de498 (commit)
via c584f44543883346d5a565581ff99a0afce9c5e1 (commit)
from 669a83ba86c38b271d85ed4bf1cabc7cc8160583 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 95eef21583d8e998efc48f22898c1ae31b77cb48
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Sun Nov 2 17:45:35 2014 +0200
Disable NEON for CPUs that are known to have broken NEON implementation
* src/hwf-arm.c (detect_arm_proc_cpuinfo): Add parsing for CPU version
information and check if CPU is known to have broken NEON
implementation.
(_gcry_hwf_detect_arm): Filter out broken HW features.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/src/hwf-arm.c b/src/hwf-arm.c
index dbbb607..3dc050e 100644
--- a/src/hwf-arm.c
+++ b/src/hwf-arm.c
@@ -98,17 +98,32 @@ detect_arm_at_hwcap(void)
#define HAS_PROC_CPUINFO 1
static unsigned int
-detect_arm_proc_cpuinfo(void)
+detect_arm_proc_cpuinfo(unsigned int *broken_hwfs)
{
char buf[1024]; /* large enough */
char *str_features, *str_neon;
+ int cpu_implementer, cpu_arch, cpu_variant, cpu_part, cpu_revision;
FILE *f;
int readlen, i;
static int cpuinfo_initialized = 0;
static unsigned int stored_cpuinfo_features;
+ static unsigned int stored_broken_hwfs;
+ struct {
+ const char *name;
+ int *value;
+ } cpu_entries[5] = {
+ { "CPU implementer", &cpu_implementer },
+ { "CPU architecture", &cpu_arch },
+ { "CPU variant", &cpu_variant },
+ { "CPU part", &cpu_part },
+ { "CPU revision", &cpu_revision },
+ };
if (cpuinfo_initialized)
- return stored_cpuinfo_features;
+ {
+ *broken_hwfs |= stored_broken_hwfs;
+ return stored_cpuinfo_features;
+ }
f = fopen("/proc/cpuinfo", "r");
if (!f)
@@ -124,12 +139,32 @@ detect_arm_proc_cpuinfo(void)
cpuinfo_initialized = 1;
stored_cpuinfo_features = 0;
+ stored_broken_hwfs = 0;
/* Find features line. */
str_features = strstr(buf, "Features");
if (!str_features)
return stored_cpuinfo_features;
+ /* Find CPU version information. */
+ for (i = 0; i < DIM(cpu_entries); i++)
+ {
+ char *str;
+
+ *cpu_entries[i].value = -1;
+
+ str = strstr(buf, cpu_entries[i].name);
+ if (!str)
+ continue;
+
+ str = strstr(str, ": ");
+ if (!str)
+ continue;
+
+ str += 2;
+ *cpu_entries[i].value = strtoul(str, NULL, 0);
+ }
+
/* Lines to strings. */
for (i = 0; i < sizeof(buf); i++)
if (buf[i] == '\n')
@@ -140,6 +175,19 @@ detect_arm_proc_cpuinfo(void)
if (str_neon && (str_neon[5] == ' ' || str_neon[5] == '\0'))
stored_cpuinfo_features |= HWF_ARM_NEON;
+ /* Check for CPUs with broken NEON implementation. See
+ * https://code.google.com/p/chromium/issues/detail?id=341598
+ */
+ if (cpu_implementer == 0x51
+ && cpu_arch == 7
+ && cpu_variant == 1
+ && cpu_part == 0x4d
+ && cpu_revision == 0)
+ {
+ stored_broken_hwfs = HWF_ARM_NEON;
+ }
+
+ *broken_hwfs |= stored_broken_hwfs;
return stored_cpuinfo_features;
}
@@ -149,18 +197,21 @@ unsigned int
_gcry_hwf_detect_arm (void)
{
unsigned int ret = 0;
+ unsigned int broken_hwfs = 0;
#if defined (HAS_SYS_AT_HWCAP)
ret |= detect_arm_at_hwcap ();
#endif
#if defined (HAS_PROC_CPUINFO)
- ret |= detect_arm_proc_cpuinfo ();
+ ret |= detect_arm_proc_cpuinfo (&broken_hwfs);
#endif
#if defined(__ARM_NEON__) && defined(ENABLE_NEON_SUPPORT)
ret |= HWF_ARM_NEON;
#endif
+ ret &= ~broken_hwfs;
+
return ret;
}
commit 0b520128551054d83fb0bb2db8873394f38de498
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Sun Nov 2 16:01:11 2014 +0200
Add ARM/NEON implementation of Poly1305
* cipher/Makefile.am: Add 'poly1305-armv7-neon.S'.
* cipher/poly1305-armv7-neon.S: New.
* cipher/poly1305-internal.h (POLY1305_USE_NEON)
(POLY1305_NEON_BLOCKSIZE, POLY1305_NEON_STATESIZE)
(POLY1305_NEON_ALIGNMENT): New.
* cipher/poly1305.c [POLY1305_USE_NEON]
(_gcry_poly1305_armv7_neon_init_ext)
(_gcry_poly1305_armv7_neon_finish_ext)
(_gcry_poly1305_armv7_neon_blocks, poly1305_armv7_neon_ops): New.
(_gcry_poly1305_init) [POLY1305_USE_NEON]: Select NEON implementation
if HWF_ARM_NEON set.
* configure.ac [neonsupport=yes]: Add 'poly1305-armv7-neon.lo'.
--
Add Andrew Moon's public domain NEON implementation of Poly1305. Original
source is available at: https://github.com/floodyberry/poly1305-opt
Benchmark on Cortex-A8 (--cpu-mhz 1008):
Old:
| nanosecs/byte mebibytes/sec cycles/byte
POLY1305 | 12.34 ns/B 77.27 MiB/s 12.44 c/B
New:
| nanosecs/byte mebibytes/sec cycles/byte
POLY1305 | 2.12 ns/B 450.7 MiB/s 2.13 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 09ccaf9..22018b3 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -73,7 +73,7 @@ gost28147.c gost.h \
gostr3411-94.c \
md4.c \
md5.c \
-poly1305-sse2-amd64.S poly1305-avx2-amd64.S \
+poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \
rmd160.c \
rsa.c \
diff --git a/cipher/poly1305-armv7-neon.S b/cipher/poly1305-armv7-neon.S
new file mode 100644
index 0000000..1134e85
--- /dev/null
+++ b/cipher/poly1305-armv7-neon.S
@@ -0,0 +1,705 @@
+/* poly1305-armv7-neon.S - ARMv7/NEON implementation of Poly1305
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by Andrew Moon at
+ * https://github.com/floodyberry/poly1305-opt
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON)
+
+.syntax unified
+.fpu neon
+.arm
+
+.text
+
+.p2align 2
+.Lpoly1305_init_constants_neon:
+.long 0x3ffff03
+.long 0x3ffc0ff
+.long 0x3f03fff
+.long 0x00fffff
+
+.globl _gcry_poly1305_armv7_neon_init_ext
+.type _gcry_poly1305_armv7_neon_init_ext,%function;
+_gcry_poly1305_armv7_neon_init_ext:
+.Lpoly1305_init_ext_neon_local:
+ stmfd sp!, {r4-r11, lr}
+ sub sp, sp, #32
+ mov r14, r2
+ and r2, r2, r2
+ moveq r14, #-1
+ ldmia r1!, {r2-r5}
+ ldr r7, =.Lpoly1305_init_constants_neon
+ mov r6, r2
+ mov r8, r2, lsr #26
+ mov r9, r3, lsr #20
+ mov r10, r4, lsr #14
+ mov r11, r5, lsr #8
+ orr r8, r8, r3, lsl #6
+ orr r9, r9, r4, lsl #12
+ orr r10, r10, r5, lsl #18
+ ldmia r7, {r2-r5}
+ and r2, r2, r8
+ and r3, r3, r9
+ and r4, r4, r10
+ and r5, r5, r11
+ and r6, r6, 0x3ffffff
+ stmia r0!, {r2-r6}
+ eor r8, r8, r8
+ str r8, [sp, #24]
+.Lpoly1305_init_ext_neon_squareloop:
+ ldr r8, [sp, #24]
+ mov r12, #16
+ cmp r8, #2
+ beq .Lpoly1305_init_ext_neon_donesquaring
+ cmp r8, #1
+ moveq r12, #64
+ cmp r14, r12
+ bls .Lpoly1305_init_ext_neon_donesquaring
+ add r8, #1
+ str r8, [sp, #24]
+ mov r6, r6, lsl #1
+ mov r2, r2, lsl #1
+ umull r7, r8, r3, r3
+ umull r9, r10, r6, r4
+ umlal r7, r8, r6, r5
+ umlal r9, r10, r2, r3
+ add r11, r5, r5, lsl #2
+ umlal r7, r8, r2, r4
+ umlal r9, r10, r5, r11
+ str r7, [sp, #16]
+ str r8, [sp, #20]
+ mov r2, r2, lsr #1
+ mov r5, r5, lsl #1
+ str r9, [sp, #8]
+ str r10, [sp, #12]
+ umull r7, r8, r2, r2
+ umull r9, r10, r6, r2
+ add r11, r3, r3, lsl #2
+ add r12, r4, r4, lsl #2
+ umlal r7, r8, r6, r3
+ umlal r9, r10, r5, r11
+ umlal r7, r8, r5, r12
+ umlal r9, r10, r4, r12
+ mov r6, r6, lsr #1
+ mov r3, r3, lsl #1
+ add r11, r2, r2, lsl #2
+ str r7, [sp, #0]
+ str r8, [sp, #4]
+ umull r7, r8, r6, r6
+ umlal r7, r8, r3, r12
+ umlal r7, r8, r5, r11
+ and r6, r7, 0x3ffffff
+ mov r11, r7, lsr #26
+ orr r11, r11, r8, lsl #6
+ ldr r7, [sp, #0]
+ ldr r8, [sp, #4]
+ adds r9, r9, r11
+ adc r10, r10, #0
+ and r2, r9, 0x3ffffff
+ mov r11, r9, lsr #26
+ orr r11, r11, r10, lsl #6
+ ldr r9, [sp, #8]
+ ldr r10, [sp, #12]
+ adds r7, r7, r11
+ adc r8, r8, #0
+ and r3, r7, 0x3ffffff
+ mov r11, r7, lsr #26
+ orr r11, r11, r8, lsl #6
+ ldr r7, [sp, #16]
+ ldr r8, [sp, #20]
+ adds r9, r9, r11
+ adc r10, r10, #0
+ and r4, r9, 0x3ffffff
+ mov r11, r9, lsr #26
+ orr r11, r11, r10, lsl #6
+ adds r7, r7, r11
+ adc r8, r8, #0
+ and r5, r7, 0x3ffffff
+ mov r11, r7, lsr #26
+ orr r11, r11, r8, lsl #6
+ add r11, r11, r11, lsl #2
+ add r6, r6, r11
+ mov r11, r6, lsr #26
+ and r6, r6, 0x3ffffff
+ add r2, r2, r11
+ stmia r0!, {r2-r6}
+ b .Lpoly1305_init_ext_neon_squareloop
+.Lpoly1305_init_ext_neon_donesquaring:
+ mov r2, #2
+ ldr r14, [sp, #24]
+ sub r14, r2, r14
+ mov r3, r14, lsl #4
+ add r3, r3, r14, lsl #2
+ add r0, r0, r3
+ eor r2, r2, r2
+ eor r3, r3, r3
+ eor r4, r4, r4
+ eor r5, r5, r5
+ eor r6, r6, r6
+ stmia r0!, {r2-r6}
+ stmia r0!, {r2-r6}
+ ldmia r1!, {r2-r5}
+ stmia r0, {r2-r6}
+ add sp, sp, #32
+ ldmfd sp!, {r4-r11, lr}
+ mov r0, #(9*4+32)
+ bx lr
+.ltorg
+.size _gcry_poly1305_armv7_neon_init_ext,.-_gcry_poly1305_armv7_neon_init_ext;
+
+.globl _gcry_poly1305_armv7_neon_blocks
+.type _gcry_poly1305_armv7_neon_blocks,%function;
+_gcry_poly1305_armv7_neon_blocks:
+.Lpoly1305_blocks_neon_local:
+ vmov.i32 q0, #0xffffffff
+ vmov.i32 d4, #1
+ vsubw.u32 q0, q0, d4
+ vstmdb sp!, {q4,q5,q6,q7}
+ stmfd sp!, {r4-r11, lr}
+ mov r8, sp
+ and sp, sp, #~63
+ sub sp, sp, #192
+ str r0, [sp, #108]
+ str r1, [sp, #112]
+ str r2, [sp, #116]
+ str r8, [sp, #120]
+ mov r3, r0
+ mov r0, r1
+ mov r1, r2
+ mov r2, r3
+ ldr r8, [r2, #116]
+ veor d15, d15, d15
+ vorr.i32 d15, #(1 << 24)
+ tst r8, #2
+ beq .Lpoly1305_blocks_neon_skip_shift8
+ vshr.u64 d15, #32
+.Lpoly1305_blocks_neon_skip_shift8:
+ tst r8, #4
+ beq .Lpoly1305_blocks_neon_skip_shift16
+ veor d15, d15, d15
+.Lpoly1305_blocks_neon_skip_shift16:
+ vst1.64 d15, [sp, :64]
+ tst r8, #1
+ bne .Lpoly1305_blocks_neon_started
+ vld1.64 {q0-q1}, [r0]!
+ vswp d1, d2
+ vmovn.i64 d21, q0
+ vshrn.i64 d22, q0, #26
+ vshrn.u64 d24, q1, #14
+ vext.8 d0, d0, d2, #4
+ vext.8 d1, d1, d3, #4
+ vshr.u64 q1, q1, #32
+ vshrn.i64 d23, q0, #20
+ vshrn.u64 d25, q1, #8
+ vand.i32 d21, #0x03ffffff
+ vand.i32 q11, #0x03ffffff
+ vand.i32 q12, #0x03ffffff
+ orr r8, r8, #1
+ sub r1, r1, #32
+ str r8, [r2, #116]
+ vorr d25, d25, d15
+ b .Lpoly1305_blocks_neon_setupr20
+.Lpoly1305_blocks_neon_started:
+ add r9, r2, #60
+ vldm r9, {d21-d25}
+.Lpoly1305_blocks_neon_setupr20:
+ vmov.i32 d0, #5
+ tst r8, #(8|16)
+ beq .Lpoly1305_blocks_neon_setupr20_simple
+ tst r8, #(8)
+ beq .Lpoly1305_blocks_neon_setupr20_r_1
+ mov r9, r2
+ add r10, r2, #20
+ vld1.64 {q9}, [r9]!
+ vld1.64 {q8}, [r10]!
+ vld1.64 {d2}, [r9]
+ vld1.64 {d20}, [r10]
+ b .Lpoly1305_blocks_neon_setupr20_hard
+.Lpoly1305_blocks_neon_setupr20_r_1:
+ mov r9, r2
+ vmov.i32 d2, #1
+ vld1.64 {q8}, [r9]!
+ veor q9, q9, q9
+ vshr.u64 d2, d2, #32
+ vld1.64 {d20}, [r9]
+.Lpoly1305_blocks_neon_setupr20_hard:
+ vzip.i32 q8, q9
+ vzip.i32 d20, d2
+ b .Lpoly1305_blocks_neon_setups20
+.Lpoly1305_blocks_neon_setupr20_simple:
+ add r9, r2, #20
+ vld1.64 {d2-d4}, [r9]
+ vdup.32 d16, d2[0]
+ vdup.32 d17, d2[1]
+ vdup.32 d18, d3[0]
+ vdup.32 d19, d3[1]
+ vdup.32 d20, d4[0]
+.Lpoly1305_blocks_neon_setups20:
+ vmul.i32 q13, q8, d0[0]
+ vmov.i64 q15, 0x00000000ffffffff
+ vmul.i32 q14, q9, d0[0]
+ vshr.u64 q15, q15, #6
+ cmp r1, #64
+ blo .Lpoly1305_blocks_neon_try32
+ add r9, sp, #16
+ add r10, r2, #40
+ add r11, sp, #64
+ str r1, [sp, #116]
+ vld1.64 {d10-d12}, [r10]
+ vmov d14, d12
+ vmul.i32 q6, q5, d0[0]
+.Lpoly1305_blocks_neon_mainloop:
+ ldmia r0!, {r2-r5}
+ vmull.u32 q0, d25, d12[0]
+ mov r7, r2, lsr #26
+ vmlal.u32 q0, d24, d12[1]
+ mov r8, r3, lsr #20
+ ldr r6, [sp, #0]
+ vmlal.u32 q0, d23, d13[0]
+ mov r9, r4, lsr #14
+ vmlal.u32 q0, d22, d13[1]
+ orr r6, r6, r5, lsr #8
+ vmlal.u32 q0, d21, d14[0]
+ orr r3, r7, r3, lsl #6
+ vmull.u32 q1, d25, d12[1]
+ orr r4, r8, r4, lsl #12
+ orr r5, r9, r5, lsl #18
+ vmlal.u32 q1, d24, d13[0]
+ ldmia r0!, {r7-r10}
+ vmlal.u32 q1, d23, d13[1]
+ mov r1, r7, lsr #26
+ vmlal.u32 q1, d22, d14[0]
+ ldr r11, [sp, #4]
+ mov r12, r8, lsr #20
+ vmlal.u32 q1, d21, d10[0]
+ mov r14, r9, lsr #14
+ vmull.u32 q2, d25, d13[0]
+ orr r11, r11, r10, lsr #8
+ orr r8, r1, r8, lsl #6
+ vmlal.u32 q2, d24, d13[1]
+ orr r9, r12, r9, lsl #12
+ vmlal.u32 q2, d23, d14[0]
+ orr r10, r14, r10, lsl #18
+ vmlal.u32 q2, d22, d10[0]
+ mov r12, r3
+ and r2, r2, #0x3ffffff
+ vmlal.u32 q2, d21, d10[1]
+ mov r14, r5
+ vmull.u32 q3, d25, d13[1]
+ and r3, r7, #0x3ffffff
+ vmlal.u32 q3, d24, d14[0]
+ and r5, r8, #0x3ffffff
+ vmlal.u32 q3, d23, d10[0]
+ and r7, r9, #0x3ffffff
+ vmlal.u32 q3, d22, d10[1]
+ and r8, r14, #0x3ffffff
+ vmlal.u32 q3, d21, d11[0]
+ and r9, r10, #0x3ffffff
+ add r14, sp, #128
+ vmull.u32 q4, d25, d14[0]
+ mov r10, r6
+ vmlal.u32 q4, d24, d10[0]
+ and r6, r4, #0x3ffffff
+ vmlal.u32 q4, d23, d10[1]
+ and r4, r12, #0x3ffffff
+ vmlal.u32 q4, d22, d11[0]
+ stm r14, {r2-r11}
+ vmlal.u32 q4, d21, d11[1]
+ vld1.64 {d21-d24}, [r14, :256]!
+ vld1.64 {d25}, [r14, :64]
+ ldmia r0!, {r2-r5}
+ vmlal.u32 q0, d25, d26
+ mov r7, r2, lsr #26
+ vmlal.u32 q0, d24, d27
+ ldr r6, [sp, #0]
+ mov r8, r3, lsr #20
+ vmlal.u32 q0, d23, d28
+ mov r9, r4, lsr #14
+ vmlal.u32 q0, d22, d29
+ orr r6, r6, r5, lsr #8
+ vmlal.u32 q0, d21, d20
+ orr r3, r7, r3, lsl #6
+ vmlal.u32 q1, d25, d27
+ orr r4, r8, r4, lsl #12
+ orr r5, r9, r5, lsl #18
+ vmlal.u32 q1, d24, d28
+ ldmia r0!, {r7-r10}
+ vmlal.u32 q1, d23, d29
+ mov r1, r7, lsr #26
+ vmlal.u32 q1, d22, d20
+ ldr r11, [sp, #4]
+ mov r12, r8, lsr #20
+ vmlal.u32 q1, d21, d16
+ mov r14, r9, lsr #14
+ vmlal.u32 q2, d25, d28
+ orr r11, r11, r10, lsr #8
+ orr r8, r1, r8, lsl #6
+ orr r9, r12, r9, lsl #12
+ vmlal.u32 q2, d24, d29
+ orr r10, r14, r10, lsl #18
+ and r2, r2, #0x3ffffff
+ mov r12, r3
+ vmlal.u32 q2, d23, d20
+ mov r14, r5
+ vmlal.u32 q2, d22, d16
+ and r3, r7, #0x3ffffff
+ vmlal.u32 q2, d21, d17
+ and r5, r8, #0x3ffffff
+ vmlal.u32 q3, d25, d29
+ and r7, r9, #0x3ffffff
+ vmlal.u32 q3, d24, d20
+ and r8, r14, #0x3ffffff
+ vmlal.u32 q3, d23, d16
+ and r9, r10, #0x3ffffff
+ vmlal.u32 q3, d22, d17
+ add r14, sp, #128
+ vmlal.u32 q3, d21, d18
+ mov r10, r6
+ vmlal.u32 q4, d25, d20
+ vmlal.u32 q4, d24, d16
+ and r6, r4, #0x3ffffff
+ vmlal.u32 q4, d23, d17
+ and r4, r12, #0x3ffffff
+ vmlal.u32 q4, d22, d18
+ stm r14, {r2-r11}
+ vmlal.u32 q4, d21, d19
+ vld1.64 {d21-d24}, [r14, :256]!
+ vld1.64 {d25}, [r14, :64]
+ vaddw.u32 q0, q0, d21
+ vaddw.u32 q1, q1, d22
+ vaddw.u32 q2, q2, d23
+ vaddw.u32 q3, q3, d24
+ vaddw.u32 q4, q4, d25
+ vshr.u64 q11, q0, #26
+ vand q0, q0, q15
+ vadd.i64 q1, q1, q11
+ vshr.u64 q12, q3, #26
+ vand q3, q3, q15
+ vadd.i64 q4, q4, q12
+ vshr.u64 q11, q1, #26
+ vand q1, q1, q15
+ vadd.i64 q2, q2, q11
+ vshr.u64 q12, q4, #26
+ vand q4, q4, q15
+ vadd.i64 q0, q0, q12
+ vshl.i64 q12, q12, #2
+ ldr r1, [sp, #116]
+ vadd.i64 q0, q0, q12
+ vshr.u64 q11, q2, #26
+ vand q2, q2, q15
+ vadd.i64 q3, q3, q11
+ sub r1, #64
+ vshr.u64 q12, q0, #26
+ vand q0, q0, q15
+ vadd.i64 q1, q1, q12
+ cmp r1, #64
+ vshr.u64 q11, q3, #26
+ vand q3, q3, q15
+ vadd.i64 q4, q4, q11
+ vmovn.i64 d21, q0
+ str r1, [sp, #116]
+ vmovn.i64 d22, q1
+ vmovn.i64 d23, q2
+ vmovn.i64 d24, q3
+ vmovn.i64 d25, q4
+ bhs .Lpoly1305_blocks_neon_mainloop
+.Lpoly1305_blocks_neon_try32:
+ cmp r1, #32
+ blo .Lpoly1305_blocks_neon_done
+ tst r0, r0
+ bne .Lpoly1305_blocks_loadm32
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+ veor q3, q3, q3
+ veor q4, q4, q4
+ b .Lpoly1305_blocks_continue32
+.Lpoly1305_blocks_loadm32:
+ vld1.64 {q0-q1}, [r0]!
+ veor q4, q4, q4
+ vswp d1, d2
+ veor q3, q3, q3
+ vtrn.32 q0, q4
+ vtrn.32 q1, q3
+ vshl.i64 q2, q1, #12
+ vshl.i64 q3, q3, #18
+ vshl.i64 q1, q4, #6
+ vmovl.u32 q4, d15
+.Lpoly1305_blocks_continue32:
+ vmlal.u32 q0, d25, d26
+ vmlal.u32 q0, d24, d27
+ vmlal.u32 q0, d23, d28
+ vmlal.u32 q0, d22, d29
+ vmlal.u32 q0, d21, d20
+ vmlal.u32 q1, d25, d27
+ vmlal.u32 q1, d24, d28
+ vmlal.u32 q1, d23, d29
+ vmlal.u32 q1, d22, d20
+ vmlal.u32 q1, d21, d16
+ vmlal.u32 q2, d25, d28
+ vmlal.u32 q2, d24, d29
+ vmlal.u32 q2, d23, d20
+ vmlal.u32 q2, d22, d16
+ vmlal.u32 q2, d21, d17
+ vmlal.u32 q3, d25, d29
+ vmlal.u32 q3, d24, d20
+ vmlal.u32 q3, d23, d16
+ vmlal.u32 q3, d22, d17
+ vmlal.u32 q3, d21, d18
+ vmlal.u32 q4, d25, d20
+ vmlal.u32 q4, d24, d16
+ vmlal.u32 q4, d23, d17
+ vmlal.u32 q4, d22, d18
+ vmlal.u32 q4, d21, d19
+ vshr.u64 q11, q0, #26
+ vand q0, q0, q15
+ vadd.i64 q1, q1, q11
+ vshr.u64 q12, q3, #26
+ vand q3, q3, q15
+ vadd.i64 q4, q4, q12
+ vshr.u64 q11, q1, #26
+ vand q1, q1, q15
+ vadd.i64 q2, q2, q11
+ vshr.u64 q12, q4, #26
+ vand q4, q4, q15
+ vadd.i64 q0, q0, q12
+ vshl.i64 q12, q12, #2
+ vadd.i64 q0, q0, q12
+ vshr.u64 q11, q2, #26
+ vand q2, q2, q15
+ vadd.i64 q3, q3, q11
+ vshr.u64 q12, q0, #26
+ vand q0, q0, q15
+ vadd.i64 q1, q1, q12
+ vshr.u64 q11, q3, #26
+ vand q3, q3, q15
+ vadd.i64 q4, q4, q11
+ vmovn.i64 d21, q0
+ vmovn.i64 d22, q1
+ vmovn.i64 d23, q2
+ vmovn.i64 d24, q3
+ vmovn.i64 d25, q4
+.Lpoly1305_blocks_neon_done:
+ tst r0, r0
+ beq .Lpoly1305_blocks_neon_final
+ ldr r2, [sp, #108]
+ add r2, r2, #60
+ vst1.64 {d21}, [r2]!
+ vst1.64 {d22-d25}, [r2]
+ b .Lpoly1305_blocks_neon_leave
+.Lpoly1305_blocks_neon_final:
+ vadd.u32 d10, d0, d1
+ vadd.u32 d13, d2, d3
+ vadd.u32 d11, d4, d5
+ ldr r5, [sp, #108]
+ vadd.u32 d14, d6, d7
+ vadd.u32 d12, d8, d9
+ vtrn.32 d10, d13
+ vtrn.32 d11, d14
+ vst1.64 {d10-d12}, [sp]
+ ldm sp, {r0-r4}
+ mov r12, r0, lsr #26
+ and r0, r0, #0x3ffffff
+ add r1, r1, r12
+ mov r12, r1, lsr #26
+ and r1, r1, #0x3ffffff
+ add r2, r2, r12
+ mov r12, r2, lsr #26
+ and r2, r2, #0x3ffffff
+ add r3, r3, r12
+ mov r12, r3, lsr #26
+ and r3, r3, #0x3ffffff
+ add r4, r4, r12
+ mov r12, r4, lsr #26
+ and r4, r4, #0x3ffffff
+ add r12, r12, r12, lsl #2
+ add r0, r0, r12
+ mov r12, r0, lsr #26
+ and r0, r0, #0x3ffffff
+ add r1, r1, r12
+ mov r12, r1, lsr #26
+ and r1, r1, #0x3ffffff
+ add r2, r2, r12
+ mov r12, r2, lsr #26
+ and r2, r2, #0x3ffffff
+ add r3, r3, r12
+ mov r12, r3, lsr #26
+ and r3, r3, #0x3ffffff
+ add r4, r4, r12
+ mov r12, r4, lsr #26
+ and r4, r4, #0x3ffffff
+ add r12, r12, r12, lsl #2
+ add r0, r0, r12
+ mov r12, r0, lsr #26
+ and r0, r0, #0x3ffffff
+ add r1, r1, r12
+ add r6, r0, #5
+ mov r12, r6, lsr #26
+ and r6, r6, #0x3ffffff
+ add r7, r1, r12
+ mov r12, r7, lsr #26
+ and r7, r7, #0x3ffffff
+ add r10, r2, r12
+ mov r12, r10, lsr #26
+ and r10, r10, #0x3ffffff
+ add r11, r3, r12
+ mov r12, #-(1 << 26)
+ add r12, r12, r11, lsr #26
+ and r11, r11, #0x3ffffff
+ add r14, r4, r12
+ mov r12, r14, lsr #31
+ sub r12, #1
+ and r6, r6, r12
+ and r7, r7, r12
+ and r10, r10, r12
+ and r11, r11, r12
+ and r14, r14, r12
+ mvn r12, r12
+ and r0, r0, r12
+ and r1, r1, r12
+ and r2, r2, r12
+ and r3, r3, r12
+ and r4, r4, r12
+ orr r0, r0, r6
+ orr r1, r1, r7
+ orr r2, r2, r10
+ orr r3, r3, r11
+ orr r4, r4, r14
+ orr r0, r0, r1, lsl #26
+ lsr r1, r1, #6
+ orr r1, r1, r2, lsl #20
+ lsr r2, r2, #12
+ orr r2, r2, r3, lsl #14
+ lsr r3, r3, #18
+ orr r3, r3, r4, lsl #8
+ add r5, r5, #60
+ stm r5, {r0-r3}
+.Lpoly1305_blocks_neon_leave:
+ sub r0, sp, #8
+ ldr sp, [sp, #120]
+ ldmfd sp!, {r4-r11, lr}
+ vldm sp!, {q4-q7}
+ sub r0, sp, r0
+ bx lr
+.size _gcry_poly1305_armv7_neon_blocks,.-_gcry_poly1305_armv7_neon_blocks;
+
+.globl _gcry_poly1305_armv7_neon_finish_ext
+.type _gcry_poly1305_armv7_neon_finish_ext,%function;
+_gcry_poly1305_armv7_neon_finish_ext:
+.Lpoly1305_finish_ext_neon_local:
+ stmfd sp!, {r4-r11, lr}
+ sub sp, sp, #32
+ mov r5, r0
+ mov r6, r1
+ mov r7, r2
+ mov r8, r3
+ ands r7, r7, r7
+ beq .Lpoly1305_finish_ext_neon_noremaining
+ mov r9, sp
+ veor q0, q0, q0
+ veor q1, q1, q1
+ vst1.64 {q0-q1}, [sp]
+ tst r7, #16
+ beq .Lpoly1305_finish_ext_neon_skip16
+ vld1.u64 {q0}, [r1]!
+ vst1.64 {q0}, [r9]!
+.Lpoly1305_finish_ext_neon_skip16:
+ tst r7, #8
+ beq .Lpoly1305_finish_ext_neon_skip8
+ ldmia r1!, {r10-r11}
+ stmia r9!, {r10-r11}
+.Lpoly1305_finish_ext_neon_skip8:
+ tst r7, #4
+ beq .Lpoly1305_finish_ext_neon_skip4
+ ldr r10, [r1], #4
+ str r10, [r9], #4
+.Lpoly1305_finish_ext_neon_skip4:
+ tst r7, #2
+ beq .Lpoly1305_finish_ext_neon_skip2
+ ldrh r10, [r1], #2
+ strh r10, [r9], #2
+.Lpoly1305_finish_ext_neon_skip2:
+ tst r7, #1
+ beq .Lpoly1305_finish_ext_neon_skip1
+ ldrb r10, [r1], #1
+ strb r10, [r9], #1
+.Lpoly1305_finish_ext_neon_skip1:
+ cmp r7, #16
+ beq .Lpoly1305_finish_ext_neon_skipfinalbit
+ mov r10, #1
+ strb r10, [r9]
+.Lpoly1305_finish_ext_neon_skipfinalbit:
+ ldr r10, [r5, #116]
+ orrhs r10, #2
+ orrlo r10, #4
+ str r10, [r5, #116]
+ mov r0, r5
+ mov r1, sp
+ mov r2, #32
+ bl .Lpoly1305_blocks_neon_local
+.Lpoly1305_finish_ext_neon_noremaining:
+ ldr r10, [r5, #116]
+ tst r10, #1
+ beq .Lpoly1305_finish_ext_neon_notstarted
+ cmp r7, #0
+ beq .Lpoly1305_finish_ext_neon_user2r
+ cmp r7, #16
+ bls .Lpoly1305_finish_ext_neon_user1
+.Lpoly1305_finish_ext_neon_user2r:
+ orr r10, r10, #8
+ b .Lpoly1305_finish_ext_neon_finalblock
+.Lpoly1305_finish_ext_neon_user1:
+ orr r10, r10, #16
+.Lpoly1305_finish_ext_neon_finalblock:
+ str r10, [r5, #116]
+ mov r0, r5
+ eor r1, r1, r1
+ mov r2, #32
+ bl .Lpoly1305_blocks_neon_local
+.Lpoly1305_finish_ext_neon_notstarted:
+ add r0, r5, #60
+ add r9, r5, #100
+ ldm r0, {r0-r3}
+ ldm r9, {r9-r12}
+ adds r0, r0, r9
+ adcs r1, r1, r10
+ adcs r2, r2, r11
+ adcs r3, r3, r12
+ stm r8, {r0-r3}
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+ veor q3, q3, q3
+ vstmia r5!, {q0-q3}
+ vstm r5, {q0-q3}
+ add sp, sp, #32
+ ldmfd sp!, {r4-r11, lr}
+ mov r0, #(9*4+32)
+ bx lr
+.size _gcry_poly1305_armv7_neon_finish_ext,.-_gcry_poly1305_armv7_neon_finish_ext;
+
+#endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index 0299c43..dfc0c04 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -65,10 +65,24 @@
#endif
+/* POLY1305_USE_NEON indicates whether to enable ARM NEON assembly code. */
+#undef POLY1305_USE_NEON
+#if defined(ENABLE_NEON_SUPPORT) && defined(HAVE_ARM_ARCH_V6) && \
+ defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON)
+# define POLY1305_USE_NEON 1
+# define POLY1305_NEON_BLOCKSIZE 32
+# define POLY1305_NEON_STATESIZE 128
+# define POLY1305_NEON_ALIGNMENT 16
+#endif
+
+
/* Largest block-size used in any implementation (optimized implementations
* might use block-size multiple of 16). */
#ifdef POLY1305_USE_AVX2
# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE
+#elif defined(POLY1305_USE_NEON)
+# define POLY1305_LARGEST_BLOCKSIZE POLY1305_NEON_BLOCKSIZE
#elif defined(POLY1305_USE_SSE2)
# define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE
#else
@@ -78,6 +92,8 @@
/* Largest state-size used in any implementation. */
#ifdef POLY1305_USE_AVX2
# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE
+#elif defined(POLY1305_USE_NEON)
+# define POLY1305_LARGEST_STATESIZE POLY1305_NEON_STATESIZE
#elif defined(POLY1305_USE_SSE2)
# define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE
#else
@@ -87,6 +103,8 @@
/* Minimum alignment for state pointer passed to implementations. */
#ifdef POLY1305_USE_AVX2
# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT
+#elif defined(POLY1305_USE_NEON)
+# define POLY1305_STATE_ALIGNMENT POLY1305_NEON_ALIGNMENT
#elif defined(POLY1305_USE_SSE2)
# define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT
#else
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index fe241c1..28dbbf8 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -76,6 +76,25 @@ static const poly1305_ops_t poly1305_amd64_avx2_ops = {
#endif
+#ifdef POLY1305_USE_NEON
+
+void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key);
+unsigned int _gcry_poly1305_armv7_neon_finish_ext(void *state, const byte *m,
+ size_t remaining,
+ byte mac[16]);
+unsigned int _gcry_poly1305_armv7_neon_blocks(void *ctx, const byte *m,
+ size_t bytes);
+
+static const poly1305_ops_t poly1305_armv7_neon_ops = {
+ POLY1305_NEON_BLOCKSIZE,
+ _gcry_poly1305_armv7_neon_init_ext,
+ _gcry_poly1305_armv7_neon_blocks,
+ _gcry_poly1305_armv7_neon_finish_ext
+};
+
+#endif
+
+
#ifdef HAVE_U64_TYPEDEF
/* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit
@@ -661,6 +680,10 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
if (features & HWF_INTEL_AVX2)
ctx->ops = &poly1305_amd64_avx2_ops;
#endif
+#ifdef POLY1305_USE_NEON
+ if (features & HWF_ARM_NEON)
+ ctx->ops = &poly1305_armv7_neon_ops;
+#endif
(void)features;
buf_cpy (keytmp.b, key, POLY1305_KEYLEN);
diff --git a/configure.ac b/configure.ac
index 60ed015..a0d5fc9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1837,6 +1837,11 @@ case "${host}" in
;;
esac
+if test x"$neonsupport" = xyes ; then
+ # Build with the NEON implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-armv7-neon.lo"
+fi
+
LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
if test "$found" = "1" ; then
GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"
commit c584f44543883346d5a565581ff99a0afce9c5e1
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Wed Aug 6 20:05:16 2014 +0300
chacha20: add ARMv7/NEON implementation
* cipher/Makefile.am: Add 'chacha20-armv7-neon.S'.
* cipher/chacha20-armv7-neon.S: New.
* cipher/chacha20.c (USE_NEON): New.
[USE_NEON] (_gcry_chacha20_armv7_neon_blocks): New.
(chacha20_do_setkey) [USE_NEON]: Use Neon implementation if
HWF_ARM_NEON flag set.
(selftest): Self-test encrypting buffer byte by byte.
* configure.ac [neonsupport=yes]: Add 'chacha20-armv7-neon.lo'.
--
Add Andrew Moon's public domain ARMv7/NEON implementation of ChaCha20. Original
source is available at: https://github.com/floodyberry/chacha-opt
Benchmark on Cortex-A8 (--cpu-mhz 1008):
Old:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 13.45 ns/B 70.92 MiB/s 13.56 c/B
STREAM dec | 13.45 ns/B 70.90 MiB/s 13.56 c/B
New:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 6.20 ns/B 153.9 MiB/s 6.25 c/B
STREAM dec | 6.20 ns/B 153.9 MiB/s 6.25 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 7f45cbb..09ccaf9 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -61,6 +61,7 @@ arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
+ chacha20-armv7-neon.S \
crc.c \
des.c des-amd64.S \
dsa.c \
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
new file mode 100644
index 0000000..1a395ba
--- /dev/null
+++ b/cipher/chacha20-armv7-neon.S
@@ -0,0 +1,710 @@
+/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by Andrew Moon at
+ * https://github.com/floodyberry/chacha-opt
+ */
+
+#include <config.h>
+
+#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
+ defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20)
+
+.syntax unified
+.fpu neon
+.arm
+
+.text
+
+.globl _gcry_chacha20_armv7_neon_blocks
+.type _gcry_chacha20_armv7_neon_blocks,%function;
+_gcry_chacha20_armv7_neon_blocks:
+.Lchacha_blocks_neon_local:
+ tst r3, r3
+ beq .Lchacha_blocks_neon_nobytes
+ vstmdb sp!, {q4,q5,q6,q7}
+ stmfd sp!, {r4-r12, r14}
+ mov r8, sp
+ sub sp, sp, #196
+ and sp, sp, #0xffffffe0
+ str r0, [sp, #60]
+ str r1, [sp, #48]
+ str r2, [sp, #40]
+ str r3, [sp, #52]
+ str r8, [sp, #192]
+ add r1, sp, #64
+ ldmia r0!, {r4-r11}
+ stmia r1!, {r4-r11}
+ ldmia r0!, {r4-r11}
+ stmia r1!, {r4-r11}
+ mov r4, #20
+ str r4, [sp, #44]
+ cmp r3, #256
+ blo .Lchacha_blocks_neon_mainloop2
+.Lchacha_blocks_neon_mainloop1:
+ ldr r0, [sp, #44]
+ str r0, [sp, #0]
+ add r1, sp, #(64)
+ mov r2, #1
+ veor q12, q12
+ vld1.32 {q0,q1}, [r1,:128]!
+ vld1.32 {q2,q3}, [r1,:128]
+ vmov.32 d24[0], r2
+ vadd.u64 q3, q3, q12
+ vmov q4, q0
+ vmov q5, q1
+ vmov q6, q2
+ vadd.u64 q7, q3, q12
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vadd.u64 q11, q7, q12
+ add r0, sp, #64
+ ldm r0, {r0-r12}
+ ldr r14, [sp, #(64 +60)]
+ str r6, [sp, #8]
+ str r11, [sp, #12]
+ str r14, [sp, #28]
+ ldr r11, [sp, #(64 +52)]
+ ldr r14, [sp, #(64 +56)]
+.Lchacha_blocks_neon_rounds1:
+ ldr r6, [sp, #0]
+ vadd.i32 q0, q0, q1
+ add r0, r0, r4
+ vadd.i32 q4, q4, q5
+ add r1, r1, r5
+ vadd.i32 q8, q8, q9
+ eor r12, r12, r0
+ veor q12, q3, q0
+ eor r11, r11, r1
+ veor q13, q7, q4
+ ror r12, r12, #16
+ veor q14, q11, q8
+ ror r11, r11, #16
+ vrev32.16 q3, q12
+ subs r6, r6, #2
+ vrev32.16 q7, q13
+ add r8, r8, r12
+ vrev32.16 q11, q14
+ add r9, r9, r11
+ vadd.i32 q2, q2, q3
+ eor r4, r4, r8
+ vadd.i32 q6, q6, q7
+ eor r5, r5, r9
+ vadd.i32 q10, q10, q11
+ str r6, [sp, #0]
+ veor q12, q1, q2
+ ror r4, r4, #20
+ veor q13, q5, q6
+ ror r5, r5, #20
+ veor q14, q9, q10
+ add r0, r0, r4
+ vshl.i32 q1, q12, #12
+ add r1, r1, r5
+ vshl.i32 q5, q13, #12
+ ldr r6, [sp, #8]
+ vshl.i32 q9, q14, #12
+ eor r12, r12, r0
+ vsri.u32 q1, q12, #20
+ eor r11, r11, r1
+ vsri.u32 q5, q13, #20
+ ror r12, r12, #24
+ vsri.u32 q9, q14, #20
+ ror r11, r11, #24
+ vadd.i32 q0, q0, q1
+ add r8, r8, r12
+ vadd.i32 q4, q4, q5
+ add r9, r9, r11
+ vadd.i32 q8, q8, q9
+ eor r4, r4, r8
+ veor q12, q3, q0
+ eor r5, r5, r9
+ veor q13, q7, q4
+ str r11, [sp, #20]
+ veor q14, q11, q8
+ ror r4, r4, #25
+ vshl.i32 q3, q12, #8
+ ror r5, r5, #25
+ vshl.i32 q7, q13, #8
+ str r4, [sp, #4]
+ vshl.i32 q11, q14, #8
+ ldr r4, [sp, #28]
+ vsri.u32 q3, q12, #24
+ add r2, r2, r6
+ vsri.u32 q7, q13, #24
+ add r3, r3, r7
+ vsri.u32 q11, q14, #24
+ ldr r11, [sp, #12]
+ vadd.i32 q2, q2, q3
+ eor r14, r14, r2
+ vadd.i32 q6, q6, q7
+ eor r4, r4, r3
+ vadd.i32 q10, q10, q11
+ ror r14, r14, #16
+ veor q12, q1, q2
+ ror r4, r4, #16
+ veor q13, q5, q6
+ add r10, r10, r14
+ veor q14, q9, q10
+ add r11, r11, r4
+ vshl.i32 q1, q12, #7
+ eor r6, r6, r10
+ vshl.i32 q5, q13, #7
+ eor r7, r7, r11
+ vshl.i32 q9, q14, #7
+ ror r6, r6, #20
+ vsri.u32 q1, q12, #25
+ ror r7, r7, #20
+ vsri.u32 q5, q13, #25
+ add r2, r2, r6
+ vsri.u32 q9, q14, #25
+ add r3, r3, r7
+ vext.32 q3, q3, q3, #3
+ eor r14, r14, r2
+ vext.32 q7, q7, q7, #3
+ eor r4, r4, r3
+ vext.32 q11, q11, q11, #3
+ ror r14, r14, #24
+ vext.32 q1, q1, q1, #1
+ ror r4, r4, #24
+ vext.32 q5, q5, q5, #1
+ add r10, r10, r14
+ vext.32 q9, q9, q9, #1
+ add r11, r11, r4
+ vext.32 q2, q2, q2, #2
+ eor r6, r6, r10
+ vext.32 q6, q6, q6, #2
+ eor r7, r7, r11
+ vext.32 q10, q10, q10, #2
+ ror r6, r6, #25
+ vadd.i32 q0, q0, q1
+ ror r7, r7, #25
+ vadd.i32 q4, q4, q5
+ add r0, r0, r5
+ vadd.i32 q8, q8, q9
+ add r1, r1, r6
+ veor q12, q3, q0
+ eor r4, r4, r0
+ veor q13, q7, q4
+ eor r12, r12, r1
+ veor q14, q11, q8
+ ror r4, r4, #16
+ vrev32.16 q3, q12
+ ror r12, r12, #16
+ vrev32.16 q7, q13
+ add r10, r10, r4
+ vrev32.16 q11, q14
+ add r11, r11, r12
+ vadd.i32 q2, q2, q3
+ eor r5, r5, r10
+ vadd.i32 q6, q6, q7
+ eor r6, r6, r11
+ vadd.i32 q10, q10, q11
+ ror r5, r5, #20
+ veor q12, q1, q2
+ ror r6, r6, #20
+ veor q13, q5, q6
+ add r0, r0, r5
+ veor q14, q9, q10
+ add r1, r1, r6
+ vshl.i32 q1, q12, #12
+ eor r4, r4, r0
+ vshl.i32 q5, q13, #12
+ eor r12, r12, r1
+ vshl.i32 q9, q14, #12
+ ror r4, r4, #24
+ vsri.u32 q1, q12, #20
+ ror r12, r12, #24
+ vsri.u32 q5, q13, #20
+ add r10, r10, r4
+ vsri.u32 q9, q14, #20
+ add r11, r11, r12
+ vadd.i32 q0, q0, q1
+ eor r5, r5, r10
+ vadd.i32 q4, q4, q5
+ eor r6, r6, r11
+ vadd.i32 q8, q8, q9
+ str r11, [sp, #12]
+ veor q12, q3, q0
+ ror r5, r5, #25
+ veor q13, q7, q4
+ ror r6, r6, #25
+ veor q14, q11, q8
+ str r4, [sp, #28]
+ vshl.i32 q3, q12, #8
+ ldr r4, [sp, #4]
+ vshl.i32 q7, q13, #8
+ add r2, r2, r7
+ vshl.i32 q11, q14, #8
+ add r3, r3, r4
+ vsri.u32 q3, q12, #24
+ ldr r11, [sp, #20]
+ vsri.u32 q7, q13, #24
+ eor r11, r11, r2
+ vsri.u32 q11, q14, #24
+ eor r14, r14, r3
+ vadd.i32 q2, q2, q3
+ ror r11, r11, #16
+ vadd.i32 q6, q6, q7
+ ror r14, r14, #16
+ vadd.i32 q10, q10, q11
+ add r8, r8, r11
+ veor q12, q1, q2
+ add r9, r9, r14
+ veor q13, q5, q6
+ eor r7, r7, r8
+ veor q14, q9, q10
+ eor r4, r4, r9
+ vshl.i32 q1, q12, #7
+ ror r7, r7, #20
+ vshl.i32 q5, q13, #7
+ ror r4, r4, #20
+ vshl.i32 q9, q14, #7
+ str r6, [sp, #8]
+ vsri.u32 q1, q12, #25
+ add r2, r2, r7
+ vsri.u32 q5, q13, #25
+ add r3, r3, r4
+ vsri.u32 q9, q14, #25
+ eor r11, r11, r2
+ vext.32 q3, q3, q3, #1
+ eor r14, r14, r3
+ vext.32 q7, q7, q7, #1
+ ror r11, r11, #24
+ vext.32 q11, q11, q11, #1
+ ror r14, r14, #24
+ vext.32 q1, q1, q1, #3
+ add r8, r8, r11
+ vext.32 q5, q5, q5, #3
+ add r9, r9, r14
+ vext.32 q9, q9, q9, #3
+ eor r7, r7, r8
+ vext.32 q2, q2, q2, #2
+ eor r4, r4, r9
+ vext.32 q6, q6, q6, #2
+ ror r7, r7, #25
+ vext.32 q10, q10, q10, #2
+ ror r4, r4, #25
+ bne .Lchacha_blocks_neon_rounds1
+ str r8, [sp, #0]
+ str r9, [sp, #4]
+ str r10, [sp, #8]
+ str r12, [sp, #16]
+ str r11, [sp, #20]
+ str r14, [sp, #24]
+ add r9, sp, #64
+ vld1.32 {q12,q13}, [r9,:128]!
+ ldr r12, [sp, #48]
+ vld1.32 {q14,q15}, [r9,:128]
+ ldr r14, [sp, #40]
+ vadd.i32 q0, q0, q12
+ ldr r8, [sp, #(64 +0)]
+ vadd.i32 q4, q4, q12
+ ldr r9, [sp, #(64 +4)]
+ vadd.i32 q8, q8, q12
+ ldr r10, [sp, #(64 +8)]
+ vadd.i32 q1, q1, q13
+ ldr r11, [sp, #(64 +12)]
+ vadd.i32 q5, q5, q13
+ add r0, r0, r8
+ vadd.i32 q9, q9, q13
+ add r1, r1, r9
+ vadd.i32 q2, q2, q14
+ add r2, r2, r10
+ vadd.i32 q6, q6, q14
+ ldr r8, [sp, #(64 +16)]
+ vadd.i32 q10, q10, q14
+ add r3, r3, r11
+ veor q14, q14, q14
+ ldr r9, [sp, #(64 +20)]
+ mov r11, #1
+ add r4, r4, r8
+ vmov.32 d28[0], r11
+ ldr r10, [sp, #(64 +24)]
+ vadd.u64 q12, q14, q15
+ add r5, r5, r9
+ vadd.u64 q13, q14, q12
+ ldr r11, [sp, #(64 +28)]
+ vadd.u64 q14, q14, q13
+ add r6, r6, r10
+ vadd.i32 q3, q3, q12
+ tst r12, r12
+ vadd.i32 q7, q7, q13
+ add r7, r7, r11
+ vadd.i32 q11, q11, q14
+ beq .Lchacha_blocks_neon_nomessage11
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage11:
+ stmia r14!, {r0-r7}
+ ldm sp, {r0-r7}
+ ldr r8, [sp, #(64 +32)]
+ ldr r9, [sp, #(64 +36)]
+ ldr r10, [sp, #(64 +40)]
+ ldr r11, [sp, #(64 +44)]
+ add r0, r0, r8
+ add r1, r1, r9
+ add r2, r2, r10
+ ldr r8, [sp, #(64 +48)]
+ add r3, r3, r11
+ ldr r9, [sp, #(64 +52)]
+ add r4, r4, r8
+ ldr r10, [sp, #(64 +56)]
+ add r5, r5, r9
+ ldr r11, [sp, #(64 +60)]
+ add r6, r6, r10
+ adds r8, r8, #4
+ add r7, r7, r11
+ adc r9, r9, #0
+ str r8, [sp, #(64 +48)]
+ tst r12, r12
+ str r9, [sp, #(64 +52)]
+ beq .Lchacha_blocks_neon_nomessage12
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage12:
+ stmia r14!, {r0-r7}
+ beq .Lchacha_blocks_neon_nomessage13
+ vld1.32 {q12,q13}, [r12]!
+ vld1.32 {q14,q15}, [r12]!
+ veor q0, q0, q12
+ veor q1, q1, q13
+ veor q2, q2, q14
+ veor q3, q3, q15
+.Lchacha_blocks_neon_nomessage13:
+ vst1.32 {q0,q1}, [r14]!
+ vst1.32 {q2,q3}, [r14]!
+ beq .Lchacha_blocks_neon_nomessage14
+ vld1.32 {q12,q13}, [r12]!
+ vld1.32 {q14,q15}, [r12]!
+ veor q4, q4, q12
+ veor q5, q5, q13
+ veor q6, q6, q14
+ veor q7, q7, q15
+.Lchacha_blocks_neon_nomessage14:
+ vst1.32 {q4,q5}, [r14]!
+ vst1.32 {q6,q7}, [r14]!
+ beq .Lchacha_blocks_neon_nomessage15
+ vld1.32 {q12,q13}, [r12]!
+ vld1.32 {q14,q15}, [r12]!
+ veor q8, q8, q12
+ veor q9, q9, q13
+ veor q10, q10, q14
+ veor q11, q11, q15
+.Lchacha_blocks_neon_nomessage15:
+ vst1.32 {q8,q9}, [r14]!
+ vst1.32 {q10,q11}, [r14]!
+ str r12, [sp, #48]
+ str r14, [sp, #40]
+ ldr r3, [sp, #52]
+ sub r3, r3, #256
+ cmp r3, #256
+ str r3, [sp, #52]
+ bhs .Lchacha_blocks_neon_mainloop1
+ tst r3, r3
+ beq .Lchacha_blocks_neon_done
+.Lchacha_blocks_neon_mainloop2:
+ ldr r3, [sp, #52]
+ ldr r1, [sp, #48]
+ cmp r3, #64
+ bhs .Lchacha_blocks_neon_noswap1
+ add r4, sp, #128
+ mov r5, r4
+ tst r1, r1
+ beq .Lchacha_blocks_neon_nocopy1
+.Lchacha_blocks_neon_copyinput1:
+ subs r3, r3, #1
+ ldrb r0, [r1], #1
+ strb r0, [r4], #1
+ bne .Lchacha_blocks_neon_copyinput1
+ str r5, [sp, #48]
+.Lchacha_blocks_neon_nocopy1:
+ ldr r4, [sp, #40]
+ str r5, [sp, #40]
+ str r4, [sp, #56]
+.Lchacha_blocks_neon_noswap1:
+ ldr r0, [sp, #44]
+ str r0, [sp, #0]
+ add r0, sp, #64
+ ldm r0, {r0-r12}
+ ldr r14, [sp, #(64 +60)]
+ str r6, [sp, #8]
+ str r11, [sp, #12]
+ str r14, [sp, #28]
+ ldr r11, [sp, #(64 +52)]
+ ldr r14, [sp, #(64 +56)]
+.Lchacha_blocks_neon_rounds2:
+ ldr r6, [sp, #0]
+ add r0, r0, r4
+ add r1, r1, r5
+ eor r12, r12, r0
+ eor r11, r11, r1
+ ror r12, r12, #16
+ ror r11, r11, #16
+ subs r6, r6, #2
+ add r8, r8, r12
+ add r9, r9, r11
+ eor r4, r4, r8
+ eor r5, r5, r9
+ str r6, [sp, #0]
+ ror r4, r4, #20
+ ror r5, r5, #20
+ add r0, r0, r4
+ add r1, r1, r5
+ ldr r6, [sp, #8]
+ eor r12, r12, r0
+ eor r11, r11, r1
+ ror r12, r12, #24
+ ror r11, r11, #24
+ add r8, r8, r12
+ add r9, r9, r11
+ eor r4, r4, r8
+ eor r5, r5, r9
+ str r11, [sp, #20]
+ ror r4, r4, #25
+ ror r5, r5, #25
+ str r4, [sp, #4]
+ ldr r4, [sp, #28]
+ add r2, r2, r6
+ add r3, r3, r7
+ ldr r11, [sp, #12]
+ eor r14, r14, r2
+ eor r4, r4, r3
+ ror r14, r14, #16
+ ror r4, r4, #16
+ add r10, r10, r14
+ add r11, r11, r4
+ eor r6, r6, r10
+ eor r7, r7, r11
+ ror r6, r6, #20
+ ror r7, r7, #20
+ add r2, r2, r6
+ add r3, r3, r7
+ eor r14, r14, r2
+ eor r4, r4, r3
+ ror r14, r14, #24
+ ror r4, r4, #24
+ add r10, r10, r14
+ add r11, r11, r4
+ eor r6, r6, r10
+ eor r7, r7, r11
+ ror r6, r6, #25
+ ror r7, r7, #25
+ add r0, r0, r5
+ add r1, r1, r6
+ eor r4, r4, r0
+ eor r12, r12, r1
+ ror r4, r4, #16
+ ror r12, r12, #16
+ add r10, r10, r4
+ add r11, r11, r12
+ eor r5, r5, r10
+ eor r6, r6, r11
+ ror r5, r5, #20
+ ror r6, r6, #20
+ add r0, r0, r5
+ add r1, r1, r6
+ eor r4, r4, r0
+ eor r12, r12, r1
+ ror r4, r4, #24
+ ror r12, r12, #24
+ add r10, r10, r4
+ add r11, r11, r12
+ eor r5, r5, r10
+ eor r6, r6, r11
+ str r11, [sp, #12]
+ ror r5, r5, #25
+ ror r6, r6, #25
+ str r4, [sp, #28]
+ ldr r4, [sp, #4]
+ add r2, r2, r7
+ add r3, r3, r4
+ ldr r11, [sp, #20]
+ eor r11, r11, r2
+ eor r14, r14, r3
+ ror r11, r11, #16
+ ror r14, r14, #16
+ add r8, r8, r11
+ add r9, r9, r14
+ eor r7, r7, r8
+ eor r4, r4, r9
+ ror r7, r7, #20
+ ror r4, r4, #20
+ str r6, [sp, #8]
+ add r2, r2, r7
+ add r3, r3, r4
+ eor r11, r11, r2
+ eor r14, r14, r3
+ ror r11, r11, #24
+ ror r14, r14, #24
+ add r8, r8, r11
+ add r9, r9, r14
+ eor r7, r7, r8
+ eor r4, r4, r9
+ ror r7, r7, #25
+ ror r4, r4, #25
+ bne .Lchacha_blocks_neon_rounds2
+ str r8, [sp, #0]
+ str r9, [sp, #4]
+ str r10, [sp, #8]
+ str r12, [sp, #16]
+ str r11, [sp, #20]
+ str r14, [sp, #24]
+ ldr r12, [sp, #48]
+ ldr r14, [sp, #40]
+ ldr r8, [sp, #(64 +0)]
+ ldr r9, [sp, #(64 +4)]
+ ldr r10, [sp, #(64 +8)]
+ ldr r11, [sp, #(64 +12)]
+ add r0, r0, r8
+ add r1, r1, r9
+ add r2, r2, r10
+ ldr r8, [sp, #(64 +16)]
+ add r3, r3, r11
+ ldr r9, [sp, #(64 +20)]
+ add r4, r4, r8
+ ldr r10, [sp, #(64 +24)]
+ add r5, r5, r9
+ ldr r11, [sp, #(64 +28)]
+ add r6, r6, r10
+ tst r12, r12
+ add r7, r7, r11
+ beq .Lchacha_blocks_neon_nomessage21
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage21:
+ stmia r14!, {r0-r7}
+ ldm sp, {r0-r7}
+ ldr r8, [sp, #(64 +32)]
+ ldr r9, [sp, #(64 +36)]
+ ldr r10, [sp, #(64 +40)]
+ ldr r11, [sp, #(64 +44)]
+ add r0, r0, r8
+ add r1, r1, r9
+ add r2, r2, r10
+ ldr r8, [sp, #(64 +48)]
+ add r3, r3, r11
+ ldr r9, [sp, #(64 +52)]
+ add r4, r4, r8
+ ldr r10, [sp, #(64 +56)]
+ add r5, r5, r9
+ ldr r11, [sp, #(64 +60)]
+ add r6, r6, r10
+ adds r8, r8, #1
+ add r7, r7, r11
+ adc r9, r9, #0
+ str r8, [sp, #(64 +48)]
+ tst r12, r12
+ str r9, [sp, #(64 +52)]
+ beq .Lchacha_blocks_neon_nomessage22
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8
+ eor r1, r1, r9
+ eor r2, r2, r10
+ ldr r8, [r12, #0]
+ eor r3, r3, r11
+ ldr r9, [r12, #4]
+ eor r4, r4, r8
+ ldr r10, [r12, #8]
+ eor r5, r5, r9
+ ldr r11, [r12, #12]
+ eor r6, r6, r10
+ add r12, r12, #16
+ eor r7, r7, r11
+.Lchacha_blocks_neon_nomessage22:
+ stmia r14!, {r0-r7}
+ str r12, [sp, #48]
+ str r14, [sp, #40]
+ ldr r3, [sp, #52]
+ cmp r3, #64
+ sub r4, r3, #64
+ str r4, [sp, #52]
+ bhi .Lchacha_blocks_neon_mainloop2
+ cmp r3, #64
+ beq .Lchacha_blocks_neon_nocopy2
+ ldr r1, [sp, #56]
+ sub r14, r14, #64
+.Lchacha_blocks_neon_copyinput2:
+ subs r3, r3, #1
+ ldrb r0, [r14], #1
+ strb r0, [r1], #1
+ bne .Lchacha_blocks_neon_copyinput2
+.Lchacha_blocks_neon_nocopy2:
+.Lchacha_blocks_neon_done:
+ ldr r7, [sp, #60]
+ ldr r8, [sp, #(64 +48)]
+ ldr r9, [sp, #(64 +52)]
+ str r8, [r7, #(48 + 0)]
+ str r9, [r7, #(48 + 4)]
+ mov r12, sp
+ stmia r12!, {r0-r7}
+ add r12, r12, #48
+ stmia r12!, {r0-r7}
+ sub r0, sp, #8
+ ldr sp, [sp, #192]
+ ldmfd sp!, {r4-r12, r14}
+ vldm sp!, {q4-q7}
+ sub r0, sp, r0
+ bx lr
+.Lchacha_blocks_neon_nobytes:
+ mov r0, #0;
+ bx lr
+.ltorg
+.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks;
+
+#endif
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index ebba2fc..c1847aa 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -67,6 +67,16 @@
# define USE_AVX2 1
#endif
+/* USE_NEON indicates whether to enable ARM NEON assembly code. */
+#undef USE_NEON
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
+# define USE_NEON 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
struct CHACHA20_context_s;
@@ -104,6 +114,13 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in,
#endif /* USE_AVX2 */
+#ifdef USE_NEON
+
+unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in,
+ byte *out, size_t bytes);
+
+#endif /* USE_NEON */
+
static void chacha20_setiv (void *context, const byte * iv, size_t ivlen);
static const char *selftest (void);
@@ -353,6 +370,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx,
if (features & HWF_INTEL_AVX2)
ctx->blocks = _gcry_chacha20_amd64_avx2_blocks;
#endif
+#ifdef USE_NEON
+ if (features & HWF_ARM_NEON)
+ ctx->blocks = _gcry_chacha20_armv7_neon_blocks;
+#endif
(void)features;
@@ -541,6 +562,19 @@ selftest (void)
if (buf[i] != (byte) i)
return "ChaCha20 encryption test 2 failed.";
+ chacha20_setkey (&ctx, key_1, sizeof key_1);
+ chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+ /* encrypt */
+ for (i = 0; i < sizeof buf; i++)
+ chacha20_encrypt_stream (&ctx, &buf[i], &buf[i], 1);
+ /* decrypt */
+ chacha20_setkey (&ctx, key_1, sizeof key_1);
+ chacha20_setiv (&ctx, nonce_1, sizeof nonce_1);
+ chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf);
+ for (i = 0; i < sizeof buf; i++)
+ if (buf[i] != (byte) i)
+ return "ChaCha20 encryption test 3 failed.";
+
return NULL;
}
diff --git a/configure.ac b/configure.ac
index d14b7f6..60ed015 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1822,6 +1822,11 @@ if test "$found" = "1" ; then
GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo"
;;
esac
+
+ if test x"$neonsupport" = xyes ; then
+ # Build with the NEON implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo"
+ fi
fi
case "${host}" in
-----------------------------------------------------------------------
Summary of changes:
cipher/Makefile.am | 3 +-
cipher/chacha20-armv7-neon.S | 710 ++++++++++++++++++++++++++++++++++++++++++
cipher/chacha20.c | 34 ++
cipher/poly1305-armv7-neon.S | 705 +++++++++++++++++++++++++++++++++++++++++
cipher/poly1305-internal.h | 18 ++
cipher/poly1305.c | 23 ++
configure.ac | 10 +
src/hwf-arm.c | 57 +++-
8 files changed, 1556 insertions(+), 4 deletions(-)
create mode 100644 cipher/chacha20-armv7-neon.S
create mode 100644 cipher/poly1305-armv7-neon.S
hooks/post-receive
--
The GNU crypto library
http://git.gnupg.org
More information about the Gnupg-commits
mailing list