From jussi.kivilinna at iki.fi Sun Nov 2 17:52:35 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 02 Nov 2014 18:52:35 +0200 Subject: [PATCH 1/3] chacha20: add ARMv7/NEON implementation Message-ID: <20141102165235.18119.67152.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'chacha20-armv7-neon.S'. * cipher/chacha20-armv7-neon.S: New. * cipher/chacha20.c (USE_NEON): New. [USE_NEON] (_gcry_chacha20_armv7_neon_blocks): New. (chacha20_do_setkey) [USE_NEON]: Use Neon implementation if HWF_ARM_NEON flag set. (selftest): Self-test encrypting buffer byte by byte. * configure.ac [neonsupport=yes]: Add 'chacha20-armv7-neon.lo'. -- Add Andrew Moon's public domain ARMv7/NEON implementation of ChaCha20. Original source is available at: https://github.com/floodyberry/chacha-opt Benchmark on Cortex-A8 (--cpu-mhz 1008): Old: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 13.45 ns/B 70.92 MiB/s 13.56 c/B STREAM dec | 13.45 ns/B 70.90 MiB/s 13.56 c/B New: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 6.20 ns/B 153.9 MiB/s 6.25 c/B STREAM dec | 6.20 ns/B 153.9 MiB/s 6.25 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 1 cipher/chacha20-armv7-neon.S | 710 ++++++++++++++++++++++++++++++++++++++++++ cipher/chacha20.c | 34 ++ configure.ac | 5 4 files changed, 750 insertions(+) create mode 100644 cipher/chacha20-armv7-neon.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 7f45cbb..09ccaf9 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -61,6 +61,7 @@ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ + chacha20-armv7-neon.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S new file mode 100644 index 0000000..1a395ba --- /dev/null +++ b/cipher/chacha20-armv7-neon.S @@ -0,0 +1,710 @@ +/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20) + +.syntax unified +.fpu neon +.arm + +.text + +.globl _gcry_chacha20_armv7_neon_blocks +.type _gcry_chacha20_armv7_neon_blocks,%function; +_gcry_chacha20_armv7_neon_blocks: +.Lchacha_blocks_neon_local: + tst r3, r3 + beq .Lchacha_blocks_neon_nobytes + vstmdb sp!, {q4,q5,q6,q7} + stmfd sp!, {r4-r12, r14} + mov r8, sp + sub sp, sp, #196 + and sp, sp, #0xffffffe0 + str r0, [sp, #60] + str r1, [sp, #48] + str r2, [sp, #40] + str r3, [sp, #52] + str r8, [sp, #192] + add r1, sp, #64 + ldmia r0!, {r4-r11} + stmia r1!, {r4-r11} + ldmia r0!, {r4-r11} + stmia r1!, {r4-r11} + mov r4, #20 + str r4, [sp, #44] + cmp r3, #256 + blo .Lchacha_blocks_neon_mainloop2 +.Lchacha_blocks_neon_mainloop1: + ldr r0, [sp, #44] + str r0, [sp, #0] + add r1, sp, #(64) + mov r2, #1 + veor q12, q12 + vld1.32 {q0,q1}, [r1,:128]! + vld1.32 {q2,q3}, [r1,:128] + vmov.32 d24[0], r2 + vadd.u64 q3, q3, q12 + vmov q4, q0 + vmov q5, q1 + vmov q6, q2 + vadd.u64 q7, q3, q12 + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vadd.u64 q11, q7, q12 + add r0, sp, #64 + ldm r0, {r0-r12} + ldr r14, [sp, #(64 +60)] + str r6, [sp, #8] + str r11, [sp, #12] + str r14, [sp, #28] + ldr r11, [sp, #(64 +52)] + ldr r14, [sp, #(64 +56)] +.Lchacha_blocks_neon_rounds1: + ldr r6, [sp, #0] + vadd.i32 q0, q0, q1 + add r0, r0, r4 + vadd.i32 q4, q4, q5 + add r1, r1, r5 + vadd.i32 q8, q8, q9 + eor r12, r12, r0 + veor q12, q3, q0 + eor r11, r11, r1 + veor q13, q7, q4 + ror r12, r12, #16 + veor q14, q11, q8 + ror r11, r11, #16 + vrev32.16 q3, q12 + subs r6, r6, #2 + vrev32.16 q7, q13 + add r8, r8, r12 + vrev32.16 q11, q14 + add r9, r9, r11 + vadd.i32 q2, q2, q3 + eor r4, r4, r8 + vadd.i32 q6, q6, q7 + eor r5, r5, r9 + vadd.i32 q10, q10, q11 + str r6, [sp, #0] + veor q12, q1, q2 + ror r4, r4, #20 + veor q13, q5, q6 + ror r5, r5, #20 + veor q14, q9, q10 + add r0, r0, r4 + vshl.i32 q1, q12, #12 + add r1, r1, r5 + vshl.i32 q5, q13, #12 + ldr r6, [sp, #8] + vshl.i32 q9, q14, #12 + eor r12, r12, r0 + vsri.u32 q1, q12, #20 + eor r11, r11, r1 + vsri.u32 q5, q13, #20 + ror r12, r12, #24 + vsri.u32 q9, q14, #20 + ror r11, r11, #24 + vadd.i32 q0, q0, q1 + add r8, r8, r12 + vadd.i32 q4, q4, q5 + add r9, r9, r11 + vadd.i32 q8, q8, q9 + eor r4, r4, r8 + veor q12, q3, q0 + eor r5, r5, r9 + veor q13, q7, q4 + str r11, [sp, #20] + veor q14, q11, q8 + ror r4, r4, #25 + vshl.i32 q3, q12, #8 + ror r5, r5, #25 + vshl.i32 q7, q13, #8 + str r4, [sp, #4] + vshl.i32 q11, q14, #8 + ldr r4, [sp, #28] + vsri.u32 q3, q12, #24 + add r2, r2, r6 + vsri.u32 q7, q13, #24 + add r3, r3, r7 + vsri.u32 q11, q14, #24 + ldr r11, [sp, #12] + vadd.i32 q2, q2, q3 + eor r14, r14, r2 + vadd.i32 q6, q6, q7 + eor r4, r4, r3 + vadd.i32 q10, q10, q11 + ror r14, r14, #16 + veor q12, q1, q2 + ror r4, r4, #16 + veor q13, q5, q6 + add r10, r10, r14 + veor q14, q9, q10 + add r11, r11, r4 + vshl.i32 q1, q12, #7 + eor r6, r6, r10 + vshl.i32 q5, q13, #7 + eor r7, r7, r11 + vshl.i32 q9, q14, #7 + ror r6, r6, #20 + vsri.u32 q1, q12, #25 + ror r7, r7, #20 + vsri.u32 q5, q13, #25 + add r2, r2, r6 + vsri.u32 q9, q14, #25 + add r3, r3, r7 + vext.32 q3, q3, q3, #3 + eor r14, r14, r2 + vext.32 q7, q7, q7, #3 + eor r4, r4, r3 + vext.32 q11, q11, q11, #3 + ror r14, r14, #24 + vext.32 q1, q1, q1, #1 + ror r4, r4, #24 + vext.32 q5, q5, q5, #1 + add r10, r10, r14 + vext.32 q9, q9, q9, #1 + add r11, r11, r4 + vext.32 q2, q2, q2, #2 + eor r6, r6, r10 + vext.32 q6, q6, q6, #2 + eor r7, r7, r11 + vext.32 q10, q10, q10, #2 + ror r6, r6, #25 + vadd.i32 q0, q0, q1 + ror r7, r7, #25 + vadd.i32 q4, q4, q5 + add r0, r0, r5 + vadd.i32 q8, q8, q9 + add r1, r1, r6 + veor q12, q3, q0 + eor r4, r4, r0 + veor q13, q7, q4 + eor r12, r12, r1 + veor q14, q11, q8 + ror r4, r4, #16 + vrev32.16 q3, q12 + ror r12, r12, #16 + vrev32.16 q7, q13 + add r10, r10, r4 + vrev32.16 q11, q14 + add r11, r11, r12 + vadd.i32 q2, q2, q3 + eor r5, r5, r10 + vadd.i32 q6, q6, q7 + eor r6, r6, r11 + vadd.i32 q10, q10, q11 + ror r5, r5, #20 + veor q12, q1, q2 + ror r6, r6, #20 + veor q13, q5, q6 + add r0, r0, r5 + veor q14, q9, q10 + add r1, r1, r6 + vshl.i32 q1, q12, #12 + eor r4, r4, r0 + vshl.i32 q5, q13, #12 + eor r12, r12, r1 + vshl.i32 q9, q14, #12 + ror r4, r4, #24 + vsri.u32 q1, q12, #20 + ror r12, r12, #24 + vsri.u32 q5, q13, #20 + add r10, r10, r4 + vsri.u32 q9, q14, #20 + add r11, r11, r12 + vadd.i32 q0, q0, q1 + eor r5, r5, r10 + vadd.i32 q4, q4, q5 + eor r6, r6, r11 + vadd.i32 q8, q8, q9 + str r11, [sp, #12] + veor q12, q3, q0 + ror r5, r5, #25 + veor q13, q7, q4 + ror r6, r6, #25 + veor q14, q11, q8 + str r4, [sp, #28] + vshl.i32 q3, q12, #8 + ldr r4, [sp, #4] + vshl.i32 q7, q13, #8 + add r2, r2, r7 + vshl.i32 q11, q14, #8 + add r3, r3, r4 + vsri.u32 q3, q12, #24 + ldr r11, [sp, #20] + vsri.u32 q7, q13, #24 + eor r11, r11, r2 + vsri.u32 q11, q14, #24 + eor r14, r14, r3 + vadd.i32 q2, q2, q3 + ror r11, r11, #16 + vadd.i32 q6, q6, q7 + ror r14, r14, #16 + vadd.i32 q10, q10, q11 + add r8, r8, r11 + veor q12, q1, q2 + add r9, r9, r14 + veor q13, q5, q6 + eor r7, r7, r8 + veor q14, q9, q10 + eor r4, r4, r9 + vshl.i32 q1, q12, #7 + ror r7, r7, #20 + vshl.i32 q5, q13, #7 + ror r4, r4, #20 + vshl.i32 q9, q14, #7 + str r6, [sp, #8] + vsri.u32 q1, q12, #25 + add r2, r2, r7 + vsri.u32 q5, q13, #25 + add r3, r3, r4 + vsri.u32 q9, q14, #25 + eor r11, r11, r2 + vext.32 q3, q3, q3, #1 + eor r14, r14, r3 + vext.32 q7, q7, q7, #1 + ror r11, r11, #24 + vext.32 q11, q11, q11, #1 + ror r14, r14, #24 + vext.32 q1, q1, q1, #3 + add r8, r8, r11 + vext.32 q5, q5, q5, #3 + add r9, r9, r14 + vext.32 q9, q9, q9, #3 + eor r7, r7, r8 + vext.32 q2, q2, q2, #2 + eor r4, r4, r9 + vext.32 q6, q6, q6, #2 + ror r7, r7, #25 + vext.32 q10, q10, q10, #2 + ror r4, r4, #25 + bne .Lchacha_blocks_neon_rounds1 + str r8, [sp, #0] + str r9, [sp, #4] + str r10, [sp, #8] + str r12, [sp, #16] + str r11, [sp, #20] + str r14, [sp, #24] + add r9, sp, #64 + vld1.32 {q12,q13}, [r9,:128]! + ldr r12, [sp, #48] + vld1.32 {q14,q15}, [r9,:128] + ldr r14, [sp, #40] + vadd.i32 q0, q0, q12 + ldr r8, [sp, #(64 +0)] + vadd.i32 q4, q4, q12 + ldr r9, [sp, #(64 +4)] + vadd.i32 q8, q8, q12 + ldr r10, [sp, #(64 +8)] + vadd.i32 q1, q1, q13 + ldr r11, [sp, #(64 +12)] + vadd.i32 q5, q5, q13 + add r0, r0, r8 + vadd.i32 q9, q9, q13 + add r1, r1, r9 + vadd.i32 q2, q2, q14 + add r2, r2, r10 + vadd.i32 q6, q6, q14 + ldr r8, [sp, #(64 +16)] + vadd.i32 q10, q10, q14 + add r3, r3, r11 + veor q14, q14, q14 + ldr r9, [sp, #(64 +20)] + mov r11, #1 + add r4, r4, r8 + vmov.32 d28[0], r11 + ldr r10, [sp, #(64 +24)] + vadd.u64 q12, q14, q15 + add r5, r5, r9 + vadd.u64 q13, q14, q12 + ldr r11, [sp, #(64 +28)] + vadd.u64 q14, q14, q13 + add r6, r6, r10 + vadd.i32 q3, q3, q12 + tst r12, r12 + vadd.i32 q7, q7, q13 + add r7, r7, r11 + vadd.i32 q11, q11, q14 + beq .Lchacha_blocks_neon_nomessage11 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage11: + stmia r14!, {r0-r7} + ldm sp, {r0-r7} + ldr r8, [sp, #(64 +32)] + ldr r9, [sp, #(64 +36)] + ldr r10, [sp, #(64 +40)] + ldr r11, [sp, #(64 +44)] + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + ldr r8, [sp, #(64 +48)] + add r3, r3, r11 + ldr r9, [sp, #(64 +52)] + add r4, r4, r8 + ldr r10, [sp, #(64 +56)] + add r5, r5, r9 + ldr r11, [sp, #(64 +60)] + add r6, r6, r10 + adds r8, r8, #4 + add r7, r7, r11 + adc r9, r9, #0 + str r8, [sp, #(64 +48)] + tst r12, r12 + str r9, [sp, #(64 +52)] + beq .Lchacha_blocks_neon_nomessage12 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage12: + stmia r14!, {r0-r7} + beq .Lchacha_blocks_neon_nomessage13 + vld1.32 {q12,q13}, [r12]! + vld1.32 {q14,q15}, [r12]! + veor q0, q0, q12 + veor q1, q1, q13 + veor q2, q2, q14 + veor q3, q3, q15 +.Lchacha_blocks_neon_nomessage13: + vst1.32 {q0,q1}, [r14]! + vst1.32 {q2,q3}, [r14]! + beq .Lchacha_blocks_neon_nomessage14 + vld1.32 {q12,q13}, [r12]! + vld1.32 {q14,q15}, [r12]! + veor q4, q4, q12 + veor q5, q5, q13 + veor q6, q6, q14 + veor q7, q7, q15 +.Lchacha_blocks_neon_nomessage14: + vst1.32 {q4,q5}, [r14]! + vst1.32 {q6,q7}, [r14]! + beq .Lchacha_blocks_neon_nomessage15 + vld1.32 {q12,q13}, [r12]! + vld1.32 {q14,q15}, [r12]! + veor q8, q8, q12 + veor q9, q9, q13 + veor q10, q10, q14 + veor q11, q11, q15 +.Lchacha_blocks_neon_nomessage15: + vst1.32 {q8,q9}, [r14]! + vst1.32 {q10,q11}, [r14]! + str r12, [sp, #48] + str r14, [sp, #40] + ldr r3, [sp, #52] + sub r3, r3, #256 + cmp r3, #256 + str r3, [sp, #52] + bhs .Lchacha_blocks_neon_mainloop1 + tst r3, r3 + beq .Lchacha_blocks_neon_done +.Lchacha_blocks_neon_mainloop2: + ldr r3, [sp, #52] + ldr r1, [sp, #48] + cmp r3, #64 + bhs .Lchacha_blocks_neon_noswap1 + add r4, sp, #128 + mov r5, r4 + tst r1, r1 + beq .Lchacha_blocks_neon_nocopy1 +.Lchacha_blocks_neon_copyinput1: + subs r3, r3, #1 + ldrb r0, [r1], #1 + strb r0, [r4], #1 + bne .Lchacha_blocks_neon_copyinput1 + str r5, [sp, #48] +.Lchacha_blocks_neon_nocopy1: + ldr r4, [sp, #40] + str r5, [sp, #40] + str r4, [sp, #56] +.Lchacha_blocks_neon_noswap1: + ldr r0, [sp, #44] + str r0, [sp, #0] + add r0, sp, #64 + ldm r0, {r0-r12} + ldr r14, [sp, #(64 +60)] + str r6, [sp, #8] + str r11, [sp, #12] + str r14, [sp, #28] + ldr r11, [sp, #(64 +52)] + ldr r14, [sp, #(64 +56)] +.Lchacha_blocks_neon_rounds2: + ldr r6, [sp, #0] + add r0, r0, r4 + add r1, r1, r5 + eor r12, r12, r0 + eor r11, r11, r1 + ror r12, r12, #16 + ror r11, r11, #16 + subs r6, r6, #2 + add r8, r8, r12 + add r9, r9, r11 + eor r4, r4, r8 + eor r5, r5, r9 + str r6, [sp, #0] + ror r4, r4, #20 + ror r5, r5, #20 + add r0, r0, r4 + add r1, r1, r5 + ldr r6, [sp, #8] + eor r12, r12, r0 + eor r11, r11, r1 + ror r12, r12, #24 + ror r11, r11, #24 + add r8, r8, r12 + add r9, r9, r11 + eor r4, r4, r8 + eor r5, r5, r9 + str r11, [sp, #20] + ror r4, r4, #25 + ror r5, r5, #25 + str r4, [sp, #4] + ldr r4, [sp, #28] + add r2, r2, r6 + add r3, r3, r7 + ldr r11, [sp, #12] + eor r14, r14, r2 + eor r4, r4, r3 + ror r14, r14, #16 + ror r4, r4, #16 + add r10, r10, r14 + add r11, r11, r4 + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #20 + ror r7, r7, #20 + add r2, r2, r6 + add r3, r3, r7 + eor r14, r14, r2 + eor r4, r4, r3 + ror r14, r14, #24 + ror r4, r4, #24 + add r10, r10, r14 + add r11, r11, r4 + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #25 + ror r7, r7, #25 + add r0, r0, r5 + add r1, r1, r6 + eor r4, r4, r0 + eor r12, r12, r1 + ror r4, r4, #16 + ror r12, r12, #16 + add r10, r10, r4 + add r11, r11, r12 + eor r5, r5, r10 + eor r6, r6, r11 + ror r5, r5, #20 + ror r6, r6, #20 + add r0, r0, r5 + add r1, r1, r6 + eor r4, r4, r0 + eor r12, r12, r1 + ror r4, r4, #24 + ror r12, r12, #24 + add r10, r10, r4 + add r11, r11, r12 + eor r5, r5, r10 + eor r6, r6, r11 + str r11, [sp, #12] + ror r5, r5, #25 + ror r6, r6, #25 + str r4, [sp, #28] + ldr r4, [sp, #4] + add r2, r2, r7 + add r3, r3, r4 + ldr r11, [sp, #20] + eor r11, r11, r2 + eor r14, r14, r3 + ror r11, r11, #16 + ror r14, r14, #16 + add r8, r8, r11 + add r9, r9, r14 + eor r7, r7, r8 + eor r4, r4, r9 + ror r7, r7, #20 + ror r4, r4, #20 + str r6, [sp, #8] + add r2, r2, r7 + add r3, r3, r4 + eor r11, r11, r2 + eor r14, r14, r3 + ror r11, r11, #24 + ror r14, r14, #24 + add r8, r8, r11 + add r9, r9, r14 + eor r7, r7, r8 + eor r4, r4, r9 + ror r7, r7, #25 + ror r4, r4, #25 + bne .Lchacha_blocks_neon_rounds2 + str r8, [sp, #0] + str r9, [sp, #4] + str r10, [sp, #8] + str r12, [sp, #16] + str r11, [sp, #20] + str r14, [sp, #24] + ldr r12, [sp, #48] + ldr r14, [sp, #40] + ldr r8, [sp, #(64 +0)] + ldr r9, [sp, #(64 +4)] + ldr r10, [sp, #(64 +8)] + ldr r11, [sp, #(64 +12)] + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + ldr r8, [sp, #(64 +16)] + add r3, r3, r11 + ldr r9, [sp, #(64 +20)] + add r4, r4, r8 + ldr r10, [sp, #(64 +24)] + add r5, r5, r9 + ldr r11, [sp, #(64 +28)] + add r6, r6, r10 + tst r12, r12 + add r7, r7, r11 + beq .Lchacha_blocks_neon_nomessage21 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage21: + stmia r14!, {r0-r7} + ldm sp, {r0-r7} + ldr r8, [sp, #(64 +32)] + ldr r9, [sp, #(64 +36)] + ldr r10, [sp, #(64 +40)] + ldr r11, [sp, #(64 +44)] + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + ldr r8, [sp, #(64 +48)] + add r3, r3, r11 + ldr r9, [sp, #(64 +52)] + add r4, r4, r8 + ldr r10, [sp, #(64 +56)] + add r5, r5, r9 + ldr r11, [sp, #(64 +60)] + add r6, r6, r10 + adds r8, r8, #1 + add r7, r7, r11 + adc r9, r9, #0 + str r8, [sp, #(64 +48)] + tst r12, r12 + str r9, [sp, #(64 +52)] + beq .Lchacha_blocks_neon_nomessage22 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage22: + stmia r14!, {r0-r7} + str r12, [sp, #48] + str r14, [sp, #40] + ldr r3, [sp, #52] + cmp r3, #64 + sub r4, r3, #64 + str r4, [sp, #52] + bhi .Lchacha_blocks_neon_mainloop2 + cmp r3, #64 + beq .Lchacha_blocks_neon_nocopy2 + ldr r1, [sp, #56] + sub r14, r14, #64 +.Lchacha_blocks_neon_copyinput2: + subs r3, r3, #1 + ldrb r0, [r14], #1 + strb r0, [r1], #1 + bne .Lchacha_blocks_neon_copyinput2 +.Lchacha_blocks_neon_nocopy2: +.Lchacha_blocks_neon_done: + ldr r7, [sp, #60] + ldr r8, [sp, #(64 +48)] + ldr r9, [sp, #(64 +52)] + str r8, [r7, #(48 + 0)] + str r9, [r7, #(48 + 4)] + mov r12, sp + stmia r12!, {r0-r7} + add r12, r12, #48 + stmia r12!, {r0-r7} + sub r0, sp, #8 + ldr sp, [sp, #192] + ldmfd sp!, {r4-r12, r14} + vldm sp!, {q4-q7} + sub r0, sp, r0 + bx lr +.Lchacha_blocks_neon_nobytes: + mov r0, #0; + bx lr +.ltorg +.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks; + +#endif diff --git a/cipher/chacha20.c b/cipher/chacha20.c index ebba2fc..c1847aa 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -67,6 +67,16 @@ # define USE_AVX2 1 #endif +/* USE_NEON indicates whether to enable ARM NEON assembly code. */ +#undef USE_NEON +#ifdef ENABLE_NEON_SUPPORT +# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ + && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ + && defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_NEON 1 +# endif +#endif /*ENABLE_NEON_SUPPORT*/ + struct CHACHA20_context_s; @@ -104,6 +114,13 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in, #endif /* USE_AVX2 */ +#ifdef USE_NEON + +unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in, + byte *out, size_t bytes); + +#endif /* USE_NEON */ + static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); @@ -353,6 +370,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, if (features & HWF_INTEL_AVX2) ctx->blocks = _gcry_chacha20_amd64_avx2_blocks; #endif +#ifdef USE_NEON + if (features & HWF_ARM_NEON) + ctx->blocks = _gcry_chacha20_armv7_neon_blocks; +#endif (void)features; @@ -541,6 +562,19 @@ selftest (void) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + /* encrypt */ + for (i = 0; i < sizeof buf; i++) + chacha20_encrypt_stream (&ctx, &buf[i], &buf[i], 1); + /* decrypt */ + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf); + for (i = 0; i < sizeof buf; i++) + if (buf[i] != (byte) i) + return "ChaCha20 encryption test 3 failed."; + return NULL; } diff --git a/configure.ac b/configure.ac index d14b7f6..60ed015 100644 --- a/configure.ac +++ b/configure.ac @@ -1822,6 +1822,11 @@ if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" ;; esac + + if test x"$neonsupport" = xyes ; then + # Build with the NEON implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo" + fi fi case "${host}" in From jussi.kivilinna at iki.fi Sun Nov 2 17:52:45 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 02 Nov 2014 18:52:45 +0200 Subject: [PATCH 3/3] Disable NEON for CPUs that are known to have broken NEON implementation In-Reply-To: <20141102165235.18119.67152.stgit@localhost6.localdomain6> References: <20141102165235.18119.67152.stgit@localhost6.localdomain6> Message-ID: <20141102165245.18119.39131.stgit@localhost6.localdomain6> * src/hwf-arm.c (detect_arm_proc_cpuinfo): Add parsing for CPU version information and check if CPU is known to have broken NEON implementation. (_gcry_hwf_detect_arm): Filter out broken HW features. -- Signed-off-by: Jussi Kivilinna --- src/hwf-arm.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/src/hwf-arm.c b/src/hwf-arm.c index dbbb607..ac86fb9 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -98,17 +98,32 @@ detect_arm_at_hwcap(void) #define HAS_PROC_CPUINFO 1 static unsigned int -detect_arm_proc_cpuinfo(void) +detect_arm_proc_cpuinfo(unsigned int *broken_hwfs) { char buf[1024]; /* large enough */ char *str_features, *str_neon; + int cpu_implementer, cpu_arch, cpu_variant, cpu_part, cpu_revision; FILE *f; int readlen, i; static int cpuinfo_initialized = 0; static unsigned int stored_cpuinfo_features; + static unsigned int stored_broken_hwfs; + struct { + const char *name; + int *value; + } cpu_entries[5] = { + { "CPU implementer", &cpu_implementer }, + { "CPU architecture", &cpu_arch }, + { "CPU variant", &cpu_variant }, + { "CPU part", &cpu_part }, + { "CPU revision", &cpu_revision }, + }; if (cpuinfo_initialized) - return stored_cpuinfo_features; + { + *broken_hwfs |= stored_broken_hwfs; + return stored_cpuinfo_features; + } f = fopen("/proc/cpuinfo", "r"); if (!f) @@ -124,12 +139,32 @@ detect_arm_proc_cpuinfo(void) cpuinfo_initialized = 1; stored_cpuinfo_features = 0; + stored_broken_hwfs = 0; /* Find features line. */ str_features = strstr(buf, "Features"); if (!str_features) return stored_cpuinfo_features; + /* Find CPU version information. */ + for (i = 0; i < sizeof(cpu_entries) / sizeof(cpu_entries[0]); i++) + { + char *str; + + *cpu_entries[i].value = -1; + + str = strstr(buf, cpu_entries[i].name); + if (!str) + continue; + + str = strstr(str, ": "); + if (!str) + continue; + + str += 2; + *cpu_entries[i].value = strtoul(str, NULL, 0); + } + /* Lines to strings. */ for (i = 0; i < sizeof(buf); i++) if (buf[i] == '\n') @@ -140,6 +175,19 @@ detect_arm_proc_cpuinfo(void) if (str_neon && (str_neon[5] == ' ' || str_neon[5] == '\0')) stored_cpuinfo_features |= HWF_ARM_NEON; + /* Check for CPUs with broken NEON implementation. See + * https://code.google.com/p/chromium/issues/detail?id=341598 + */ + if (cpu_implementer == 0x51 && + cpu_arch == 7 && + cpu_variant == 1 && + cpu_part == 0x4d && + cpu_revision == 0) + { + stored_broken_hwfs = HWF_ARM_NEON; + } + + *broken_hwfs |= stored_broken_hwfs; return stored_cpuinfo_features; } @@ -149,18 +197,21 @@ unsigned int _gcry_hwf_detect_arm (void) { unsigned int ret = 0; + unsigned int broken_hwfs = 0; #if defined (HAS_SYS_AT_HWCAP) ret |= detect_arm_at_hwcap (); #endif #if defined (HAS_PROC_CPUINFO) - ret |= detect_arm_proc_cpuinfo (); + ret |= detect_arm_proc_cpuinfo (&broken_hwfs); #endif #if defined(__ARM_NEON__) && defined(ENABLE_NEON_SUPPORT) ret |= HWF_ARM_NEON; #endif + ret &= ~broken_hwfs; + return ret; } From jussi.kivilinna at iki.fi Sun Nov 2 17:52:40 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 02 Nov 2014 18:52:40 +0200 Subject: [PATCH 2/3] Add ARM/NEON implementation of Poly1305 In-Reply-To: <20141102165235.18119.67152.stgit@localhost6.localdomain6> References: <20141102165235.18119.67152.stgit@localhost6.localdomain6> Message-ID: <20141102165240.18119.57146.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'poly1305-armv7-neon.S'. * cipher/poly1305-armv7-neon.S: New. * cipher/poly1305-internal.h (POLY1305_USE_NEON) (POLY1305_NEON_BLOCKSIZE, POLY1305_NEON_STATESIZE) (POLY1305_NEON_ALIGNMENT): New. * cipher/poly1305.c [POLY1305_USE_NEON] (_gcry_poly1305_armv7_neon_init_ext) (_gcry_poly1305_armv7_neon_finish_ext) (_gcry_poly1305_armv7_neon_blocks, poly1305_armv7_neon_ops): New. (_gcry_poly1305_init) [POLY1305_USE_NEON]: Select NEON implementation if HWF_ARM_NEON set. * configure.ac [neonsupport=yes]: Add 'poly1305-armv7-neon.lo'. -- Add Andrew Moon's public domain NEON implementation of Poly1305. Original source is available at: https://github.com/floodyberry/poly1305-opt Benchmark on Cortex-A8 (--cpu-mhz 1008): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 12.34 ns/B 77.27 MiB/s 12.44 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 2.12 ns/B 450.7 MiB/s 2.13 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/poly1305-armv7-neon.S | 705 ++++++++++++++++++++++++++++++++++++++++++ cipher/poly1305-internal.h | 18 + cipher/poly1305.c | 23 + configure.ac | 5 5 files changed, 752 insertions(+), 1 deletion(-) create mode 100644 cipher/poly1305-armv7-neon.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 09ccaf9..22018b3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -73,7 +73,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ -poly1305-sse2-amd64.S poly1305-avx2-amd64.S \ +poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-armv7-neon.S b/cipher/poly1305-armv7-neon.S new file mode 100644 index 0000000..1134e85 --- /dev/null +++ b/cipher/poly1305-armv7-neon.S @@ -0,0 +1,705 @@ +/* poly1305-armv7-neon.S - ARMv7/NEON implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +.syntax unified +.fpu neon +.arm + +.text + +.p2align 2 +.Lpoly1305_init_constants_neon: +.long 0x3ffff03 +.long 0x3ffc0ff +.long 0x3f03fff +.long 0x00fffff + +.globl _gcry_poly1305_armv7_neon_init_ext +.type _gcry_poly1305_armv7_neon_init_ext,%function; +_gcry_poly1305_armv7_neon_init_ext: +.Lpoly1305_init_ext_neon_local: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #32 + mov r14, r2 + and r2, r2, r2 + moveq r14, #-1 + ldmia r1!, {r2-r5} + ldr r7, =.Lpoly1305_init_constants_neon + mov r6, r2 + mov r8, r2, lsr #26 + mov r9, r3, lsr #20 + mov r10, r4, lsr #14 + mov r11, r5, lsr #8 + orr r8, r8, r3, lsl #6 + orr r9, r9, r4, lsl #12 + orr r10, r10, r5, lsl #18 + ldmia r7, {r2-r5} + and r2, r2, r8 + and r3, r3, r9 + and r4, r4, r10 + and r5, r5, r11 + and r6, r6, 0x3ffffff + stmia r0!, {r2-r6} + eor r8, r8, r8 + str r8, [sp, #24] +.Lpoly1305_init_ext_neon_squareloop: + ldr r8, [sp, #24] + mov r12, #16 + cmp r8, #2 + beq .Lpoly1305_init_ext_neon_donesquaring + cmp r8, #1 + moveq r12, #64 + cmp r14, r12 + bls .Lpoly1305_init_ext_neon_donesquaring + add r8, #1 + str r8, [sp, #24] + mov r6, r6, lsl #1 + mov r2, r2, lsl #1 + umull r7, r8, r3, r3 + umull r9, r10, r6, r4 + umlal r7, r8, r6, r5 + umlal r9, r10, r2, r3 + add r11, r5, r5, lsl #2 + umlal r7, r8, r2, r4 + umlal r9, r10, r5, r11 + str r7, [sp, #16] + str r8, [sp, #20] + mov r2, r2, lsr #1 + mov r5, r5, lsl #1 + str r9, [sp, #8] + str r10, [sp, #12] + umull r7, r8, r2, r2 + umull r9, r10, r6, r2 + add r11, r3, r3, lsl #2 + add r12, r4, r4, lsl #2 + umlal r7, r8, r6, r3 + umlal r9, r10, r5, r11 + umlal r7, r8, r5, r12 + umlal r9, r10, r4, r12 + mov r6, r6, lsr #1 + mov r3, r3, lsl #1 + add r11, r2, r2, lsl #2 + str r7, [sp, #0] + str r8, [sp, #4] + umull r7, r8, r6, r6 + umlal r7, r8, r3, r12 + umlal r7, r8, r5, r11 + and r6, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + ldr r7, [sp, #0] + ldr r8, [sp, #4] + adds r9, r9, r11 + adc r10, r10, #0 + and r2, r9, 0x3ffffff + mov r11, r9, lsr #26 + orr r11, r11, r10, lsl #6 + ldr r9, [sp, #8] + ldr r10, [sp, #12] + adds r7, r7, r11 + adc r8, r8, #0 + and r3, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + ldr r7, [sp, #16] + ldr r8, [sp, #20] + adds r9, r9, r11 + adc r10, r10, #0 + and r4, r9, 0x3ffffff + mov r11, r9, lsr #26 + orr r11, r11, r10, lsl #6 + adds r7, r7, r11 + adc r8, r8, #0 + and r5, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + add r11, r11, r11, lsl #2 + add r6, r6, r11 + mov r11, r6, lsr #26 + and r6, r6, 0x3ffffff + add r2, r2, r11 + stmia r0!, {r2-r6} + b .Lpoly1305_init_ext_neon_squareloop +.Lpoly1305_init_ext_neon_donesquaring: + mov r2, #2 + ldr r14, [sp, #24] + sub r14, r2, r14 + mov r3, r14, lsl #4 + add r3, r3, r14, lsl #2 + add r0, r0, r3 + eor r2, r2, r2 + eor r3, r3, r3 + eor r4, r4, r4 + eor r5, r5, r5 + eor r6, r6, r6 + stmia r0!, {r2-r6} + stmia r0!, {r2-r6} + ldmia r1!, {r2-r5} + stmia r0, {r2-r6} + add sp, sp, #32 + ldmfd sp!, {r4-r11, lr} + mov r0, #(9*4+32) + bx lr +.ltorg +.size _gcry_poly1305_armv7_neon_init_ext,.-_gcry_poly1305_armv7_neon_init_ext; + +.globl _gcry_poly1305_armv7_neon_blocks +.type _gcry_poly1305_armv7_neon_blocks,%function; +_gcry_poly1305_armv7_neon_blocks: +.Lpoly1305_blocks_neon_local: + vmov.i32 q0, #0xffffffff + vmov.i32 d4, #1 + vsubw.u32 q0, q0, d4 + vstmdb sp!, {q4,q5,q6,q7} + stmfd sp!, {r4-r11, lr} + mov r8, sp + and sp, sp, #~63 + sub sp, sp, #192 + str r0, [sp, #108] + str r1, [sp, #112] + str r2, [sp, #116] + str r8, [sp, #120] + mov r3, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + ldr r8, [r2, #116] + veor d15, d15, d15 + vorr.i32 d15, #(1 << 24) + tst r8, #2 + beq .Lpoly1305_blocks_neon_skip_shift8 + vshr.u64 d15, #32 +.Lpoly1305_blocks_neon_skip_shift8: + tst r8, #4 + beq .Lpoly1305_blocks_neon_skip_shift16 + veor d15, d15, d15 +.Lpoly1305_blocks_neon_skip_shift16: + vst1.64 d15, [sp, :64] + tst r8, #1 + bne .Lpoly1305_blocks_neon_started + vld1.64 {q0-q1}, [r0]! + vswp d1, d2 + vmovn.i64 d21, q0 + vshrn.i64 d22, q0, #26 + vshrn.u64 d24, q1, #14 + vext.8 d0, d0, d2, #4 + vext.8 d1, d1, d3, #4 + vshr.u64 q1, q1, #32 + vshrn.i64 d23, q0, #20 + vshrn.u64 d25, q1, #8 + vand.i32 d21, #0x03ffffff + vand.i32 q11, #0x03ffffff + vand.i32 q12, #0x03ffffff + orr r8, r8, #1 + sub r1, r1, #32 + str r8, [r2, #116] + vorr d25, d25, d15 + b .Lpoly1305_blocks_neon_setupr20 +.Lpoly1305_blocks_neon_started: + add r9, r2, #60 + vldm r9, {d21-d25} +.Lpoly1305_blocks_neon_setupr20: + vmov.i32 d0, #5 + tst r8, #(8|16) + beq .Lpoly1305_blocks_neon_setupr20_simple + tst r8, #(8) + beq .Lpoly1305_blocks_neon_setupr20_r_1 + mov r9, r2 + add r10, r2, #20 + vld1.64 {q9}, [r9]! + vld1.64 {q8}, [r10]! + vld1.64 {d2}, [r9] + vld1.64 {d20}, [r10] + b .Lpoly1305_blocks_neon_setupr20_hard +.Lpoly1305_blocks_neon_setupr20_r_1: + mov r9, r2 + vmov.i32 d2, #1 + vld1.64 {q8}, [r9]! + veor q9, q9, q9 + vshr.u64 d2, d2, #32 + vld1.64 {d20}, [r9] +.Lpoly1305_blocks_neon_setupr20_hard: + vzip.i32 q8, q9 + vzip.i32 d20, d2 + b .Lpoly1305_blocks_neon_setups20 +.Lpoly1305_blocks_neon_setupr20_simple: + add r9, r2, #20 + vld1.64 {d2-d4}, [r9] + vdup.32 d16, d2[0] + vdup.32 d17, d2[1] + vdup.32 d18, d3[0] + vdup.32 d19, d3[1] + vdup.32 d20, d4[0] +.Lpoly1305_blocks_neon_setups20: + vmul.i32 q13, q8, d0[0] + vmov.i64 q15, 0x00000000ffffffff + vmul.i32 q14, q9, d0[0] + vshr.u64 q15, q15, #6 + cmp r1, #64 + blo .Lpoly1305_blocks_neon_try32 + add r9, sp, #16 + add r10, r2, #40 + add r11, sp, #64 + str r1, [sp, #116] + vld1.64 {d10-d12}, [r10] + vmov d14, d12 + vmul.i32 q6, q5, d0[0] +.Lpoly1305_blocks_neon_mainloop: + ldmia r0!, {r2-r5} + vmull.u32 q0, d25, d12[0] + mov r7, r2, lsr #26 + vmlal.u32 q0, d24, d12[1] + mov r8, r3, lsr #20 + ldr r6, [sp, #0] + vmlal.u32 q0, d23, d13[0] + mov r9, r4, lsr #14 + vmlal.u32 q0, d22, d13[1] + orr r6, r6, r5, lsr #8 + vmlal.u32 q0, d21, d14[0] + orr r3, r7, r3, lsl #6 + vmull.u32 q1, d25, d12[1] + orr r4, r8, r4, lsl #12 + orr r5, r9, r5, lsl #18 + vmlal.u32 q1, d24, d13[0] + ldmia r0!, {r7-r10} + vmlal.u32 q1, d23, d13[1] + mov r1, r7, lsr #26 + vmlal.u32 q1, d22, d14[0] + ldr r11, [sp, #4] + mov r12, r8, lsr #20 + vmlal.u32 q1, d21, d10[0] + mov r14, r9, lsr #14 + vmull.u32 q2, d25, d13[0] + orr r11, r11, r10, lsr #8 + orr r8, r1, r8, lsl #6 + vmlal.u32 q2, d24, d13[1] + orr r9, r12, r9, lsl #12 + vmlal.u32 q2, d23, d14[0] + orr r10, r14, r10, lsl #18 + vmlal.u32 q2, d22, d10[0] + mov r12, r3 + and r2, r2, #0x3ffffff + vmlal.u32 q2, d21, d10[1] + mov r14, r5 + vmull.u32 q3, d25, d13[1] + and r3, r7, #0x3ffffff + vmlal.u32 q3, d24, d14[0] + and r5, r8, #0x3ffffff + vmlal.u32 q3, d23, d10[0] + and r7, r9, #0x3ffffff + vmlal.u32 q3, d22, d10[1] + and r8, r14, #0x3ffffff + vmlal.u32 q3, d21, d11[0] + and r9, r10, #0x3ffffff + add r14, sp, #128 + vmull.u32 q4, d25, d14[0] + mov r10, r6 + vmlal.u32 q4, d24, d10[0] + and r6, r4, #0x3ffffff + vmlal.u32 q4, d23, d10[1] + and r4, r12, #0x3ffffff + vmlal.u32 q4, d22, d11[0] + stm r14, {r2-r11} + vmlal.u32 q4, d21, d11[1] + vld1.64 {d21-d24}, [r14, :256]! + vld1.64 {d25}, [r14, :64] + ldmia r0!, {r2-r5} + vmlal.u32 q0, d25, d26 + mov r7, r2, lsr #26 + vmlal.u32 q0, d24, d27 + ldr r6, [sp, #0] + mov r8, r3, lsr #20 + vmlal.u32 q0, d23, d28 + mov r9, r4, lsr #14 + vmlal.u32 q0, d22, d29 + orr r6, r6, r5, lsr #8 + vmlal.u32 q0, d21, d20 + orr r3, r7, r3, lsl #6 + vmlal.u32 q1, d25, d27 + orr r4, r8, r4, lsl #12 + orr r5, r9, r5, lsl #18 + vmlal.u32 q1, d24, d28 + ldmia r0!, {r7-r10} + vmlal.u32 q1, d23, d29 + mov r1, r7, lsr #26 + vmlal.u32 q1, d22, d20 + ldr r11, [sp, #4] + mov r12, r8, lsr #20 + vmlal.u32 q1, d21, d16 + mov r14, r9, lsr #14 + vmlal.u32 q2, d25, d28 + orr r11, r11, r10, lsr #8 + orr r8, r1, r8, lsl #6 + orr r9, r12, r9, lsl #12 + vmlal.u32 q2, d24, d29 + orr r10, r14, r10, lsl #18 + and r2, r2, #0x3ffffff + mov r12, r3 + vmlal.u32 q2, d23, d20 + mov r14, r5 + vmlal.u32 q2, d22, d16 + and r3, r7, #0x3ffffff + vmlal.u32 q2, d21, d17 + and r5, r8, #0x3ffffff + vmlal.u32 q3, d25, d29 + and r7, r9, #0x3ffffff + vmlal.u32 q3, d24, d20 + and r8, r14, #0x3ffffff + vmlal.u32 q3, d23, d16 + and r9, r10, #0x3ffffff + vmlal.u32 q3, d22, d17 + add r14, sp, #128 + vmlal.u32 q3, d21, d18 + mov r10, r6 + vmlal.u32 q4, d25, d20 + vmlal.u32 q4, d24, d16 + and r6, r4, #0x3ffffff + vmlal.u32 q4, d23, d17 + and r4, r12, #0x3ffffff + vmlal.u32 q4, d22, d18 + stm r14, {r2-r11} + vmlal.u32 q4, d21, d19 + vld1.64 {d21-d24}, [r14, :256]! + vld1.64 {d25}, [r14, :64] + vaddw.u32 q0, q0, d21 + vaddw.u32 q1, q1, d22 + vaddw.u32 q2, q2, d23 + vaddw.u32 q3, q3, d24 + vaddw.u32 q4, q4, d25 + vshr.u64 q11, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q11 + vshr.u64 q12, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q12 + vshr.u64 q11, q1, #26 + vand q1, q1, q15 + vadd.i64 q2, q2, q11 + vshr.u64 q12, q4, #26 + vand q4, q4, q15 + vadd.i64 q0, q0, q12 + vshl.i64 q12, q12, #2 + ldr r1, [sp, #116] + vadd.i64 q0, q0, q12 + vshr.u64 q11, q2, #26 + vand q2, q2, q15 + vadd.i64 q3, q3, q11 + sub r1, #64 + vshr.u64 q12, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q12 + cmp r1, #64 + vshr.u64 q11, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q11 + vmovn.i64 d21, q0 + str r1, [sp, #116] + vmovn.i64 d22, q1 + vmovn.i64 d23, q2 + vmovn.i64 d24, q3 + vmovn.i64 d25, q4 + bhs .Lpoly1305_blocks_neon_mainloop +.Lpoly1305_blocks_neon_try32: + cmp r1, #32 + blo .Lpoly1305_blocks_neon_done + tst r0, r0 + bne .Lpoly1305_blocks_loadm32 + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + veor q4, q4, q4 + b .Lpoly1305_blocks_continue32 +.Lpoly1305_blocks_loadm32: + vld1.64 {q0-q1}, [r0]! + veor q4, q4, q4 + vswp d1, d2 + veor q3, q3, q3 + vtrn.32 q0, q4 + vtrn.32 q1, q3 + vshl.i64 q2, q1, #12 + vshl.i64 q3, q3, #18 + vshl.i64 q1, q4, #6 + vmovl.u32 q4, d15 +.Lpoly1305_blocks_continue32: + vmlal.u32 q0, d25, d26 + vmlal.u32 q0, d24, d27 + vmlal.u32 q0, d23, d28 + vmlal.u32 q0, d22, d29 + vmlal.u32 q0, d21, d20 + vmlal.u32 q1, d25, d27 + vmlal.u32 q1, d24, d28 + vmlal.u32 q1, d23, d29 + vmlal.u32 q1, d22, d20 + vmlal.u32 q1, d21, d16 + vmlal.u32 q2, d25, d28 + vmlal.u32 q2, d24, d29 + vmlal.u32 q2, d23, d20 + vmlal.u32 q2, d22, d16 + vmlal.u32 q2, d21, d17 + vmlal.u32 q3, d25, d29 + vmlal.u32 q3, d24, d20 + vmlal.u32 q3, d23, d16 + vmlal.u32 q3, d22, d17 + vmlal.u32 q3, d21, d18 + vmlal.u32 q4, d25, d20 + vmlal.u32 q4, d24, d16 + vmlal.u32 q4, d23, d17 + vmlal.u32 q4, d22, d18 + vmlal.u32 q4, d21, d19 + vshr.u64 q11, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q11 + vshr.u64 q12, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q12 + vshr.u64 q11, q1, #26 + vand q1, q1, q15 + vadd.i64 q2, q2, q11 + vshr.u64 q12, q4, #26 + vand q4, q4, q15 + vadd.i64 q0, q0, q12 + vshl.i64 q12, q12, #2 + vadd.i64 q0, q0, q12 + vshr.u64 q11, q2, #26 + vand q2, q2, q15 + vadd.i64 q3, q3, q11 + vshr.u64 q12, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q12 + vshr.u64 q11, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q11 + vmovn.i64 d21, q0 + vmovn.i64 d22, q1 + vmovn.i64 d23, q2 + vmovn.i64 d24, q3 + vmovn.i64 d25, q4 +.Lpoly1305_blocks_neon_done: + tst r0, r0 + beq .Lpoly1305_blocks_neon_final + ldr r2, [sp, #108] + add r2, r2, #60 + vst1.64 {d21}, [r2]! + vst1.64 {d22-d25}, [r2] + b .Lpoly1305_blocks_neon_leave +.Lpoly1305_blocks_neon_final: + vadd.u32 d10, d0, d1 + vadd.u32 d13, d2, d3 + vadd.u32 d11, d4, d5 + ldr r5, [sp, #108] + vadd.u32 d14, d6, d7 + vadd.u32 d12, d8, d9 + vtrn.32 d10, d13 + vtrn.32 d11, d14 + vst1.64 {d10-d12}, [sp] + ldm sp, {r0-r4} + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + mov r12, r1, lsr #26 + and r1, r1, #0x3ffffff + add r2, r2, r12 + mov r12, r2, lsr #26 + and r2, r2, #0x3ffffff + add r3, r3, r12 + mov r12, r3, lsr #26 + and r3, r3, #0x3ffffff + add r4, r4, r12 + mov r12, r4, lsr #26 + and r4, r4, #0x3ffffff + add r12, r12, r12, lsl #2 + add r0, r0, r12 + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + mov r12, r1, lsr #26 + and r1, r1, #0x3ffffff + add r2, r2, r12 + mov r12, r2, lsr #26 + and r2, r2, #0x3ffffff + add r3, r3, r12 + mov r12, r3, lsr #26 + and r3, r3, #0x3ffffff + add r4, r4, r12 + mov r12, r4, lsr #26 + and r4, r4, #0x3ffffff + add r12, r12, r12, lsl #2 + add r0, r0, r12 + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + add r6, r0, #5 + mov r12, r6, lsr #26 + and r6, r6, #0x3ffffff + add r7, r1, r12 + mov r12, r7, lsr #26 + and r7, r7, #0x3ffffff + add r10, r2, r12 + mov r12, r10, lsr #26 + and r10, r10, #0x3ffffff + add r11, r3, r12 + mov r12, #-(1 << 26) + add r12, r12, r11, lsr #26 + and r11, r11, #0x3ffffff + add r14, r4, r12 + mov r12, r14, lsr #31 + sub r12, #1 + and r6, r6, r12 + and r7, r7, r12 + and r10, r10, r12 + and r11, r11, r12 + and r14, r14, r12 + mvn r12, r12 + and r0, r0, r12 + and r1, r1, r12 + and r2, r2, r12 + and r3, r3, r12 + and r4, r4, r12 + orr r0, r0, r6 + orr r1, r1, r7 + orr r2, r2, r10 + orr r3, r3, r11 + orr r4, r4, r14 + orr r0, r0, r1, lsl #26 + lsr r1, r1, #6 + orr r1, r1, r2, lsl #20 + lsr r2, r2, #12 + orr r2, r2, r3, lsl #14 + lsr r3, r3, #18 + orr r3, r3, r4, lsl #8 + add r5, r5, #60 + stm r5, {r0-r3} +.Lpoly1305_blocks_neon_leave: + sub r0, sp, #8 + ldr sp, [sp, #120] + ldmfd sp!, {r4-r11, lr} + vldm sp!, {q4-q7} + sub r0, sp, r0 + bx lr +.size _gcry_poly1305_armv7_neon_blocks,.-_gcry_poly1305_armv7_neon_blocks; + +.globl _gcry_poly1305_armv7_neon_finish_ext +.type _gcry_poly1305_armv7_neon_finish_ext,%function; +_gcry_poly1305_armv7_neon_finish_ext: +.Lpoly1305_finish_ext_neon_local: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #32 + mov r5, r0 + mov r6, r1 + mov r7, r2 + mov r8, r3 + ands r7, r7, r7 + beq .Lpoly1305_finish_ext_neon_noremaining + mov r9, sp + veor q0, q0, q0 + veor q1, q1, q1 + vst1.64 {q0-q1}, [sp] + tst r7, #16 + beq .Lpoly1305_finish_ext_neon_skip16 + vld1.u64 {q0}, [r1]! + vst1.64 {q0}, [r9]! +.Lpoly1305_finish_ext_neon_skip16: + tst r7, #8 + beq .Lpoly1305_finish_ext_neon_skip8 + ldmia r1!, {r10-r11} + stmia r9!, {r10-r11} +.Lpoly1305_finish_ext_neon_skip8: + tst r7, #4 + beq .Lpoly1305_finish_ext_neon_skip4 + ldr r10, [r1], #4 + str r10, [r9], #4 +.Lpoly1305_finish_ext_neon_skip4: + tst r7, #2 + beq .Lpoly1305_finish_ext_neon_skip2 + ldrh r10, [r1], #2 + strh r10, [r9], #2 +.Lpoly1305_finish_ext_neon_skip2: + tst r7, #1 + beq .Lpoly1305_finish_ext_neon_skip1 + ldrb r10, [r1], #1 + strb r10, [r9], #1 +.Lpoly1305_finish_ext_neon_skip1: + cmp r7, #16 + beq .Lpoly1305_finish_ext_neon_skipfinalbit + mov r10, #1 + strb r10, [r9] +.Lpoly1305_finish_ext_neon_skipfinalbit: + ldr r10, [r5, #116] + orrhs r10, #2 + orrlo r10, #4 + str r10, [r5, #116] + mov r0, r5 + mov r1, sp + mov r2, #32 + bl .Lpoly1305_blocks_neon_local +.Lpoly1305_finish_ext_neon_noremaining: + ldr r10, [r5, #116] + tst r10, #1 + beq .Lpoly1305_finish_ext_neon_notstarted + cmp r7, #0 + beq .Lpoly1305_finish_ext_neon_user2r + cmp r7, #16 + bls .Lpoly1305_finish_ext_neon_user1 +.Lpoly1305_finish_ext_neon_user2r: + orr r10, r10, #8 + b .Lpoly1305_finish_ext_neon_finalblock +.Lpoly1305_finish_ext_neon_user1: + orr r10, r10, #16 +.Lpoly1305_finish_ext_neon_finalblock: + str r10, [r5, #116] + mov r0, r5 + eor r1, r1, r1 + mov r2, #32 + bl .Lpoly1305_blocks_neon_local +.Lpoly1305_finish_ext_neon_notstarted: + add r0, r5, #60 + add r9, r5, #100 + ldm r0, {r0-r3} + ldm r9, {r9-r12} + adds r0, r0, r9 + adcs r1, r1, r10 + adcs r2, r2, r11 + adcs r3, r3, r12 + stm r8, {r0-r3} + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + vstmia r5!, {q0-q3} + vstm r5, {q0-q3} + add sp, sp, #32 + ldmfd sp!, {r4-r11, lr} + mov r0, #(9*4+32) + bx lr +.size _gcry_poly1305_armv7_neon_finish_ext,.-_gcry_poly1305_armv7_neon_finish_ext; + +#endif diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index 0299c43..dfc0c04 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -65,10 +65,24 @@ #endif +/* POLY1305_USE_NEON indicates whether to enable ARM NEON assembly code. */ +#undef POLY1305_USE_NEON +#if defined(ENABLE_NEON_SUPPORT) && defined(HAVE_ARM_ARCH_V6) && \ + defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) +# define POLY1305_USE_NEON 1 +# define POLY1305_NEON_BLOCKSIZE 32 +# define POLY1305_NEON_STATESIZE 128 +# define POLY1305_NEON_ALIGNMENT 16 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ #ifdef POLY1305_USE_AVX2 # define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE +#elif defined(POLY1305_USE_NEON) +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_NEON_BLOCKSIZE #elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE #else @@ -78,6 +92,8 @@ /* Largest state-size used in any implementation. */ #ifdef POLY1305_USE_AVX2 # define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE +#elif defined(POLY1305_USE_NEON) +# define POLY1305_LARGEST_STATESIZE POLY1305_NEON_STATESIZE #elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE #else @@ -87,6 +103,8 @@ /* Minimum alignment for state pointer passed to implementations. */ #ifdef POLY1305_USE_AVX2 # define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT +#elif defined(POLY1305_USE_NEON) +# define POLY1305_STATE_ALIGNMENT POLY1305_NEON_ALIGNMENT #elif defined(POLY1305_USE_SSE2) # define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT #else diff --git a/cipher/poly1305.c b/cipher/poly1305.c index fe241c1..28dbbf8 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -76,6 +76,25 @@ static const poly1305_ops_t poly1305_amd64_avx2_ops = { #endif +#ifdef POLY1305_USE_NEON + +void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_armv7_neon_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_armv7_neon_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_armv7_neon_ops = { + POLY1305_NEON_BLOCKSIZE, + _gcry_poly1305_armv7_neon_init_ext, + _gcry_poly1305_armv7_neon_blocks, + _gcry_poly1305_armv7_neon_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -661,6 +680,10 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, if (features & HWF_INTEL_AVX2) ctx->ops = &poly1305_amd64_avx2_ops; #endif +#ifdef POLY1305_USE_NEON + if (features & HWF_ARM_NEON) + ctx->ops = &poly1305_armv7_neon_ops; +#endif (void)features; buf_cpy (keytmp.b, key, POLY1305_KEYLEN); diff --git a/configure.ac b/configure.ac index 60ed015..a0d5fc9 100644 --- a/configure.ac +++ b/configure.ac @@ -1837,6 +1837,11 @@ case "${host}" in ;; esac +if test x"$neonsupport" = xyes ; then + # Build with the NEON implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-armv7-neon.lo" +fi + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" From wk at gnupg.org Sun Nov 2 22:59:56 2014 From: wk at gnupg.org (Werner Koch) Date: Sun, 02 Nov 2014 22:59:56 +0100 Subject: [PATCH 3/3] Disable NEON for CPUs that are known to have broken NEON implementation In-Reply-To: <20141102165245.18119.39131.stgit@localhost6.localdomain6> (Jussi Kivilinna's message of "Sun, 02 Nov 2014 18:52:45 +0200") References: <20141102165235.18119.67152.stgit@localhost6.localdomain6> <20141102165245.18119.39131.stgit@localhost6.localdomain6> Message-ID: <87zjc9tg9v.fsf@vigenere.g10code.de> On Sun, 2 Nov 2014 17:52, jussi.kivilinna at iki.fi said: > + /* Find CPU version information. */ > + for (i = 0; i < sizeof(cpu_entries) / sizeof(cpu_entries[0]); i++) Please use for (i = 0; i < DIM(cpu_entries); i++) for easier reading. > + if (cpu_implementer == 0x51 && > + cpu_arch == 7 && > + cpu_variant == 1 && > + cpu_part == 0x4d && > + cpu_revision == 0) The GNU coding standards like to have it this way: if (cpu_implementer == 0x51 && cpu_arch == 7 && cpu_variant == 1 && cpu_part == 0x4d && cpu_revision == 0) (I know that the first line does not align nicely but this rule is used everywhere else and we should stick to it.) Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From jussi.kivilinna at iki.fi Mon Nov 3 16:41:26 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 03 Nov 2014 17:41:26 +0200 Subject: [PATCH 3/3] Disable NEON for CPUs that are known to have broken NEON implementation In-Reply-To: <87zjc9tg9v.fsf@vigenere.g10code.de> References: <20141102165235.18119.67152.stgit@localhost6.localdomain6> <20141102165245.18119.39131.stgit@localhost6.localdomain6> <87zjc9tg9v.fsf@vigenere.g10code.de> Message-ID: <5457A226.7080804@iki.fi> On 02.11.2014 23:59, Werner Koch wrote: > On Sun, 2 Nov 2014 17:52, jussi.kivilinna at iki.fi said: > >> + /* Find CPU version information. */ >> + for (i = 0; i < sizeof(cpu_entries) / sizeof(cpu_entries[0]); i++) > > Please use > > for (i = 0; i < DIM(cpu_entries); i++) > > for easier reading. > >> + if (cpu_implementer == 0x51 && >> + cpu_arch == 7 && >> + cpu_variant == 1 && >> + cpu_part == 0x4d && >> + cpu_revision == 0) > > The GNU coding standards like to have it this way: > > if (cpu_implementer == 0x51 > && cpu_arch == 7 > && cpu_variant == 1 > && cpu_part == 0x4d > && cpu_revision == 0) > > (I know that the first line does not align nicely but this rule is > used everywhere else and we should stick to it.) Ok, I'll make the changes. -Jussi > > > Shalom-Salam, > > Werner > > From cvs at cvs.gnupg.org Wed Nov 5 17:13:44 2014 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Wed, 05 Nov 2014 17:13:44 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-125-g95eef21 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 95eef21583d8e998efc48f22898c1ae31b77cb48 (commit) via 0b520128551054d83fb0bb2db8873394f38de498 (commit) via c584f44543883346d5a565581ff99a0afce9c5e1 (commit) from 669a83ba86c38b271d85ed4bf1cabc7cc8160583 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 95eef21583d8e998efc48f22898c1ae31b77cb48 Author: Jussi Kivilinna Date: Sun Nov 2 17:45:35 2014 +0200 Disable NEON for CPUs that are known to have broken NEON implementation * src/hwf-arm.c (detect_arm_proc_cpuinfo): Add parsing for CPU version information and check if CPU is known to have broken NEON implementation. (_gcry_hwf_detect_arm): Filter out broken HW features. -- Signed-off-by: Jussi Kivilinna diff --git a/src/hwf-arm.c b/src/hwf-arm.c index dbbb607..3dc050e 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -98,17 +98,32 @@ detect_arm_at_hwcap(void) #define HAS_PROC_CPUINFO 1 static unsigned int -detect_arm_proc_cpuinfo(void) +detect_arm_proc_cpuinfo(unsigned int *broken_hwfs) { char buf[1024]; /* large enough */ char *str_features, *str_neon; + int cpu_implementer, cpu_arch, cpu_variant, cpu_part, cpu_revision; FILE *f; int readlen, i; static int cpuinfo_initialized = 0; static unsigned int stored_cpuinfo_features; + static unsigned int stored_broken_hwfs; + struct { + const char *name; + int *value; + } cpu_entries[5] = { + { "CPU implementer", &cpu_implementer }, + { "CPU architecture", &cpu_arch }, + { "CPU variant", &cpu_variant }, + { "CPU part", &cpu_part }, + { "CPU revision", &cpu_revision }, + }; if (cpuinfo_initialized) - return stored_cpuinfo_features; + { + *broken_hwfs |= stored_broken_hwfs; + return stored_cpuinfo_features; + } f = fopen("/proc/cpuinfo", "r"); if (!f) @@ -124,12 +139,32 @@ detect_arm_proc_cpuinfo(void) cpuinfo_initialized = 1; stored_cpuinfo_features = 0; + stored_broken_hwfs = 0; /* Find features line. */ str_features = strstr(buf, "Features"); if (!str_features) return stored_cpuinfo_features; + /* Find CPU version information. */ + for (i = 0; i < DIM(cpu_entries); i++) + { + char *str; + + *cpu_entries[i].value = -1; + + str = strstr(buf, cpu_entries[i].name); + if (!str) + continue; + + str = strstr(str, ": "); + if (!str) + continue; + + str += 2; + *cpu_entries[i].value = strtoul(str, NULL, 0); + } + /* Lines to strings. */ for (i = 0; i < sizeof(buf); i++) if (buf[i] == '\n') @@ -140,6 +175,19 @@ detect_arm_proc_cpuinfo(void) if (str_neon && (str_neon[5] == ' ' || str_neon[5] == '\0')) stored_cpuinfo_features |= HWF_ARM_NEON; + /* Check for CPUs with broken NEON implementation. See + * https://code.google.com/p/chromium/issues/detail?id=341598 + */ + if (cpu_implementer == 0x51 + && cpu_arch == 7 + && cpu_variant == 1 + && cpu_part == 0x4d + && cpu_revision == 0) + { + stored_broken_hwfs = HWF_ARM_NEON; + } + + *broken_hwfs |= stored_broken_hwfs; return stored_cpuinfo_features; } @@ -149,18 +197,21 @@ unsigned int _gcry_hwf_detect_arm (void) { unsigned int ret = 0; + unsigned int broken_hwfs = 0; #if defined (HAS_SYS_AT_HWCAP) ret |= detect_arm_at_hwcap (); #endif #if defined (HAS_PROC_CPUINFO) - ret |= detect_arm_proc_cpuinfo (); + ret |= detect_arm_proc_cpuinfo (&broken_hwfs); #endif #if defined(__ARM_NEON__) && defined(ENABLE_NEON_SUPPORT) ret |= HWF_ARM_NEON; #endif + ret &= ~broken_hwfs; + return ret; } commit 0b520128551054d83fb0bb2db8873394f38de498 Author: Jussi Kivilinna Date: Sun Nov 2 16:01:11 2014 +0200 Add ARM/NEON implementation of Poly1305 * cipher/Makefile.am: Add 'poly1305-armv7-neon.S'. * cipher/poly1305-armv7-neon.S: New. * cipher/poly1305-internal.h (POLY1305_USE_NEON) (POLY1305_NEON_BLOCKSIZE, POLY1305_NEON_STATESIZE) (POLY1305_NEON_ALIGNMENT): New. * cipher/poly1305.c [POLY1305_USE_NEON] (_gcry_poly1305_armv7_neon_init_ext) (_gcry_poly1305_armv7_neon_finish_ext) (_gcry_poly1305_armv7_neon_blocks, poly1305_armv7_neon_ops): New. (_gcry_poly1305_init) [POLY1305_USE_NEON]: Select NEON implementation if HWF_ARM_NEON set. * configure.ac [neonsupport=yes]: Add 'poly1305-armv7-neon.lo'. -- Add Andrew Moon's public domain NEON implementation of Poly1305. Original source is available at: https://github.com/floodyberry/poly1305-opt Benchmark on Cortex-A8 (--cpu-mhz 1008): Old: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 12.34 ns/B 77.27 MiB/s 12.44 c/B New: | nanosecs/byte mebibytes/sec cycles/byte POLY1305 | 2.12 ns/B 450.7 MiB/s 2.13 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 09ccaf9..22018b3 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -73,7 +73,7 @@ gost28147.c gost.h \ gostr3411-94.c \ md4.c \ md5.c \ -poly1305-sse2-amd64.S poly1305-avx2-amd64.S \ +poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \ rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \ rmd160.c \ rsa.c \ diff --git a/cipher/poly1305-armv7-neon.S b/cipher/poly1305-armv7-neon.S new file mode 100644 index 0000000..1134e85 --- /dev/null +++ b/cipher/poly1305-armv7-neon.S @@ -0,0 +1,705 @@ +/* poly1305-armv7-neon.S - ARMv7/NEON implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +.syntax unified +.fpu neon +.arm + +.text + +.p2align 2 +.Lpoly1305_init_constants_neon: +.long 0x3ffff03 +.long 0x3ffc0ff +.long 0x3f03fff +.long 0x00fffff + +.globl _gcry_poly1305_armv7_neon_init_ext +.type _gcry_poly1305_armv7_neon_init_ext,%function; +_gcry_poly1305_armv7_neon_init_ext: +.Lpoly1305_init_ext_neon_local: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #32 + mov r14, r2 + and r2, r2, r2 + moveq r14, #-1 + ldmia r1!, {r2-r5} + ldr r7, =.Lpoly1305_init_constants_neon + mov r6, r2 + mov r8, r2, lsr #26 + mov r9, r3, lsr #20 + mov r10, r4, lsr #14 + mov r11, r5, lsr #8 + orr r8, r8, r3, lsl #6 + orr r9, r9, r4, lsl #12 + orr r10, r10, r5, lsl #18 + ldmia r7, {r2-r5} + and r2, r2, r8 + and r3, r3, r9 + and r4, r4, r10 + and r5, r5, r11 + and r6, r6, 0x3ffffff + stmia r0!, {r2-r6} + eor r8, r8, r8 + str r8, [sp, #24] +.Lpoly1305_init_ext_neon_squareloop: + ldr r8, [sp, #24] + mov r12, #16 + cmp r8, #2 + beq .Lpoly1305_init_ext_neon_donesquaring + cmp r8, #1 + moveq r12, #64 + cmp r14, r12 + bls .Lpoly1305_init_ext_neon_donesquaring + add r8, #1 + str r8, [sp, #24] + mov r6, r6, lsl #1 + mov r2, r2, lsl #1 + umull r7, r8, r3, r3 + umull r9, r10, r6, r4 + umlal r7, r8, r6, r5 + umlal r9, r10, r2, r3 + add r11, r5, r5, lsl #2 + umlal r7, r8, r2, r4 + umlal r9, r10, r5, r11 + str r7, [sp, #16] + str r8, [sp, #20] + mov r2, r2, lsr #1 + mov r5, r5, lsl #1 + str r9, [sp, #8] + str r10, [sp, #12] + umull r7, r8, r2, r2 + umull r9, r10, r6, r2 + add r11, r3, r3, lsl #2 + add r12, r4, r4, lsl #2 + umlal r7, r8, r6, r3 + umlal r9, r10, r5, r11 + umlal r7, r8, r5, r12 + umlal r9, r10, r4, r12 + mov r6, r6, lsr #1 + mov r3, r3, lsl #1 + add r11, r2, r2, lsl #2 + str r7, [sp, #0] + str r8, [sp, #4] + umull r7, r8, r6, r6 + umlal r7, r8, r3, r12 + umlal r7, r8, r5, r11 + and r6, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + ldr r7, [sp, #0] + ldr r8, [sp, #4] + adds r9, r9, r11 + adc r10, r10, #0 + and r2, r9, 0x3ffffff + mov r11, r9, lsr #26 + orr r11, r11, r10, lsl #6 + ldr r9, [sp, #8] + ldr r10, [sp, #12] + adds r7, r7, r11 + adc r8, r8, #0 + and r3, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + ldr r7, [sp, #16] + ldr r8, [sp, #20] + adds r9, r9, r11 + adc r10, r10, #0 + and r4, r9, 0x3ffffff + mov r11, r9, lsr #26 + orr r11, r11, r10, lsl #6 + adds r7, r7, r11 + adc r8, r8, #0 + and r5, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + add r11, r11, r11, lsl #2 + add r6, r6, r11 + mov r11, r6, lsr #26 + and r6, r6, 0x3ffffff + add r2, r2, r11 + stmia r0!, {r2-r6} + b .Lpoly1305_init_ext_neon_squareloop +.Lpoly1305_init_ext_neon_donesquaring: + mov r2, #2 + ldr r14, [sp, #24] + sub r14, r2, r14 + mov r3, r14, lsl #4 + add r3, r3, r14, lsl #2 + add r0, r0, r3 + eor r2, r2, r2 + eor r3, r3, r3 + eor r4, r4, r4 + eor r5, r5, r5 + eor r6, r6, r6 + stmia r0!, {r2-r6} + stmia r0!, {r2-r6} + ldmia r1!, {r2-r5} + stmia r0, {r2-r6} + add sp, sp, #32 + ldmfd sp!, {r4-r11, lr} + mov r0, #(9*4+32) + bx lr +.ltorg +.size _gcry_poly1305_armv7_neon_init_ext,.-_gcry_poly1305_armv7_neon_init_ext; + +.globl _gcry_poly1305_armv7_neon_blocks +.type _gcry_poly1305_armv7_neon_blocks,%function; +_gcry_poly1305_armv7_neon_blocks: +.Lpoly1305_blocks_neon_local: + vmov.i32 q0, #0xffffffff + vmov.i32 d4, #1 + vsubw.u32 q0, q0, d4 + vstmdb sp!, {q4,q5,q6,q7} + stmfd sp!, {r4-r11, lr} + mov r8, sp + and sp, sp, #~63 + sub sp, sp, #192 + str r0, [sp, #108] + str r1, [sp, #112] + str r2, [sp, #116] + str r8, [sp, #120] + mov r3, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + ldr r8, [r2, #116] + veor d15, d15, d15 + vorr.i32 d15, #(1 << 24) + tst r8, #2 + beq .Lpoly1305_blocks_neon_skip_shift8 + vshr.u64 d15, #32 +.Lpoly1305_blocks_neon_skip_shift8: + tst r8, #4 + beq .Lpoly1305_blocks_neon_skip_shift16 + veor d15, d15, d15 +.Lpoly1305_blocks_neon_skip_shift16: + vst1.64 d15, [sp, :64] + tst r8, #1 + bne .Lpoly1305_blocks_neon_started + vld1.64 {q0-q1}, [r0]! + vswp d1, d2 + vmovn.i64 d21, q0 + vshrn.i64 d22, q0, #26 + vshrn.u64 d24, q1, #14 + vext.8 d0, d0, d2, #4 + vext.8 d1, d1, d3, #4 + vshr.u64 q1, q1, #32 + vshrn.i64 d23, q0, #20 + vshrn.u64 d25, q1, #8 + vand.i32 d21, #0x03ffffff + vand.i32 q11, #0x03ffffff + vand.i32 q12, #0x03ffffff + orr r8, r8, #1 + sub r1, r1, #32 + str r8, [r2, #116] + vorr d25, d25, d15 + b .Lpoly1305_blocks_neon_setupr20 +.Lpoly1305_blocks_neon_started: + add r9, r2, #60 + vldm r9, {d21-d25} +.Lpoly1305_blocks_neon_setupr20: + vmov.i32 d0, #5 + tst r8, #(8|16) + beq .Lpoly1305_blocks_neon_setupr20_simple + tst r8, #(8) + beq .Lpoly1305_blocks_neon_setupr20_r_1 + mov r9, r2 + add r10, r2, #20 + vld1.64 {q9}, [r9]! + vld1.64 {q8}, [r10]! + vld1.64 {d2}, [r9] + vld1.64 {d20}, [r10] + b .Lpoly1305_blocks_neon_setupr20_hard +.Lpoly1305_blocks_neon_setupr20_r_1: + mov r9, r2 + vmov.i32 d2, #1 + vld1.64 {q8}, [r9]! + veor q9, q9, q9 + vshr.u64 d2, d2, #32 + vld1.64 {d20}, [r9] +.Lpoly1305_blocks_neon_setupr20_hard: + vzip.i32 q8, q9 + vzip.i32 d20, d2 + b .Lpoly1305_blocks_neon_setups20 +.Lpoly1305_blocks_neon_setupr20_simple: + add r9, r2, #20 + vld1.64 {d2-d4}, [r9] + vdup.32 d16, d2[0] + vdup.32 d17, d2[1] + vdup.32 d18, d3[0] + vdup.32 d19, d3[1] + vdup.32 d20, d4[0] +.Lpoly1305_blocks_neon_setups20: + vmul.i32 q13, q8, d0[0] + vmov.i64 q15, 0x00000000ffffffff + vmul.i32 q14, q9, d0[0] + vshr.u64 q15, q15, #6 + cmp r1, #64 + blo .Lpoly1305_blocks_neon_try32 + add r9, sp, #16 + add r10, r2, #40 + add r11, sp, #64 + str r1, [sp, #116] + vld1.64 {d10-d12}, [r10] + vmov d14, d12 + vmul.i32 q6, q5, d0[0] +.Lpoly1305_blocks_neon_mainloop: + ldmia r0!, {r2-r5} + vmull.u32 q0, d25, d12[0] + mov r7, r2, lsr #26 + vmlal.u32 q0, d24, d12[1] + mov r8, r3, lsr #20 + ldr r6, [sp, #0] + vmlal.u32 q0, d23, d13[0] + mov r9, r4, lsr #14 + vmlal.u32 q0, d22, d13[1] + orr r6, r6, r5, lsr #8 + vmlal.u32 q0, d21, d14[0] + orr r3, r7, r3, lsl #6 + vmull.u32 q1, d25, d12[1] + orr r4, r8, r4, lsl #12 + orr r5, r9, r5, lsl #18 + vmlal.u32 q1, d24, d13[0] + ldmia r0!, {r7-r10} + vmlal.u32 q1, d23, d13[1] + mov r1, r7, lsr #26 + vmlal.u32 q1, d22, d14[0] + ldr r11, [sp, #4] + mov r12, r8, lsr #20 + vmlal.u32 q1, d21, d10[0] + mov r14, r9, lsr #14 + vmull.u32 q2, d25, d13[0] + orr r11, r11, r10, lsr #8 + orr r8, r1, r8, lsl #6 + vmlal.u32 q2, d24, d13[1] + orr r9, r12, r9, lsl #12 + vmlal.u32 q2, d23, d14[0] + orr r10, r14, r10, lsl #18 + vmlal.u32 q2, d22, d10[0] + mov r12, r3 + and r2, r2, #0x3ffffff + vmlal.u32 q2, d21, d10[1] + mov r14, r5 + vmull.u32 q3, d25, d13[1] + and r3, r7, #0x3ffffff + vmlal.u32 q3, d24, d14[0] + and r5, r8, #0x3ffffff + vmlal.u32 q3, d23, d10[0] + and r7, r9, #0x3ffffff + vmlal.u32 q3, d22, d10[1] + and r8, r14, #0x3ffffff + vmlal.u32 q3, d21, d11[0] + and r9, r10, #0x3ffffff + add r14, sp, #128 + vmull.u32 q4, d25, d14[0] + mov r10, r6 + vmlal.u32 q4, d24, d10[0] + and r6, r4, #0x3ffffff + vmlal.u32 q4, d23, d10[1] + and r4, r12, #0x3ffffff + vmlal.u32 q4, d22, d11[0] + stm r14, {r2-r11} + vmlal.u32 q4, d21, d11[1] + vld1.64 {d21-d24}, [r14, :256]! + vld1.64 {d25}, [r14, :64] + ldmia r0!, {r2-r5} + vmlal.u32 q0, d25, d26 + mov r7, r2, lsr #26 + vmlal.u32 q0, d24, d27 + ldr r6, [sp, #0] + mov r8, r3, lsr #20 + vmlal.u32 q0, d23, d28 + mov r9, r4, lsr #14 + vmlal.u32 q0, d22, d29 + orr r6, r6, r5, lsr #8 + vmlal.u32 q0, d21, d20 + orr r3, r7, r3, lsl #6 + vmlal.u32 q1, d25, d27 + orr r4, r8, r4, lsl #12 + orr r5, r9, r5, lsl #18 + vmlal.u32 q1, d24, d28 + ldmia r0!, {r7-r10} + vmlal.u32 q1, d23, d29 + mov r1, r7, lsr #26 + vmlal.u32 q1, d22, d20 + ldr r11, [sp, #4] + mov r12, r8, lsr #20 + vmlal.u32 q1, d21, d16 + mov r14, r9, lsr #14 + vmlal.u32 q2, d25, d28 + orr r11, r11, r10, lsr #8 + orr r8, r1, r8, lsl #6 + orr r9, r12, r9, lsl #12 + vmlal.u32 q2, d24, d29 + orr r10, r14, r10, lsl #18 + and r2, r2, #0x3ffffff + mov r12, r3 + vmlal.u32 q2, d23, d20 + mov r14, r5 + vmlal.u32 q2, d22, d16 + and r3, r7, #0x3ffffff + vmlal.u32 q2, d21, d17 + and r5, r8, #0x3ffffff + vmlal.u32 q3, d25, d29 + and r7, r9, #0x3ffffff + vmlal.u32 q3, d24, d20 + and r8, r14, #0x3ffffff + vmlal.u32 q3, d23, d16 + and r9, r10, #0x3ffffff + vmlal.u32 q3, d22, d17 + add r14, sp, #128 + vmlal.u32 q3, d21, d18 + mov r10, r6 + vmlal.u32 q4, d25, d20 + vmlal.u32 q4, d24, d16 + and r6, r4, #0x3ffffff + vmlal.u32 q4, d23, d17 + and r4, r12, #0x3ffffff + vmlal.u32 q4, d22, d18 + stm r14, {r2-r11} + vmlal.u32 q4, d21, d19 + vld1.64 {d21-d24}, [r14, :256]! + vld1.64 {d25}, [r14, :64] + vaddw.u32 q0, q0, d21 + vaddw.u32 q1, q1, d22 + vaddw.u32 q2, q2, d23 + vaddw.u32 q3, q3, d24 + vaddw.u32 q4, q4, d25 + vshr.u64 q11, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q11 + vshr.u64 q12, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q12 + vshr.u64 q11, q1, #26 + vand q1, q1, q15 + vadd.i64 q2, q2, q11 + vshr.u64 q12, q4, #26 + vand q4, q4, q15 + vadd.i64 q0, q0, q12 + vshl.i64 q12, q12, #2 + ldr r1, [sp, #116] + vadd.i64 q0, q0, q12 + vshr.u64 q11, q2, #26 + vand q2, q2, q15 + vadd.i64 q3, q3, q11 + sub r1, #64 + vshr.u64 q12, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q12 + cmp r1, #64 + vshr.u64 q11, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q11 + vmovn.i64 d21, q0 + str r1, [sp, #116] + vmovn.i64 d22, q1 + vmovn.i64 d23, q2 + vmovn.i64 d24, q3 + vmovn.i64 d25, q4 + bhs .Lpoly1305_blocks_neon_mainloop +.Lpoly1305_blocks_neon_try32: + cmp r1, #32 + blo .Lpoly1305_blocks_neon_done + tst r0, r0 + bne .Lpoly1305_blocks_loadm32 + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + veor q4, q4, q4 + b .Lpoly1305_blocks_continue32 +.Lpoly1305_blocks_loadm32: + vld1.64 {q0-q1}, [r0]! + veor q4, q4, q4 + vswp d1, d2 + veor q3, q3, q3 + vtrn.32 q0, q4 + vtrn.32 q1, q3 + vshl.i64 q2, q1, #12 + vshl.i64 q3, q3, #18 + vshl.i64 q1, q4, #6 + vmovl.u32 q4, d15 +.Lpoly1305_blocks_continue32: + vmlal.u32 q0, d25, d26 + vmlal.u32 q0, d24, d27 + vmlal.u32 q0, d23, d28 + vmlal.u32 q0, d22, d29 + vmlal.u32 q0, d21, d20 + vmlal.u32 q1, d25, d27 + vmlal.u32 q1, d24, d28 + vmlal.u32 q1, d23, d29 + vmlal.u32 q1, d22, d20 + vmlal.u32 q1, d21, d16 + vmlal.u32 q2, d25, d28 + vmlal.u32 q2, d24, d29 + vmlal.u32 q2, d23, d20 + vmlal.u32 q2, d22, d16 + vmlal.u32 q2, d21, d17 + vmlal.u32 q3, d25, d29 + vmlal.u32 q3, d24, d20 + vmlal.u32 q3, d23, d16 + vmlal.u32 q3, d22, d17 + vmlal.u32 q3, d21, d18 + vmlal.u32 q4, d25, d20 + vmlal.u32 q4, d24, d16 + vmlal.u32 q4, d23, d17 + vmlal.u32 q4, d22, d18 + vmlal.u32 q4, d21, d19 + vshr.u64 q11, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q11 + vshr.u64 q12, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q12 + vshr.u64 q11, q1, #26 + vand q1, q1, q15 + vadd.i64 q2, q2, q11 + vshr.u64 q12, q4, #26 + vand q4, q4, q15 + vadd.i64 q0, q0, q12 + vshl.i64 q12, q12, #2 + vadd.i64 q0, q0, q12 + vshr.u64 q11, q2, #26 + vand q2, q2, q15 + vadd.i64 q3, q3, q11 + vshr.u64 q12, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q12 + vshr.u64 q11, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q11 + vmovn.i64 d21, q0 + vmovn.i64 d22, q1 + vmovn.i64 d23, q2 + vmovn.i64 d24, q3 + vmovn.i64 d25, q4 +.Lpoly1305_blocks_neon_done: + tst r0, r0 + beq .Lpoly1305_blocks_neon_final + ldr r2, [sp, #108] + add r2, r2, #60 + vst1.64 {d21}, [r2]! + vst1.64 {d22-d25}, [r2] + b .Lpoly1305_blocks_neon_leave +.Lpoly1305_blocks_neon_final: + vadd.u32 d10, d0, d1 + vadd.u32 d13, d2, d3 + vadd.u32 d11, d4, d5 + ldr r5, [sp, #108] + vadd.u32 d14, d6, d7 + vadd.u32 d12, d8, d9 + vtrn.32 d10, d13 + vtrn.32 d11, d14 + vst1.64 {d10-d12}, [sp] + ldm sp, {r0-r4} + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + mov r12, r1, lsr #26 + and r1, r1, #0x3ffffff + add r2, r2, r12 + mov r12, r2, lsr #26 + and r2, r2, #0x3ffffff + add r3, r3, r12 + mov r12, r3, lsr #26 + and r3, r3, #0x3ffffff + add r4, r4, r12 + mov r12, r4, lsr #26 + and r4, r4, #0x3ffffff + add r12, r12, r12, lsl #2 + add r0, r0, r12 + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + mov r12, r1, lsr #26 + and r1, r1, #0x3ffffff + add r2, r2, r12 + mov r12, r2, lsr #26 + and r2, r2, #0x3ffffff + add r3, r3, r12 + mov r12, r3, lsr #26 + and r3, r3, #0x3ffffff + add r4, r4, r12 + mov r12, r4, lsr #26 + and r4, r4, #0x3ffffff + add r12, r12, r12, lsl #2 + add r0, r0, r12 + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + add r6, r0, #5 + mov r12, r6, lsr #26 + and r6, r6, #0x3ffffff + add r7, r1, r12 + mov r12, r7, lsr #26 + and r7, r7, #0x3ffffff + add r10, r2, r12 + mov r12, r10, lsr #26 + and r10, r10, #0x3ffffff + add r11, r3, r12 + mov r12, #-(1 << 26) + add r12, r12, r11, lsr #26 + and r11, r11, #0x3ffffff + add r14, r4, r12 + mov r12, r14, lsr #31 + sub r12, #1 + and r6, r6, r12 + and r7, r7, r12 + and r10, r10, r12 + and r11, r11, r12 + and r14, r14, r12 + mvn r12, r12 + and r0, r0, r12 + and r1, r1, r12 + and r2, r2, r12 + and r3, r3, r12 + and r4, r4, r12 + orr r0, r0, r6 + orr r1, r1, r7 + orr r2, r2, r10 + orr r3, r3, r11 + orr r4, r4, r14 + orr r0, r0, r1, lsl #26 + lsr r1, r1, #6 + orr r1, r1, r2, lsl #20 + lsr r2, r2, #12 + orr r2, r2, r3, lsl #14 + lsr r3, r3, #18 + orr r3, r3, r4, lsl #8 + add r5, r5, #60 + stm r5, {r0-r3} +.Lpoly1305_blocks_neon_leave: + sub r0, sp, #8 + ldr sp, [sp, #120] + ldmfd sp!, {r4-r11, lr} + vldm sp!, {q4-q7} + sub r0, sp, r0 + bx lr +.size _gcry_poly1305_armv7_neon_blocks,.-_gcry_poly1305_armv7_neon_blocks; + +.globl _gcry_poly1305_armv7_neon_finish_ext +.type _gcry_poly1305_armv7_neon_finish_ext,%function; +_gcry_poly1305_armv7_neon_finish_ext: +.Lpoly1305_finish_ext_neon_local: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #32 + mov r5, r0 + mov r6, r1 + mov r7, r2 + mov r8, r3 + ands r7, r7, r7 + beq .Lpoly1305_finish_ext_neon_noremaining + mov r9, sp + veor q0, q0, q0 + veor q1, q1, q1 + vst1.64 {q0-q1}, [sp] + tst r7, #16 + beq .Lpoly1305_finish_ext_neon_skip16 + vld1.u64 {q0}, [r1]! + vst1.64 {q0}, [r9]! +.Lpoly1305_finish_ext_neon_skip16: + tst r7, #8 + beq .Lpoly1305_finish_ext_neon_skip8 + ldmia r1!, {r10-r11} + stmia r9!, {r10-r11} +.Lpoly1305_finish_ext_neon_skip8: + tst r7, #4 + beq .Lpoly1305_finish_ext_neon_skip4 + ldr r10, [r1], #4 + str r10, [r9], #4 +.Lpoly1305_finish_ext_neon_skip4: + tst r7, #2 + beq .Lpoly1305_finish_ext_neon_skip2 + ldrh r10, [r1], #2 + strh r10, [r9], #2 +.Lpoly1305_finish_ext_neon_skip2: + tst r7, #1 + beq .Lpoly1305_finish_ext_neon_skip1 + ldrb r10, [r1], #1 + strb r10, [r9], #1 +.Lpoly1305_finish_ext_neon_skip1: + cmp r7, #16 + beq .Lpoly1305_finish_ext_neon_skipfinalbit + mov r10, #1 + strb r10, [r9] +.Lpoly1305_finish_ext_neon_skipfinalbit: + ldr r10, [r5, #116] + orrhs r10, #2 + orrlo r10, #4 + str r10, [r5, #116] + mov r0, r5 + mov r1, sp + mov r2, #32 + bl .Lpoly1305_blocks_neon_local +.Lpoly1305_finish_ext_neon_noremaining: + ldr r10, [r5, #116] + tst r10, #1 + beq .Lpoly1305_finish_ext_neon_notstarted + cmp r7, #0 + beq .Lpoly1305_finish_ext_neon_user2r + cmp r7, #16 + bls .Lpoly1305_finish_ext_neon_user1 +.Lpoly1305_finish_ext_neon_user2r: + orr r10, r10, #8 + b .Lpoly1305_finish_ext_neon_finalblock +.Lpoly1305_finish_ext_neon_user1: + orr r10, r10, #16 +.Lpoly1305_finish_ext_neon_finalblock: + str r10, [r5, #116] + mov r0, r5 + eor r1, r1, r1 + mov r2, #32 + bl .Lpoly1305_blocks_neon_local +.Lpoly1305_finish_ext_neon_notstarted: + add r0, r5, #60 + add r9, r5, #100 + ldm r0, {r0-r3} + ldm r9, {r9-r12} + adds r0, r0, r9 + adcs r1, r1, r10 + adcs r2, r2, r11 + adcs r3, r3, r12 + stm r8, {r0-r3} + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + vstmia r5!, {q0-q3} + vstm r5, {q0-q3} + add sp, sp, #32 + ldmfd sp!, {r4-r11, lr} + mov r0, #(9*4+32) + bx lr +.size _gcry_poly1305_armv7_neon_finish_ext,.-_gcry_poly1305_armv7_neon_finish_ext; + +#endif diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h index 0299c43..dfc0c04 100644 --- a/cipher/poly1305-internal.h +++ b/cipher/poly1305-internal.h @@ -65,10 +65,24 @@ #endif +/* POLY1305_USE_NEON indicates whether to enable ARM NEON assembly code. */ +#undef POLY1305_USE_NEON +#if defined(ENABLE_NEON_SUPPORT) && defined(HAVE_ARM_ARCH_V6) && \ + defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) +# define POLY1305_USE_NEON 1 +# define POLY1305_NEON_BLOCKSIZE 32 +# define POLY1305_NEON_STATESIZE 128 +# define POLY1305_NEON_ALIGNMENT 16 +#endif + + /* Largest block-size used in any implementation (optimized implementations * might use block-size multiple of 16). */ #ifdef POLY1305_USE_AVX2 # define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE +#elif defined(POLY1305_USE_NEON) +# define POLY1305_LARGEST_BLOCKSIZE POLY1305_NEON_BLOCKSIZE #elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE #else @@ -78,6 +92,8 @@ /* Largest state-size used in any implementation. */ #ifdef POLY1305_USE_AVX2 # define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE +#elif defined(POLY1305_USE_NEON) +# define POLY1305_LARGEST_STATESIZE POLY1305_NEON_STATESIZE #elif defined(POLY1305_USE_SSE2) # define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE #else @@ -87,6 +103,8 @@ /* Minimum alignment for state pointer passed to implementations. */ #ifdef POLY1305_USE_AVX2 # define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT +#elif defined(POLY1305_USE_NEON) +# define POLY1305_STATE_ALIGNMENT POLY1305_NEON_ALIGNMENT #elif defined(POLY1305_USE_SSE2) # define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT #else diff --git a/cipher/poly1305.c b/cipher/poly1305.c index fe241c1..28dbbf8 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -76,6 +76,25 @@ static const poly1305_ops_t poly1305_amd64_avx2_ops = { #endif +#ifdef POLY1305_USE_NEON + +void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key); +unsigned int _gcry_poly1305_armv7_neon_finish_ext(void *state, const byte *m, + size_t remaining, + byte mac[16]); +unsigned int _gcry_poly1305_armv7_neon_blocks(void *ctx, const byte *m, + size_t bytes); + +static const poly1305_ops_t poly1305_armv7_neon_ops = { + POLY1305_NEON_BLOCKSIZE, + _gcry_poly1305_armv7_neon_init_ext, + _gcry_poly1305_armv7_neon_blocks, + _gcry_poly1305_armv7_neon_finish_ext +}; + +#endif + + #ifdef HAVE_U64_TYPEDEF /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit @@ -661,6 +680,10 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key, if (features & HWF_INTEL_AVX2) ctx->ops = &poly1305_amd64_avx2_ops; #endif +#ifdef POLY1305_USE_NEON + if (features & HWF_ARM_NEON) + ctx->ops = &poly1305_armv7_neon_ops; +#endif (void)features; buf_cpy (keytmp.b, key, POLY1305_KEYLEN); diff --git a/configure.ac b/configure.ac index 60ed015..a0d5fc9 100644 --- a/configure.ac +++ b/configure.ac @@ -1837,6 +1837,11 @@ case "${host}" in ;; esac +if test x"$neonsupport" = xyes ; then + # Build with the NEON implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-armv7-neon.lo" +fi + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) if test "$found" = "1" ; then GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo" commit c584f44543883346d5a565581ff99a0afce9c5e1 Author: Jussi Kivilinna Date: Wed Aug 6 20:05:16 2014 +0300 chacha20: add ARMv7/NEON implementation * cipher/Makefile.am: Add 'chacha20-armv7-neon.S'. * cipher/chacha20-armv7-neon.S: New. * cipher/chacha20.c (USE_NEON): New. [USE_NEON] (_gcry_chacha20_armv7_neon_blocks): New. (chacha20_do_setkey) [USE_NEON]: Use Neon implementation if HWF_ARM_NEON flag set. (selftest): Self-test encrypting buffer byte by byte. * configure.ac [neonsupport=yes]: Add 'chacha20-armv7-neon.lo'. -- Add Andrew Moon's public domain ARMv7/NEON implementation of ChaCha20. Original source is available at: https://github.com/floodyberry/chacha-opt Benchmark on Cortex-A8 (--cpu-mhz 1008): Old: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 13.45 ns/B 70.92 MiB/s 13.56 c/B STREAM dec | 13.45 ns/B 70.90 MiB/s 13.56 c/B New: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 6.20 ns/B 153.9 MiB/s 6.25 c/B STREAM dec | 6.20 ns/B 153.9 MiB/s 6.25 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 7f45cbb..09ccaf9 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -61,6 +61,7 @@ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ + chacha20-armv7-neon.S \ crc.c \ des.c des-amd64.S \ dsa.c \ diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S new file mode 100644 index 0000000..1a395ba --- /dev/null +++ b/cipher/chacha20-armv7-neon.S @@ -0,0 +1,710 @@ +/* chacha20-armv7-neon.S - ARM/NEON accelerated chacha20 blocks function + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/chacha-opt + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_CHACHA20) + +.syntax unified +.fpu neon +.arm + +.text + +.globl _gcry_chacha20_armv7_neon_blocks +.type _gcry_chacha20_armv7_neon_blocks,%function; +_gcry_chacha20_armv7_neon_blocks: +.Lchacha_blocks_neon_local: + tst r3, r3 + beq .Lchacha_blocks_neon_nobytes + vstmdb sp!, {q4,q5,q6,q7} + stmfd sp!, {r4-r12, r14} + mov r8, sp + sub sp, sp, #196 + and sp, sp, #0xffffffe0 + str r0, [sp, #60] + str r1, [sp, #48] + str r2, [sp, #40] + str r3, [sp, #52] + str r8, [sp, #192] + add r1, sp, #64 + ldmia r0!, {r4-r11} + stmia r1!, {r4-r11} + ldmia r0!, {r4-r11} + stmia r1!, {r4-r11} + mov r4, #20 + str r4, [sp, #44] + cmp r3, #256 + blo .Lchacha_blocks_neon_mainloop2 +.Lchacha_blocks_neon_mainloop1: + ldr r0, [sp, #44] + str r0, [sp, #0] + add r1, sp, #(64) + mov r2, #1 + veor q12, q12 + vld1.32 {q0,q1}, [r1,:128]! + vld1.32 {q2,q3}, [r1,:128] + vmov.32 d24[0], r2 + vadd.u64 q3, q3, q12 + vmov q4, q0 + vmov q5, q1 + vmov q6, q2 + vadd.u64 q7, q3, q12 + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vadd.u64 q11, q7, q12 + add r0, sp, #64 + ldm r0, {r0-r12} + ldr r14, [sp, #(64 +60)] + str r6, [sp, #8] + str r11, [sp, #12] + str r14, [sp, #28] + ldr r11, [sp, #(64 +52)] + ldr r14, [sp, #(64 +56)] +.Lchacha_blocks_neon_rounds1: + ldr r6, [sp, #0] + vadd.i32 q0, q0, q1 + add r0, r0, r4 + vadd.i32 q4, q4, q5 + add r1, r1, r5 + vadd.i32 q8, q8, q9 + eor r12, r12, r0 + veor q12, q3, q0 + eor r11, r11, r1 + veor q13, q7, q4 + ror r12, r12, #16 + veor q14, q11, q8 + ror r11, r11, #16 + vrev32.16 q3, q12 + subs r6, r6, #2 + vrev32.16 q7, q13 + add r8, r8, r12 + vrev32.16 q11, q14 + add r9, r9, r11 + vadd.i32 q2, q2, q3 + eor r4, r4, r8 + vadd.i32 q6, q6, q7 + eor r5, r5, r9 + vadd.i32 q10, q10, q11 + str r6, [sp, #0] + veor q12, q1, q2 + ror r4, r4, #20 + veor q13, q5, q6 + ror r5, r5, #20 + veor q14, q9, q10 + add r0, r0, r4 + vshl.i32 q1, q12, #12 + add r1, r1, r5 + vshl.i32 q5, q13, #12 + ldr r6, [sp, #8] + vshl.i32 q9, q14, #12 + eor r12, r12, r0 + vsri.u32 q1, q12, #20 + eor r11, r11, r1 + vsri.u32 q5, q13, #20 + ror r12, r12, #24 + vsri.u32 q9, q14, #20 + ror r11, r11, #24 + vadd.i32 q0, q0, q1 + add r8, r8, r12 + vadd.i32 q4, q4, q5 + add r9, r9, r11 + vadd.i32 q8, q8, q9 + eor r4, r4, r8 + veor q12, q3, q0 + eor r5, r5, r9 + veor q13, q7, q4 + str r11, [sp, #20] + veor q14, q11, q8 + ror r4, r4, #25 + vshl.i32 q3, q12, #8 + ror r5, r5, #25 + vshl.i32 q7, q13, #8 + str r4, [sp, #4] + vshl.i32 q11, q14, #8 + ldr r4, [sp, #28] + vsri.u32 q3, q12, #24 + add r2, r2, r6 + vsri.u32 q7, q13, #24 + add r3, r3, r7 + vsri.u32 q11, q14, #24 + ldr r11, [sp, #12] + vadd.i32 q2, q2, q3 + eor r14, r14, r2 + vadd.i32 q6, q6, q7 + eor r4, r4, r3 + vadd.i32 q10, q10, q11 + ror r14, r14, #16 + veor q12, q1, q2 + ror r4, r4, #16 + veor q13, q5, q6 + add r10, r10, r14 + veor q14, q9, q10 + add r11, r11, r4 + vshl.i32 q1, q12, #7 + eor r6, r6, r10 + vshl.i32 q5, q13, #7 + eor r7, r7, r11 + vshl.i32 q9, q14, #7 + ror r6, r6, #20 + vsri.u32 q1, q12, #25 + ror r7, r7, #20 + vsri.u32 q5, q13, #25 + add r2, r2, r6 + vsri.u32 q9, q14, #25 + add r3, r3, r7 + vext.32 q3, q3, q3, #3 + eor r14, r14, r2 + vext.32 q7, q7, q7, #3 + eor r4, r4, r3 + vext.32 q11, q11, q11, #3 + ror r14, r14, #24 + vext.32 q1, q1, q1, #1 + ror r4, r4, #24 + vext.32 q5, q5, q5, #1 + add r10, r10, r14 + vext.32 q9, q9, q9, #1 + add r11, r11, r4 + vext.32 q2, q2, q2, #2 + eor r6, r6, r10 + vext.32 q6, q6, q6, #2 + eor r7, r7, r11 + vext.32 q10, q10, q10, #2 + ror r6, r6, #25 + vadd.i32 q0, q0, q1 + ror r7, r7, #25 + vadd.i32 q4, q4, q5 + add r0, r0, r5 + vadd.i32 q8, q8, q9 + add r1, r1, r6 + veor q12, q3, q0 + eor r4, r4, r0 + veor q13, q7, q4 + eor r12, r12, r1 + veor q14, q11, q8 + ror r4, r4, #16 + vrev32.16 q3, q12 + ror r12, r12, #16 + vrev32.16 q7, q13 + add r10, r10, r4 + vrev32.16 q11, q14 + add r11, r11, r12 + vadd.i32 q2, q2, q3 + eor r5, r5, r10 + vadd.i32 q6, q6, q7 + eor r6, r6, r11 + vadd.i32 q10, q10, q11 + ror r5, r5, #20 + veor q12, q1, q2 + ror r6, r6, #20 + veor q13, q5, q6 + add r0, r0, r5 + veor q14, q9, q10 + add r1, r1, r6 + vshl.i32 q1, q12, #12 + eor r4, r4, r0 + vshl.i32 q5, q13, #12 + eor r12, r12, r1 + vshl.i32 q9, q14, #12 + ror r4, r4, #24 + vsri.u32 q1, q12, #20 + ror r12, r12, #24 + vsri.u32 q5, q13, #20 + add r10, r10, r4 + vsri.u32 q9, q14, #20 + add r11, r11, r12 + vadd.i32 q0, q0, q1 + eor r5, r5, r10 + vadd.i32 q4, q4, q5 + eor r6, r6, r11 + vadd.i32 q8, q8, q9 + str r11, [sp, #12] + veor q12, q3, q0 + ror r5, r5, #25 + veor q13, q7, q4 + ror r6, r6, #25 + veor q14, q11, q8 + str r4, [sp, #28] + vshl.i32 q3, q12, #8 + ldr r4, [sp, #4] + vshl.i32 q7, q13, #8 + add r2, r2, r7 + vshl.i32 q11, q14, #8 + add r3, r3, r4 + vsri.u32 q3, q12, #24 + ldr r11, [sp, #20] + vsri.u32 q7, q13, #24 + eor r11, r11, r2 + vsri.u32 q11, q14, #24 + eor r14, r14, r3 + vadd.i32 q2, q2, q3 + ror r11, r11, #16 + vadd.i32 q6, q6, q7 + ror r14, r14, #16 + vadd.i32 q10, q10, q11 + add r8, r8, r11 + veor q12, q1, q2 + add r9, r9, r14 + veor q13, q5, q6 + eor r7, r7, r8 + veor q14, q9, q10 + eor r4, r4, r9 + vshl.i32 q1, q12, #7 + ror r7, r7, #20 + vshl.i32 q5, q13, #7 + ror r4, r4, #20 + vshl.i32 q9, q14, #7 + str r6, [sp, #8] + vsri.u32 q1, q12, #25 + add r2, r2, r7 + vsri.u32 q5, q13, #25 + add r3, r3, r4 + vsri.u32 q9, q14, #25 + eor r11, r11, r2 + vext.32 q3, q3, q3, #1 + eor r14, r14, r3 + vext.32 q7, q7, q7, #1 + ror r11, r11, #24 + vext.32 q11, q11, q11, #1 + ror r14, r14, #24 + vext.32 q1, q1, q1, #3 + add r8, r8, r11 + vext.32 q5, q5, q5, #3 + add r9, r9, r14 + vext.32 q9, q9, q9, #3 + eor r7, r7, r8 + vext.32 q2, q2, q2, #2 + eor r4, r4, r9 + vext.32 q6, q6, q6, #2 + ror r7, r7, #25 + vext.32 q10, q10, q10, #2 + ror r4, r4, #25 + bne .Lchacha_blocks_neon_rounds1 + str r8, [sp, #0] + str r9, [sp, #4] + str r10, [sp, #8] + str r12, [sp, #16] + str r11, [sp, #20] + str r14, [sp, #24] + add r9, sp, #64 + vld1.32 {q12,q13}, [r9,:128]! + ldr r12, [sp, #48] + vld1.32 {q14,q15}, [r9,:128] + ldr r14, [sp, #40] + vadd.i32 q0, q0, q12 + ldr r8, [sp, #(64 +0)] + vadd.i32 q4, q4, q12 + ldr r9, [sp, #(64 +4)] + vadd.i32 q8, q8, q12 + ldr r10, [sp, #(64 +8)] + vadd.i32 q1, q1, q13 + ldr r11, [sp, #(64 +12)] + vadd.i32 q5, q5, q13 + add r0, r0, r8 + vadd.i32 q9, q9, q13 + add r1, r1, r9 + vadd.i32 q2, q2, q14 + add r2, r2, r10 + vadd.i32 q6, q6, q14 + ldr r8, [sp, #(64 +16)] + vadd.i32 q10, q10, q14 + add r3, r3, r11 + veor q14, q14, q14 + ldr r9, [sp, #(64 +20)] + mov r11, #1 + add r4, r4, r8 + vmov.32 d28[0], r11 + ldr r10, [sp, #(64 +24)] + vadd.u64 q12, q14, q15 + add r5, r5, r9 + vadd.u64 q13, q14, q12 + ldr r11, [sp, #(64 +28)] + vadd.u64 q14, q14, q13 + add r6, r6, r10 + vadd.i32 q3, q3, q12 + tst r12, r12 + vadd.i32 q7, q7, q13 + add r7, r7, r11 + vadd.i32 q11, q11, q14 + beq .Lchacha_blocks_neon_nomessage11 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage11: + stmia r14!, {r0-r7} + ldm sp, {r0-r7} + ldr r8, [sp, #(64 +32)] + ldr r9, [sp, #(64 +36)] + ldr r10, [sp, #(64 +40)] + ldr r11, [sp, #(64 +44)] + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + ldr r8, [sp, #(64 +48)] + add r3, r3, r11 + ldr r9, [sp, #(64 +52)] + add r4, r4, r8 + ldr r10, [sp, #(64 +56)] + add r5, r5, r9 + ldr r11, [sp, #(64 +60)] + add r6, r6, r10 + adds r8, r8, #4 + add r7, r7, r11 + adc r9, r9, #0 + str r8, [sp, #(64 +48)] + tst r12, r12 + str r9, [sp, #(64 +52)] + beq .Lchacha_blocks_neon_nomessage12 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage12: + stmia r14!, {r0-r7} + beq .Lchacha_blocks_neon_nomessage13 + vld1.32 {q12,q13}, [r12]! + vld1.32 {q14,q15}, [r12]! + veor q0, q0, q12 + veor q1, q1, q13 + veor q2, q2, q14 + veor q3, q3, q15 +.Lchacha_blocks_neon_nomessage13: + vst1.32 {q0,q1}, [r14]! + vst1.32 {q2,q3}, [r14]! + beq .Lchacha_blocks_neon_nomessage14 + vld1.32 {q12,q13}, [r12]! + vld1.32 {q14,q15}, [r12]! + veor q4, q4, q12 + veor q5, q5, q13 + veor q6, q6, q14 + veor q7, q7, q15 +.Lchacha_blocks_neon_nomessage14: + vst1.32 {q4,q5}, [r14]! + vst1.32 {q6,q7}, [r14]! + beq .Lchacha_blocks_neon_nomessage15 + vld1.32 {q12,q13}, [r12]! + vld1.32 {q14,q15}, [r12]! + veor q8, q8, q12 + veor q9, q9, q13 + veor q10, q10, q14 + veor q11, q11, q15 +.Lchacha_blocks_neon_nomessage15: + vst1.32 {q8,q9}, [r14]! + vst1.32 {q10,q11}, [r14]! + str r12, [sp, #48] + str r14, [sp, #40] + ldr r3, [sp, #52] + sub r3, r3, #256 + cmp r3, #256 + str r3, [sp, #52] + bhs .Lchacha_blocks_neon_mainloop1 + tst r3, r3 + beq .Lchacha_blocks_neon_done +.Lchacha_blocks_neon_mainloop2: + ldr r3, [sp, #52] + ldr r1, [sp, #48] + cmp r3, #64 + bhs .Lchacha_blocks_neon_noswap1 + add r4, sp, #128 + mov r5, r4 + tst r1, r1 + beq .Lchacha_blocks_neon_nocopy1 +.Lchacha_blocks_neon_copyinput1: + subs r3, r3, #1 + ldrb r0, [r1], #1 + strb r0, [r4], #1 + bne .Lchacha_blocks_neon_copyinput1 + str r5, [sp, #48] +.Lchacha_blocks_neon_nocopy1: + ldr r4, [sp, #40] + str r5, [sp, #40] + str r4, [sp, #56] +.Lchacha_blocks_neon_noswap1: + ldr r0, [sp, #44] + str r0, [sp, #0] + add r0, sp, #64 + ldm r0, {r0-r12} + ldr r14, [sp, #(64 +60)] + str r6, [sp, #8] + str r11, [sp, #12] + str r14, [sp, #28] + ldr r11, [sp, #(64 +52)] + ldr r14, [sp, #(64 +56)] +.Lchacha_blocks_neon_rounds2: + ldr r6, [sp, #0] + add r0, r0, r4 + add r1, r1, r5 + eor r12, r12, r0 + eor r11, r11, r1 + ror r12, r12, #16 + ror r11, r11, #16 + subs r6, r6, #2 + add r8, r8, r12 + add r9, r9, r11 + eor r4, r4, r8 + eor r5, r5, r9 + str r6, [sp, #0] + ror r4, r4, #20 + ror r5, r5, #20 + add r0, r0, r4 + add r1, r1, r5 + ldr r6, [sp, #8] + eor r12, r12, r0 + eor r11, r11, r1 + ror r12, r12, #24 + ror r11, r11, #24 + add r8, r8, r12 + add r9, r9, r11 + eor r4, r4, r8 + eor r5, r5, r9 + str r11, [sp, #20] + ror r4, r4, #25 + ror r5, r5, #25 + str r4, [sp, #4] + ldr r4, [sp, #28] + add r2, r2, r6 + add r3, r3, r7 + ldr r11, [sp, #12] + eor r14, r14, r2 + eor r4, r4, r3 + ror r14, r14, #16 + ror r4, r4, #16 + add r10, r10, r14 + add r11, r11, r4 + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #20 + ror r7, r7, #20 + add r2, r2, r6 + add r3, r3, r7 + eor r14, r14, r2 + eor r4, r4, r3 + ror r14, r14, #24 + ror r4, r4, #24 + add r10, r10, r14 + add r11, r11, r4 + eor r6, r6, r10 + eor r7, r7, r11 + ror r6, r6, #25 + ror r7, r7, #25 + add r0, r0, r5 + add r1, r1, r6 + eor r4, r4, r0 + eor r12, r12, r1 + ror r4, r4, #16 + ror r12, r12, #16 + add r10, r10, r4 + add r11, r11, r12 + eor r5, r5, r10 + eor r6, r6, r11 + ror r5, r5, #20 + ror r6, r6, #20 + add r0, r0, r5 + add r1, r1, r6 + eor r4, r4, r0 + eor r12, r12, r1 + ror r4, r4, #24 + ror r12, r12, #24 + add r10, r10, r4 + add r11, r11, r12 + eor r5, r5, r10 + eor r6, r6, r11 + str r11, [sp, #12] + ror r5, r5, #25 + ror r6, r6, #25 + str r4, [sp, #28] + ldr r4, [sp, #4] + add r2, r2, r7 + add r3, r3, r4 + ldr r11, [sp, #20] + eor r11, r11, r2 + eor r14, r14, r3 + ror r11, r11, #16 + ror r14, r14, #16 + add r8, r8, r11 + add r9, r9, r14 + eor r7, r7, r8 + eor r4, r4, r9 + ror r7, r7, #20 + ror r4, r4, #20 + str r6, [sp, #8] + add r2, r2, r7 + add r3, r3, r4 + eor r11, r11, r2 + eor r14, r14, r3 + ror r11, r11, #24 + ror r14, r14, #24 + add r8, r8, r11 + add r9, r9, r14 + eor r7, r7, r8 + eor r4, r4, r9 + ror r7, r7, #25 + ror r4, r4, #25 + bne .Lchacha_blocks_neon_rounds2 + str r8, [sp, #0] + str r9, [sp, #4] + str r10, [sp, #8] + str r12, [sp, #16] + str r11, [sp, #20] + str r14, [sp, #24] + ldr r12, [sp, #48] + ldr r14, [sp, #40] + ldr r8, [sp, #(64 +0)] + ldr r9, [sp, #(64 +4)] + ldr r10, [sp, #(64 +8)] + ldr r11, [sp, #(64 +12)] + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + ldr r8, [sp, #(64 +16)] + add r3, r3, r11 + ldr r9, [sp, #(64 +20)] + add r4, r4, r8 + ldr r10, [sp, #(64 +24)] + add r5, r5, r9 + ldr r11, [sp, #(64 +28)] + add r6, r6, r10 + tst r12, r12 + add r7, r7, r11 + beq .Lchacha_blocks_neon_nomessage21 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage21: + stmia r14!, {r0-r7} + ldm sp, {r0-r7} + ldr r8, [sp, #(64 +32)] + ldr r9, [sp, #(64 +36)] + ldr r10, [sp, #(64 +40)] + ldr r11, [sp, #(64 +44)] + add r0, r0, r8 + add r1, r1, r9 + add r2, r2, r10 + ldr r8, [sp, #(64 +48)] + add r3, r3, r11 + ldr r9, [sp, #(64 +52)] + add r4, r4, r8 + ldr r10, [sp, #(64 +56)] + add r5, r5, r9 + ldr r11, [sp, #(64 +60)] + add r6, r6, r10 + adds r8, r8, #1 + add r7, r7, r11 + adc r9, r9, #0 + str r8, [sp, #(64 +48)] + tst r12, r12 + str r9, [sp, #(64 +52)] + beq .Lchacha_blocks_neon_nomessage22 + ldmia r12!, {r8-r11} + eor r0, r0, r8 + eor r1, r1, r9 + eor r2, r2, r10 + ldr r8, [r12, #0] + eor r3, r3, r11 + ldr r9, [r12, #4] + eor r4, r4, r8 + ldr r10, [r12, #8] + eor r5, r5, r9 + ldr r11, [r12, #12] + eor r6, r6, r10 + add r12, r12, #16 + eor r7, r7, r11 +.Lchacha_blocks_neon_nomessage22: + stmia r14!, {r0-r7} + str r12, [sp, #48] + str r14, [sp, #40] + ldr r3, [sp, #52] + cmp r3, #64 + sub r4, r3, #64 + str r4, [sp, #52] + bhi .Lchacha_blocks_neon_mainloop2 + cmp r3, #64 + beq .Lchacha_blocks_neon_nocopy2 + ldr r1, [sp, #56] + sub r14, r14, #64 +.Lchacha_blocks_neon_copyinput2: + subs r3, r3, #1 + ldrb r0, [r14], #1 + strb r0, [r1], #1 + bne .Lchacha_blocks_neon_copyinput2 +.Lchacha_blocks_neon_nocopy2: +.Lchacha_blocks_neon_done: + ldr r7, [sp, #60] + ldr r8, [sp, #(64 +48)] + ldr r9, [sp, #(64 +52)] + str r8, [r7, #(48 + 0)] + str r9, [r7, #(48 + 4)] + mov r12, sp + stmia r12!, {r0-r7} + add r12, r12, #48 + stmia r12!, {r0-r7} + sub r0, sp, #8 + ldr sp, [sp, #192] + ldmfd sp!, {r4-r12, r14} + vldm sp!, {q4-q7} + sub r0, sp, r0 + bx lr +.Lchacha_blocks_neon_nobytes: + mov r0, #0; + bx lr +.ltorg +.size _gcry_chacha20_armv7_neon_blocks,.-_gcry_chacha20_armv7_neon_blocks; + +#endif diff --git a/cipher/chacha20.c b/cipher/chacha20.c index ebba2fc..c1847aa 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -67,6 +67,16 @@ # define USE_AVX2 1 #endif +/* USE_NEON indicates whether to enable ARM NEON assembly code. */ +#undef USE_NEON +#ifdef ENABLE_NEON_SUPPORT +# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \ + && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \ + && defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_NEON 1 +# endif +#endif /*ENABLE_NEON_SUPPORT*/ + struct CHACHA20_context_s; @@ -104,6 +114,13 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks(u32 *state, const byte *in, #endif /* USE_AVX2 */ +#ifdef USE_NEON + +unsigned int _gcry_chacha20_armv7_neon_blocks(u32 *state, const byte *in, + byte *out, size_t bytes); + +#endif /* USE_NEON */ + static void chacha20_setiv (void *context, const byte * iv, size_t ivlen); static const char *selftest (void); @@ -353,6 +370,10 @@ chacha20_do_setkey (CHACHA20_context_t * ctx, if (features & HWF_INTEL_AVX2) ctx->blocks = _gcry_chacha20_amd64_avx2_blocks; #endif +#ifdef USE_NEON + if (features & HWF_ARM_NEON) + ctx->blocks = _gcry_chacha20_armv7_neon_blocks; +#endif (void)features; @@ -541,6 +562,19 @@ selftest (void) if (buf[i] != (byte) i) return "ChaCha20 encryption test 2 failed."; + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + /* encrypt */ + for (i = 0; i < sizeof buf; i++) + chacha20_encrypt_stream (&ctx, &buf[i], &buf[i], 1); + /* decrypt */ + chacha20_setkey (&ctx, key_1, sizeof key_1); + chacha20_setiv (&ctx, nonce_1, sizeof nonce_1); + chacha20_encrypt_stream (&ctx, buf, buf, sizeof buf); + for (i = 0; i < sizeof buf; i++) + if (buf[i] != (byte) i) + return "ChaCha20 encryption test 3 failed."; + return NULL; } diff --git a/configure.ac b/configure.ac index d14b7f6..60ed015 100644 --- a/configure.ac +++ b/configure.ac @@ -1822,6 +1822,11 @@ if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-avx2-amd64.lo" ;; esac + + if test x"$neonsupport" = xyes ; then + # Build with the NEON implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS chacha20-armv7-neon.lo" + fi fi case "${host}" in ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 3 +- cipher/chacha20-armv7-neon.S | 710 ++++++++++++++++++++++++++++++++++++++++++ cipher/chacha20.c | 34 ++ cipher/poly1305-armv7-neon.S | 705 +++++++++++++++++++++++++++++++++++++++++ cipher/poly1305-internal.h | 18 ++ cipher/poly1305.c | 23 ++ configure.ac | 10 + src/hwf-arm.c | 57 +++- 8 files changed, 1556 insertions(+), 4 deletions(-) create mode 100644 cipher/chacha20-armv7-neon.S create mode 100644 cipher/poly1305-armv7-neon.S hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From gniibe at fsij.org Wed Nov 19 08:39:58 2014 From: gniibe at fsij.org (NIIBE Yutaka) Date: Wed, 19 Nov 2014 16:39:58 +0900 Subject: [PATCH] ecc: Improve Montgomery curve implementation Message-ID: <546C494E.9010503@fsij.org> Here is the change for Montgomery curve implementation. I forgot to submit this change in August. Adding test_ecdh_only_keys is needed when we will support encryption by Curve25519 in future. The changes in _gcry_mpi_ec_mul_point are to make sure resizing the MPI representation of points, and code clean up. OK to commit? ecc: Improve Montgomery curve implementation. * cipher/ecc-curves.c (_gcry_ecc_fill_in_curve): Support MPI_EC_MONTGOMERY. * cipher/ecc.c (test_ecdh_only_keys): New. (nist_generate_key): Call test_ecdh_only_keys for MPI_EC_MONTGOMERY. (check_secret_key): Handle Montgomery curve of x-coordinate only. * mpi/ec.c (_gcry_mpi_ec_mul_point): Resize points before the loop. Simplify, using pointers of Q1, Q2, PRD, and SUM. -- diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c index fd47c1d..9975bb4 100644 --- a/cipher/ecc-curves.c +++ b/cipher/ecc-curves.c @@ -530,9 +530,8 @@ _gcry_ecc_fill_in_curve (unsigned int nbits, const char *name, { case MPI_EC_WEIERSTRASS: case MPI_EC_EDWARDS: - break; case MPI_EC_MONTGOMERY: - return GPG_ERR_NOT_SUPPORTED; + break; default: return GPG_ERR_BUG; } diff --git a/cipher/ecc.c b/cipher/ecc.c index 8bdbd56..2f5e401 100644 --- a/cipher/ecc.c +++ b/cipher/ecc.c @@ -81,6 +81,7 @@ static void *progress_cb_data; /* Local prototypes. */ static void test_keys (ECC_secret_key * sk, unsigned int nbits); +static void test_ecdh_only_keys (ECC_secret_key * sk, unsigned int nbits); static unsigned int ecc_get_nbits (gcry_sexp_t parms); @@ -209,7 +210,10 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx, point_free (&Q); /* Now we can test our keys (this should never fail!). */ - test_keys (sk, nbits - 64); + if (sk->E.model != MPI_EC_MONTGOMERY) + test_keys (sk, nbits - 64); + else + test_ecdh_only_keys (sk, nbits - 64); return 0; } @@ -266,6 +270,80 @@ test_keys (ECC_secret_key *sk, unsigned int nbits) } +static void +test_ecdh_only_keys (ECC_secret_key *sk, unsigned int nbits) +{ + ECC_public_key pk; + gcry_mpi_t test; + mpi_point_struct R_; + gcry_mpi_t x0, x1; + mpi_ec_t ec; + + if (DBG_CIPHER) + log_debug ("Testing key.\n"); + + point_init (&R_); + + pk.E = _gcry_ecc_curve_copy (sk->E); + point_init (&pk.Q); + point_set (&pk.Q, &sk->Q); + + if (sk->E.dialect == ECC_DIALECT_ED25519) + { + char *rndbuf; + + test = mpi_new (256); + rndbuf = _gcry_random_bytes (32, GCRY_WEAK_RANDOM); + rndbuf[0] &= 0x7f; /* Clear bit 255. */ + rndbuf[0] |= 0x40; /* Set bit 254. */ + rndbuf[31] &= 0xf8; /* Clear bits 2..0 so that d mod 8 == 0 */ + _gcry_mpi_set_buffer (test, rndbuf, 32, 0); + xfree (rndbuf); + } + else + { + test = mpi_new (nbits); + _gcry_mpi_randomize (test, nbits, GCRY_WEAK_RANDOM); + } + + ec = _gcry_mpi_ec_p_internal_new (pk.E.model, pk.E.dialect, 0, + pk.E.p, pk.E.a, pk.E.b); + x0 = mpi_new (0); + x1 = mpi_new (0); + + /* R_ = hkQ <=> R_ = hkdG */ + _gcry_mpi_ec_mul_point (&R_, test, &pk.Q, ec); + if (sk->E.dialect != ECC_DIALECT_ED25519) + _gcry_mpi_ec_mul_point (&R_, ec->h, &R_, ec); + if (_gcry_mpi_ec_get_affine (x0, NULL, &R_, ec)) + log_fatal ("ecdh: Failed to get affine coordinates for hkQ\n"); + + _gcry_mpi_ec_mul_point (&R_, test, &pk.E.G, ec); + _gcry_mpi_ec_mul_point (&R_, sk->d, &R_, ec); + /* R_ = hdkG */ + if (sk->E.dialect != ECC_DIALECT_ED25519) + _gcry_mpi_ec_mul_point (&R_, ec->h, &R_, ec); + + if (_gcry_mpi_ec_get_affine (x1, NULL, &R_, ec)) + log_fatal ("ecdh: Failed to get affine coordinates for hdkG\n"); + + if (mpi_cmp (x0, x1)) + { + log_fatal ("ECDH test failed.\n"); + } + + mpi_free (x0); + mpi_free (x1); + _gcry_mpi_ec_free (ec); + + point_free (&pk.Q); + _gcry_ecc_curve_free (&pk.E); + + point_free (&R_); + mpi_free (test); +} + + /* * To check the validity of the value, recalculate the correspondence * between the public value and the secret one. @@ -281,7 +359,10 @@ check_secret_key (ECC_secret_key *sk, mpi_ec_t ec, int flags) point_init (&Q); x1 = mpi_new (0); - y1 = mpi_new (0); + if (ec->model == MPI_EC_MONTGOMERY) + y1 = NULL; + else + y1 = mpi_new (0); /* G in E(F_p) */ if (!_gcry_mpi_ec_curve_point (&sk->E.G, ec)) @@ -338,7 +419,7 @@ check_secret_key (ECC_secret_key *sk, mpi_ec_t ec, int flags) else if (!mpi_cmp_ui (sk->Q.z, 1)) { /* Fast path if Q is already in affine coordinates. */ - if (mpi_cmp (x1, sk->Q.x) || mpi_cmp (y1, sk->Q.y)) + if (mpi_cmp (x1, sk->Q.x) || (!y1 && mpi_cmp (y1, sk->Q.y))) { if (DBG_CIPHER) log_debug @@ -1581,7 +1662,7 @@ compute_keygrip (gcry_md_hd_t md, gcry_sexp_t keyparms) char buf[30]; if (idx == 5) - continue; /* Skip cofactor. */ + continue; /* Skip cofactor. */ if (mpi_is_opaque (values[idx])) { diff --git a/mpi/ec.c b/mpi/ec.c index 80f3b22..0b7c7a7 100644 --- a/mpi/ec.c +++ b/mpi/ec.c @@ -1251,7 +1251,9 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, unsigned int nbits; int j; mpi_point_struct p1_, p2_; + mpi_point_t q1, q2, prd, sum; unsigned long sw; + size_t nlimbs; /* Compute scalar point multiplication with Montgomery Ladder. Note that we don't use Y-coordinate in the points at all. @@ -1267,27 +1269,35 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, p2.x = mpi_copy (point->x); mpi_set_ui (p2.z, 1); + nlimbs = 2*(nbits+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB+1; + mpi_resize (p1.x, nlimbs); + mpi_resize (p1.z, nlimbs); + mpi_resize (p2.x, nlimbs); + mpi_resize (p2.z, nlimbs); + mpi_resize (p1_.x, nlimbs); + mpi_resize (p1_.z, nlimbs); + mpi_resize (p2_.x, nlimbs); + mpi_resize (p2_.z, nlimbs); + + q1 = &p1; + q2 = &p2; + prd = &p1_; + sum = &p2_; + for (j=nbits-1; j >= 0; j--) { - sw = mpi_test_bit (scalar, j); - mpi_swap_cond (p1.x, p2.x, sw); - mpi_swap_cond (p1.z, p2.z, sw); - montgomery_ladder (&p1_, &p2_, &p1, &p2, point->x, ctx); - mpi_swap_cond (p1_.x, p2_.x, sw); - mpi_swap_cond (p1_.z, p2_.z, sw); - - if (--j < 0) - break; + mpi_point_t t; sw = mpi_test_bit (scalar, j); - mpi_swap_cond (p1_.x, p2_.x, sw); - mpi_swap_cond (p1_.z, p2_.z, sw); - montgomery_ladder (&p1, &p2, &p1_, &p2_, point->x, ctx); - mpi_swap_cond (p1.x, p2.x, sw); - mpi_swap_cond (p1.z, p2.z, sw); + mpi_swap_cond (q1->x, q2->x, sw); + mpi_swap_cond (q1->z, q2->z, sw); + montgomery_ladder (prd, sum, q1, q2, point->x, ctx); + mpi_swap_cond (prd->x, sum->x, sw); + mpi_swap_cond (prd->z, sum->z, sw); + t = q1; q1 = prd; prd = t; + t = q2; q2 = sum; sum = t; } - z1 = mpi_new (0); mpi_clear (result->y); sw = (nbits & 1); mpi_swap_cond (p1.x, p1_.x, sw); @@ -1300,12 +1310,13 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, } else { + z1 = mpi_new (0); ec_invm (z1, p1.z, ctx); ec_mulm (result->x, p1.x, z1, ctx); mpi_set_ui (result->z, 1); + mpi_free (z1); } - mpi_free (z1); point_free (&p1); point_free (&p2); point_free (&p1_); -- From wk at gnupg.org Wed Nov 19 15:46:27 2014 From: wk at gnupg.org (Werner Koch) Date: Wed, 19 Nov 2014 15:46:27 +0100 Subject: [PATCH] ecc: Improve Montgomery curve implementation In-Reply-To: <546C494E.9010503@fsij.org> (NIIBE Yutaka's message of "Wed, 19 Nov 2014 16:39:58 +0900") References: <546C494E.9010503@fsij.org> Message-ID: <87fvdfnt8c.fsf@vigenere.g10code.de> On Wed, 19 Nov 2014 08:39, gniibe at fsij.org said: > Adding test_ecdh_only_keys is needed when we will support encryption > by Curve25519 in future. However, I doubt that the condition if (sk->E.dialect == ECC_DIALECT_ED25519) will ever be asserted. > OK to commit? Ack. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From cvs at cvs.gnupg.org Thu Nov 20 01:46:02 2014 From: cvs at cvs.gnupg.org (by NIIBE Yutaka) Date: Thu, 20 Nov 2014 01:46:02 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-126-ge613003 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via e6130034506013d6153465a2bedb6fb08a43f74d (commit) from 95eef21583d8e998efc48f22898c1ae31b77cb48 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit e6130034506013d6153465a2bedb6fb08a43f74d Author: NIIBE Yutaka Date: Wed Nov 19 15:48:12 2014 +0900 ecc: Improve Montgomery curve implementation. * cipher/ecc-curves.c (_gcry_ecc_fill_in_curve): Support MPI_EC_MONTGOMERY. * cipher/ecc.c (test_ecdh_only_keys): New. (nist_generate_key): Call test_ecdh_only_keys for MPI_EC_MONTGOMERY. (check_secret_key): Handle Montgomery curve of x-coordinate only. * mpi/ec.c (_gcry_mpi_ec_mul_point): Resize points before the loop. Simplify, using pointers of Q1, Q2, PRD, and SUM. -- diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c index fd47c1d..9975bb4 100644 --- a/cipher/ecc-curves.c +++ b/cipher/ecc-curves.c @@ -530,9 +530,8 @@ _gcry_ecc_fill_in_curve (unsigned int nbits, const char *name, { case MPI_EC_WEIERSTRASS: case MPI_EC_EDWARDS: - break; case MPI_EC_MONTGOMERY: - return GPG_ERR_NOT_SUPPORTED; + break; default: return GPG_ERR_BUG; } diff --git a/cipher/ecc.c b/cipher/ecc.c index 8bdbd56..2f5e401 100644 --- a/cipher/ecc.c +++ b/cipher/ecc.c @@ -81,6 +81,7 @@ static void *progress_cb_data; /* Local prototypes. */ static void test_keys (ECC_secret_key * sk, unsigned int nbits); +static void test_ecdh_only_keys (ECC_secret_key * sk, unsigned int nbits); static unsigned int ecc_get_nbits (gcry_sexp_t parms); @@ -209,7 +210,10 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx, point_free (&Q); /* Now we can test our keys (this should never fail!). */ - test_keys (sk, nbits - 64); + if (sk->E.model != MPI_EC_MONTGOMERY) + test_keys (sk, nbits - 64); + else + test_ecdh_only_keys (sk, nbits - 64); return 0; } @@ -266,6 +270,80 @@ test_keys (ECC_secret_key *sk, unsigned int nbits) } +static void +test_ecdh_only_keys (ECC_secret_key *sk, unsigned int nbits) +{ + ECC_public_key pk; + gcry_mpi_t test; + mpi_point_struct R_; + gcry_mpi_t x0, x1; + mpi_ec_t ec; + + if (DBG_CIPHER) + log_debug ("Testing key.\n"); + + point_init (&R_); + + pk.E = _gcry_ecc_curve_copy (sk->E); + point_init (&pk.Q); + point_set (&pk.Q, &sk->Q); + + if (sk->E.dialect == ECC_DIALECT_ED25519) + { + char *rndbuf; + + test = mpi_new (256); + rndbuf = _gcry_random_bytes (32, GCRY_WEAK_RANDOM); + rndbuf[0] &= 0x7f; /* Clear bit 255. */ + rndbuf[0] |= 0x40; /* Set bit 254. */ + rndbuf[31] &= 0xf8; /* Clear bits 2..0 so that d mod 8 == 0 */ + _gcry_mpi_set_buffer (test, rndbuf, 32, 0); + xfree (rndbuf); + } + else + { + test = mpi_new (nbits); + _gcry_mpi_randomize (test, nbits, GCRY_WEAK_RANDOM); + } + + ec = _gcry_mpi_ec_p_internal_new (pk.E.model, pk.E.dialect, 0, + pk.E.p, pk.E.a, pk.E.b); + x0 = mpi_new (0); + x1 = mpi_new (0); + + /* R_ = hkQ <=> R_ = hkdG */ + _gcry_mpi_ec_mul_point (&R_, test, &pk.Q, ec); + if (sk->E.dialect != ECC_DIALECT_ED25519) + _gcry_mpi_ec_mul_point (&R_, ec->h, &R_, ec); + if (_gcry_mpi_ec_get_affine (x0, NULL, &R_, ec)) + log_fatal ("ecdh: Failed to get affine coordinates for hkQ\n"); + + _gcry_mpi_ec_mul_point (&R_, test, &pk.E.G, ec); + _gcry_mpi_ec_mul_point (&R_, sk->d, &R_, ec); + /* R_ = hdkG */ + if (sk->E.dialect != ECC_DIALECT_ED25519) + _gcry_mpi_ec_mul_point (&R_, ec->h, &R_, ec); + + if (_gcry_mpi_ec_get_affine (x1, NULL, &R_, ec)) + log_fatal ("ecdh: Failed to get affine coordinates for hdkG\n"); + + if (mpi_cmp (x0, x1)) + { + log_fatal ("ECDH test failed.\n"); + } + + mpi_free (x0); + mpi_free (x1); + _gcry_mpi_ec_free (ec); + + point_free (&pk.Q); + _gcry_ecc_curve_free (&pk.E); + + point_free (&R_); + mpi_free (test); +} + + /* * To check the validity of the value, recalculate the correspondence * between the public value and the secret one. @@ -281,7 +359,10 @@ check_secret_key (ECC_secret_key *sk, mpi_ec_t ec, int flags) point_init (&Q); x1 = mpi_new (0); - y1 = mpi_new (0); + if (ec->model == MPI_EC_MONTGOMERY) + y1 = NULL; + else + y1 = mpi_new (0); /* G in E(F_p) */ if (!_gcry_mpi_ec_curve_point (&sk->E.G, ec)) @@ -338,7 +419,7 @@ check_secret_key (ECC_secret_key *sk, mpi_ec_t ec, int flags) else if (!mpi_cmp_ui (sk->Q.z, 1)) { /* Fast path if Q is already in affine coordinates. */ - if (mpi_cmp (x1, sk->Q.x) || mpi_cmp (y1, sk->Q.y)) + if (mpi_cmp (x1, sk->Q.x) || (!y1 && mpi_cmp (y1, sk->Q.y))) { if (DBG_CIPHER) log_debug @@ -1581,7 +1662,7 @@ compute_keygrip (gcry_md_hd_t md, gcry_sexp_t keyparms) char buf[30]; if (idx == 5) - continue; /* Skip cofactor. */ + continue; /* Skip cofactor. */ if (mpi_is_opaque (values[idx])) { diff --git a/mpi/ec.c b/mpi/ec.c index 80f3b22..0b7c7a7 100644 --- a/mpi/ec.c +++ b/mpi/ec.c @@ -1251,7 +1251,9 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, unsigned int nbits; int j; mpi_point_struct p1_, p2_; + mpi_point_t q1, q2, prd, sum; unsigned long sw; + size_t nlimbs; /* Compute scalar point multiplication with Montgomery Ladder. Note that we don't use Y-coordinate in the points at all. @@ -1267,27 +1269,35 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, p2.x = mpi_copy (point->x); mpi_set_ui (p2.z, 1); + nlimbs = 2*(nbits+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB+1; + mpi_resize (p1.x, nlimbs); + mpi_resize (p1.z, nlimbs); + mpi_resize (p2.x, nlimbs); + mpi_resize (p2.z, nlimbs); + mpi_resize (p1_.x, nlimbs); + mpi_resize (p1_.z, nlimbs); + mpi_resize (p2_.x, nlimbs); + mpi_resize (p2_.z, nlimbs); + + q1 = &p1; + q2 = &p2; + prd = &p1_; + sum = &p2_; + for (j=nbits-1; j >= 0; j--) { - sw = mpi_test_bit (scalar, j); - mpi_swap_cond (p1.x, p2.x, sw); - mpi_swap_cond (p1.z, p2.z, sw); - montgomery_ladder (&p1_, &p2_, &p1, &p2, point->x, ctx); - mpi_swap_cond (p1_.x, p2_.x, sw); - mpi_swap_cond (p1_.z, p2_.z, sw); - - if (--j < 0) - break; + mpi_point_t t; sw = mpi_test_bit (scalar, j); - mpi_swap_cond (p1_.x, p2_.x, sw); - mpi_swap_cond (p1_.z, p2_.z, sw); - montgomery_ladder (&p1, &p2, &p1_, &p2_, point->x, ctx); - mpi_swap_cond (p1.x, p2.x, sw); - mpi_swap_cond (p1.z, p2.z, sw); + mpi_swap_cond (q1->x, q2->x, sw); + mpi_swap_cond (q1->z, q2->z, sw); + montgomery_ladder (prd, sum, q1, q2, point->x, ctx); + mpi_swap_cond (prd->x, sum->x, sw); + mpi_swap_cond (prd->z, sum->z, sw); + t = q1; q1 = prd; prd = t; + t = q2; q2 = sum; sum = t; } - z1 = mpi_new (0); mpi_clear (result->y); sw = (nbits & 1); mpi_swap_cond (p1.x, p1_.x, sw); @@ -1300,12 +1310,13 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, } else { + z1 = mpi_new (0); ec_invm (z1, p1.z, ctx); ec_mulm (result->x, p1.x, z1, ctx); mpi_set_ui (result->z, 1); + mpi_free (z1); } - mpi_free (z1); point_free (&p1); point_free (&p2); point_free (&p1_); ----------------------------------------------------------------------- Summary of changes: cipher/ecc-curves.c | 3 +- cipher/ecc.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++--- mpi/ec.c | 43 ++++++++++++++++--------- 3 files changed, 113 insertions(+), 22 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From gniibe at fsij.org Thu Nov 20 02:21:40 2014 From: gniibe at fsij.org (NIIBE Yutaka) Date: Thu, 20 Nov 2014 10:21:40 +0900 Subject: [PATCH] ecc: Improve Montgomery curve implementation In-Reply-To: <87fvdfnt8c.fsf@vigenere.g10code.de> References: <546C494E.9010503@fsij.org> <87fvdfnt8c.fsf@vigenere.g10code.de> Message-ID: <546D4224.2020309@fsij.org> Pushed. On 11/19/2014 11:46 PM, Werner Koch wrote: > However, I doubt that the condition > > if (sk->E.dialect == ECC_DIALECT_ED25519) > > will ever be asserted. Err... let me explain. It would be abuse to put another meaning on ECC_DIALECT_ED25519, but I wanted to say something like: "If it's computation for DJB's curve,..." It corresponds to the condition in the function nist_generate_key. I thought that (ab)using ECC_DIALECT_ED25519, the Montgomery curve could be defined as: diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c index fd47c1d..5d855bd 100644 --- a/cipher/ecc-curves.c +++ b/cipher/ecc-curves.c @@ -40,7 +40,7 @@ static const struct const char *other; /* Other name. */ } curve_aliases[] = { - /*{ "Curve25519", "1.3.6.1.4.1.3029.1.5.1" },*/ + { "Curve25519", "1.3.6.1.4.1.3029.1.5.1" }, { "Ed25519", "1.3.6.1.4.1.11591.15.1" }, { "NIST P-192", "1.2.840.10045.3.1.1" }, /* X9.62 OID */ @@ -129,6 +129,18 @@ static const ecc_domain_parms_t domain_parms[] = "0x6666666666666666666666666666666666666666666666666666666666666658", "0x08" }, + { + /* (y^2 = x^3 + 486662*x^2 + x) */ + "Curve25519", 256, 0, + MPI_EC_MONTGOMERY, ECC_DIALECT_ED25519, + "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED", + "0x01DB41", + "0x01", + "0x1000000000000000000000000000000014DEF9DEA2F79CD65812631A5CF5D3ED", + "0x0000000000000000000000000000000000000000000000000000000000000009", + "0x20AE19A1B8A086B4E01EDD2C7748D14C923D4D7E6D7C61B229E9C5A27ECED3D9", + "0x08" + }, #if 0 /* No real specs yet found. */ { /* x^2 + y^2 = 1 + 3617x^2y^2 mod 2^414 - 17 */ ================================= When the cofactor h != 1, we computes hkdG for shared point, according to 'cofactor Elliptic Curve Diffie-Hellman' defined in SP800-56A. On the other hand, for the computation of Curve25519, we make sure the value of k (in the variable test in test_ecdh_only_keys) is multiple of 8 (== cofactor), in advance. Should we have another ECC_DIALECT_??? ? Well, for completeness, we should implement 'cofactor Elliptic Curve Diffie-Hellman' in ecc_encrypt_raw/ecc_decrypt_raw to handle cofactor correctly, but this is another story. Currently, since the curve with h != 1 is only Curve25519, there is no problem. -- From cvs at cvs.gnupg.org Mon Nov 24 12:32:20 2014 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 24 Nov 2014 12:32:20 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-128-gd53ea84 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via d53ea84bed37b973f7ce59262c50b33700cd8311 (commit) via 1b4210c204a5ef5e631187509e011b8468a134ef (commit) from e6130034506013d6153465a2bedb6fb08a43f74d (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit d53ea84bed37b973f7ce59262c50b33700cd8311 Author: Werner Koch Date: Mon Nov 24 12:28:33 2014 +0100 Remove duplicated prototypes. * src/gcrypt-int.h (_gcry_mpi_ec_new, _gcry_mpi_ec_set_mpi) (gcry_mpi_ec_set_point): Remove. -- Thos used gpg_error_t instead of gpg_err_code_t and the picky AIX compiler takes this as a severe error. Signed-off-by: Werner Koch diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h index 918937b..29d4fd3 100644 --- a/src/gcrypt-int.h +++ b/src/gcrypt-int.h @@ -416,15 +416,10 @@ gcry_mpi_point_t _gcry_mpi_point_set (gcry_mpi_point_t point, gcry_mpi_point_t _gcry_mpi_point_snatch_set (gcry_mpi_point_t point, gcry_mpi_t x, gcry_mpi_t y, gcry_mpi_t z); -gpg_error_t _gcry_mpi_ec_new (gcry_ctx_t *r_ctx, - gcry_sexp_t keyparam, const char *curvename); + gcry_mpi_t _gcry_mpi_ec_get_mpi (const char *name, gcry_ctx_t ctx, int copy); gcry_mpi_point_t _gcry_mpi_ec_get_point (const char *name, gcry_ctx_t ctx, int copy); -gpg_error_t _gcry_mpi_ec_set_mpi (const char *name, gcry_mpi_t newvalue, - gcry_ctx_t ctx); -gpg_error_t _gcry_mpi_ec_set_point (const char *name, gcry_mpi_point_t newvalue, - gcry_ctx_t ctx); int _gcry_mpi_ec_get_affine (gcry_mpi_t x, gcry_mpi_t y, gcry_mpi_point_t point, mpi_ec_t ctx); void _gcry_mpi_ec_dup (gcry_mpi_point_t w, gcry_mpi_point_t u, gcry_ctx_t ctx); commit 1b4210c204a5ef5e631187509e011b8468a134ef Author: Werner Koch Date: Tue Oct 14 21:29:33 2014 +0200 tests: Add a prime mode to benchmark. * tests/benchmark.c (progress_cb): Add a single char mode. (prime_bench): New. (main): Add a "prime" mode. Factor with_progress out to file scope. Signed-off-by: Werner Koch diff --git a/tests/benchmark.c b/tests/benchmark.c index 2621551..5bf92da 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -62,6 +62,12 @@ static int in_fips_mode; /* Whether we are running as part of the regression test suite. */ static int in_regression_test; +/* Whether --progress is in use. */ +static int with_progress; + +/* Runtime flag to switch to a different progress output. */ +static int single_char_progress; + static const char sample_private_dsa_key_1024[] = "(private-key\n" @@ -429,9 +435,17 @@ progress_cb (void *cb_data, const char *what, int printchar, { (void)cb_data; - fprintf (stderr, PGM ": progress (%s %c %d %d)\n", - what, printchar, current, total); - fflush (stderr); + if (single_char_progress) + { + fputc (printchar, stdout); + fflush (stderr); + } + else + { + fprintf (stderr, PGM ": progress (%s %c %d %d)\n", + what, printchar, current, total); + fflush (stderr); + } } @@ -1544,6 +1558,51 @@ mpi_bench (void) } +static void +prime_bench (void) +{ + gpg_error_t err; + int i; + gcry_mpi_t prime; + int old_prog = single_char_progress; + + single_char_progress = 1; + if (!with_progress) + printf ("%-10s", "prime"); + fflush (stdout); + start_timer (); + for (i=0; i < 10; i++) + { + if (with_progress) + fputs ("primegen ", stdout); + err = gcry_prime_generate (&prime, + 1024, 0, + NULL, + NULL, NULL, + GCRY_WEAK_RANDOM, + GCRY_PRIME_FLAG_SECRET); + if (with_progress) + { + fputc ('\n', stdout); + fflush (stdout); + } + if (err) + { + fprintf (stderr, PGM ": error creating prime: %s\n", + gpg_strerror (err)); + exit (1); + } + gcry_mpi_release (prime); + } + stop_timer (); + if (with_progress) + printf ("%-10s", "prime"); + printf (" %s\n", elapsed_time ()); fflush (stdout); + + single_char_progress = old_prog; +} + + int main( int argc, char **argv ) { @@ -1551,7 +1610,6 @@ main( int argc, char **argv ) int no_blinding = 0; int use_random_daemon = 0; int use_secmem = 0; - int with_progress = 0; int debug = 0; int pk_count = 100; @@ -1582,7 +1640,7 @@ main( int argc, char **argv ) else if (!strcmp (*argv, "--help")) { fputs ("usage: benchmark " - "[md|mac|cipher|random|mpi|rsa|dsa|ecc [algonames]]\n", + "[md|mac|cipher|random|mpi|rsa|dsa|ecc|prime [algonames]]\n", stdout); exit (0); } @@ -1833,6 +1891,11 @@ main( int argc, char **argv ) gcry_control (GCRYCTL_ENABLE_QUICK_RANDOM, 0); ecc_bench (pk_count, 1); } + else if ( !strcmp (*argv, "prime")) + { + gcry_control (GCRYCTL_ENABLE_QUICK_RANDOM, 0); + prime_bench (); + } else { fprintf (stderr, PGM ": bad arguments\n"); ----------------------------------------------------------------------- Summary of changes: src/gcrypt-int.h | 7 +---- tests/benchmark.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 69 insertions(+), 11 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jan.bilek at eftlab.co.uk Fri Nov 28 02:10:20 2014 From: jan.bilek at eftlab.co.uk (Jan Bilek) Date: Fri, 28 Nov 2014 11:10:20 +1000 Subject: AES192 & AES256 in CBC mode [libgcrypt] Message-ID: <5477CB7C.6020507@eftlab.co.uk> Hello, I've just bounced in a potential problem with libgcrypt while trying to do AES192 & AES256 in CBC mode. All works well with AES128 for all cipher modes, however when moving to AES192 & AES256 and GCRY_CIPHER_MODE_CBC it looks like all buffers are being written just in first 128 bits of output. Please see example code attached. Let me know if you'll be able to confirm that and if confirmed if I may help with fixing it. Thank you & Kind Regards, Jan Jan Bilek CTO, EFTlab Pty Ltd email:jan.bilek at eftlab.co.uk mob: +61 (0) 498 103 179 This message contains confidential information and is intended only for the addressee(s). E-mail transmission cannot be guaranteed to be secure or error-free as information could be intercepted, corrupted, lost, destroyed, arrive late or incomplete, or contain viruses. EFTlab Ltd cannot accept liability for any errors or omissions in the contents of this message, which may arise as a result of e-mail transmission. Please note that EFTlab Ltd may monitor, analyse and archive email traffic, data and the content of email for the purposes of security, legal compliance and staff training. If you have received this email in error please notify us at support at eftlab.co.uk. EFTlab is a limited company registered in England & Wales with Reg No. 07528943. The Registered Office is 21-27 Lamb's Conduit Street, London, WC1N 3GS. -------------- next part -------------- An HTML attachment was scrubbed... URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: crypto_aes.cpp Type: text/x-c++src Size: 6065 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: CryptoAes_unittest.cpp Type: text/x-c++src Size: 10065 bytes Desc: not available URL: From jussi.kivilinna at iki.fi Sun Nov 30 10:07:16 2014 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 30 Nov 2014 11:07:16 +0200 Subject: AES192 & AES256 in CBC mode [libgcrypt] In-Reply-To: <5477CB7C.6020507@eftlab.co.uk> References: <5477CB7C.6020507@eftlab.co.uk> Message-ID: <547ADE44.2020907@iki.fi> Hello, On 28.11.2014 03:10, Jan Bilek wrote:> Hello, > > I've just bounced in a potential problem with libgcrypt while trying to do AES192 & AES256 in CBC mode. > > All works well with AES128 for all cipher modes, however when moving to AES192 & AES256 and GCRY_CIPHER_MODE_CBC it looks like all buffers are being written just in first 128 bits of output. > > Please see example code attached. > > Let me know if you'll be able to confirm that and if confirmed if I may help with fixing it. I added new CBC test vectors to tests/basic.c from RFC 3602 and NIST SP800-38A, which did work fine. Then I tried adding the test vectors from example code that you provided and those tests fail (AES128, 192 and 256). check-cbc, encrypt mismatch entry 4:0 expected: 72 de 8d 00 17 e1 4a a3 5c 87 16 b3 33 55 92 5c b8 ef fc b9 d8 aa 32 29 8a 76 6a 62 7c fc 29 52 e0 34 c8 72 32 91 0a 8d 18 93 13 48 fc af 45 24 01 1a 61 9d 6c 61 a2 38 5f c7 9d ce 7f 92 ee b8 computed: 72 de 8d 00 17 e1 4a a3 5c 87 16 b3 33 55 92 5c b8 ef fc b9 d8 aa 32 29 8a 76 6a 62 7c fc 29 52 01 1a 61 9d 6c 61 a2 38 5f c7 9d ce 7f 92 ee b8 e0 34 c8 72 32 91 0a 8d 18 93 13 48 fc af 45 24 xor-diff: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 e1 2e a9 ef 5e f0 a8 b5 47 54 8e 86 83 3d ab 9c e1 2e a9 ef 5e f0 a8 b5 47 54 8e 86 83 3d ab 9c check-cbc, encrypt mismatch entry 5:0 expected: 12 e9 3d 3c b0 6d 13 44 e7 b9 5c eb b6 88 ec ba 39 f3 b0 21 d7 a2 45 f0 9e 24 ed e1 2c 4f f6 a3 80 f9 f7 b7 3c f6 dc 46 3e e5 3a 7b d1 e6 1d 2e bb b1 38 b2 aa 22 7b 81 18 35 e0 a7 c8 20 53 f6 computed: fd 51 b0 e6 d2 98 09 85 4f 7a c7 a7 d6 97 c2 62 72 a2 d8 bb 5f 0f 43 5e 05 33 59 ef c7 4d f9 83 16 05 2c 29 91 55 1c 58 b9 d9 7d ec 5d dd eb 4c 7a 97 e1 a5 b5 04 60 59 6c 25 62 e5 50 1d 72 33 xor-diff: ef b8 8d da 62 f5 1a c1 a8 c3 9b 4c 60 1f 2e d8 4b 51 68 9a 88 ad 06 ae 9b 17 b4 0e eb 02 0f 20 96 fc db 9e ad a3 c0 1e 87 3c 47 97 8c 3b f6 62 c1 26 d9 17 1f 26 1b d8 74 10 82 42 98 3d 21 c5 check-cbc, encrypt mismatch entry 6:0 expected: 4a d8 4c 9f 32 dc 6a 95 63 49 a3 d3 cc 30 88 96 4b 5c aa f0 30 51 f1 9f 1e a9 11 71 52 89 46 0a 4c da 68 d7 17 6b a4 ed e0 4d 1b 69 c8 fb 54 f2 46 6a 14 7d 9a 4a d5 c4 bc 55 15 d4 1b d4 ac 7d computed: b2 1f 9d bf bf fe c3 4f b0 98 fe f2 0b fc 8e 44 d1 ac 48 e0 d9 1c b4 17 e1 32 19 05 80 f8 e9 da 60 fb 53 97 4a 6c 79 46 2a 7d 91 7c 87 71 af a7 87 44 37 05 2b 02 c7 63 52 b3 53 a3 a3 cf 8f 75 xor-diff: f8 c7 d1 20 8d 22 a9 da d3 d1 5d 21 c7 cc 06 d2 9a f0 e2 10 e9 4d 45 88 ff 9b 08 74 d2 71 af d0 2c 21 3b 40 5d 07 dd ab ca 30 8a 15 4f 8a fb 55 c1 2e 23 78 b1 48 12 a7 ee e6 46 77 b8 1b 23 08 Patch with these tests for libgcrypt/master attached. -Jussi > > Thank you & Kind Regards, > Jan > > Jan Bilek > CTO, EFTlab Pty Ltd > email: jan.bilek at eftlab.co.uk > mob: +61 (0) 498 103 179 > > This message contains confidential information and is intended only for the addressee(s). E-mail transmission cannot be guaranteed to be secure or error-free as information could be intercepted, corrupted, lost, destroyed, arrive late or incomplete, or contain viruses. EFTlab Ltd cannot accept liability for any errors or omissions in the contents of this message, which may arise as a result of e-mail transmission. Please note that EFTlab Ltd may monitor, analyse and archive email traffic, data and the content of email for the purposes of security, legal compliance and staff training. If you have received this email in error please notify us at support at eftlab.co.uk. EFTlab is a limited company registered in England & Wales with Reg No. 07528943. The Registered Office is 21-27 Lamb's Conduit Street, London, WC1N 3GS. > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > -------------- next part -------------- A non-text attachment was scrubbed... Name: 03-cbc-tests.patch Type: text/x-patch Size: 12728 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 648 bytes Desc: OpenPGP digital signature URL: From jan.bilek at eftlab.co.uk Sun Nov 30 14:18:42 2014 From: jan.bilek at eftlab.co.uk (Jan Bilek) Date: Sun, 30 Nov 2014 23:18:42 +1000 Subject: AES192 & AES256 in CBC mode [libgcrypt] In-Reply-To: <547B154D.70802@eftlab.co.uk> References: <5477CB7C.6020507@eftlab.co.uk> <547ADE44.2020907@iki.fi> <547B154D.70802@eftlab.co.uk> Message-ID: <547B1932.30701@eftlab.co.uk> Resending as previous email seemed to be bounced back by the http://www.dnsbl.manitu.net/'s spam filter. Kind Regards, Jan On 30/11/14 23:02, Jan Bilek wrote: > Hello Jussi, > > thanks for this, however I'm not sure how to interpret your email. > Does it mean that my test vectors are wrong? I calculated those "in > hand" so this might be the case, but then my understanding of CBC is > incorrect. > > Thank you, > Jan > > On 30/11/14 19:07, Jussi Kivilinna wrote: >> Hello, >> >> On 28.11.2014 03:10, Jan Bilek wrote:> Hello, >>> I've just bounced in a potential problem with libgcrypt while trying >>> to do AES192 & AES256 in CBC mode. >>> >>> All works well with AES128 for all cipher modes, however when moving >>> to AES192 & AES256 and GCRY_CIPHER_MODE_CBC it looks like all >>> buffers are being written just in first 128 bits of output. >>> >>> Please see example code attached. >>> >>> Let me know if you'll be able to confirm that and if confirmed if I >>> may help with fixing it. >> I added new CBC test vectors to tests/basic.c from RFC 3602 and NIST >> SP800-38A, which did work fine. Then I tried adding the test vectors >> from example code that you provided and those tests fail (AES128, 192 >> and 256). >> >> check-cbc, encrypt mismatch entry 4:0 >> expected: 72 de 8d 00 17 e1 4a a3 5c 87 16 b3 33 55 92 5c b8 ef fc b9 >> d8 aa 32 29 8a 76 6a 62 7c fc 29 52 e0 34 c8 72 32 91 0a 8d 18 93 13 >> 48 fc af 45 24 01 1a 61 9d 6c 61 a2 38 5f c7 9d ce 7f 92 ee b8 >> computed: 72 de 8d 00 17 e1 4a a3 5c 87 16 b3 33 55 92 5c b8 ef fc b9 >> d8 aa 32 29 8a 76 6a 62 7c fc 29 52 01 1a 61 9d 6c 61 a2 38 5f c7 9d >> ce 7f 92 ee b8 e0 34 c8 72 32 91 0a 8d 18 93 13 48 fc af 45 24 >> xor-diff: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >> 00 00 00 00 00 00 00 00 00 00 00 00 e1 2e a9 ef 5e f0 a8 b5 47 54 8e >> 86 83 3d ab 9c e1 2e a9 ef 5e f0 a8 b5 47 54 8e 86 83 3d ab 9c >> check-cbc, encrypt mismatch entry 5:0 >> expected: 12 e9 3d 3c b0 6d 13 44 e7 b9 5c eb b6 88 ec ba 39 f3 b0 21 >> d7 a2 45 f0 9e 24 ed e1 2c 4f f6 a3 80 f9 f7 b7 3c f6 dc 46 3e e5 3a >> 7b d1 e6 1d 2e bb b1 38 b2 aa 22 7b 81 18 35 e0 a7 c8 20 53 f6 >> computed: fd 51 b0 e6 d2 98 09 85 4f 7a c7 a7 d6 97 c2 62 72 a2 d8 bb >> 5f 0f 43 5e 05 33 59 ef c7 4d f9 83 16 05 2c 29 91 55 1c 58 b9 d9 7d >> ec 5d dd eb 4c 7a 97 e1 a5 b5 04 60 59 6c 25 62 e5 50 1d 72 33 >> xor-diff: ef b8 8d da 62 f5 1a c1 a8 c3 9b 4c 60 1f 2e d8 4b 51 68 9a >> 88 ad 06 ae 9b 17 b4 0e eb 02 0f 20 96 fc db 9e ad a3 c0 1e 87 3c 47 >> 97 8c 3b f6 62 c1 26 d9 17 1f 26 1b d8 74 10 82 42 98 3d 21 c5 >> check-cbc, encrypt mismatch entry 6:0 >> expected: 4a d8 4c 9f 32 dc 6a 95 63 49 a3 d3 cc 30 88 96 4b 5c aa f0 >> 30 51 f1 9f 1e a9 11 71 52 89 46 0a 4c da 68 d7 17 6b a4 ed e0 4d 1b >> 69 c8 fb 54 f2 46 6a 14 7d 9a 4a d5 c4 bc 55 15 d4 1b d4 ac 7d >> computed: b2 1f 9d bf bf fe c3 4f b0 98 fe f2 0b fc 8e 44 d1 ac 48 e0 >> d9 1c b4 17 e1 32 19 05 80 f8 e9 da 60 fb 53 97 4a 6c 79 46 2a 7d 91 >> 7c 87 71 af a7 87 44 37 05 2b 02 c7 63 52 b3 53 a3 a3 cf 8f 75 >> xor-diff: f8 c7 d1 20 8d 22 a9 da d3 d1 5d 21 c7 cc 06 d2 9a f0 e2 10 >> e9 4d 45 88 ff 9b 08 74 d2 71 af d0 2c 21 3b 40 5d 07 dd ab ca 30 8a >> 15 4f 8a fb 55 c1 2e 23 78 b1 48 12 a7 ee e6 46 77 b8 1b 23 08 >> >> Patch with these tests for libgcrypt/master attached. >> >> -Jussi >> >>> Thank you & Kind Regards, >>> Jan >>> >>> Jan Bilek >>> CTO, EFTlab Pty Ltd >>> email: jan.bilek at eftlab.co.uk >>> mob: +61 (0) 498 103 179 >>> >>> This message contains confidential information and is intended only >>> for the addressee(s). E-mail transmission cannot be guaranteed to be >>> secure or error-free as information could be intercepted, corrupted, >>> lost, destroyed, arrive late or incomplete, or contain viruses. >>> EFTlab Ltd cannot accept liability for any errors or omissions in >>> the contents of this message, which may arise as a result of e-mail >>> transmission. Please note that EFTlab Ltd may monitor, analyse and >>> archive email traffic, data and the content of email for the >>> purposes of security, legal compliance and staff training. If you >>> have received this email in error please notify us at >>> support at eftlab.co.uk. EFTlab is a limited company registered in >>> England & Wales with Reg No. 07528943. The Registered Office is >>> 21-27 Lamb's Conduit Street, London, WC1N 3GS. >>> >>> >>> _______________________________________________ >>> Gcrypt-devel mailing list >>> Gcrypt-devel at gnupg.org >>> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel >>> > From jan.bilek at eftlab.co.uk Sun Nov 30 17:26:46 2014 From: jan.bilek at eftlab.co.uk (Jan Bilek) Date: Mon, 01 Dec 2014 02:26:46 +1000 Subject: AES192 & AES256 in CBC mode [libgcrypt] In-Reply-To: <547B1932.30701@eftlab.co.uk> References: <5477CB7C.6020507@eftlab.co.uk> <547ADE44.2020907@iki.fi> <547B154D.70802@eftlab.co.uk> <547B1932.30701@eftlab.co.uk> Message-ID: <547B4546.2010904@eftlab.co.uk> Hi, problem found and fixed between the chair and keyboard on my side. Thank you for your support & Kind Regards, Jan On 30/11/14 23:18, Jan Bilek wrote: > Resending as previous email seemed to be bounced back by the > http://www.dnsbl.manitu.net/'s spam filter. > > Kind Regards, > Jan > > On 30/11/14 23:02, Jan Bilek wrote: >> Hello Jussi, >> >> thanks for this, however I'm not sure how to interpret your email. >> Does it mean that my test vectors are wrong? I calculated those "in >> hand" so this might be the case, but then my understanding of CBC is >> incorrect. >> >> Thank you, >> Jan >> >> On 30/11/14 19:07, Jussi Kivilinna wrote: >>> Hello, >>> >>> On 28.11.2014 03:10, Jan Bilek wrote:> Hello, >>>> I've just bounced in a potential problem with libgcrypt while >>>> trying to do AES192 & AES256 in CBC mode. >>>> >>>> All works well with AES128 for all cipher modes, however when >>>> moving to AES192 & AES256 and GCRY_CIPHER_MODE_CBC it looks like >>>> all buffers are being written just in first 128 bits of output. >>>> >>>> Please see example code attached. >>>> >>>> Let me know if you'll be able to confirm that and if confirmed if I >>>> may help with fixing it. >>> I added new CBC test vectors to tests/basic.c from RFC 3602 and NIST >>> SP800-38A, which did work fine. Then I tried adding the test vectors >>> from example code that you provided and those tests fail (AES128, >>> 192 and 256). >>> >>> check-cbc, encrypt mismatch entry 4:0 >>> expected: 72 de 8d 00 17 e1 4a a3 5c 87 16 b3 33 55 92 5c b8 ef fc >>> b9 d8 aa 32 29 8a 76 6a 62 7c fc 29 52 e0 34 c8 72 32 91 0a 8d 18 93 >>> 13 48 fc af 45 24 01 1a 61 9d 6c 61 a2 38 5f c7 9d ce 7f 92 ee b8 >>> computed: 72 de 8d 00 17 e1 4a a3 5c 87 16 b3 33 55 92 5c b8 ef fc >>> b9 d8 aa 32 29 8a 76 6a 62 7c fc 29 52 01 1a 61 9d 6c 61 a2 38 5f c7 >>> 9d ce 7f 92 ee b8 e0 34 c8 72 32 91 0a 8d 18 93 13 48 fc af 45 24 >>> xor-diff: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >>> 00 00 00 00 00 00 00 00 00 00 00 00 00 e1 2e a9 ef 5e f0 a8 b5 47 54 >>> 8e 86 83 3d ab 9c e1 2e a9 ef 5e f0 a8 b5 47 54 8e 86 83 3d ab 9c >>> check-cbc, encrypt mismatch entry 5:0 >>> expected: 12 e9 3d 3c b0 6d 13 44 e7 b9 5c eb b6 88 ec ba 39 f3 b0 >>> 21 d7 a2 45 f0 9e 24 ed e1 2c 4f f6 a3 80 f9 f7 b7 3c f6 dc 46 3e e5 >>> 3a 7b d1 e6 1d 2e bb b1 38 b2 aa 22 7b 81 18 35 e0 a7 c8 20 53 f6 >>> computed: fd 51 b0 e6 d2 98 09 85 4f 7a c7 a7 d6 97 c2 62 72 a2 d8 >>> bb 5f 0f 43 5e 05 33 59 ef c7 4d f9 83 16 05 2c 29 91 55 1c 58 b9 d9 >>> 7d ec 5d dd eb 4c 7a 97 e1 a5 b5 04 60 59 6c 25 62 e5 50 1d 72 33 >>> xor-diff: ef b8 8d da 62 f5 1a c1 a8 c3 9b 4c 60 1f 2e d8 4b 51 68 >>> 9a 88 ad 06 ae 9b 17 b4 0e eb 02 0f 20 96 fc db 9e ad a3 c0 1e 87 3c >>> 47 97 8c 3b f6 62 c1 26 d9 17 1f 26 1b d8 74 10 82 42 98 3d 21 c5 >>> check-cbc, encrypt mismatch entry 6:0 >>> expected: 4a d8 4c 9f 32 dc 6a 95 63 49 a3 d3 cc 30 88 96 4b 5c aa >>> f0 30 51 f1 9f 1e a9 11 71 52 89 46 0a 4c da 68 d7 17 6b a4 ed e0 4d >>> 1b 69 c8 fb 54 f2 46 6a 14 7d 9a 4a d5 c4 bc 55 15 d4 1b d4 ac 7d >>> computed: b2 1f 9d bf bf fe c3 4f b0 98 fe f2 0b fc 8e 44 d1 ac 48 >>> e0 d9 1c b4 17 e1 32 19 05 80 f8 e9 da 60 fb 53 97 4a 6c 79 46 2a 7d >>> 91 7c 87 71 af a7 87 44 37 05 2b 02 c7 63 52 b3 53 a3 a3 cf 8f 75 >>> xor-diff: f8 c7 d1 20 8d 22 a9 da d3 d1 5d 21 c7 cc 06 d2 9a f0 e2 >>> 10 e9 4d 45 88 ff 9b 08 74 d2 71 af d0 2c 21 3b 40 5d 07 dd ab ca 30 >>> 8a 15 4f 8a fb 55 c1 2e 23 78 b1 48 12 a7 ee e6 46 77 b8 1b 23 08 >>> >>> Patch with these tests for libgcrypt/master attached. >>> >>> -Jussi >>> >>>> Thank you & Kind Regards, >>>> Jan >>>> >>>> Jan Bilek >>>> CTO, EFTlab Pty Ltd >>>> email: jan.bilek at eftlab.co.uk >>>> mob: +61 (0) 498 103 179 >>>> >>>> This message contains confidential information and is intended only >>>> for the addressee(s). E-mail transmission cannot be guaranteed to >>>> be secure or error-free as information could be intercepted, >>>> corrupted, lost, destroyed, arrive late or incomplete, or contain >>>> viruses. EFTlab Ltd cannot accept liability for any errors or >>>> omissions in the contents of this message, which may arise as a >>>> result of e-mail transmission. Please note that EFTlab Ltd may >>>> monitor, analyse and archive email traffic, data and the content of >>>> email for the purposes of security, legal compliance and staff >>>> training. If you have received this email in error please notify us >>>> at support at eftlab.co.uk. EFTlab is a limited company registered in >>>> England & Wales with Reg No. 07528943. The Registered Office is >>>> 21-27 Lamb's Conduit Street, London, WC1N 3GS. >>>> >>>> >>>> _______________________________________________ >>>> Gcrypt-devel mailing list >>>> Gcrypt-devel at gnupg.org >>>> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel >>>> >> >