[PATCH 2/2] poly1305: add AMD64/AVX2 optimized implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri May 16 20:08:18 CEST 2014


* cipher/Makefile.am: Add 'poly1305-avx2-amd64.S'.
* cipher/poly1305-avx2-amd64.S: New.
* cipher/poly1305-internal.h (POLY1305_USE_AVX2)
(POLY1305_AVX2_BLOCKSIZE, POLY1305_AVX2_STATESIZE)
(POLY1305_AVX2_ALIGNMENT): New.
(POLY1305_LARGEST_BLOCKSIZE, POLY1305_LARGEST_STATESIZE)
(POLY1305_STATE_ALIGNMENT): Use AVX2 versions when needed.
* cipher/poly1305.c [POLY1305_USE_AVX2]
(_gcry_poly1305_amd64_avx2_init_ext)
(_gcry_poly1305_amd64_avx2_finish_ext)
(_gcry_poly1305_amd64_avx2_blocks, poly1305_amd64_avx2_ops): New.
(_gcry_poly1305_init) [POLY1305_USE_AVX2]: Use AVX2 implementation if
AVX2 supported by CPU.
* configure.ac [host=x86_64]: Add 'poly1305-avx2-amd64.lo'.
--

Add Andrew Moon's public domain AVX2 implementation of Poly1305. Original
source is available at: https://github.com/floodyberry/poly1305-opt

Benchmarks on Intel i5-4570 (haswell):

Old:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |     0.448 ns/B    2129.5 MiB/s      1.43 c/B

New:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |     0.205 ns/B    4647.1 MiB/s     0.739 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am           |    2 
 cipher/poly1305-avx2-amd64.S |  954 ++++++++++++++++++++++++++++++++++++++++++
 cipher/poly1305-internal.h   |   23 +
 cipher/poly1305.c            |   26 +
 configure.ac                 |    1 
 5 files changed, 1002 insertions(+), 4 deletions(-)
 create mode 100644 cipher/poly1305-avx2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index a32ae89..19b0097 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -72,7 +72,7 @@ gost28147.c gost.h \
 gostr3411-94.c \
 md4.c \
 md5.c \
-poly1305-sse2-amd64.S \
+poly1305-sse2-amd64.S poly1305-avx2-amd64.S \
 rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-arm.S \
 rmd160.c \
 rsa.c \
diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S
new file mode 100644
index 0000000..0ba7e76
--- /dev/null
+++ b/cipher/poly1305-avx2-amd64.S
@@ -0,0 +1,954 @@
+/* poly1305-avx2-amd64.S  -  AMD64/AVX2 implementation of Poly1305
+ *
+ * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Based on public domain implementation by Andrew Moon at
+ *  https://github.com/floodyberry/poly1305-opt
+ */
+
+#include <config.h>
+
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(ENABLE_AVX2_SUPPORT)
+
+.text
+
+
+.align 8
+.globl _gcry_poly1305_amd64_avx2_init_ext
+.type  _gcry_poly1305_amd64_avx2_init_ext, at function;
+_gcry_poly1305_amd64_avx2_init_ext:
+.Lpoly1305_init_ext_avx2_local:
+	xor %edx, %edx
+	vzeroupper
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushq %rbx
+	movq %rdx, %rcx
+	vpxor %ymm0, %ymm0, %ymm0
+	movq $-1, %r8
+	testq %rcx, %rcx
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm0, 32(%rdi)
+	vmovdqu %ymm0, 64(%rdi)
+	vmovdqu %ymm0, 96(%rdi)
+	vmovdqu %ymm0, 128(%rdi)
+	movq 8(%rsi), %r9
+	cmove %r8, %rcx
+	movq $0xffc0fffffff, %r8
+	movq %r9, %r13
+	movq (%rsi), %r10
+	andq %r10, %r8
+	shrq $44, %r10
+	movq %r8, %r14
+	shlq $20, %r13
+	orq %r13, %r10
+	movq $0xfffffc0ffff, %r13
+	shrq $24, %r9
+	andq %r13, %r10
+	movq $0xffffffc0f, %r13
+	andq %r13, %r9
+	movl %r8d, %r13d
+	andl $67108863, %r13d
+	movl %r13d, 164(%rdi)
+	movq %r10, %r13
+	shrq $26, %r14
+	shlq $18, %r13
+	orq %r13, %r14
+	movq %r10, %r13
+	shrq $8, %r13
+	andl $67108863, %r14d
+	andl $67108863, %r13d
+	movl %r14d, 172(%rdi)
+	movq %r10, %r14
+	movl %r13d, 180(%rdi)
+	movq %r9, %r13
+	shrq $34, %r14
+	shlq $10, %r13
+	orq %r13, %r14
+	movq %r9, %r13
+	shrq $16, %r13
+	andl $67108863, %r14d
+	movl %r14d, 188(%rdi)
+	movl %r13d, 196(%rdi)
+	cmpq $16, %rcx
+	jbe .Lpoly1305_init_ext_avx2_continue
+	lea (%r9,%r9,4), %r11
+	shlq $2, %r11
+	lea (%r10,%r10), %rax
+	mulq %r11
+	movq %rax, %r13
+	movq %r8, %rax
+	movq %rdx, %r14
+	mulq %r8
+	addq %rax, %r13
+	lea (%r8,%r8), %rax
+	movq %r13, %r12
+	adcq %rdx, %r14
+	mulq %r10
+	shlq $20, %r14
+	movq %rax, %r15
+	shrq $44, %r12
+	movq %r11, %rax
+	orq %r12, %r14
+	movq %rdx, %r12
+	mulq %r9
+	addq %rax, %r15
+	movq %r8, %rax
+	adcq %rdx, %r12
+	addq %r15, %r14
+	lea (%r9,%r9), %r15
+	movq %r14, %rbx
+	adcq $0, %r12
+	mulq %r15
+	shlq $20, %r12
+	movq %rdx, %r11
+	shrq $44, %rbx
+	orq %rbx, %r12
+	movq %rax, %rbx
+	movq %r10, %rax
+	mulq %r10
+	addq %rax, %rbx
+	adcq %rdx, %r11
+	addq %rbx, %r12
+	movq $0xfffffffffff, %rbx
+	movq %r12, %r15
+	adcq $0, %r11
+	andq %rbx, %r13
+	shlq $22, %r11
+	andq %rbx, %r14
+	shrq $42, %r15
+	orq %r15, %r11
+	lea (%r11,%r11,4), %r11
+	addq %r11, %r13
+	movq %rbx, %r11
+	andq %r13, %r11
+	shrq $44, %r13
+	movq %r11, %r15
+	addq %r13, %r14
+	movq $0x3ffffffffff, %r13
+	andq %r14, %rbx
+	andq %r13, %r12
+	movq %rbx, %r13
+	shrq $26, %r15
+	shlq $18, %r13
+	orq %r13, %r15
+	movq %rbx, %r13
+	shrq $44, %r14
+	shrq $8, %r13
+	addq %r14, %r12
+	movl %r11d, %r14d
+	andl $67108863, %r15d
+	andl $67108863, %r14d
+	andl $67108863, %r13d
+	movl %r14d, 204(%rdi)
+	movq %rbx, %r14
+	movl %r13d, 220(%rdi)
+	movq %r12, %r13
+	shrq $34, %r14
+	shlq $10, %r13
+	orq %r13, %r14
+	movq %r12, %r13
+	shrq $16, %r13
+	andl $67108863, %r14d
+	movl %r15d, 212(%rdi)
+	movl %r14d, 228(%rdi)
+	movl %r13d, 236(%rdi)
+	cmpq $32, %rcx
+	jbe .Lpoly1305_init_ext_avx2_continue
+	movq %r9, %rax
+	lea (%rbx,%rbx,4), %r14
+	shlq $2, %r14
+	mulq %r14
+	movq %rdi, -32(%rsp)
+	lea (%r12,%r12,4), %rdi
+	shlq $2, %rdi
+	movq %rax, %r14
+	movq %r10, %rax
+	movq %rdx, %r15
+	mulq %rdi
+	movq %rax, %r13
+	movq %r11, %rax
+	movq %rcx, -16(%rsp)
+	movq %rdx, %rcx
+	mulq %r8
+	addq %rax, %r13
+	movq %rdi, %rax
+	movq %rsi, -24(%rsp)
+	adcq %rdx, %rcx
+	addq %r13, %r14
+	adcq %rcx, %r15
+	movq %r14, %rcx
+	mulq %r9
+	shlq $20, %r15
+	movq %rax, %r13
+	shrq $44, %rcx
+	movq %r11, %rax
+	orq %rcx, %r15
+	movq %rdx, %rcx
+	mulq %r10
+	movq %rax, %rsi
+	movq %rbx, %rax
+	movq %rdx, %rdi
+	mulq %r8
+	addq %rax, %rsi
+	movq %r11, %rax
+	adcq %rdx, %rdi
+	addq %rsi, %r13
+	adcq %rdi, %rcx
+	addq %r13, %r15
+	movq %r15, %rdi
+	adcq $0, %rcx
+	mulq %r9
+	shlq $20, %rcx
+	movq %rdx, %rsi
+	shrq $44, %rdi
+	orq %rdi, %rcx
+	movq %rax, %rdi
+	movq %rbx, %rax
+	mulq %r10
+	movq %rax, %r9
+	movq %r8, %rax
+	movq %rdx, %r10
+	movq $0xfffffffffff, %r8
+	mulq %r12
+	addq %rax, %r9
+	adcq %rdx, %r10
+	andq %r8, %r14
+	addq %r9, %rdi
+	adcq %r10, %rsi
+	andq %r8, %r15
+	addq %rdi, %rcx
+	movq $0x3ffffffffff, %rdi
+	movq %rcx, %r10
+	adcq $0, %rsi
+	andq %rdi, %rcx
+	shlq $22, %rsi
+	shrq $42, %r10
+	orq %r10, %rsi
+	movq -32(%rsp), %rdi
+	lea (%rsi,%rsi,4), %r9
+	movq %r8, %rsi
+	addq %r9, %r14
+	andq %r14, %rsi
+	shrq $44, %r14
+	addq %r14, %r15
+	andq %r15, %r8
+	shrq $44, %r15
+	movq %r8, %r14
+	addq %r15, %rcx
+	movl %esi, %r15d
+	movq %rcx, %r10
+	movq %r8, %r9
+	shrq $26, %rsi
+	andl $67108863, %r15d
+	shlq $18, %r14
+	shrq $34, %r8
+	orq %r14, %rsi
+	shlq $10, %r10
+	shrq $8, %r9
+	orq %r10, %r8
+	shrq $16, %rcx
+	andl $67108863, %esi
+	movl %esi, 252(%rdi)
+	andl $67108863, %r9d
+	movl %ecx, 276(%rdi)
+	andl $67108863, %r8d
+	movl %r15d, 244(%rdi)
+	movl %r9d, 260(%rdi)
+	movl %r8d, 268(%rdi)
+	movq -16(%rsp), %rcx
+	movq -24(%rsp), %rsi
+.Lpoly1305_init_ext_avx2_continue:
+	movl 16(%rsi), %r8d
+	movl %r8d, 284(%rdi)
+	movl 20(%rsi), %r9d
+	movl %r9d, 292(%rdi)
+	movl 24(%rsi), %r10d
+	movl %r10d, 300(%rdi)
+	movl 28(%rsi), %esi
+	movl %esi, 308(%rdi)
+	cmpq $48, %rcx
+	jbe .Lpoly1305_init_ext_avx2_done
+	lea (%r12,%r12,4), %r9
+	shlq $2, %r9
+	lea (%rbx,%rbx), %rax
+	mulq %r9
+	movq %rax, %rsi
+	movq %r11, %rax
+	movq %rdx, %r8
+	mulq %r11
+	addq %rax, %rsi
+	lea (%r11,%r11), %rax
+	movq %rsi, %r10
+	adcq %rdx, %r8
+	mulq %rbx
+	movq %rax, %r13
+	movq %r12, %rax
+	movq %rdx, %rcx
+	addq %r12, %r12
+	mulq %r9
+	addq %rax, %r13
+	movq %r11, %rax
+	movq $0xfffffffffff, %r9
+	adcq %rdx, %rcx
+	andq %r9, %rsi
+	mulq %r12
+	shlq $20, %r8
+	movq %rax, %r11
+	shrq $44, %r10
+	movq %rbx, %rax
+	orq %r10, %r8
+	movq %rdx, %r12
+	mulq %rbx
+	addq %r13, %r8
+	movq %r8, %r14
+	adcq $0, %rcx
+	andq %r9, %r8
+	addq %rax, %r11
+	adcq %rdx, %r12
+	shlq $20, %rcx
+	shrq $44, %r14
+	orq %r14, %rcx
+	addq %r11, %rcx
+	movq %rcx, %rbx
+	adcq $0, %r12
+	shlq $22, %r12
+	shrq $42, %rbx
+	orq %rbx, %r12
+	movq %r9, %rbx
+	lea (%r12,%r12,4), %r15
+	addq %r15, %rsi
+	andq %rsi, %rbx
+	shrq $44, %rsi
+	movl %ebx, %r11d
+	addq %rsi, %r8
+	movq $0x3ffffffffff, %rsi
+	andq %r8, %r9
+	andq %rsi, %rcx
+	shrq $44, %r8
+	movq %r9, %rax
+	addq %r8, %rcx
+	movq %r9, %r8
+	movq %rcx, %r10
+	andl $67108863, %r11d
+	shrq $26, %rbx
+	shlq $18, %r8
+	shrq $34, %r9
+	orq %r8, %rbx
+	shlq $10, %r10
+	shrq $8, %rax
+	orq %r10, %r9
+	shrq $16, %rcx
+	andl $67108863, %ebx
+	andl $67108863, %eax
+	andl $67108863, %r9d
+	movl %r11d, 184(%rdi)
+	movl %r11d, 176(%rdi)
+	movl %r11d, 168(%rdi)
+	movl %r11d, 160(%rdi)
+	movl %ebx, 216(%rdi)
+	movl %ebx, 208(%rdi)
+	movl %ebx, 200(%rdi)
+	movl %ebx, 192(%rdi)
+	movl %eax, 248(%rdi)
+	movl %eax, 240(%rdi)
+	movl %eax, 232(%rdi)
+	movl %eax, 224(%rdi)
+	movl %r9d, 280(%rdi)
+	movl %r9d, 272(%rdi)
+	movl %r9d, 264(%rdi)
+	movl %r9d, 256(%rdi)
+	movl %ecx, 312(%rdi)
+	movl %ecx, 304(%rdi)
+	movl %ecx, 296(%rdi)
+	movl %ecx, 288(%rdi)
+.Lpoly1305_init_ext_avx2_done:
+	movq $0, 320(%rdi)
+	vzeroall
+	popq %rbx
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	ret
+.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext;
+
+
+.align 8
+.globl _gcry_poly1305_amd64_avx2_blocks
+.type  _gcry_poly1305_amd64_avx2_blocks, at function;
+_gcry_poly1305_amd64_avx2_blocks:
+.Lpoly1305_blocks_avx2_local:
+	vzeroupper
+	pushq %rbp
+	movq %rsp, %rbp
+	pushq %rbx
+	andq $-64, %rsp
+	subq $200, %rsp
+	movl $((1<<26)-1), %r8d
+	movl $(5), %r9d
+	movl $((1<<24)), %r10d
+	vmovd %r8d, %xmm0
+	vmovd %r9d, %xmm8
+	vmovd %r10d, %xmm7
+	vpbroadcastq %xmm0, %ymm0
+	vpbroadcastq %xmm8, %ymm8
+	vpbroadcastq %xmm7, %ymm7
+	vmovdqa %ymm7, 168(%rsp)
+	movq 320(%rdi), %rax
+	testb $60, %al
+	je .Lpoly1305_blocks_avx2_9
+	vmovdqa 168(%rsp), %ymm7
+	vpsrldq $8, %ymm7, %ymm1
+	vmovdqa %ymm1, 168(%rsp)
+	testb $4, %al
+	je .Lpoly1305_blocks_avx2_10
+	vpermq $192, %ymm1, %ymm7
+	vmovdqa %ymm7, 168(%rsp)
+.Lpoly1305_blocks_avx2_10:
+	testb $8, %al
+	je .Lpoly1305_blocks_avx2_11
+	vpermq $240, 168(%rsp), %ymm7
+	vmovdqa %ymm7, 168(%rsp)
+.Lpoly1305_blocks_avx2_11:
+	testb $16, %al
+	je .Lpoly1305_blocks_avx2_12
+	vpermq $252, 168(%rsp), %ymm6
+	vmovdqa %ymm6, 168(%rsp)
+.Lpoly1305_blocks_avx2_12:
+	testb $32, %al
+	je .Lpoly1305_blocks_avx2_9
+	vpxor %xmm6, %xmm6, %xmm6
+	vmovdqa %ymm6, 168(%rsp)
+.Lpoly1305_blocks_avx2_9:
+	testb $1, %al
+	jne .Lpoly1305_blocks_avx2_13
+	vmovdqu (%rsi), %ymm3
+	vmovdqu 32(%rsi), %ymm1
+	vpunpcklqdq %ymm1, %ymm3, %ymm2
+	vpunpckhqdq %ymm1, %ymm3, %ymm1
+	vpermq $216, %ymm2, %ymm2
+	vpermq $216, %ymm1, %ymm1
+	vpand %ymm2, %ymm0, %ymm5
+	vpsrlq $26, %ymm2, %ymm4
+	vpand %ymm4, %ymm0, %ymm4
+	vpsllq $12, %ymm1, %ymm3
+	vpsrlq $52, %ymm2, %ymm2
+	vpor %ymm3, %ymm2, %ymm2
+	vpand %ymm2, %ymm0, %ymm3
+	vpsrlq $26, %ymm2, %ymm2
+	vpand %ymm2, %ymm0, %ymm2
+	vpsrlq $40, %ymm1, %ymm1
+	vpor 168(%rsp), %ymm1, %ymm1
+	addq $64, %rsi
+	subq $64, %rdx
+	orq $1, 320(%rdi)
+	jmp .Lpoly1305_blocks_avx2_14
+.Lpoly1305_blocks_avx2_13:
+	vmovdqa (%rdi), %ymm5
+	vmovdqa 32(%rdi), %ymm4
+	vmovdqa 64(%rdi), %ymm3
+	vmovdqa 96(%rdi), %ymm2
+	vmovdqa 128(%rdi), %ymm1
+.Lpoly1305_blocks_avx2_14:
+	cmpq $63, %rdx
+	jbe .Lpoly1305_blocks_avx2_15
+	vmovdqa 160(%rdi), %ymm6
+	vmovdqa %ymm8, 136(%rsp)
+	vmovdqa 192(%rdi), %ymm7
+	vpmuludq %ymm8, %ymm7, %ymm11
+	vmovdqa %ymm11, 104(%rsp)
+	vmovdqa 224(%rdi), %ymm11
+	vmovdqa %ymm11, 72(%rsp)
+	vpmuludq %ymm11, %ymm8, %ymm11
+	vmovdqa %ymm11, 40(%rsp)
+	vmovdqa 256(%rdi), %ymm11
+	vmovdqa %ymm11, 8(%rsp)
+	vpmuludq %ymm11, %ymm8, %ymm11
+	vmovdqa %ymm11, -24(%rsp)
+	vmovdqa 288(%rdi), %ymm13
+	vmovdqa %ymm13, -56(%rsp)
+	vpmuludq %ymm13, %ymm8, %ymm13
+	vmovdqa %ymm13, -88(%rsp)
+.Lpoly1305_blocks_avx2_16:
+	vpmuludq 104(%rsp), %ymm1, %ymm14
+	vmovdqa 40(%rsp), %ymm13
+	vpmuludq %ymm13, %ymm2, %ymm8
+	vpmuludq %ymm13, %ymm1, %ymm13
+	vmovdqa -24(%rsp), %ymm9
+	vpmuludq %ymm9, %ymm2, %ymm10
+	vpmuludq %ymm9, %ymm1, %ymm11
+	vpaddq %ymm8, %ymm14, %ymm14
+	vpmuludq %ymm9, %ymm3, %ymm8
+	vmovdqa -88(%rsp), %ymm12
+	vpmuludq %ymm12, %ymm1, %ymm9
+	vpaddq %ymm10, %ymm13, %ymm13
+	vpmuludq %ymm12, %ymm4, %ymm15
+	vmovdqa %ymm12, %ymm10
+	vpmuludq %ymm12, %ymm3, %ymm12
+	vpaddq %ymm8, %ymm14, %ymm14
+	vpmuludq %ymm10, %ymm2, %ymm10
+	vpmuludq %ymm6, %ymm2, %ymm8
+	vpaddq %ymm15, %ymm14, %ymm14
+	vpmuludq %ymm6, %ymm1, %ymm1
+	vpaddq %ymm12, %ymm13, %ymm13
+	vpmuludq %ymm6, %ymm5, %ymm15
+	vpaddq %ymm10, %ymm11, %ymm11
+	vpmuludq %ymm6, %ymm4, %ymm12
+	vpaddq %ymm8, %ymm9, %ymm9
+	vpmuludq %ymm6, %ymm3, %ymm10
+	vpmuludq %ymm7, %ymm3, %ymm8
+	vpaddq %ymm15, %ymm14, %ymm14
+	vpmuludq %ymm7, %ymm2, %ymm2
+	vpaddq %ymm12, %ymm13, %ymm12
+	vpmuludq %ymm7, %ymm5, %ymm15
+	vpaddq %ymm10, %ymm11, %ymm10
+	vpmuludq %ymm7, %ymm4, %ymm13
+	vpaddq %ymm8, %ymm9, %ymm8
+	vmovdqa 72(%rsp), %ymm9
+	vpmuludq %ymm9, %ymm4, %ymm11
+	vpaddq %ymm2, %ymm1, %ymm1
+	vpmuludq %ymm9, %ymm3, %ymm3
+	vpaddq %ymm15, %ymm12, %ymm12
+	vpmuludq %ymm9, %ymm5, %ymm15
+	vpaddq %ymm13, %ymm10, %ymm10
+	vmovdqa 8(%rsp), %ymm2
+	vpmuludq %ymm2, %ymm5, %ymm9
+	vpaddq %ymm11, %ymm8, %ymm8
+	vpmuludq %ymm2, %ymm4, %ymm4
+	vpaddq %ymm3, %ymm1, %ymm1
+	vpmuludq -56(%rsp), %ymm5, %ymm5
+	vpaddq %ymm15, %ymm10, %ymm10
+	vpaddq %ymm9, %ymm8, %ymm8
+	vpaddq %ymm4, %ymm1, %ymm1
+	vpaddq %ymm5, %ymm1, %ymm5
+	vmovdqu (%rsi), %ymm3
+	vmovdqu 32(%rsi), %ymm2
+	vperm2i128 $32, %ymm2, %ymm3, %ymm1
+	vperm2i128 $49, %ymm2, %ymm3, %ymm2
+	vpunpckldq %ymm2, %ymm1, %ymm15
+	vpunpckhdq %ymm2, %ymm1, %ymm2
+	vpxor %xmm4, %xmm4, %xmm4
+	vpunpckldq %ymm4, %ymm15, %ymm1
+	vpunpckhdq %ymm4, %ymm15, %ymm15
+	vpunpckldq %ymm4, %ymm2, %ymm3
+	vpunpckhdq %ymm4, %ymm2, %ymm2
+	vpsllq $6, %ymm15, %ymm15
+	vpsllq $12, %ymm3, %ymm3
+	vpsllq $18, %ymm2, %ymm2
+	vpaddq %ymm1, %ymm14, %ymm14
+	vpaddq %ymm15, %ymm12, %ymm12
+	vpaddq %ymm3, %ymm10, %ymm10
+	vpaddq %ymm2, %ymm8, %ymm8
+	vpaddq 168(%rsp), %ymm5, %ymm5
+	addq $64, %rsi
+	vpsrlq $26, %ymm14, %ymm4
+	vpsrlq $26, %ymm8, %ymm2
+	vpand %ymm0, %ymm14, %ymm14
+	vpand %ymm0, %ymm8, %ymm8
+	vpaddq %ymm4, %ymm12, %ymm12
+	vpaddq %ymm2, %ymm5, %ymm5
+	vpsrlq $26, %ymm12, %ymm3
+	vpsrlq $26, %ymm5, %ymm9
+	vpand %ymm0, %ymm12, %ymm12
+	vpand %ymm0, %ymm5, %ymm11
+	vpaddq %ymm3, %ymm10, %ymm3
+	vpmuludq 136(%rsp), %ymm9, %ymm9
+	vpaddq %ymm9, %ymm14, %ymm14
+	vpsrlq $26, %ymm3, %ymm2
+	vpsrlq $26, %ymm14, %ymm4
+	vpand %ymm0, %ymm3, %ymm3
+	vpand %ymm0, %ymm14, %ymm5
+	vpaddq %ymm2, %ymm8, %ymm2
+	vpaddq %ymm4, %ymm12, %ymm4
+	vpsrlq $26, %ymm2, %ymm1
+	vpand %ymm0, %ymm2, %ymm2
+	vpaddq %ymm1, %ymm11, %ymm1
+	subq $64, %rdx
+	cmpq $63, %rdx
+	ja .Lpoly1305_blocks_avx2_16
+.Lpoly1305_blocks_avx2_15:
+	testb $64, 320(%rdi)
+	jne .Lpoly1305_blocks_avx2_17
+	vmovdqa %ymm5, (%rdi)
+	vmovdqa %ymm4, 32(%rdi)
+	vmovdqa %ymm3, 64(%rdi)
+	vmovdqa %ymm2, 96(%rdi)
+	vmovdqa %ymm1, 128(%rdi)
+	jmp .Lpoly1305_blocks_avx2_8
+.Lpoly1305_blocks_avx2_17:
+	vpermq $245, %ymm5, %ymm0
+	vpaddq %ymm0, %ymm5, %ymm5
+	vpermq $245, %ymm4, %ymm0
+	vpaddq %ymm0, %ymm4, %ymm4
+	vpermq $245, %ymm3, %ymm0
+	vpaddq %ymm0, %ymm3, %ymm3
+	vpermq $245, %ymm2, %ymm0
+	vpaddq %ymm0, %ymm2, %ymm2
+	vpermq $245, %ymm1, %ymm0
+	vpaddq %ymm0, %ymm1, %ymm1
+	vpermq $170, %ymm5, %ymm0
+	vpaddq %ymm0, %ymm5, %ymm5
+	vpermq $170, %ymm4, %ymm0
+	vpaddq %ymm0, %ymm4, %ymm4
+	vpermq $170, %ymm3, %ymm0
+	vpaddq %ymm0, %ymm3, %ymm3
+	vpermq $170, %ymm2, %ymm0
+	vpaddq %ymm0, %ymm2, %ymm2
+	vpermq $170, %ymm1, %ymm0
+	vpaddq %ymm0, %ymm1, %ymm1
+	vmovd %xmm5, %eax
+	vmovd %xmm4, %edx
+	movl %eax, %ecx
+	shrl $26, %ecx
+	addl %edx, %ecx
+	movl %ecx, %edx
+	andl $67108863, %edx
+	vmovd %xmm3, %esi
+	shrl $26, %ecx
+	movl %ecx, %r11d
+	addl %esi, %r11d
+	vmovd %xmm2, %ecx
+	movl %r11d, %r10d
+	shrl $26, %r10d
+	addl %ecx, %r10d
+	movl %r10d, %r9d
+	andl $67108863, %r9d
+	vmovd %xmm1, %r8d
+	movl %edx, %esi
+	salq $26, %rsi
+	andl $67108863, %eax
+	orq %rax, %rsi
+	movabsq $17592186044415, %rax
+	andq %rax, %rsi
+	andl $67108863, %r11d
+	salq $8, %r11
+	shrl $18, %edx
+	movl %edx, %edx
+	orq %r11, %rdx
+	movq %r9, %rcx
+	salq $34, %rcx
+	orq %rcx, %rdx
+	andq %rax, %rdx
+	shrl $26, %r10d
+	addl %r10d, %r8d
+	salq $16, %r8
+	shrl $10, %r9d
+	movl %r9d, %r9d
+	orq %r9, %r8
+	movabsq $4398046511103, %r10
+	movq %r8, %r9
+	andq %r10, %r9
+	shrq $42, %r8
+	leaq (%r8,%r8,4), %rcx
+	addq %rcx, %rsi
+	movq %rsi, %r8
+	andq %rax, %r8
+	movq %rsi, %rcx
+	shrq $44, %rcx
+	addq %rdx, %rcx
+	movq %rcx, %rsi
+	andq %rax, %rsi
+	shrq $44, %rcx
+	movq %rcx, %rdx
+	addq %r9, %rdx
+	andq %rdx, %r10
+	shrq $42, %rdx
+	leaq (%r8,%rdx,4), %rcx
+	leaq (%rcx,%rdx), %rdx
+	movq %rdx, %rbx
+	andq %rax, %rbx
+	shrq $44, %rdx
+	movq %rdx, %r11
+	addq %rsi, %r11
+	leaq 5(%rbx), %r9
+	movq %r9, %r8
+	shrq $44, %r8
+	addq %r11, %r8
+	movabsq $-4398046511104, %rsi
+	addq %r10, %rsi
+	movq %r8, %rdx
+	shrq $44, %rdx
+	addq %rdx, %rsi
+	movq %rsi, %rdx
+	shrq $63, %rdx
+	subq $1, %rdx
+	movq %rdx, %rcx
+	notq %rcx
+	andq %rcx, %rbx
+	andq %rcx, %r11
+	andq %r10, %rcx
+	andq %rax, %r9
+	andq %rdx, %r9
+	orq %r9, %rbx
+	movq %rbx, (%rdi)
+	andq %r8, %rax
+	andq %rdx, %rax
+	orq %rax, %r11
+	movq %r11, 8(%rdi)
+	andq %rsi, %rdx
+	orq %rcx, %rdx
+	movq %rdx, 16(%rdi)
+.Lpoly1305_blocks_avx2_8:
+	movq -8(%rbp), %rbx
+	vzeroall
+	movq %rbp, %rax
+	subq %rsp, %rax
+	leave
+	addq $8, %rax
+	ret
+.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks;
+
+
+.align 8
+.globl _gcry_poly1305_amd64_avx2_finish_ext
+.type  _gcry_poly1305_amd64_avx2_finish_ext, at function;
+_gcry_poly1305_amd64_avx2_finish_ext:
+.Lpoly1305_finish_ext_avx2_local:
+	vzeroupper
+	pushq %rbp
+	movq %rsp, %rbp
+	pushq %r13
+	pushq %r12
+	pushq %rbx
+	andq $-64, %rsp
+	subq $64, %rsp
+	movq %rdi, %rbx
+	movq %rdx, %r13
+	movq %rcx, %r12
+	testq %rdx, %rdx
+	je .Lpoly1305_finish_ext_avx2_22
+	vpxor %xmm0, %xmm0, %xmm0
+	vmovdqa %ymm0, (%rsp)
+	vmovdqa %ymm0, 32(%rsp)
+	movq %rsp, %rax
+	subq %rsp, %rsi
+	testb $32, %dl
+	je .Lpoly1305_finish_ext_avx2_23
+	vmovdqu (%rsp,%rsi), %ymm0
+	vmovdqa %ymm0, (%rsp)
+	leaq 32(%rsp), %rax
+.Lpoly1305_finish_ext_avx2_23:
+	testb $16, %r13b
+	je .Lpoly1305_finish_ext_avx2_24
+	vmovdqu (%rax,%rsi), %xmm0
+	vmovdqa %xmm0, (%rax)
+	addq $16, %rax
+.Lpoly1305_finish_ext_avx2_24:
+	testb $8, %r13b
+	je .Lpoly1305_finish_ext_avx2_25
+	movq (%rax,%rsi), %rdx
+	movq %rdx, (%rax)
+	addq $8, %rax
+.Lpoly1305_finish_ext_avx2_25:
+	testb $4, %r13b
+	je .Lpoly1305_finish_ext_avx2_26
+	movl (%rax,%rsi), %edx
+	movl %edx, (%rax)
+	addq $4, %rax
+.Lpoly1305_finish_ext_avx2_26:
+	testb $2, %r13b
+	je .Lpoly1305_finish_ext_avx2_27
+	movzwl (%rax,%rsi), %edx
+	movw %dx, (%rax)
+	addq $2, %rax
+.Lpoly1305_finish_ext_avx2_27:
+	testb $1, %r13b
+	je .Lpoly1305_finish_ext_avx2_28
+	movzbl (%rax,%rsi), %edx
+	movb %dl, (%rax)
+.Lpoly1305_finish_ext_avx2_28:
+	testb $15, %r13b
+	je .Lpoly1305_finish_ext_avx2_29
+	movb $1, (%rsp,%r13)
+.Lpoly1305_finish_ext_avx2_29:
+	cmpq $47, %r13
+	jbe .Lpoly1305_finish_ext_avx2_30
+	orq $4, 320(%rbx)
+	jmp .Lpoly1305_finish_ext_avx2_31
+.Lpoly1305_finish_ext_avx2_30:
+	cmpq $31, %r13
+	jbe .Lpoly1305_finish_ext_avx2_32
+	orq $8, 320(%rbx)
+	jmp .Lpoly1305_finish_ext_avx2_31
+.Lpoly1305_finish_ext_avx2_32:
+	cmpq $15, %r13
+	jbe .Lpoly1305_finish_ext_avx2_33
+	orq $16, 320(%rbx)
+	jmp .Lpoly1305_finish_ext_avx2_31
+.Lpoly1305_finish_ext_avx2_33:
+	orq $32, 320(%rbx)
+.Lpoly1305_finish_ext_avx2_31:
+	testb $1, 320(%rbx)
+	je .Lpoly1305_finish_ext_avx2_34
+	cmpq $32, %r13
+	ja .Lpoly1305_finish_ext_avx2_34
+	cmpq $17, %r13
+	sbbq %rsi, %rsi
+	notq %rsi
+	addq $2, %rsi
+	cmpq $17, %r13
+	sbbq %rax, %rax
+	movq %rbx, %rdx
+	addq $23, %rax
+	leaq (%rbx,%rax,8), %rax
+	movl $0, %ecx
+.Lpoly1305_finish_ext_avx2_37:
+	movl 244(%rdx), %edi
+	movl %edi, (%rax)
+	movl 252(%rdx), %edi
+	movl %edi, 32(%rax)
+	movl 260(%rdx), %edi
+	movl %edi, 64(%rax)
+	movl 268(%rdx), %edi
+	movl %edi, 96(%rax)
+	movl 276(%rdx), %edi
+	movl %edi, 128(%rax)
+	addq $1, %rcx
+	subq $40, %rdx
+	addq $8, %rax
+	cmpq %rcx, %rsi
+	ja .Lpoly1305_finish_ext_avx2_37
+.Lpoly1305_finish_ext_avx2_34:
+	movl $64, %edx
+	movq %rsp, %rsi
+	movq %rbx, %rdi
+	call .Lpoly1305_blocks_avx2_local
+.Lpoly1305_finish_ext_avx2_22:
+	movq 320(%rbx), %r8
+	testb $1, %r8b
+	je .Lpoly1305_finish_ext_avx2_38
+	leaq -1(%r13), %rax
+	cmpq $47, %rax
+	ja .Lpoly1305_finish_ext_avx2_46
+	cmpq $32, %r13
+	ja .Lpoly1305_finish_ext_avx2_47
+	cmpq $17, %r13
+	sbbq %r9, %r9
+	addq $2, %r9
+	movl $0, %edi
+	cmpq $17, %r13
+	sbbq %rax, %rax
+	notq %rax
+	andl $5, %eax
+	jmp .Lpoly1305_finish_ext_avx2_39
+.Lpoly1305_finish_ext_avx2_41:
+	movl (%rdx), %esi
+	movl %esi, (%rax)
+	movl 8(%rdx), %esi
+	movl %esi, 32(%rax)
+	movl 16(%rdx), %esi
+	movl %esi, 64(%rax)
+	movl 24(%rdx), %esi
+	movl %esi, 96(%rax)
+	movl 32(%rdx), %esi
+	movl %esi, 128(%rax)
+	addq $1, %rcx
+	subq $40, %rdx
+	addq $8, %rax
+	movq %rcx, %rsi
+	subq %rdi, %rsi
+	cmpq %rsi, %r9
+	ja .Lpoly1305_finish_ext_avx2_41
+	cmpq $3, %rcx
+	ja .Lpoly1305_finish_ext_avx2_42
+	leaq 160(%rbx,%rcx,8), %rax
+.Lpoly1305_finish_ext_avx2_43:
+	movl $1, (%rax)
+	movl $0, 32(%rax)
+	movl $0, 64(%rax)
+	movl $0, 96(%rax)
+	movl $0, 128(%rax)
+	addq $1, %rcx
+	addq $8, %rax
+	cmpq $4, %rcx
+	jne .Lpoly1305_finish_ext_avx2_43
+.Lpoly1305_finish_ext_avx2_42:
+	orq $96, %r8
+	movq %r8, 320(%rbx)
+	vpxor %ymm0, %ymm0, %ymm0
+	vmovdqa %ymm0, (%rsp)
+	vmovdqa %ymm0, 32(%rsp)
+	movl $64, %edx
+	movq %rsp, %rsi
+	movq %rbx, %rdi
+	call .Lpoly1305_blocks_avx2_local
+.Lpoly1305_finish_ext_avx2_38:
+	movq 8(%rbx), %rax
+	movq %rax, %rdx
+	salq $44, %rdx
+	orq (%rbx), %rdx
+	shrq $20, %rax
+	movl $24, %edi
+	shlx %rdi, 16(%rbx), %rcx
+	orq %rcx, %rax
+	movl 292(%rbx), %ecx
+	salq $32, %rcx
+	movl 284(%rbx), %esi
+	orq %rsi, %rcx
+	movl 308(%rbx), %esi
+	salq $32, %rsi
+	movl 300(%rbx), %edi
+	orq %rdi, %rsi
+	addq %rcx, %rdx
+	adcq %rsi, %rax
+	movq %rdx, (%r12)
+	movq %rax, 8(%r12)
+	vpxor %xmm0, %xmm0, %xmm0
+	vmovdqu %ymm0, (%rbx)
+	vmovdqu %ymm0, 32(%rbx)
+	vmovdqu %ymm0, 64(%rbx)
+	vmovdqu %ymm0, 96(%rbx)
+	vmovdqu %ymm0, 128(%rbx)
+	vmovdqu %ymm0, 160(%rbx)
+	vmovdqu %ymm0, 192(%rbx)
+	vmovdqu %ymm0, 224(%rbx)
+	jmp .Lpoly1305_finish_ext_avx2_49
+.Lpoly1305_finish_ext_avx2_46:
+	movl $3, %r9d
+	movl $1, %edi
+	movl $10, %eax
+	jmp .Lpoly1305_finish_ext_avx2_39
+.Lpoly1305_finish_ext_avx2_47:
+	movl $3, %r9d
+	movl $0, %edi
+	movl $10, %eax
+.Lpoly1305_finish_ext_avx2_39:
+	leaq 164(%rbx,%rax,8), %rdx
+	leaq 160(%rbx,%rdi,8), %rax
+	movq %rdi, %rcx
+	jmp .Lpoly1305_finish_ext_avx2_41
+.Lpoly1305_finish_ext_avx2_49:
+	movq %rbp, %rax
+	subq %rsp, %rax
+	leaq -24(%rbp), %rsp
+	vzeroall
+	popq %rbx
+	popq %r12
+	popq %r13
+	popq %rbp
+	addq $(8*5), %rax
+ret
+.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext;
+
+#endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index fa3fe75..0299c43 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -54,23 +54,40 @@
 #endif
 
 
+/* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef POLY1305_USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(ENABLE_AVX2_SUPPORT)
+# define POLY1305_USE_AVX2 1
+# define POLY1305_AVX2_BLOCKSIZE 64
+# define POLY1305_AVX2_STATESIZE 328
+# define POLY1305_AVX2_ALIGNMENT 32
+#endif
+
+
 /* Largest block-size used in any implementation (optimized implementations
  * might use block-size multiple of 16). */
-#ifdef POLY1305_USE_SSE2
+#ifdef POLY1305_USE_AVX2
+# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE
+#elif defined(POLY1305_USE_SSE2)
 # define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE
 #else
 # define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE
 #endif
 
 /* Largest state-size used in any implementation. */
-#ifdef POLY1305_USE_SSE2
+#ifdef POLY1305_USE_AVX2
+# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE
+#elif defined(POLY1305_USE_SSE2)
 # define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE
 #else
 # define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE
 #endif
 
 /* Minimum alignment for state pointer passed to implementations. */
-#ifdef POLY1305_USE_SSE2
+#ifdef POLY1305_USE_AVX2
+# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT
+#elif defined(POLY1305_USE_SSE2)
 # define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT
 #else
 # define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index cd1902a..fe241c1 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -57,6 +57,25 @@ static const poly1305_ops_t poly1305_amd64_sse2_ops = {
 #endif
 
 
+#ifdef POLY1305_USE_AVX2
+
+void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key);
+unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m,
+						  size_t remaining,
+						  byte mac[16]);
+unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m,
+					      size_t bytes);
+
+static const poly1305_ops_t poly1305_amd64_avx2_ops = {
+  POLY1305_AVX2_BLOCKSIZE,
+  _gcry_poly1305_amd64_avx2_init_ext,
+  _gcry_poly1305_amd64_avx2_blocks,
+  _gcry_poly1305_amd64_avx2_finish_ext
+};
+
+#endif
+
+
 #ifdef HAVE_U64_TYPEDEF
 
 /* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit
@@ -616,6 +635,7 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
   static int initialized;
   static const char *selftest_failed;
   poly1305_key_t keytmp;
+  unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
@@ -637,6 +657,12 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
   ctx->ops = &poly1305_default_ops;
 #endif
 
+#ifdef POLY1305_USE_AVX2
+  if (features & HWF_INTEL_AVX2)
+    ctx->ops = &poly1305_amd64_avx2_ops;
+#endif
+  (void)features;
+
   buf_cpy (keytmp.b, key, POLY1305_KEYLEN);
   poly1305_init (ctx, &keytmp);
 
diff --git a/configure.ac b/configure.ac
index 4dc36d5..47a322b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1825,6 +1825,7 @@ case "${host}" in
    x86_64-*-*)
       # Build with the assembly implementation
       GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo"
+      GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-avx2-amd64.lo"
    ;;
 esac
 




More information about the Gcrypt-devel mailing list