[PATCH 1/3] New Poly1305 implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Sat Jan 6 19:02:56 CET 2018


* cipher/Makefile.am: Include '../mpi' for 'longlong.h'; Remove
'poly1305-sse2-amd64.S', 'poly1305-avx2-amd64.S' and
'poly1305-armv7-neon.S'.
* cipher/poly1305-armv7-neon.S: Remove.
* cipher/poly1305-avx2-amd64.S: Remove.
* cipher/poly1305-sse2-amd64.S: Remove.
* cipher/poly1305-internal.h (POLY1305_BLOCKSIZE)
(POLY1305_STATE): New.
(POLY1305_SYSV_FUNC_ABI, POLY1305_REF_BLOCKSIZE)
(POLY1305_REF_STATESIZE, POLY1305_REF_ALIGNMENT)
(POLY1305_USE_SSE2, POLY1305_SSE2_BLOCKSIZE, POLY1305_SSE2_STATESIZE)
(POLY1305_SSE2_ALIGNMENT, POLY1305_USE_AVX2, POLY1305_AVX2_BLOCKSIZE)
(POLY1305_AVX2_STATESIZE, POLY1305_AVX2_ALIGNMENT)
(POLY1305_USE_NEON, POLY1305_NEON_BLOCKSIZE, POLY1305_NEON_STATESIZE)
(POLY1305_NEON_ALIGNMENT, POLY1305_LARGEST_BLOCKSIZE)
(POLY1305_LARGEST_STATESIZE, POLY1305_LARGEST_ALIGNMENT)
(POLY1305_STATE_BLOCKSIZE, POLY1305_STATE_STATESIZE)
(POLY1305_STATE_ALIGNMENT, OPS_FUNC_ABI, poly1305_key_s)
(poly1305_ops_s): Remove.
(poly1305_context_s): Rewrite.
* cipher/poly1305.c (_gcry_poly1305_amd64_sse2_init_ext)
(_gcry_poly1305_amd64_sse2_finish_ext)
(_gcry_poly1305_amd64_sse2_blocks, poly1305_amd64_sse2_ops)
(poly1305_init_ext_ref32, poly1305_blocks_ref32)
(poly1305_finish_ext_ref32, poly1305_default_ops)
(_gcry_poly1305_amd64_avx2_init_ext)
(_gcry_poly1305_amd64_avx2_finish_ext)
(_gcry_poly1305_amd64_avx2_blocks)
(poly1305_amd64_avx2_ops, poly1305_get_state): Remove.
(poly1305_init): Rewrite.
(USE_MPI_64BIT, USE_MPI_32BIT): New.
[USE_MPI_64BIT] (ADD_1305_64, MUL_MOD_1305_64, poly1305_blocks)
(poly1305_final): New implementation using 64-bit limbs.
[USE_MPI_32BIT] (UMUL_ADD_32, ADD_1305_32, MUL_MOD_1305_32)
(poly1305_blocks): New implementation using 32-bit limbs.
(_gcry_poly1305_update, _gcry_poly1305_finish)
(_gcry_poly1305_init): Adapt to new implementation.
* configure.ac: Remove 'poly1305-sse2-amd64.lo',
'poly1305-avx2-amd64.lo' and 'poly1305-armv7-neon.lo'.
--

Intel Core i7-4790K CPU @ 4.00GHz (x86_64):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |     0.284 ns/B    3358.6 MiB/s      1.14 c/B

Intel Core i7-4790K CPU @ 4.00GHz (i386):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |     0.888 ns/B    1073.9 MiB/s      3.55 c/B

Cortex-A53 @ 1152Mhz (armv7):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |      4.40 ns/B     216.7 MiB/s      5.07 c/B

Cortex-A53 @ 1152Mhz (aarch64):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte
 POLY1305           |      2.60 ns/B     367.0 MiB/s      2.99 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index bbfab4c82..08baa7c44 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -78,7 +78,6 @@ gost28147.c gost.h \
 gostr3411-94.c \
 md4.c \
 md5.c \
-poly1305-sse2-amd64.S poly1305-avx2-amd64.S poly1305-armv7-neon.S \
 rijndael.c rijndael-internal.h rijndael-tables.h rijndael-aesni.c \
   rijndael-padlock.c rijndael-amd64.S rijndael-arm.S \
   rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S \
diff --git a/cipher/poly1305-armv7-neon.S b/cipher/poly1305-armv7-neon.S
deleted file mode 100644
index 13cb4a5d8..000000000
--- a/cipher/poly1305-armv7-neon.S
+++ /dev/null
@@ -1,744 +0,0 @@
-/* poly1305-armv7-neon.S  -  ARMv7/NEON implementation of Poly1305
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
-#include <config.h>
-
-#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
-    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-    defined(HAVE_GCC_INLINE_ASM_NEON)
-
-.syntax unified
-.fpu neon
-.arm
-
-#ifdef __PIC__
-#  define GET_DATA_POINTER(reg, name, rtmp) \
-		ldr reg, 1f; \
-		ldr rtmp, 2f; \
-		b 3f; \
-	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
-	2:	.word name(GOT); \
-	3:	add reg, pc, reg; \
-		ldr reg, [reg, rtmp];
-#else
-#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
-#endif
-
-#define UNALIGNED_LDMIA2(ptr, l0, l1) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0}; \
-        vld1.32 {d0}, [ptr]!; \
-        vmov l0, s0; \
-        vmov l1, s1; \
-        vpop {d0}; \
-        b 2f; \
-     1: ldmia ptr!, {l0-l1}; \
-     2: ;
-
-#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
-        tst ptr, #3; \
-        beq 1f; \
-        vpush {d0-d1}; \
-        vld1.32 {d0-d1}, [ptr]!; \
-        vmov l0, s0; \
-        vmov l1, s1; \
-        vmov l2, s2; \
-        vmov l3, s3; \
-        vpop {d0-d1}; \
-        b 2f; \
-     1: ldmia ptr!, {l0-l3}; \
-     2: ;
-
-.text
-
-.p2align 2
-.Lpoly1305_init_constants_neon:
-.long 0x3ffff03
-.long 0x3ffc0ff
-.long 0x3f03fff
-.long 0x00fffff
-
-.globl _gcry_poly1305_armv7_neon_init_ext
-.type  _gcry_poly1305_armv7_neon_init_ext,%function;
-_gcry_poly1305_armv7_neon_init_ext:
-.Lpoly1305_init_ext_neon_local:
-	stmfd sp!, {r4-r11, lr}
-	sub sp, sp, #32
-	mov r14, r2
-	and r2, r2, r2
-	moveq r14, #-1
-	UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
-	GET_DATA_POINTER(r7,.Lpoly1305_init_constants_neon,r8)
-	mov r6, r2
-	mov r8, r2, lsr #26
-	mov r9, r3, lsr #20
-	mov r10, r4, lsr #14
-	mov r11, r5, lsr #8
-	orr r8, r8, r3, lsl #6
-	orr r9, r9, r4, lsl #12
-	orr r10, r10, r5, lsl #18
-	ldmia r7, {r2-r5}
-	and r2, r2, r8
-	and r3, r3, r9
-	and r4, r4, r10
-	and r5, r5, r11
-	and r6, r6, 0x3ffffff
-	stmia r0!, {r2-r6}
-	eor r8, r8, r8
-	str r8, [sp, #24]
-.Lpoly1305_init_ext_neon_squareloop:
-	ldr r8, [sp, #24]
-	mov r12, #16
-	cmp r8, #2
-	beq .Lpoly1305_init_ext_neon_donesquaring
-	cmp r8, #1
-	moveq r12, #64
-	cmp r14, r12
-	bls .Lpoly1305_init_ext_neon_donesquaring
-	add r8, #1
-	str r8, [sp, #24]
-	mov r6, r6, lsl #1
-	mov r2, r2, lsl #1
-	umull r7, r8, r3, r3
-	umull r9, r10, r6, r4
-	umlal r7, r8, r6, r5
-	umlal r9, r10, r2, r3
-	add r11, r5, r5, lsl #2
-	umlal r7, r8, r2, r4
-	umlal r9, r10, r5, r11
-	str r7, [sp, #16]
-	str r8, [sp, #20]
-	mov r2, r2, lsr #1
-	mov r5, r5, lsl #1
-	str r9, [sp, #8]
-	str r10, [sp, #12]
-	umull r7, r8, r2, r2
-	umull r9, r10, r6, r2
-	add r11, r3, r3, lsl #2
-	add r12, r4, r4, lsl #2
-	umlal r7, r8, r6, r3
-	umlal r9, r10, r5, r11
-	umlal r7, r8, r5, r12
-	umlal r9, r10, r4, r12
-	mov r6, r6, lsr #1
-	mov r3, r3, lsl #1
-	add r11, r2, r2, lsl #2
-	str r7, [sp, #0]
-	str r8, [sp, #4]
-	umull r7, r8, r6, r6
-	umlal r7, r8, r3, r12
-	umlal r7, r8, r5, r11
-	and r6, r7, 0x3ffffff
-	mov r11, r7, lsr #26
-	orr r11, r11, r8, lsl #6
-	ldr r7, [sp, #0]
-	ldr r8, [sp, #4]
-	adds r9, r9, r11
-	adc r10, r10, #0
-	and r2, r9, 0x3ffffff
-	mov r11, r9, lsr #26
-	orr r11, r11, r10, lsl #6
-	ldr r9, [sp, #8]
-	ldr r10, [sp, #12]
-	adds r7, r7, r11
-	adc r8, r8, #0
-	and r3, r7, 0x3ffffff
-	mov r11, r7, lsr #26
-	orr r11, r11, r8, lsl #6
-	ldr r7, [sp, #16]
-	ldr r8, [sp, #20]
-	adds r9, r9, r11
-	adc r10, r10, #0
-	and r4, r9, 0x3ffffff
-	mov r11, r9, lsr #26
-	orr r11, r11, r10, lsl #6
-	adds r7, r7, r11
-	adc r8, r8, #0
-	and r5, r7, 0x3ffffff
-	mov r11, r7, lsr #26
-	orr r11, r11, r8, lsl #6
-	add r11, r11, r11, lsl #2
-	add r6, r6, r11
-	mov r11, r6, lsr #26
-	and r6, r6, 0x3ffffff
-	add r2, r2, r11
-	stmia r0!, {r2-r6}
-	b .Lpoly1305_init_ext_neon_squareloop
-.Lpoly1305_init_ext_neon_donesquaring:
-	mov r2, #2
-	ldr r14, [sp, #24]
-	sub r14, r2, r14
-	mov r3, r14, lsl #4
-	add r3, r3, r14, lsl #2
-	add r0, r0, r3
-	eor r2, r2, r2
-	eor r3, r3, r3
-	eor r4, r4, r4
-	eor r5, r5, r5
-	eor r6, r6, r6
-	stmia r0!, {r2-r6}
-	stmia r0!, {r2-r6}
-	UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
-	stmia r0, {r2-r6}
-	add sp, sp, #32
-	ldmfd sp!, {r4-r11, lr}
-	mov r0, #(9*4+32)
-	bx lr
-.ltorg
-.size _gcry_poly1305_armv7_neon_init_ext,.-_gcry_poly1305_armv7_neon_init_ext;
-
-.globl _gcry_poly1305_armv7_neon_blocks
-.type  _gcry_poly1305_armv7_neon_blocks,%function;
-_gcry_poly1305_armv7_neon_blocks:
-.Lpoly1305_blocks_neon_local:
-	vmov.i32 q0, #0xffffffff
-	vmov.i32 d4, #1
-	vsubw.u32 q0, q0, d4
-	vstmdb sp!, {q4,q5,q6,q7}
-	stmfd sp!, {r4-r11, lr}
-	mov r8, sp
-	and sp, sp, #~63
-	sub sp, sp, #192
-	str r0, [sp, #108]
-	str r1, [sp, #112]
-	str r2, [sp, #116]
-	str r8, [sp, #120]
-	mov r3, r0
-	mov r0, r1
-	mov r1, r2
-	mov r2, r3
-	ldr r8, [r2, #116]
-	veor d15, d15, d15
-	vorr.i32 d15, #(1 << 24)
-	tst r8, #2
-	beq .Lpoly1305_blocks_neon_skip_shift8
-	vshr.u64 d15, #32
-.Lpoly1305_blocks_neon_skip_shift8:
-	tst r8, #4
-	beq .Lpoly1305_blocks_neon_skip_shift16
-	veor d15, d15, d15
-.Lpoly1305_blocks_neon_skip_shift16:
-	vst1.64 d15, [sp, :64]
-	tst r8, #1
-	bne .Lpoly1305_blocks_neon_started
-	vld1.64 {q0-q1}, [r0]!
-	vswp d1, d2
-	vmovn.i64 d21, q0
-	vshrn.i64 d22, q0, #26
-	vshrn.u64 d24, q1, #14
-	vext.8 d0, d0, d2, #4
-	vext.8 d1, d1, d3, #4
-	vshr.u64 q1, q1, #32
-	vshrn.i64 d23, q0, #20
-	vshrn.u64 d25, q1, #8
-	vand.i32 d21, #0x03ffffff
-	vand.i32 q11, #0x03ffffff
-	vand.i32 q12, #0x03ffffff
-	orr r8, r8, #1
-	sub r1, r1, #32
-	str r8, [r2, #116]
-	vorr d25, d25, d15
-	b .Lpoly1305_blocks_neon_setupr20
-.Lpoly1305_blocks_neon_started:
-	add r9, r2, #60
-	vldm r9, {d21-d25}
-.Lpoly1305_blocks_neon_setupr20:
-	vmov.i32 d0, #5
-	tst r8, #(8|16)
-	beq .Lpoly1305_blocks_neon_setupr20_simple
-	tst r8, #(8)
-	beq .Lpoly1305_blocks_neon_setupr20_r_1
-	mov r9, r2
-	add r10, r2, #20
-	vld1.64 {q9}, [r9]!
-	vld1.64 {q8}, [r10]!
-	vld1.64 {d2}, [r9]
-	vld1.64 {d20}, [r10]
-	b .Lpoly1305_blocks_neon_setupr20_hard
-.Lpoly1305_blocks_neon_setupr20_r_1:
-	mov r9, r2
-	vmov.i32 d2, #1
-	vld1.64 {q8}, [r9]!
-	veor q9, q9, q9
-	vshr.u64 d2, d2, #32
-	vld1.64 {d20}, [r9]
-.Lpoly1305_blocks_neon_setupr20_hard:
-	vzip.i32 q8, q9
-	vzip.i32 d20, d2
-	b .Lpoly1305_blocks_neon_setups20
-.Lpoly1305_blocks_neon_setupr20_simple:
-	add r9, r2, #20
-	vld1.64 {d2-d4}, [r9]
-	vdup.32 d16, d2[0]
-	vdup.32 d17, d2[1]
-	vdup.32 d18, d3[0]
-	vdup.32 d19, d3[1]
-	vdup.32 d20, d4[0]
-.Lpoly1305_blocks_neon_setups20:
-	vmul.i32 q13, q8, d0[0]
-	vmov.i64 q15, 0x00000000ffffffff
-	vmul.i32 q14, q9, d0[0]
-	vshr.u64 q15, q15, #6
-	cmp r1, #64
-	blo .Lpoly1305_blocks_neon_try32
-	add r9, sp, #16
-	add r10, r2, #40
-	add r11, sp, #64
-	str r1, [sp, #116]
-	vld1.64 {d10-d12}, [r10]
-	vmov d14, d12
-	vmul.i32 q6, q5, d0[0]
-.Lpoly1305_blocks_neon_mainloop:
-	UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
-	vmull.u32 q0, d25, d12[0]
-	mov r7, r2, lsr #26
-	vmlal.u32 q0, d24, d12[1]
-	mov r8, r3, lsr #20
-	ldr r6, [sp, #0]
-	vmlal.u32 q0, d23, d13[0]
-	mov r9, r4, lsr #14
-	vmlal.u32 q0, d22, d13[1]
-	orr r6, r6, r5, lsr #8
-	vmlal.u32 q0, d21, d14[0]
-	orr r3, r7, r3, lsl #6
-	vmull.u32 q1, d25, d12[1]
-	orr r4, r8, r4, lsl #12
-	orr r5, r9, r5, lsl #18
-	vmlal.u32 q1, d24, d13[0]
-	UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
-	vmlal.u32 q1, d23, d13[1]
-	mov r1, r7, lsr #26
-	vmlal.u32 q1, d22, d14[0]
-	ldr r11, [sp, #4]
-	mov r12, r8, lsr #20
-	vmlal.u32 q1, d21, d10[0]
-	mov r14, r9, lsr #14
-	vmull.u32 q2, d25, d13[0]
-	orr r11, r11, r10, lsr #8
-	orr r8, r1, r8, lsl #6
-	vmlal.u32 q2, d24, d13[1]
-	orr r9, r12, r9, lsl #12
-	vmlal.u32 q2, d23, d14[0]
-	orr r10, r14, r10, lsl #18
-	vmlal.u32 q2, d22, d10[0]
-	mov r12, r3
-	and r2, r2, #0x3ffffff
-	vmlal.u32 q2, d21, d10[1]
-	mov r14, r5
-	vmull.u32 q3, d25, d13[1]
-	and r3, r7, #0x3ffffff
-	vmlal.u32 q3, d24, d14[0]
-	and r5, r8, #0x3ffffff
-	vmlal.u32 q3, d23, d10[0]
-	and r7, r9, #0x3ffffff
-	vmlal.u32 q3, d22, d10[1]
-	and r8, r14, #0x3ffffff
-	vmlal.u32 q3, d21, d11[0]
-	and r9, r10, #0x3ffffff
-	add r14, sp, #128
-	vmull.u32 q4, d25, d14[0]
-	mov r10, r6
-	vmlal.u32 q4, d24, d10[0]
-	and r6, r4, #0x3ffffff
-	vmlal.u32 q4, d23, d10[1]
-	and r4, r12, #0x3ffffff
-	vmlal.u32 q4, d22, d11[0]
-	stm r14, {r2-r11}
-	vmlal.u32 q4, d21, d11[1]
-	vld1.64 {d21-d24}, [r14, :256]!
-	vld1.64 {d25}, [r14, :64]
-	UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
-	vmlal.u32 q0, d25, d26
-	mov r7, r2, lsr #26
-	vmlal.u32 q0, d24, d27
-	ldr r6, [sp, #0]
-	mov r8, r3, lsr #20
-	vmlal.u32 q0, d23, d28
-	mov r9, r4, lsr #14
-	vmlal.u32 q0, d22, d29
-	orr r6, r6, r5, lsr #8
-	vmlal.u32 q0, d21, d20
-	orr r3, r7, r3, lsl #6
-	vmlal.u32 q1, d25, d27
-	orr r4, r8, r4, lsl #12
-	orr r5, r9, r5, lsl #18
-	vmlal.u32 q1, d24, d28
-	UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
-	vmlal.u32 q1, d23, d29
-	mov r1, r7, lsr #26
-	vmlal.u32 q1, d22, d20
-	ldr r11, [sp, #4]
-	mov r12, r8, lsr #20
-	vmlal.u32 q1, d21, d16
-	mov r14, r9, lsr #14
-	vmlal.u32 q2, d25, d28
-	orr r11, r11, r10, lsr #8
-	orr r8, r1, r8, lsl #6
-	orr r9, r12, r9, lsl #12
-	vmlal.u32 q2, d24, d29
-	orr r10, r14, r10, lsl #18
-	and r2, r2, #0x3ffffff
-	mov r12, r3
-	vmlal.u32 q2, d23, d20
-	mov r14, r5
-	vmlal.u32 q2, d22, d16
-	and r3, r7, #0x3ffffff
-	vmlal.u32 q2, d21, d17
-	and r5, r8, #0x3ffffff
-	vmlal.u32 q3, d25, d29
-	and r7, r9, #0x3ffffff
-	vmlal.u32 q3, d24, d20
-	and r8, r14, #0x3ffffff
-	vmlal.u32 q3, d23, d16
-	and r9, r10, #0x3ffffff
-	vmlal.u32 q3, d22, d17
-	add r14, sp, #128
-	vmlal.u32 q3, d21, d18
-	mov r10, r6
-	vmlal.u32 q4, d25, d20
-	vmlal.u32 q4, d24, d16
-	and r6, r4, #0x3ffffff
-	vmlal.u32 q4, d23, d17
-	and r4, r12, #0x3ffffff
-	vmlal.u32 q4, d22, d18
-	stm r14, {r2-r11}
-	vmlal.u32 q4, d21, d19
-	vld1.64 {d21-d24}, [r14, :256]!
-	vld1.64 {d25}, [r14, :64]
-	vaddw.u32 q0, q0, d21
-	vaddw.u32 q1, q1, d22
-	vaddw.u32 q2, q2, d23
-	vaddw.u32 q3, q3, d24
-	vaddw.u32 q4, q4, d25
-	vshr.u64 q11, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q11
-	vshr.u64 q12, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q12
-	vshr.u64 q11, q1, #26
-	vand q1, q1, q15
-	vadd.i64 q2, q2, q11
-	vshr.u64 q12, q4, #26
-	vand q4, q4, q15
-	vadd.i64 q0, q0, q12
-	vshl.i64 q12, q12, #2
-	ldr r1, [sp, #116]
-	vadd.i64 q0, q0, q12
-	vshr.u64 q11, q2, #26
-	vand q2, q2, q15
-	vadd.i64 q3, q3, q11
-	sub r1, #64
-	vshr.u64 q12, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q12
-	cmp r1, #64
-	vshr.u64 q11, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q11
-	vmovn.i64 d21, q0
-	str r1, [sp, #116]
-	vmovn.i64 d22, q1
-	vmovn.i64 d23, q2
-	vmovn.i64 d24, q3
-	vmovn.i64 d25, q4
-	bhs .Lpoly1305_blocks_neon_mainloop
-.Lpoly1305_blocks_neon_try32:
-	cmp r1, #32
-	blo .Lpoly1305_blocks_neon_done
-	tst r0, r0
-	bne .Lpoly1305_blocks_loadm32
-	veor q0, q0, q0
-	veor q1, q1, q1
-	veor q2, q2, q2
-	veor q3, q3, q3
-	veor q4, q4, q4
-	b .Lpoly1305_blocks_continue32
-.Lpoly1305_blocks_loadm32:
-	vld1.64 {q0-q1}, [r0]!
-	veor q4, q4, q4
-	vswp d1, d2
-	veor q3, q3, q3
-	vtrn.32 q0, q4
-	vtrn.32 q1, q3
-	vshl.i64 q2, q1, #12
-	vshl.i64 q3, q3, #18
-	vshl.i64 q1, q4, #6
-	vmovl.u32 q4, d15
-.Lpoly1305_blocks_continue32:
-	vmlal.u32 q0, d25, d26
-	vmlal.u32 q0, d24, d27
-	vmlal.u32 q0, d23, d28
-	vmlal.u32 q0, d22, d29
-	vmlal.u32 q0, d21, d20
-	vmlal.u32 q1, d25, d27
-	vmlal.u32 q1, d24, d28
-	vmlal.u32 q1, d23, d29
-	vmlal.u32 q1, d22, d20
-	vmlal.u32 q1, d21, d16
-	vmlal.u32 q2, d25, d28
-	vmlal.u32 q2, d24, d29
-	vmlal.u32 q2, d23, d20
-	vmlal.u32 q2, d22, d16
-	vmlal.u32 q2, d21, d17
-	vmlal.u32 q3, d25, d29
-	vmlal.u32 q3, d24, d20
-	vmlal.u32 q3, d23, d16
-	vmlal.u32 q3, d22, d17
-	vmlal.u32 q3, d21, d18
-	vmlal.u32 q4, d25, d20
-	vmlal.u32 q4, d24, d16
-	vmlal.u32 q4, d23, d17
-	vmlal.u32 q4, d22, d18
-	vmlal.u32 q4, d21, d19
-	vshr.u64 q11, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q11
-	vshr.u64 q12, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q12
-	vshr.u64 q11, q1, #26
-	vand q1, q1, q15
-	vadd.i64 q2, q2, q11
-	vshr.u64 q12, q4, #26
-	vand q4, q4, q15
-	vadd.i64 q0, q0, q12
-	vshl.i64 q12, q12, #2
-	vadd.i64 q0, q0, q12
-	vshr.u64 q11, q2, #26
-	vand q2, q2, q15
-	vadd.i64 q3, q3, q11
-	vshr.u64 q12, q0, #26
-	vand q0, q0, q15
-	vadd.i64 q1, q1, q12
-	vshr.u64 q11, q3, #26
-	vand q3, q3, q15
-	vadd.i64 q4, q4, q11
-	vmovn.i64 d21, q0
-	vmovn.i64 d22, q1
-	vmovn.i64 d23, q2
-	vmovn.i64 d24, q3
-	vmovn.i64 d25, q4
-.Lpoly1305_blocks_neon_done:
-	tst r0, r0
-	beq .Lpoly1305_blocks_neon_final
-	ldr r2, [sp, #108]
-	add r2, r2, #60
-	vst1.64 {d21}, [r2]!
-	vst1.64 {d22-d25}, [r2]
-	b .Lpoly1305_blocks_neon_leave
-.Lpoly1305_blocks_neon_final:
-	vadd.u32 d10, d0, d1
-	vadd.u32 d13, d2, d3
-	vadd.u32 d11, d4, d5
-	ldr r5, [sp, #108]
-	vadd.u32 d14, d6, d7
-	vadd.u32 d12, d8, d9
-	vtrn.32 d10, d13
-	vtrn.32 d11, d14
-	vst1.64 {d10-d12}, [sp]
-	ldm sp, {r0-r4}
-	mov r12, r0, lsr #26
-	and r0, r0, #0x3ffffff
-	add r1, r1, r12
-	mov r12, r1, lsr #26
-	and r1, r1, #0x3ffffff
-	add r2, r2, r12
-	mov r12, r2, lsr #26
-	and r2, r2, #0x3ffffff
-	add r3, r3, r12
-	mov r12, r3, lsr #26
-	and r3, r3, #0x3ffffff
-	add r4, r4, r12
-	mov r12, r4, lsr #26
-	and r4, r4, #0x3ffffff
-	add r12, r12, r12, lsl #2
-	add r0, r0, r12
-	mov r12, r0, lsr #26
-	and r0, r0, #0x3ffffff
-	add r1, r1, r12
-	mov r12, r1, lsr #26
-	and r1, r1, #0x3ffffff
-	add r2, r2, r12
-	mov r12, r2, lsr #26
-	and r2, r2, #0x3ffffff
-	add r3, r3, r12
-	mov r12, r3, lsr #26
-	and r3, r3, #0x3ffffff
-	add r4, r4, r12
-	mov r12, r4, lsr #26
-	and r4, r4, #0x3ffffff
-	add r12, r12, r12, lsl #2
-	add r0, r0, r12
-	mov r12, r0, lsr #26
-	and r0, r0, #0x3ffffff
-	add r1, r1, r12
-	add r6, r0, #5
-	mov r12, r6, lsr #26
-	and r6, r6, #0x3ffffff
-	add r7, r1, r12
-	mov r12, r7, lsr #26
-	and r7, r7, #0x3ffffff
-	add r10, r2, r12
-	mov r12, r10, lsr #26
-	and r10, r10, #0x3ffffff
-	add r11, r3, r12
-	mov r12, #-(1 << 26)
-	add r12, r12, r11, lsr #26
-	and r11, r11, #0x3ffffff
-	add r14, r4, r12
-	mov r12, r14, lsr #31
-	sub r12, #1
-	and r6, r6, r12
-	and r7, r7, r12
-	and r10, r10, r12
-	and r11, r11, r12
-	and r14, r14, r12
-	mvn r12, r12
-	and r0, r0, r12
-	and r1, r1, r12
-	and r2, r2, r12
-	and r3, r3, r12
-	and r4, r4, r12
-	orr r0, r0, r6
-	orr r1, r1, r7
-	orr r2, r2, r10
-	orr r3, r3, r11
-	orr r4, r4, r14
-	orr r0, r0, r1, lsl #26
-	lsr r1, r1, #6
-	orr r1, r1, r2, lsl #20
-	lsr r2, r2, #12
-	orr r2, r2, r3, lsl #14
-	lsr r3, r3, #18
-	orr r3, r3, r4, lsl #8
-	add r5, r5, #60
-	stm r5, {r0-r3}
-.Lpoly1305_blocks_neon_leave:
-	sub r0, sp, #8
-	ldr sp, [sp, #120]
-	ldmfd sp!, {r4-r11, lr}
-	vldm sp!, {q4-q7}
-	sub r0, sp, r0
-	bx lr
-.size _gcry_poly1305_armv7_neon_blocks,.-_gcry_poly1305_armv7_neon_blocks;
-
-.globl _gcry_poly1305_armv7_neon_finish_ext
-.type  _gcry_poly1305_armv7_neon_finish_ext,%function;
-_gcry_poly1305_armv7_neon_finish_ext:
-.Lpoly1305_finish_ext_neon_local:
-	stmfd sp!, {r4-r11, lr}
-	sub sp, sp, #32
-	mov r5, r0
-	mov r6, r1
-	mov r7, r2
-	mov r8, r3
-	ands r7, r7, r7
-	beq .Lpoly1305_finish_ext_neon_noremaining
-	mov r9, sp
-	veor q0, q0, q0
-	veor q1, q1, q1
-	vst1.64 {q0-q1}, [sp]
-	tst r7, #16
-	beq .Lpoly1305_finish_ext_neon_skip16
-	vld1.u64 {q0}, [r1]!
-	vst1.64 {q0}, [r9]!
-.Lpoly1305_finish_ext_neon_skip16:
-	tst r7, #8
-	beq .Lpoly1305_finish_ext_neon_skip8
-	UNALIGNED_LDMIA2(r1, r10, r11)
-	stmia r9!, {r10-r11}
-.Lpoly1305_finish_ext_neon_skip8:
-	tst r7, #4
-	beq .Lpoly1305_finish_ext_neon_skip4
-	ldr r10, [r1], #4
-	str r10, [r9], #4
-.Lpoly1305_finish_ext_neon_skip4:
-	tst r7, #2
-	beq .Lpoly1305_finish_ext_neon_skip2
-	ldrh r10, [r1], #2
-	strh r10, [r9], #2
-.Lpoly1305_finish_ext_neon_skip2:
-	tst r7, #1
-	beq .Lpoly1305_finish_ext_neon_skip1
-	ldrb r10, [r1], #1
-	strb r10, [r9], #1
-.Lpoly1305_finish_ext_neon_skip1:
-	cmp r7, #16
-	beq .Lpoly1305_finish_ext_neon_skipfinalbit
-	mov r10, #1
-	strb r10, [r9]
-.Lpoly1305_finish_ext_neon_skipfinalbit:
-	ldr r10, [r5, #116]
-	orrhs r10, #2
-	orrlo r10, #4
-	str r10, [r5, #116]
-	mov r0, r5
-	mov r1, sp
-	mov r2, #32
-	bl .Lpoly1305_blocks_neon_local
-.Lpoly1305_finish_ext_neon_noremaining:
-	ldr r10, [r5, #116]
-	tst r10, #1
-	beq .Lpoly1305_finish_ext_neon_notstarted
-	cmp r7, #0
-	beq .Lpoly1305_finish_ext_neon_user2r
-	cmp r7, #16
-	bls .Lpoly1305_finish_ext_neon_user1
-.Lpoly1305_finish_ext_neon_user2r:
-	orr r10, r10, #8
-	b .Lpoly1305_finish_ext_neon_finalblock
-.Lpoly1305_finish_ext_neon_user1:
-	orr r10, r10, #16
-.Lpoly1305_finish_ext_neon_finalblock:
-	str r10, [r5, #116]
-	mov r0, r5
-	eor r1, r1, r1
-	mov r2, #32
-	bl .Lpoly1305_blocks_neon_local
-.Lpoly1305_finish_ext_neon_notstarted:
-	add r0, r5, #60
-	add r9, r5, #100
-	ldm r0, {r0-r3}
-	ldm r9, {r9-r12}
-	adds r0, r0, r9
-	adcs r1, r1, r10
-	adcs r2, r2, r11
-	adcs r3, r3, r12
-	stm r8, {r0-r3}
-	veor q0, q0, q0
-	veor q1, q1, q1
-	veor q2, q2, q2
-	veor q3, q3, q3
-	vstmia r5!, {q0-q3}
-	vstm r5, {q0-q3}
-	add sp, sp, #32
-	ldmfd sp!, {r4-r11, lr}
-	mov r0, #(9*4+32)
-	bx lr
-.size _gcry_poly1305_armv7_neon_finish_ext,.-_gcry_poly1305_armv7_neon_finish_ext;
-
-#endif
diff --git a/cipher/poly1305-avx2-amd64.S b/cipher/poly1305-avx2-amd64.S
deleted file mode 100644
index 9362a5aee..000000000
--- a/cipher/poly1305-avx2-amd64.S
+++ /dev/null
@@ -1,962 +0,0 @@
-/* poly1305-avx2-amd64.S  -  AMD64/AVX2 implementation of Poly1305
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
-#include <config.h>
-
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT)
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-
-.text
-
-
-.align 8
-.globl _gcry_poly1305_amd64_avx2_init_ext
-ELF(.type  _gcry_poly1305_amd64_avx2_init_ext, at function;)
-_gcry_poly1305_amd64_avx2_init_ext:
-.Lpoly1305_init_ext_avx2_local:
-	xor %edx, %edx
-	vzeroupper
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	pushq %r15
-	pushq %rbx
-	movq %rdx, %rcx
-	vpxor %ymm0, %ymm0, %ymm0
-	movq $-1, %r8
-	testq %rcx, %rcx
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm0, 32(%rdi)
-	vmovdqu %ymm0, 64(%rdi)
-	vmovdqu %ymm0, 96(%rdi)
-	vmovdqu %ymm0, 128(%rdi)
-	movq 8(%rsi), %r9
-	cmove %r8, %rcx
-	movq $0xffc0fffffff, %r8
-	movq %r9, %r13
-	movq (%rsi), %r10
-	andq %r10, %r8
-	shrq $44, %r10
-	movq %r8, %r14
-	shlq $20, %r13
-	orq %r13, %r10
-	movq $0xfffffc0ffff, %r13
-	shrq $24, %r9
-	andq %r13, %r10
-	movq $0xffffffc0f, %r13
-	andq %r13, %r9
-	movl %r8d, %r13d
-	andl $67108863, %r13d
-	movl %r13d, 164(%rdi)
-	movq %r10, %r13
-	shrq $26, %r14
-	shlq $18, %r13
-	orq %r13, %r14
-	movq %r10, %r13
-	shrq $8, %r13
-	andl $67108863, %r14d
-	andl $67108863, %r13d
-	movl %r14d, 172(%rdi)
-	movq %r10, %r14
-	movl %r13d, 180(%rdi)
-	movq %r9, %r13
-	shrq $34, %r14
-	shlq $10, %r13
-	orq %r13, %r14
-	movq %r9, %r13
-	shrq $16, %r13
-	andl $67108863, %r14d
-	movl %r14d, 188(%rdi)
-	movl %r13d, 196(%rdi)
-	cmpq $16, %rcx
-	jbe .Lpoly1305_init_ext_avx2_continue
-	lea (%r9,%r9,4), %r11
-	shlq $2, %r11
-	lea (%r10,%r10), %rax
-	mulq %r11
-	movq %rax, %r13
-	movq %r8, %rax
-	movq %rdx, %r14
-	mulq %r8
-	addq %rax, %r13
-	lea (%r8,%r8), %rax
-	movq %r13, %r12
-	adcq %rdx, %r14
-	mulq %r10
-	shlq $20, %r14
-	movq %rax, %r15
-	shrq $44, %r12
-	movq %r11, %rax
-	orq %r12, %r14
-	movq %rdx, %r12
-	mulq %r9
-	addq %rax, %r15
-	movq %r8, %rax
-	adcq %rdx, %r12
-	addq %r15, %r14
-	lea (%r9,%r9), %r15
-	movq %r14, %rbx
-	adcq $0, %r12
-	mulq %r15
-	shlq $20, %r12
-	movq %rdx, %r11
-	shrq $44, %rbx
-	orq %rbx, %r12
-	movq %rax, %rbx
-	movq %r10, %rax
-	mulq %r10
-	addq %rax, %rbx
-	adcq %rdx, %r11
-	addq %rbx, %r12
-	movq $0xfffffffffff, %rbx
-	movq %r12, %r15
-	adcq $0, %r11
-	andq %rbx, %r13
-	shlq $22, %r11
-	andq %rbx, %r14
-	shrq $42, %r15
-	orq %r15, %r11
-	lea (%r11,%r11,4), %r11
-	addq %r11, %r13
-	movq %rbx, %r11
-	andq %r13, %r11
-	shrq $44, %r13
-	movq %r11, %r15
-	addq %r13, %r14
-	movq $0x3ffffffffff, %r13
-	andq %r14, %rbx
-	andq %r13, %r12
-	movq %rbx, %r13
-	shrq $26, %r15
-	shlq $18, %r13
-	orq %r13, %r15
-	movq %rbx, %r13
-	shrq $44, %r14
-	shrq $8, %r13
-	addq %r14, %r12
-	movl %r11d, %r14d
-	andl $67108863, %r15d
-	andl $67108863, %r14d
-	andl $67108863, %r13d
-	movl %r14d, 204(%rdi)
-	movq %rbx, %r14
-	movl %r13d, 220(%rdi)
-	movq %r12, %r13
-	shrq $34, %r14
-	shlq $10, %r13
-	orq %r13, %r14
-	movq %r12, %r13
-	shrq $16, %r13
-	andl $67108863, %r14d
-	movl %r15d, 212(%rdi)
-	movl %r14d, 228(%rdi)
-	movl %r13d, 236(%rdi)
-	cmpq $32, %rcx
-	jbe .Lpoly1305_init_ext_avx2_continue
-	movq %r9, %rax
-	lea (%rbx,%rbx,4), %r14
-	shlq $2, %r14
-	mulq %r14
-	movq %rdi, -32(%rsp)
-	lea (%r12,%r12,4), %rdi
-	shlq $2, %rdi
-	movq %rax, %r14
-	movq %r10, %rax
-	movq %rdx, %r15
-	mulq %rdi
-	movq %rax, %r13
-	movq %r11, %rax
-	movq %rcx, -16(%rsp)
-	movq %rdx, %rcx
-	mulq %r8
-	addq %rax, %r13
-	movq %rdi, %rax
-	movq %rsi, -24(%rsp)
-	adcq %rdx, %rcx
-	addq %r13, %r14
-	adcq %rcx, %r15
-	movq %r14, %rcx
-	mulq %r9
-	shlq $20, %r15
-	movq %rax, %r13
-	shrq $44, %rcx
-	movq %r11, %rax
-	orq %rcx, %r15
-	movq %rdx, %rcx
-	mulq %r10
-	movq %rax, %rsi
-	movq %rbx, %rax
-	movq %rdx, %rdi
-	mulq %r8
-	addq %rax, %rsi
-	movq %r11, %rax
-	adcq %rdx, %rdi
-	addq %rsi, %r13
-	adcq %rdi, %rcx
-	addq %r13, %r15
-	movq %r15, %rdi
-	adcq $0, %rcx
-	mulq %r9
-	shlq $20, %rcx
-	movq %rdx, %rsi
-	shrq $44, %rdi
-	orq %rdi, %rcx
-	movq %rax, %rdi
-	movq %rbx, %rax
-	mulq %r10
-	movq %rax, %r9
-	movq %r8, %rax
-	movq %rdx, %r10
-	movq $0xfffffffffff, %r8
-	mulq %r12
-	addq %rax, %r9
-	adcq %rdx, %r10
-	andq %r8, %r14
-	addq %r9, %rdi
-	adcq %r10, %rsi
-	andq %r8, %r15
-	addq %rdi, %rcx
-	movq $0x3ffffffffff, %rdi
-	movq %rcx, %r10
-	adcq $0, %rsi
-	andq %rdi, %rcx
-	shlq $22, %rsi
-	shrq $42, %r10
-	orq %r10, %rsi
-	movq -32(%rsp), %rdi
-	lea (%rsi,%rsi,4), %r9
-	movq %r8, %rsi
-	addq %r9, %r14
-	andq %r14, %rsi
-	shrq $44, %r14
-	addq %r14, %r15
-	andq %r15, %r8
-	shrq $44, %r15
-	movq %r8, %r14
-	addq %r15, %rcx
-	movl %esi, %r15d
-	movq %rcx, %r10
-	movq %r8, %r9
-	shrq $26, %rsi
-	andl $67108863, %r15d
-	shlq $18, %r14
-	shrq $34, %r8
-	orq %r14, %rsi
-	shlq $10, %r10
-	shrq $8, %r9
-	orq %r10, %r8
-	shrq $16, %rcx
-	andl $67108863, %esi
-	movl %esi, 252(%rdi)
-	andl $67108863, %r9d
-	movl %ecx, 276(%rdi)
-	andl $67108863, %r8d
-	movl %r15d, 244(%rdi)
-	movl %r9d, 260(%rdi)
-	movl %r8d, 268(%rdi)
-	movq -16(%rsp), %rcx
-	movq -24(%rsp), %rsi
-.Lpoly1305_init_ext_avx2_continue:
-	movl 16(%rsi), %r8d
-	movl %r8d, 284(%rdi)
-	movl 20(%rsi), %r9d
-	movl %r9d, 292(%rdi)
-	movl 24(%rsi), %r10d
-	movl %r10d, 300(%rdi)
-	movl 28(%rsi), %esi
-	movl %esi, 308(%rdi)
-	cmpq $48, %rcx
-	jbe .Lpoly1305_init_ext_avx2_done
-	lea (%r12,%r12,4), %r9
-	shlq $2, %r9
-	lea (%rbx,%rbx), %rax
-	mulq %r9
-	movq %rax, %rsi
-	movq %r11, %rax
-	movq %rdx, %r8
-	mulq %r11
-	addq %rax, %rsi
-	lea (%r11,%r11), %rax
-	movq %rsi, %r10
-	adcq %rdx, %r8
-	mulq %rbx
-	movq %rax, %r13
-	movq %r12, %rax
-	movq %rdx, %rcx
-	addq %r12, %r12
-	mulq %r9
-	addq %rax, %r13
-	movq %r11, %rax
-	movq $0xfffffffffff, %r9
-	adcq %rdx, %rcx
-	andq %r9, %rsi
-	mulq %r12
-	shlq $20, %r8
-	movq %rax, %r11
-	shrq $44, %r10
-	movq %rbx, %rax
-	orq %r10, %r8
-	movq %rdx, %r12
-	mulq %rbx
-	addq %r13, %r8
-	movq %r8, %r14
-	adcq $0, %rcx
-	andq %r9, %r8
-	addq %rax, %r11
-	adcq %rdx, %r12
-	shlq $20, %rcx
-	shrq $44, %r14
-	orq %r14, %rcx
-	addq %r11, %rcx
-	movq %rcx, %rbx
-	adcq $0, %r12
-	shlq $22, %r12
-	shrq $42, %rbx
-	orq %rbx, %r12
-	movq %r9, %rbx
-	lea (%r12,%r12,4), %r15
-	addq %r15, %rsi
-	andq %rsi, %rbx
-	shrq $44, %rsi
-	movl %ebx, %r11d
-	addq %rsi, %r8
-	movq $0x3ffffffffff, %rsi
-	andq %r8, %r9
-	andq %rsi, %rcx
-	shrq $44, %r8
-	movq %r9, %rax
-	addq %r8, %rcx
-	movq %r9, %r8
-	movq %rcx, %r10
-	andl $67108863, %r11d
-	shrq $26, %rbx
-	shlq $18, %r8
-	shrq $34, %r9
-	orq %r8, %rbx
-	shlq $10, %r10
-	shrq $8, %rax
-	orq %r10, %r9
-	shrq $16, %rcx
-	andl $67108863, %ebx
-	andl $67108863, %eax
-	andl $67108863, %r9d
-	movl %r11d, 184(%rdi)
-	movl %r11d, 176(%rdi)
-	movl %r11d, 168(%rdi)
-	movl %r11d, 160(%rdi)
-	movl %ebx, 216(%rdi)
-	movl %ebx, 208(%rdi)
-	movl %ebx, 200(%rdi)
-	movl %ebx, 192(%rdi)
-	movl %eax, 248(%rdi)
-	movl %eax, 240(%rdi)
-	movl %eax, 232(%rdi)
-	movl %eax, 224(%rdi)
-	movl %r9d, 280(%rdi)
-	movl %r9d, 272(%rdi)
-	movl %r9d, 264(%rdi)
-	movl %r9d, 256(%rdi)
-	movl %ecx, 312(%rdi)
-	movl %ecx, 304(%rdi)
-	movl %ecx, 296(%rdi)
-	movl %ecx, 288(%rdi)
-.Lpoly1305_init_ext_avx2_done:
-	movq $0, 320(%rdi)
-	vzeroall
-	popq %rbx
-	popq %r15
-	popq %r14
-	popq %r13
-	popq %r12
-	ret
-ELF(.size _gcry_poly1305_amd64_avx2_init_ext,.-_gcry_poly1305_amd64_avx2_init_ext;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_avx2_blocks
-ELF(.type  _gcry_poly1305_amd64_avx2_blocks, at function;)
-_gcry_poly1305_amd64_avx2_blocks:
-.Lpoly1305_blocks_avx2_local:
-	vzeroupper
-	pushq %rbp
-	movq %rsp, %rbp
-	pushq %rbx
-	andq $-64, %rsp
-	subq $200, %rsp
-	movl $((1<<26)-1), %r8d
-	movl $(5), %r9d
-	movl $((1<<24)), %r10d
-	vmovd %r8d, %xmm0
-	vmovd %r9d, %xmm8
-	vmovd %r10d, %xmm7
-	vpbroadcastq %xmm0, %ymm0
-	vpbroadcastq %xmm8, %ymm8
-	vpbroadcastq %xmm7, %ymm7
-	vmovdqa %ymm7, 168(%rsp)
-	movq 320(%rdi), %rax
-	testb $60, %al
-	je .Lpoly1305_blocks_avx2_9
-	vmovdqa 168(%rsp), %ymm7
-	vpsrldq $8, %ymm7, %ymm1
-	vmovdqa %ymm1, 168(%rsp)
-	testb $4, %al
-	je .Lpoly1305_blocks_avx2_10
-	vpermq $192, %ymm1, %ymm7
-	vmovdqa %ymm7, 168(%rsp)
-.Lpoly1305_blocks_avx2_10:
-	testb $8, %al
-	je .Lpoly1305_blocks_avx2_11
-	vpermq $240, 168(%rsp), %ymm7
-	vmovdqa %ymm7, 168(%rsp)
-.Lpoly1305_blocks_avx2_11:
-	testb $16, %al
-	je .Lpoly1305_blocks_avx2_12
-	vpermq $252, 168(%rsp), %ymm6
-	vmovdqa %ymm6, 168(%rsp)
-.Lpoly1305_blocks_avx2_12:
-	testb $32, %al
-	je .Lpoly1305_blocks_avx2_9
-	vpxor %xmm6, %xmm6, %xmm6
-	vmovdqa %ymm6, 168(%rsp)
-.Lpoly1305_blocks_avx2_9:
-	testb $1, %al
-	jne .Lpoly1305_blocks_avx2_13
-	vmovdqu (%rsi), %ymm3
-	vmovdqu 32(%rsi), %ymm1
-	vpunpcklqdq %ymm1, %ymm3, %ymm2
-	vpunpckhqdq %ymm1, %ymm3, %ymm1
-	vpermq $216, %ymm2, %ymm2
-	vpermq $216, %ymm1, %ymm1
-	vpand %ymm2, %ymm0, %ymm5
-	vpsrlq $26, %ymm2, %ymm4
-	vpand %ymm4, %ymm0, %ymm4
-	vpsllq $12, %ymm1, %ymm3
-	vpsrlq $52, %ymm2, %ymm2
-	vpor %ymm3, %ymm2, %ymm2
-	vpand %ymm2, %ymm0, %ymm3
-	vpsrlq $26, %ymm2, %ymm2
-	vpand %ymm2, %ymm0, %ymm2
-	vpsrlq $40, %ymm1, %ymm1
-	vpor 168(%rsp), %ymm1, %ymm1
-	addq $64, %rsi
-	subq $64, %rdx
-	orq $1, 320(%rdi)
-	jmp .Lpoly1305_blocks_avx2_14
-.Lpoly1305_blocks_avx2_13:
-	vmovdqa (%rdi), %ymm5
-	vmovdqa 32(%rdi), %ymm4
-	vmovdqa 64(%rdi), %ymm3
-	vmovdqa 96(%rdi), %ymm2
-	vmovdqa 128(%rdi), %ymm1
-.Lpoly1305_blocks_avx2_14:
-	cmpq $63, %rdx
-	jbe .Lpoly1305_blocks_avx2_15
-	vmovdqa 160(%rdi), %ymm6
-	vmovdqa %ymm8, 136(%rsp)
-	vmovdqa 192(%rdi), %ymm7
-	vpmuludq %ymm8, %ymm7, %ymm11
-	vmovdqa %ymm11, 104(%rsp)
-	vmovdqa 224(%rdi), %ymm11
-	vmovdqa %ymm11, 72(%rsp)
-	vpmuludq %ymm11, %ymm8, %ymm11
-	vmovdqa %ymm11, 40(%rsp)
-	vmovdqa 256(%rdi), %ymm11
-	vmovdqa %ymm11, 8(%rsp)
-	vpmuludq %ymm11, %ymm8, %ymm11
-	vmovdqa %ymm11, -24(%rsp)
-	vmovdqa 288(%rdi), %ymm13
-	vmovdqa %ymm13, -56(%rsp)
-	vpmuludq %ymm13, %ymm8, %ymm13
-	vmovdqa %ymm13, -88(%rsp)
-.Lpoly1305_blocks_avx2_16:
-	vpmuludq 104(%rsp), %ymm1, %ymm14
-	vmovdqa 40(%rsp), %ymm13
-	vpmuludq %ymm13, %ymm2, %ymm8
-	vpmuludq %ymm13, %ymm1, %ymm13
-	vmovdqa -24(%rsp), %ymm9
-	vpmuludq %ymm9, %ymm2, %ymm10
-	vpmuludq %ymm9, %ymm1, %ymm11
-	vpaddq %ymm8, %ymm14, %ymm14
-	vpmuludq %ymm9, %ymm3, %ymm8
-	vmovdqa -88(%rsp), %ymm12
-	vpmuludq %ymm12, %ymm1, %ymm9
-	vpaddq %ymm10, %ymm13, %ymm13
-	vpmuludq %ymm12, %ymm4, %ymm15
-	vmovdqa %ymm12, %ymm10
-	vpmuludq %ymm12, %ymm3, %ymm12
-	vpaddq %ymm8, %ymm14, %ymm14
-	vpmuludq %ymm10, %ymm2, %ymm10
-	vpmuludq %ymm6, %ymm2, %ymm8
-	vpaddq %ymm15, %ymm14, %ymm14
-	vpmuludq %ymm6, %ymm1, %ymm1
-	vpaddq %ymm12, %ymm13, %ymm13
-	vpmuludq %ymm6, %ymm5, %ymm15
-	vpaddq %ymm10, %ymm11, %ymm11
-	vpmuludq %ymm6, %ymm4, %ymm12
-	vpaddq %ymm8, %ymm9, %ymm9
-	vpmuludq %ymm6, %ymm3, %ymm10
-	vpmuludq %ymm7, %ymm3, %ymm8
-	vpaddq %ymm15, %ymm14, %ymm14
-	vpmuludq %ymm7, %ymm2, %ymm2
-	vpaddq %ymm12, %ymm13, %ymm12
-	vpmuludq %ymm7, %ymm5, %ymm15
-	vpaddq %ymm10, %ymm11, %ymm10
-	vpmuludq %ymm7, %ymm4, %ymm13
-	vpaddq %ymm8, %ymm9, %ymm8
-	vmovdqa 72(%rsp), %ymm9
-	vpmuludq %ymm9, %ymm4, %ymm11
-	vpaddq %ymm2, %ymm1, %ymm1
-	vpmuludq %ymm9, %ymm3, %ymm3
-	vpaddq %ymm15, %ymm12, %ymm12
-	vpmuludq %ymm9, %ymm5, %ymm15
-	vpaddq %ymm13, %ymm10, %ymm10
-	vmovdqa 8(%rsp), %ymm2
-	vpmuludq %ymm2, %ymm5, %ymm9
-	vpaddq %ymm11, %ymm8, %ymm8
-	vpmuludq %ymm2, %ymm4, %ymm4
-	vpaddq %ymm3, %ymm1, %ymm1
-	vpmuludq -56(%rsp), %ymm5, %ymm5
-	vpaddq %ymm15, %ymm10, %ymm10
-	vpaddq %ymm9, %ymm8, %ymm8
-	vpaddq %ymm4, %ymm1, %ymm1
-	vpaddq %ymm5, %ymm1, %ymm5
-	vmovdqu (%rsi), %ymm3
-	vmovdqu 32(%rsi), %ymm2
-	vperm2i128 $32, %ymm2, %ymm3, %ymm1
-	vperm2i128 $49, %ymm2, %ymm3, %ymm2
-	vpunpckldq %ymm2, %ymm1, %ymm15
-	vpunpckhdq %ymm2, %ymm1, %ymm2
-	vpxor %xmm4, %xmm4, %xmm4
-	vpunpckldq %ymm4, %ymm15, %ymm1
-	vpunpckhdq %ymm4, %ymm15, %ymm15
-	vpunpckldq %ymm4, %ymm2, %ymm3
-	vpunpckhdq %ymm4, %ymm2, %ymm2
-	vpsllq $6, %ymm15, %ymm15
-	vpsllq $12, %ymm3, %ymm3
-	vpsllq $18, %ymm2, %ymm2
-	vpaddq %ymm1, %ymm14, %ymm14
-	vpaddq %ymm15, %ymm12, %ymm12
-	vpaddq %ymm3, %ymm10, %ymm10
-	vpaddq %ymm2, %ymm8, %ymm8
-	vpaddq 168(%rsp), %ymm5, %ymm5
-	addq $64, %rsi
-	vpsrlq $26, %ymm14, %ymm4
-	vpsrlq $26, %ymm8, %ymm2
-	vpand %ymm0, %ymm14, %ymm14
-	vpand %ymm0, %ymm8, %ymm8
-	vpaddq %ymm4, %ymm12, %ymm12
-	vpaddq %ymm2, %ymm5, %ymm5
-	vpsrlq $26, %ymm12, %ymm3
-	vpsrlq $26, %ymm5, %ymm9
-	vpand %ymm0, %ymm12, %ymm12
-	vpand %ymm0, %ymm5, %ymm11
-	vpaddq %ymm3, %ymm10, %ymm3
-	vpmuludq 136(%rsp), %ymm9, %ymm9
-	vpaddq %ymm9, %ymm14, %ymm14
-	vpsrlq $26, %ymm3, %ymm2
-	vpsrlq $26, %ymm14, %ymm4
-	vpand %ymm0, %ymm3, %ymm3
-	vpand %ymm0, %ymm14, %ymm5
-	vpaddq %ymm2, %ymm8, %ymm2
-	vpaddq %ymm4, %ymm12, %ymm4
-	vpsrlq $26, %ymm2, %ymm1
-	vpand %ymm0, %ymm2, %ymm2
-	vpaddq %ymm1, %ymm11, %ymm1
-	subq $64, %rdx
-	cmpq $63, %rdx
-	ja .Lpoly1305_blocks_avx2_16
-.Lpoly1305_blocks_avx2_15:
-	testb $64, 320(%rdi)
-	jne .Lpoly1305_blocks_avx2_17
-	vmovdqa %ymm5, (%rdi)
-	vmovdqa %ymm4, 32(%rdi)
-	vmovdqa %ymm3, 64(%rdi)
-	vmovdqa %ymm2, 96(%rdi)
-	vmovdqa %ymm1, 128(%rdi)
-	jmp .Lpoly1305_blocks_avx2_8
-.Lpoly1305_blocks_avx2_17:
-	vpermq $245, %ymm5, %ymm0
-	vpaddq %ymm0, %ymm5, %ymm5
-	vpermq $245, %ymm4, %ymm0
-	vpaddq %ymm0, %ymm4, %ymm4
-	vpermq $245, %ymm3, %ymm0
-	vpaddq %ymm0, %ymm3, %ymm3
-	vpermq $245, %ymm2, %ymm0
-	vpaddq %ymm0, %ymm2, %ymm2
-	vpermq $245, %ymm1, %ymm0
-	vpaddq %ymm0, %ymm1, %ymm1
-	vpermq $170, %ymm5, %ymm0
-	vpaddq %ymm0, %ymm5, %ymm5
-	vpermq $170, %ymm4, %ymm0
-	vpaddq %ymm0, %ymm4, %ymm4
-	vpermq $170, %ymm3, %ymm0
-	vpaddq %ymm0, %ymm3, %ymm3
-	vpermq $170, %ymm2, %ymm0
-	vpaddq %ymm0, %ymm2, %ymm2
-	vpermq $170, %ymm1, %ymm0
-	vpaddq %ymm0, %ymm1, %ymm1
-	vmovd %xmm5, %eax
-	vmovd %xmm4, %edx
-	movl %eax, %ecx
-	shrl $26, %ecx
-	addl %edx, %ecx
-	movl %ecx, %edx
-	andl $67108863, %edx
-	vmovd %xmm3, %esi
-	shrl $26, %ecx
-	movl %ecx, %r11d
-	addl %esi, %r11d
-	vmovd %xmm2, %ecx
-	movl %r11d, %r10d
-	shrl $26, %r10d
-	addl %ecx, %r10d
-	movl %r10d, %r9d
-	andl $67108863, %r9d
-	vmovd %xmm1, %r8d
-	movl %edx, %esi
-	salq $26, %rsi
-	andl $67108863, %eax
-	orq %rax, %rsi
-	movabsq $17592186044415, %rax
-	andq %rax, %rsi
-	andl $67108863, %r11d
-	salq $8, %r11
-	shrl $18, %edx
-	movl %edx, %edx
-	orq %r11, %rdx
-	movq %r9, %rcx
-	salq $34, %rcx
-	orq %rcx, %rdx
-	andq %rax, %rdx
-	shrl $26, %r10d
-	addl %r10d, %r8d
-	salq $16, %r8
-	shrl $10, %r9d
-	movl %r9d, %r9d
-	orq %r9, %r8
-	movabsq $4398046511103, %r10
-	movq %r8, %r9
-	andq %r10, %r9
-	shrq $42, %r8
-	leaq (%r8,%r8,4), %rcx
-	addq %rcx, %rsi
-	movq %rsi, %r8
-	andq %rax, %r8
-	movq %rsi, %rcx
-	shrq $44, %rcx
-	addq %rdx, %rcx
-	movq %rcx, %rsi
-	andq %rax, %rsi
-	shrq $44, %rcx
-	movq %rcx, %rdx
-	addq %r9, %rdx
-	andq %rdx, %r10
-	shrq $42, %rdx
-	leaq (%r8,%rdx,4), %rcx
-	leaq (%rcx,%rdx), %rdx
-	movq %rdx, %rbx
-	andq %rax, %rbx
-	shrq $44, %rdx
-	movq %rdx, %r11
-	addq %rsi, %r11
-	leaq 5(%rbx), %r9
-	movq %r9, %r8
-	shrq $44, %r8
-	addq %r11, %r8
-	movabsq $-4398046511104, %rsi
-	addq %r10, %rsi
-	movq %r8, %rdx
-	shrq $44, %rdx
-	addq %rdx, %rsi
-	movq %rsi, %rdx
-	shrq $63, %rdx
-	subq $1, %rdx
-	movq %rdx, %rcx
-	notq %rcx
-	andq %rcx, %rbx
-	andq %rcx, %r11
-	andq %r10, %rcx
-	andq %rax, %r9
-	andq %rdx, %r9
-	orq %r9, %rbx
-	movq %rbx, (%rdi)
-	andq %r8, %rax
-	andq %rdx, %rax
-	orq %rax, %r11
-	movq %r11, 8(%rdi)
-	andq %rsi, %rdx
-	orq %rcx, %rdx
-	movq %rdx, 16(%rdi)
-.Lpoly1305_blocks_avx2_8:
-	movq -8(%rbp), %rbx
-	vzeroall
-	movq %rbp, %rax
-	subq %rsp, %rax
-	leave
-	addq $8, %rax
-	ret
-ELF(.size _gcry_poly1305_amd64_avx2_blocks,.-_gcry_poly1305_amd64_avx2_blocks;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_avx2_finish_ext
-ELF(.type  _gcry_poly1305_amd64_avx2_finish_ext, at function;)
-_gcry_poly1305_amd64_avx2_finish_ext:
-.Lpoly1305_finish_ext_avx2_local:
-	vzeroupper
-	pushq %rbp
-	movq %rsp, %rbp
-	pushq %r13
-	pushq %r12
-	pushq %rbx
-	andq $-64, %rsp
-	subq $64, %rsp
-	movq %rdi, %rbx
-	movq %rdx, %r13
-	movq %rcx, %r12
-	testq %rdx, %rdx
-	je .Lpoly1305_finish_ext_avx2_22
-	vpxor %xmm0, %xmm0, %xmm0
-	vmovdqa %ymm0, (%rsp)
-	vmovdqa %ymm0, 32(%rsp)
-	movq %rsp, %rax
-	subq %rsp, %rsi
-	testb $32, %dl
-	je .Lpoly1305_finish_ext_avx2_23
-	vmovdqu (%rsp,%rsi), %ymm0
-	vmovdqa %ymm0, (%rsp)
-	leaq 32(%rsp), %rax
-.Lpoly1305_finish_ext_avx2_23:
-	testb $16, %r13b
-	je .Lpoly1305_finish_ext_avx2_24
-	vmovdqu (%rax,%rsi), %xmm0
-	vmovdqa %xmm0, (%rax)
-	addq $16, %rax
-.Lpoly1305_finish_ext_avx2_24:
-	testb $8, %r13b
-	je .Lpoly1305_finish_ext_avx2_25
-	movq (%rax,%rsi), %rdx
-	movq %rdx, (%rax)
-	addq $8, %rax
-.Lpoly1305_finish_ext_avx2_25:
-	testb $4, %r13b
-	je .Lpoly1305_finish_ext_avx2_26
-	movl (%rax,%rsi), %edx
-	movl %edx, (%rax)
-	addq $4, %rax
-.Lpoly1305_finish_ext_avx2_26:
-	testb $2, %r13b
-	je .Lpoly1305_finish_ext_avx2_27
-	movzwl (%rax,%rsi), %edx
-	movw %dx, (%rax)
-	addq $2, %rax
-.Lpoly1305_finish_ext_avx2_27:
-	testb $1, %r13b
-	je .Lpoly1305_finish_ext_avx2_28
-	movzbl (%rax,%rsi), %edx
-	movb %dl, (%rax)
-.Lpoly1305_finish_ext_avx2_28:
-	testb $15, %r13b
-	je .Lpoly1305_finish_ext_avx2_29
-	movb $1, (%rsp,%r13)
-.Lpoly1305_finish_ext_avx2_29:
-	cmpq $47, %r13
-	jbe .Lpoly1305_finish_ext_avx2_30
-	orq $4, 320(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_31
-.Lpoly1305_finish_ext_avx2_30:
-	cmpq $31, %r13
-	jbe .Lpoly1305_finish_ext_avx2_32
-	orq $8, 320(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_31
-.Lpoly1305_finish_ext_avx2_32:
-	cmpq $15, %r13
-	jbe .Lpoly1305_finish_ext_avx2_33
-	orq $16, 320(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_31
-.Lpoly1305_finish_ext_avx2_33:
-	orq $32, 320(%rbx)
-.Lpoly1305_finish_ext_avx2_31:
-	testb $1, 320(%rbx)
-	je .Lpoly1305_finish_ext_avx2_34
-	cmpq $32, %r13
-	ja .Lpoly1305_finish_ext_avx2_34
-	cmpq $17, %r13
-	sbbq %rsi, %rsi
-	notq %rsi
-	addq $2, %rsi
-	cmpq $17, %r13
-	sbbq %rax, %rax
-	movq %rbx, %rdx
-	addq $23, %rax
-	leaq (%rbx,%rax,8), %rax
-	movl $0, %ecx
-.Lpoly1305_finish_ext_avx2_37:
-	movl 244(%rdx), %edi
-	movl %edi, (%rax)
-	movl 252(%rdx), %edi
-	movl %edi, 32(%rax)
-	movl 260(%rdx), %edi
-	movl %edi, 64(%rax)
-	movl 268(%rdx), %edi
-	movl %edi, 96(%rax)
-	movl 276(%rdx), %edi
-	movl %edi, 128(%rax)
-	addq $1, %rcx
-	subq $40, %rdx
-	addq $8, %rax
-	cmpq %rcx, %rsi
-	ja .Lpoly1305_finish_ext_avx2_37
-.Lpoly1305_finish_ext_avx2_34:
-	movl $64, %edx
-	movq %rsp, %rsi
-	movq %rbx, %rdi
-	call .Lpoly1305_blocks_avx2_local
-.Lpoly1305_finish_ext_avx2_22:
-	movq 320(%rbx), %r8
-	testb $1, %r8b
-	je .Lpoly1305_finish_ext_avx2_38
-	leaq -1(%r13), %rax
-	cmpq $47, %rax
-	ja .Lpoly1305_finish_ext_avx2_46
-	cmpq $32, %r13
-	ja .Lpoly1305_finish_ext_avx2_47
-	cmpq $17, %r13
-	sbbq %r9, %r9
-	addq $2, %r9
-	movl $0, %edi
-	cmpq $17, %r13
-	sbbq %rax, %rax
-	notq %rax
-	andl $5, %eax
-	jmp .Lpoly1305_finish_ext_avx2_39
-.Lpoly1305_finish_ext_avx2_41:
-	movl (%rdx), %esi
-	movl %esi, (%rax)
-	movl 8(%rdx), %esi
-	movl %esi, 32(%rax)
-	movl 16(%rdx), %esi
-	movl %esi, 64(%rax)
-	movl 24(%rdx), %esi
-	movl %esi, 96(%rax)
-	movl 32(%rdx), %esi
-	movl %esi, 128(%rax)
-	addq $1, %rcx
-	subq $40, %rdx
-	addq $8, %rax
-	movq %rcx, %rsi
-	subq %rdi, %rsi
-	cmpq %rsi, %r9
-	ja .Lpoly1305_finish_ext_avx2_41
-	cmpq $3, %rcx
-	ja .Lpoly1305_finish_ext_avx2_42
-	leaq 160(%rbx,%rcx,8), %rax
-.Lpoly1305_finish_ext_avx2_43:
-	movl $1, (%rax)
-	movl $0, 32(%rax)
-	movl $0, 64(%rax)
-	movl $0, 96(%rax)
-	movl $0, 128(%rax)
-	addq $1, %rcx
-	addq $8, %rax
-	cmpq $4, %rcx
-	jne .Lpoly1305_finish_ext_avx2_43
-.Lpoly1305_finish_ext_avx2_42:
-	orq $96, %r8
-	movq %r8, 320(%rbx)
-	vpxor %ymm0, %ymm0, %ymm0
-	vmovdqa %ymm0, (%rsp)
-	vmovdqa %ymm0, 32(%rsp)
-	movl $64, %edx
-	movq %rsp, %rsi
-	movq %rbx, %rdi
-	call .Lpoly1305_blocks_avx2_local
-.Lpoly1305_finish_ext_avx2_38:
-	movq 8(%rbx), %rax
-	movq %rax, %rdx
-	salq $44, %rdx
-	orq (%rbx), %rdx
-	shrq $20, %rax
-	movl $24, %edi
-	shlx %rdi, 16(%rbx), %rcx
-	orq %rcx, %rax
-	movl 292(%rbx), %ecx
-	salq $32, %rcx
-	movl 284(%rbx), %esi
-	orq %rsi, %rcx
-	movl 308(%rbx), %esi
-	salq $32, %rsi
-	movl 300(%rbx), %edi
-	orq %rdi, %rsi
-	addq %rcx, %rdx
-	adcq %rsi, %rax
-	movq %rdx, (%r12)
-	movq %rax, 8(%r12)
-	vpxor %xmm0, %xmm0, %xmm0
-	vmovdqu %ymm0, (%rbx)
-	vmovdqu %ymm0, 32(%rbx)
-	vmovdqu %ymm0, 64(%rbx)
-	vmovdqu %ymm0, 96(%rbx)
-	vmovdqu %ymm0, 128(%rbx)
-	vmovdqu %ymm0, 160(%rbx)
-	vmovdqu %ymm0, 192(%rbx)
-	vmovdqu %ymm0, 224(%rbx)
-	jmp .Lpoly1305_finish_ext_avx2_49
-.Lpoly1305_finish_ext_avx2_46:
-	movl $3, %r9d
-	movl $1, %edi
-	movl $10, %eax
-	jmp .Lpoly1305_finish_ext_avx2_39
-.Lpoly1305_finish_ext_avx2_47:
-	movl $3, %r9d
-	movl $0, %edi
-	movl $10, %eax
-.Lpoly1305_finish_ext_avx2_39:
-	leaq 164(%rbx,%rax,8), %rdx
-	leaq 160(%rbx,%rdi,8), %rax
-	movq %rdi, %rcx
-	jmp .Lpoly1305_finish_ext_avx2_41
-.Lpoly1305_finish_ext_avx2_49:
-	movq %rbp, %rax
-	subq %rsp, %rax
-	leaq -24(%rbp), %rsp
-	vzeroall
-	popq %rbx
-	popq %r12
-	popq %r13
-	popq %rbp
-	addq $(8*5), %rax
-ret
-ELF(.size _gcry_poly1305_amd64_avx2_finish_ext,.-_gcry_poly1305_amd64_avx2_finish_ext;)
-
-#endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index bcbe5df70..2405a090f 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -29,139 +29,34 @@
 #include "cipher.h"
 #include "bufhelp.h"
 
-
 #define POLY1305_TAGLEN 16
 #define POLY1305_KEYLEN 32
+#define POLY1305_BLOCKSIZE 16
 
 
-/* Block-size used in default implementation. */
-#define POLY1305_REF_BLOCKSIZE 16
-
-/* State size of default implementation. */
-#define POLY1305_REF_STATESIZE 64
-
-/* State alignment for default implementation. */
-#define POLY1305_REF_ALIGNMENT sizeof(void *)
-
-
-#undef POLY1305_SYSV_FUNC_ABI
-
-/* POLY1305_USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
-#undef POLY1305_USE_SSE2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define POLY1305_USE_SSE2 1
-# define POLY1305_SSE2_BLOCKSIZE 32
-# define POLY1305_SSE2_STATESIZE 248
-# define POLY1305_SSE2_ALIGNMENT 16
-# define POLY1305_SYSV_FUNC_ABI 1
-#endif
-
-
-/* POLY1305_USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
-#undef POLY1305_USE_AVX2
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
-    defined(ENABLE_AVX2_SUPPORT)
-# define POLY1305_USE_AVX2 1
-# define POLY1305_AVX2_BLOCKSIZE 64
-# define POLY1305_AVX2_STATESIZE 328
-# define POLY1305_AVX2_ALIGNMENT 32
-# define POLY1305_SYSV_FUNC_ABI 1
-#endif
-
-
-/* POLY1305_USE_NEON indicates whether to enable ARM NEON assembly code. */
-#undef POLY1305_USE_NEON
-#if defined(ENABLE_NEON_SUPPORT) && defined(HAVE_ARM_ARCH_V6) && \
-    defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
-    defined(HAVE_GCC_INLINE_ASM_NEON)
-# define POLY1305_USE_NEON 1
-# define POLY1305_NEON_BLOCKSIZE 32
-# define POLY1305_NEON_STATESIZE 128
-# define POLY1305_NEON_ALIGNMENT 16
-#endif
-
-
-/* Largest block-size used in any implementation (optimized implementations
- * might use block-size multiple of 16). */
-#ifdef POLY1305_USE_AVX2
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_AVX2_BLOCKSIZE
-#elif defined(POLY1305_USE_NEON)
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_NEON_BLOCKSIZE
-#elif defined(POLY1305_USE_SSE2)
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_SSE2_BLOCKSIZE
-#else
-# define POLY1305_LARGEST_BLOCKSIZE POLY1305_REF_BLOCKSIZE
-#endif
-
-/* Largest state-size used in any implementation. */
-#ifdef POLY1305_USE_AVX2
-# define POLY1305_LARGEST_STATESIZE POLY1305_AVX2_STATESIZE
-#elif defined(POLY1305_USE_NEON)
-# define POLY1305_LARGEST_STATESIZE POLY1305_NEON_STATESIZE
-#elif defined(POLY1305_USE_SSE2)
-# define POLY1305_LARGEST_STATESIZE POLY1305_SSE2_STATESIZE
-#else
-# define POLY1305_LARGEST_STATESIZE POLY1305_REF_STATESIZE
-#endif
-
-/* Minimum alignment for state pointer passed to implementations. */
-#ifdef POLY1305_USE_AVX2
-# define POLY1305_STATE_ALIGNMENT POLY1305_AVX2_ALIGNMENT
-#elif defined(POLY1305_USE_NEON)
-# define POLY1305_STATE_ALIGNMENT POLY1305_NEON_ALIGNMENT
-#elif defined(POLY1305_USE_SSE2)
-# define POLY1305_STATE_ALIGNMENT POLY1305_SSE2_ALIGNMENT
-#else
-# define POLY1305_STATE_ALIGNMENT POLY1305_REF_ALIGNMENT
-#endif
-
-
-/* Assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef OPS_FUNC_ABI
-#if defined(POLY1305_SYSV_FUNC_ABI) && \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
-# define OPS_FUNC_ABI __attribute__((sysv_abi))
-#else
-# define OPS_FUNC_ABI
-#endif
-
-
-typedef struct poly1305_key_s
+typedef struct
 {
-  byte b[POLY1305_KEYLEN];
-} poly1305_key_t;
-
-
-typedef struct poly1305_ops_s
-{
-  size_t block_size;
-  void (*init_ext) (void *ctx, const poly1305_key_t * key) OPS_FUNC_ABI;
-  unsigned int (*blocks) (void *ctx, const byte * m, size_t bytes) OPS_FUNC_ABI;
-  unsigned int (*finish_ext) (void *ctx, const byte * m, size_t remaining,
-			      byte mac[POLY1305_TAGLEN]) OPS_FUNC_ABI;
-} poly1305_ops_t;
-
+  u32 k[4];
+  u32 r[4];
+  u32 h[5];
+} POLY1305_STATE;
 
 typedef struct poly1305_context_s
 {
-  byte state[POLY1305_LARGEST_STATESIZE + POLY1305_STATE_ALIGNMENT];
-  byte buffer[POLY1305_LARGEST_BLOCKSIZE];
-  const poly1305_ops_t *ops;
+  POLY1305_STATE state;
+  byte buffer[POLY1305_BLOCKSIZE];
   unsigned int leftover;
 } poly1305_context_t;
 
 
-gcry_err_code_t _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
+gcry_err_code_t _gcry_poly1305_init (poly1305_context_t *ctx, const byte *key,
 				     size_t keylen);
 
-void _gcry_poly1305_finish (poly1305_context_t * ctx,
-			    byte mac[POLY1305_TAGLEN]);
+void _gcry_poly1305_finish (poly1305_context_t *ctx,
+			     byte mac[POLY1305_TAGLEN]);
 
-void _gcry_poly1305_update (poly1305_context_t * ctx, const byte * buf,
-			    size_t buflen);
+void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
+			     size_t buflen);
 
 
 #endif /* G10_POLY1305_INTERNAL_H */
diff --git a/cipher/poly1305-sse2-amd64.S b/cipher/poly1305-sse2-amd64.S
deleted file mode 100644
index 219eb077b..000000000
--- a/cipher/poly1305-sse2-amd64.S
+++ /dev/null
@@ -1,1043 +0,0 @@
-/* poly1305-sse2-amd64.S  -  AMD64/SSE2 implementation of Poly1305
- *
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Based on public domain implementation by Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
-#include <config.h>
-
-#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
-    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-
-.text
-
-
-.align 8
-.globl _gcry_poly1305_amd64_sse2_init_ext
-ELF(.type  _gcry_poly1305_amd64_sse2_init_ext, at function;)
-_gcry_poly1305_amd64_sse2_init_ext:
-.Lpoly1305_init_ext_x86_local:
-	xor %edx, %edx
-	pushq %r12
-	pushq %r13
-	pushq %r14
-	movq %rdx, %r10
-	movq $-1, %rcx
-	testq %r10, %r10
-	pxor %xmm0, %xmm0
-	movq $0xfffffc0ffff, %r9
-	movdqa %xmm0, (%rdi)
-	cmove %rcx, %r10
-	movdqa %xmm0, 16(%rdi)
-	movq $0xffc0fffffff, %rcx
-	movdqa %xmm0, 32(%rdi)
-	movdqa %xmm0, 48(%rdi)
-	movdqa %xmm0, 64(%rdi)
-	movq 8(%rsi), %r11
-	movq %r11, %r8
-	movq (%rsi), %r12
-	andq %r12, %rcx
-	shrq $44, %r12
-	shlq $20, %r8
-	shrq $24, %r11
-	orq %r8, %r12
-	movq $0xffffffc0f, %r8
-	andq %r9, %r12
-	andq %r8, %r11
-	movl %ecx, %r8d
-	andl $67108863, %r8d
-	movq %rcx, %r9
-	movl %r8d, 84(%rdi)
-	movq %r12, %r8
-	shrq $26, %r9
-	shlq $18, %r8
-	orq %r8, %r9
-	movq %r12, %r8
-	shrq $8, %r8
-	andl $67108863, %r9d
-	andl $67108863, %r8d
-	movl %r9d, 92(%rdi)
-	movq %r12, %r9
-	movl %r8d, 100(%rdi)
-	movq %r11, %r8
-	shrq $34, %r9
-	shlq $10, %r8
-	orq %r8, %r9
-	movq %r11, %r8
-	shrq $16, %r8
-	andl $67108863, %r9d
-	movl %r9d, 108(%rdi)
-	cmpq $16, %r10
-	movl %r8d, 116(%rdi)
-	movl 16(%rsi), %r8d
-	movl %r8d, 124(%rdi)
-	movl 20(%rsi), %r8d
-	movl %r8d, 132(%rdi)
-	movl 24(%rsi), %r8d
-	movl %r8d, 140(%rdi)
-	movl 28(%rsi), %esi
-	movl %esi, 148(%rdi)
-	jbe .Lpoly1305_init_ext_sse2_done
-	lea (%r11,%r11,4), %r14
-	shlq $2, %r14
-	lea (%r12,%r12), %rax
-	mulq %r14
-	movq %rax, %r13
-	movq %rcx, %rax
-	movq %rdx, %r8
-	mulq %rcx
-	addq %rax, %r13
-	lea (%rcx,%rcx), %rax
-	movq %r13, %r9
-	adcq %rdx, %r8
-	mulq %r12
-	shlq $20, %r8
-	movq %rax, %rsi
-	shrq $44, %r9
-	movq %r11, %rax
-	orq %r9, %r8
-	movq %rdx, %r9
-	mulq %r14
-	addq %rax, %rsi
-	movq %rcx, %rax
-	adcq %rdx, %r9
-	addq %r11, %r11
-	mulq %r11
-	addq %rsi, %r8
-	movq %rax, %r11
-	movq %r12, %rax
-	movq %rdx, %rcx
-	adcq $0, %r9
-	mulq %r12
-	addq %rax, %r11
-	movq %r8, %rsi
-	adcq %rdx, %rcx
-	shlq $20, %r9
-	shrq $44, %rsi
-	orq %rsi, %r9
-	movq $0xfffffffffff, %rsi
-	addq %r11, %r9
-	movq %r9, %r12
-	adcq $0, %rcx
-	andq %rsi, %r13
-	shlq $22, %rcx
-	andq %rsi, %r8
-	shrq $42, %r12
-	orq %r12, %rcx
-	movq %rsi, %r12
-	lea (%rcx,%rcx,4), %rcx
-	addq %rcx, %r13
-	movq %rsi, %rcx
-	andq %r13, %rcx
-	shrq $44, %r13
-	movq %rcx, %r14
-	addq %r13, %r8
-	movq $0x3ffffffffff, %r13
-	andq %r8, %r12
-	andq %r13, %r9
-	shrq $44, %r8
-	movq %r12, %r11
-	addq %r8, %r9
-	movq %r12, %rax
-	movq %r9, %r13
-	movl %ecx, %r8d
-	shrq $26, %r14
-	andl $67108863, %r8d
-	shlq $18, %r11
-	shrq $34, %rax
-	orq %r11, %r14
-	shlq $10, %r13
-	movq %r12, %r11
-	orq %r13, %rax
-	movq %r9, %r13
-	shrq $8, %r11
-	shrq $16, %r13
-	andl $67108863, %r14d
-	andl $67108863, %r11d
-	andl $67108863, %eax
-	movl %r8d, 88(%rdi)
-	cmpq $64, %r10
-	movl %r8d, 80(%rdi)
-	movl %r14d, 104(%rdi)
-	movl %r14d, 96(%rdi)
-	movl %r11d, 120(%rdi)
-	movl %r11d, 112(%rdi)
-	movl %eax, 136(%rdi)
-	movl %eax, 128(%rdi)
-	movl %r13d, 152(%rdi)
-	movl %r13d, 144(%rdi)
-	jbe .Lpoly1305_init_ext_sse2_done
-	lea (%r9,%r9,4), %r14
-	shlq $2, %r14
-	lea (%r12,%r12), %rax
-	mulq %r14
-	movq %rax, %r8
-	movq %rcx, %rax
-	movq %rdx, %r10
-	mulq %rcx
-	addq %rax, %r8
-	lea (%rcx,%rcx), %rax
-	movq %r8, %r11
-	adcq %rdx, %r10
-	andq %rsi, %r8
-	mulq %r12
-	shlq $20, %r10
-	movq %rax, %r13
-	shrq $44, %r11
-	movq %r9, %rax
-	orq %r11, %r10
-	movq %rdx, %r11
-	mulq %r14
-	addq %rax, %r13
-	movq %rcx, %rax
-	adcq %rdx, %r11
-	addq %r9, %r9
-	mulq %r9
-	addq %r13, %r10
-	movq %rax, %r9
-	movq %r12, %rax
-	movq %rdx, %rcx
-	adcq $0, %r11
-	mulq %r12
-	addq %rax, %r9
-	movq %r10, %r13
-	adcq %rdx, %rcx
-	andq %rsi, %r10
-	shlq $20, %r11
-	shrq $44, %r13
-	orq %r13, %r11
-	addq %r9, %r11
-	movq %rsi, %r9
-	movq %r11, %r12
-	adcq $0, %rcx
-	shlq $22, %rcx
-	shrq $42, %r12
-	orq %r12, %rcx
-	lea (%rcx,%rcx,4), %rcx
-	addq %rcx, %r8
-	andq %r8, %r9
-	shrq $44, %r8
-	movl %r9d, %eax
-	addq %r8, %r10
-	movq $0x3ffffffffff, %r8
-	andq %r10, %rsi
-	andq %r8, %r11
-	shrq $44, %r10
-	movq %rsi, %r8
-	addq %r10, %r11
-	andl $67108863, %eax
-	shrq $26, %r9
-	movq %r11, %r10
-	shlq $18, %r8
-	shlq $10, %r10
-	orq %r8, %r9
-	movq %rsi, %r8
-	shrq $34, %rsi
-	andl $67108863, %r9d
-	shrq $8, %r8
-	orq %r10, %rsi
-	shrq $16, %r11
-	andl $67108863, %r8d
-	andl $67108863, %esi
-	movl %eax, 168(%rdi)
-	movl %eax, 160(%rdi)
-	movl %r9d, 184(%rdi)
-	movl %r9d, 176(%rdi)
-	movl %r8d, 200(%rdi)
-	movl %r8d, 192(%rdi)
-	movl %esi, 216(%rdi)
-	movl %esi, 208(%rdi)
-	movl %r11d, 232(%rdi)
-	movl %r11d, 224(%rdi)
-.Lpoly1305_init_ext_sse2_done:
-	movq $0, 240(%rdi)
-	popq %r14
-	popq %r13
-	popq %r12
-	ret
-ELF(.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_sse2_finish_ext
-ELF(.type  _gcry_poly1305_amd64_sse2_finish_ext, at function;)
-_gcry_poly1305_amd64_sse2_finish_ext:
-.Lpoly1305_finish_ext_x86_local:
-	pushq %rbp
-	movq %rsp, %rbp
-	subq $64, %rsp
-	andq $~63, %rsp
-	movq %rdx, 32(%rsp)
-	movq %rcx, 40(%rsp)
-	andq %rdx, %rdx
-	jz .Lpoly1305_finish_x86_no_leftover
-	pxor %xmm0, %xmm0
-	movdqa %xmm0, 0+0(%rsp)
-	movdqa %xmm0, 16+0(%rsp)
-	leaq 0(%rsp), %r8
-	testq $16, %rdx
-	jz .Lpoly1305_finish_x86_skip16
-	movdqu 0(%rsi), %xmm0
-	movdqa %xmm0, 0(%r8)
-	addq $16, %rsi
-	addq $16, %r8
-.Lpoly1305_finish_x86_skip16:
-	testq $8, %rdx
-	jz .Lpoly1305_finish_x86_skip8
-	movq 0(%rsi), %rax
-	movq %rax, 0(%r8)
-	addq $8, %rsi
-	addq $8, %r8
-.Lpoly1305_finish_x86_skip8:
-	testq $4, %rdx
-	jz .Lpoly1305_finish_x86_skip4
-	movl 0(%rsi), %eax
-	movl %eax, 0(%r8)
-	addq $4, %rsi
-	addq $4, %r8
-.Lpoly1305_finish_x86_skip4:
-	testq $2, %rdx
-	jz .Lpoly1305_finish_x86_skip2
-	movw 0(%rsi), %ax
-	movw %ax, 0(%r8)
-	addq $2, %rsi
-	addq $2, %r8
-.Lpoly1305_finish_x86_skip2:
-	testq $1, %rdx
-	jz .Lpoly1305_finish_x86_skip1
-	movb 0(%rsi), %al
-	movb %al, 0(%r8)
-	addq $1, %r8
-.Lpoly1305_finish_x86_skip1:
-	cmpq $16, %rdx
-	je .Lpoly1305_finish_x86_is16
-	movb $1, 0(%r8)
-.Lpoly1305_finish_x86_is16:
-	movq $4, %rax
-	jae .Lpoly1305_finish_x86_16andover
-	movq $8, %rax
-.Lpoly1305_finish_x86_16andover:
-	orq %rax, 240(%rdi)
-	leaq 0(%rsp), %rsi
-	movq $32, %rdx
-	callq .Lpoly1305_blocks_x86_local
-.Lpoly1305_finish_x86_no_leftover:
-	testq $1, 240(%rdi)
-	jz .Lpoly1305_finish_x86_not_started
-	movq 32(%rsp), %rdx
-	andq %rdx, %rdx
-	jz .Lpoly1305_finish_x86_r2r
-	cmpq $16, %rdx
-	jg .Lpoly1305_finish_x86_r2r
-	xorl %r10d, %r10d
-	movl 84(%rdi), %eax
-	movl 92(%rdi), %ecx
-	movl 100(%rdi), %edx
-	movl 108(%rdi), %r8d
-	movl 116(%rdi), %r9d
-	movl %eax, 80(%rdi)
-	movl $1, 8+80(%rdi)
-	movl %ecx, 96(%rdi)
-	movl %r10d, 8+96(%rdi)
-	movl %edx, 112(%rdi)
-	movl %r10d, 8+112(%rdi)
-	movl %r8d, 128(%rdi)
-	movl %r10d, 8+128(%rdi)
-	movl %r9d, 144(%rdi)
-	movl %r10d, 8+144(%rdi)
-	jmp .Lpoly1305_finish_x86_combine
-.Lpoly1305_finish_x86_r2r:
-	movl 84(%rdi), %eax
-	movl 92(%rdi), %ecx
-	movl 100(%rdi), %edx
-	movl 108(%rdi), %r8d
-	movl 116(%rdi), %r9d
-	movl %eax, 8+80(%rdi)
-	movl %ecx, 8+96(%rdi)
-	movl %edx, 8+112(%rdi)
-	movl %r8d, 8+128(%rdi)
-	movl %r9d, 8+144(%rdi)
-.Lpoly1305_finish_x86_combine:
-	xorq %rsi, %rsi
-	movq $32, %rdx
-	callq .Lpoly1305_blocks_x86_local
-.Lpoly1305_finish_x86_not_started:
-	movq 0(%rdi), %r8
-	movq 8(%rdi), %r9
-	movq %r9, %r10
-	movq 16(%rdi), %r11
-	shlq $44, %r9
-	shrq $20, %r10
-	shlq $24, %r11
-	orq %r9, %r8
-	orq %r11, %r10
-	pxor %xmm0, %xmm0
-	movl 124(%rdi), %eax
-	movl 132(%rdi), %ecx
-	movl 140(%rdi), %edx
-	movl 148(%rdi), %esi
-	movq 40(%rsp), %r11
-	shlq $32, %rcx
-	shlq $32, %rsi
-	orq %rcx, %rax
-	orq %rsi, %rdx
-	addq %r8, %rax
-	adcq %r10, %rdx
-	movq %rax, 0(%r11)
-	movq %rdx, 8(%r11)
-	movq %rbp, %rax
-	subq %rsp, %rax
-	movq %rbp, %rsp
-	movdqa %xmm0, 0(%rdi)
-	movdqa %xmm0, 16(%rdi)
-	movdqa %xmm0, 32(%rdi)
-	movdqa %xmm0, 48(%rdi)
-	movdqa %xmm0, 64(%rdi)
-	movdqa %xmm0, 80(%rdi)
-	movdqa %xmm0, 96(%rdi)
-	movdqa %xmm0, 112(%rdi)
-	movdqa %xmm0, 128(%rdi)
-	movdqa %xmm0, 144(%rdi)
-	movdqa %xmm0, 160(%rdi)
-	movdqa %xmm0, 176(%rdi)
-	movdqa %xmm0, 192(%rdi)
-	movdqa %xmm0, 208(%rdi)
-	movdqa %xmm0, 224(%rdi)
-	popq %rbp
-	addq $8, %rax
-	ret
-ELF(.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext;)
-
-
-.align 8
-.globl _gcry_poly1305_amd64_sse2_blocks
-ELF(.type  _gcry_poly1305_amd64_sse2_blocks, at function;)
-_gcry_poly1305_amd64_sse2_blocks:
-.Lpoly1305_blocks_x86_local:
-	pushq %rbp
-	movq %rsp, %rbp
-	pushq %rbx
-	andq $-64, %rsp
-	subq $328, %rsp
-	movq 240(%rdi), %rax
-	movl $(1<<24), %r8d
-	movl $((1<<26)-1), %r9d
-	movd %r8, %xmm0
-	movd %r9, %xmm5
-	pshufd $0x44, %xmm0, %xmm0
-	pshufd $0x44, %xmm5, %xmm5
-	testb $4, %al
-	je .Lpoly1305_blocks_x86_3
-	psrldq $8, %xmm0
-.Lpoly1305_blocks_x86_3:
-	testb $8, %al
-	je .Lpoly1305_blocks_x86_4
-	pxor %xmm0, %xmm0
-.Lpoly1305_blocks_x86_4:
-	movdqa %xmm0, 168(%rsp)
-	testb $1, %al
-	jne .Lpoly1305_blocks_x86_5
-	movq 16(%rsi), %xmm0
-	movdqa %xmm5, %xmm7
-	movdqa %xmm5, %xmm10
-	movq (%rsi), %xmm6
-	orq $1, %rax
-	subq $32, %rdx
-	movq 8(%rsi), %xmm1
-	punpcklqdq %xmm0, %xmm6
-	movq 24(%rsi), %xmm0
-	pand %xmm6, %xmm7
-	movdqa %xmm6, %xmm9
-	psrlq $52, %xmm6
-	addq $32, %rsi
-	punpcklqdq %xmm0, %xmm1
-	movdqa %xmm1, %xmm0
-	psrlq $26, %xmm9
-	psllq $12, %xmm0
-	movq %rax, 240(%rdi)
-	pand %xmm5, %xmm9
-	por %xmm0, %xmm6
-	psrlq $40, %xmm1
-	pand %xmm6, %xmm10
-	por 168(%rsp), %xmm1
-	psrlq $26, %xmm6
-	pand %xmm5, %xmm6
-.Lpoly1305_blocks_x86_6:
-	movdqa 80(%rdi), %xmm13
-	cmpq $63, %rdx
-	movl $(5), %r8d
-	movd %r8, %xmm14
-	pshufd $0x44, %xmm14, %xmm14
-	movdqa 96(%rdi), %xmm15
-	movdqa %xmm13, -8(%rsp)
-	movdqa 112(%rdi), %xmm0
-	movdqa %xmm14, 136(%rsp)
-	movdqa 128(%rdi), %xmm3
-	movdqa %xmm15, 312(%rsp)
-	pmuludq %xmm14, %xmm15
-	movdqa 144(%rdi), %xmm13
-	movdqa %xmm0, 232(%rsp)
-	pmuludq %xmm14, %xmm0
-	movdqa %xmm3, 152(%rsp)
-	pmuludq %xmm14, %xmm3
-	movdqa %xmm13, 56(%rsp)
-	pmuludq %xmm14, %xmm13
-	movdqa %xmm15, 40(%rsp)
-	movdqa %xmm0, -24(%rsp)
-	movdqa %xmm3, -40(%rsp)
-	movdqa %xmm13, -56(%rsp)
-	jbe .Lpoly1305_blocks_x86_7
-	movdqa 192(%rdi), %xmm15
-	leaq 32(%rsi), %rax
-	movq %rdx, %rcx
-	movdqa 176(%rdi), %xmm14
-	movdqa %xmm15, %xmm2
-	movdqa 208(%rdi), %xmm0
-	movdqa %xmm15, 216(%rsp)
-	movdqa %xmm14, 296(%rsp)
-	movdqa 224(%rdi), %xmm3
-	pmuludq 136(%rsp), %xmm14
-	movdqa -24(%rsp), %xmm13
-	movdqa %xmm14, 8(%rsp)
-	pmuludq 136(%rsp), %xmm2
-	movdqa -40(%rsp), %xmm14
-	movdqa %xmm0, 120(%rsp)
-	pmuludq 136(%rsp), %xmm0
-	movdqa %xmm3, 24(%rsp)
-	movdqa 160(%rdi), %xmm12
-	movdqa %xmm0, %xmm8
-	movdqa -56(%rsp), %xmm15
-	movdqa %xmm13, 88(%rsp)
-	pmuludq 136(%rsp), %xmm3
-	movdqa %xmm2, 104(%rsp)
-	movdqa %xmm0, %xmm13
-	movdqa -8(%rsp), %xmm11
-	movdqa %xmm3, 280(%rsp)
-	movdqa %xmm2, %xmm3
-	movdqa %xmm0, 200(%rsp)
-	movdqa %xmm14, 184(%rsp)
-	movdqa %xmm15, 264(%rsp)
-	jmp .Lpoly1305_blocks_x86_8
-.p2align 6,,63
-.Lpoly1305_blocks_x86_13:
-	movdqa 200(%rsp), %xmm13
-	movdqa %xmm3, %xmm6
-	movdqa 200(%rsp), %xmm8
-	movdqa 104(%rsp), %xmm3
-.Lpoly1305_blocks_x86_8:
-	movdqa 8(%rsp), %xmm4
-	pmuludq %xmm6, %xmm3
-	subq $64, %rcx
-	pmuludq %xmm10, %xmm8
-	movdqa 104(%rsp), %xmm2
-	movdqa 200(%rsp), %xmm0
-	pmuludq %xmm1, %xmm4
-	movdqa 280(%rsp), %xmm15
-	pmuludq %xmm6, %xmm13
-	movdqa 280(%rsp), %xmm14
-	pmuludq %xmm1, %xmm0
-	paddq %xmm3, %xmm4
-	pmuludq %xmm1, %xmm2
-	movdqa 280(%rsp), %xmm3
-	paddq %xmm8, %xmm4
-	pmuludq %xmm9, %xmm15
-	movdqa 280(%rsp), %xmm8
-	pmuludq %xmm10, %xmm14
-	pmuludq %xmm6, %xmm8
-	paddq %xmm13, %xmm2
-	movdqa %xmm6, %xmm13
-	pmuludq %xmm1, %xmm3
-	paddq %xmm15, %xmm4
-	movdqa 296(%rsp), %xmm15
-	pmuludq %xmm12, %xmm13
-	paddq %xmm14, %xmm2
-	movdqa %xmm7, %xmm14
-	paddq %xmm8, %xmm0
-	pmuludq %xmm12, %xmm14
-	movdqa %xmm9, %xmm8
-	pmuludq 296(%rsp), %xmm6
-	pmuludq %xmm12, %xmm8
-	movdqa %xmm6, 248(%rsp)
-	pmuludq %xmm10, %xmm15
-	movq -16(%rax), %xmm6
-	paddq %xmm13, %xmm3
-	movdqa %xmm10, %xmm13
-	paddq %xmm14, %xmm4
-	movq -8(%rax), %xmm14
-	paddq %xmm8, %xmm2
-	movq -32(%rax), %xmm8
-	pmuludq %xmm12, %xmm13
-	paddq %xmm15, %xmm3
-	pmuludq %xmm12, %xmm1
-	movdqa 216(%rsp), %xmm15
-	pmuludq 216(%rsp), %xmm10
-	punpcklqdq %xmm6, %xmm8
-	movq -24(%rax), %xmm6
-	pmuludq %xmm9, %xmm15
-	paddq %xmm13, %xmm0
-	movdqa 296(%rsp), %xmm13
-	paddq 248(%rsp), %xmm1
-	punpcklqdq %xmm14, %xmm6
-	movdqa 296(%rsp), %xmm14
-	pmuludq %xmm9, %xmm13
-	pmuludq 120(%rsp), %xmm9
-	movdqa %xmm15, 72(%rsp)
-	paddq %xmm10, %xmm1
-	movdqa 216(%rsp), %xmm15
-	pmuludq %xmm7, %xmm14
-	movdqa %xmm6, %xmm10
-	paddq %xmm9, %xmm1
-	pmuludq %xmm7, %xmm15
-	paddq %xmm13, %xmm0
-	paddq 72(%rsp), %xmm3
-	movdqa 120(%rsp), %xmm13
-	psllq $12, %xmm10
-	paddq %xmm14, %xmm2
-	movdqa %xmm5, %xmm14
-	pand %xmm8, %xmm14
-	pmuludq %xmm7, %xmm13
-	paddq %xmm15, %xmm0
-	movdqa %xmm14, 248(%rsp)
-	movdqa %xmm8, %xmm14
-	psrlq $52, %xmm8
-	movdqu (%rax), %xmm9
-	por %xmm10, %xmm8
-	pmuludq 24(%rsp), %xmm7
-	movdqu 16(%rax), %xmm10
-	paddq %xmm13, %xmm3
-	pxor %xmm13, %xmm13
-	movdqa %xmm9, %xmm15
-	paddq %xmm7, %xmm1
-	movdqa %xmm6, %xmm7
-	movdqa %xmm10, -72(%rsp)
-	punpckldq %xmm10, %xmm15
-	movdqa %xmm15, %xmm10
-	punpckldq %xmm13, %xmm10
-	punpckhdq -72(%rsp), %xmm9
-	psrlq $40, %xmm6
-	movdqa %xmm10, 72(%rsp)
-	movdqa %xmm9, %xmm10
-	punpckhdq %xmm13, %xmm9
-	psllq $18, %xmm9
-	paddq 72(%rsp), %xmm4
-	addq $64, %rax
-	paddq %xmm9, %xmm3
-	movdqa 40(%rsp), %xmm9
-	cmpq $63, %rcx
-	punpckhdq %xmm13, %xmm15
-	psllq $6, %xmm15
-	punpckldq %xmm13, %xmm10
-	paddq %xmm15, %xmm2
-	psllq $12, %xmm10
-	por 168(%rsp), %xmm6
-	pmuludq %xmm6, %xmm9
-	movdqa 88(%rsp), %xmm15
-	paddq %xmm10, %xmm0
-	movdqa 88(%rsp), %xmm13
-	psrlq $14, %xmm7
-	pand %xmm5, %xmm8
-	movdqa 184(%rsp), %xmm10
-	pand %xmm5, %xmm7
-	pmuludq %xmm7, %xmm15
-	paddq %xmm9, %xmm4
-	pmuludq %xmm6, %xmm13
-	movdqa 184(%rsp), %xmm9
-	paddq 168(%rsp), %xmm1
-	pmuludq %xmm7, %xmm10
-	pmuludq %xmm6, %xmm9
-	paddq %xmm15, %xmm4
-	movdqa 184(%rsp), %xmm15
-	paddq %xmm13, %xmm2
-	psrlq $26, %xmm14
-	movdqa 264(%rsp), %xmm13
-	paddq %xmm10, %xmm2
-	pmuludq %xmm8, %xmm15
-	pand %xmm5, %xmm14
-	paddq %xmm9, %xmm0
-	pmuludq %xmm6, %xmm13
-	movdqa 264(%rsp), %xmm9
-	movdqa 264(%rsp), %xmm10
-	pmuludq %xmm11, %xmm6
-	pmuludq %xmm8, %xmm9
-	paddq %xmm15, %xmm4
-	movdqa 264(%rsp), %xmm15
-	pmuludq %xmm14, %xmm10
-	paddq %xmm13, %xmm3
-	movdqa %xmm7, %xmm13
-	pmuludq %xmm7, %xmm15
-	paddq %xmm6, %xmm1
-	movdqa 312(%rsp), %xmm6
-	paddq %xmm9, %xmm2
-	pmuludq %xmm11, %xmm13
-	movdqa 248(%rsp), %xmm9
-	paddq %xmm10, %xmm4
-	pmuludq %xmm8, %xmm6
-	pmuludq 312(%rsp), %xmm7
-	paddq %xmm15, %xmm0
-	movdqa %xmm9, %xmm10
-	movdqa %xmm14, %xmm15
-	pmuludq %xmm11, %xmm10
-	paddq %xmm13, %xmm3
-	movdqa %xmm8, %xmm13
-	pmuludq %xmm11, %xmm13
-	paddq %xmm6, %xmm3
-	paddq %xmm7, %xmm1
-	movdqa 232(%rsp), %xmm6
-	pmuludq %xmm11, %xmm15
-	pmuludq 232(%rsp), %xmm8
-	paddq %xmm10, %xmm4
-	paddq %xmm8, %xmm1
-	movdqa 312(%rsp), %xmm10
-	paddq %xmm13, %xmm0
-	pmuludq %xmm14, %xmm6
-	movdqa 312(%rsp), %xmm13
-	pmuludq %xmm9, %xmm10
-	paddq %xmm15, %xmm2
-	movdqa 232(%rsp), %xmm7
-	pmuludq %xmm14, %xmm13
-	pmuludq 152(%rsp), %xmm14
-	paddq %xmm14, %xmm1
-	pmuludq %xmm9, %xmm7
-	paddq %xmm6, %xmm3
-	paddq %xmm10, %xmm2
-	movdqa 152(%rsp), %xmm10
-	paddq %xmm13, %xmm0
-	pmuludq %xmm9, %xmm10
-	paddq %xmm7, %xmm0
-	movdqa %xmm4, %xmm7
-	psrlq $26, %xmm7
-	pmuludq 56(%rsp), %xmm9
-	pand %xmm5, %xmm4
-	paddq %xmm7, %xmm2
-	paddq %xmm9, %xmm1
-	paddq %xmm10, %xmm3
-	movdqa %xmm2, %xmm7
-	movdqa %xmm2, %xmm9
-	movdqa %xmm3, %xmm6
-	psrlq $26, %xmm7
-	pand %xmm5, %xmm3
-	psrlq $26, %xmm6
-	paddq %xmm7, %xmm0
-	pand %xmm5, %xmm9
-	paddq %xmm6, %xmm1
-	movdqa %xmm0, %xmm10
-	movdqa %xmm1, %xmm6
-	pand %xmm5, %xmm10
-	pand %xmm5, %xmm1
-	psrlq $26, %xmm6
-	pmuludq 136(%rsp), %xmm6
-	paddq %xmm6, %xmm4
-	movdqa %xmm0, %xmm6
-	psrlq $26, %xmm6
-	movdqa %xmm4, %xmm2
-	movdqa %xmm4, %xmm7
-	paddq %xmm6, %xmm3
-	psrlq $26, %xmm2
-	pand %xmm5, %xmm7
-	movdqa %xmm3, %xmm0
-	paddq %xmm2, %xmm9
-	pand %xmm5, %xmm3
-	psrlq $26, %xmm0
-	paddq %xmm0, %xmm1
-	ja .Lpoly1305_blocks_x86_13
-	leaq -64(%rdx), %rax
-	movdqa %xmm3, %xmm6
-	andl $63, %edx
-	andq $-64, %rax
-	leaq 64(%rsi,%rax), %rsi
-.Lpoly1305_blocks_x86_7:
-	cmpq $31, %rdx
-	jbe .Lpoly1305_blocks_x86_9
-	movdqa -24(%rsp), %xmm13
-	movdqa %xmm6, %xmm0
-	movdqa %xmm6, %xmm3
-	movdqa 40(%rsp), %xmm11
-	movdqa %xmm1, %xmm12
-	testq %rsi, %rsi
-	movdqa -40(%rsp), %xmm2
-	pmuludq %xmm13, %xmm0
-	movdqa %xmm1, %xmm8
-	pmuludq %xmm1, %xmm11
-	movdqa %xmm10, %xmm4
-	movdqa %xmm1, %xmm14
-	pmuludq %xmm2, %xmm3
-	movdqa %xmm6, %xmm15
-	pmuludq %xmm1, %xmm13
-	movdqa %xmm7, %xmm1
-	pmuludq %xmm2, %xmm12
-	paddq %xmm0, %xmm11
-	movdqa -56(%rsp), %xmm0
-	pmuludq %xmm10, %xmm2
-	paddq %xmm3, %xmm13
-	pmuludq %xmm0, %xmm4
-	movdqa %xmm9, %xmm3
-	pmuludq %xmm0, %xmm3
-	paddq %xmm2, %xmm11
-	pmuludq %xmm0, %xmm8
-	movdqa %xmm6, %xmm2
-	pmuludq %xmm0, %xmm2
-	movdqa -8(%rsp), %xmm0
-	paddq %xmm4, %xmm13
-	movdqa 312(%rsp), %xmm4
-	paddq %xmm3, %xmm11
-	pmuludq 312(%rsp), %xmm6
-	movdqa 312(%rsp), %xmm3
-	pmuludq %xmm0, %xmm1
-	paddq %xmm2, %xmm12
-	pmuludq %xmm0, %xmm15
-	movdqa %xmm9, %xmm2
-	pmuludq %xmm0, %xmm2
-	pmuludq %xmm7, %xmm3
-	paddq %xmm1, %xmm11
-	movdqa 232(%rsp), %xmm1
-	pmuludq %xmm0, %xmm14
-	paddq %xmm15, %xmm8
-	pmuludq %xmm10, %xmm0
-	paddq %xmm2, %xmm13
-	movdqa 312(%rsp), %xmm2
-	pmuludq %xmm10, %xmm4
-	paddq %xmm3, %xmm13
-	movdqa 152(%rsp), %xmm3
-	pmuludq %xmm9, %xmm2
-	paddq %xmm6, %xmm14
-	pmuludq 232(%rsp), %xmm10
-	paddq %xmm0, %xmm12
-	pmuludq %xmm9, %xmm1
-	paddq %xmm10, %xmm14
-	movdqa 232(%rsp), %xmm0
-	pmuludq %xmm7, %xmm3
-	paddq %xmm4, %xmm8
-	pmuludq 152(%rsp), %xmm9
-	paddq %xmm2, %xmm12
-	paddq %xmm9, %xmm14
-	pmuludq %xmm7, %xmm0
-	paddq %xmm1, %xmm8
-	pmuludq 56(%rsp), %xmm7
-	paddq %xmm3, %xmm8
-	paddq %xmm7, %xmm14
-	paddq %xmm0, %xmm12
-	je .Lpoly1305_blocks_x86_10
-	movdqu (%rsi), %xmm1
-	pxor %xmm0, %xmm0
-	paddq 168(%rsp), %xmm14
-	movdqu 16(%rsi), %xmm2
-	movdqa %xmm1, %xmm3
-	punpckldq %xmm2, %xmm3
-	punpckhdq %xmm2, %xmm1
-	movdqa %xmm3, %xmm4
-	movdqa %xmm1, %xmm2
-	punpckldq %xmm0, %xmm4
-	punpckhdq %xmm0, %xmm3
-	punpckhdq %xmm0, %xmm1
-	punpckldq %xmm0, %xmm2
-	movdqa %xmm2, %xmm0
-	psllq $6, %xmm3
-	paddq %xmm4, %xmm11
-	psllq $12, %xmm0
-	paddq %xmm3, %xmm13
-	psllq $18, %xmm1
-	paddq %xmm0, %xmm12
-	paddq %xmm1, %xmm8
-.Lpoly1305_blocks_x86_10:
-	movdqa %xmm11, %xmm9
-	movdqa %xmm8, %xmm1
-	movdqa %xmm11, %xmm7
-	psrlq $26, %xmm9
-	movdqa %xmm8, %xmm6
-	pand %xmm5, %xmm7
-	paddq %xmm13, %xmm9
-	psrlq $26, %xmm1
-	pand %xmm5, %xmm6
-	movdqa %xmm9, %xmm10
-	paddq %xmm14, %xmm1
-	pand %xmm5, %xmm9
-	psrlq $26, %xmm10
-	movdqa %xmm1, %xmm0
-	pand %xmm5, %xmm1
-	paddq %xmm12, %xmm10
-	psrlq $26, %xmm0
-	pmuludq 136(%rsp), %xmm0
-	movdqa %xmm10, %xmm2
-	paddq %xmm0, %xmm7
-	psrlq $26, %xmm2
-	movdqa %xmm7, %xmm0
-	pand %xmm5, %xmm10
-	paddq %xmm2, %xmm6
-	psrlq $26, %xmm0
-	pand %xmm5, %xmm7
-	movdqa %xmm6, %xmm2
-	paddq %xmm0, %xmm9
-	pand %xmm5, %xmm6
-	psrlq $26, %xmm2
-	paddq %xmm2, %xmm1
-.Lpoly1305_blocks_x86_9:
-	testq %rsi, %rsi
-	je .Lpoly1305_blocks_x86_11
-	movdqa %xmm7, 0(%rdi)
-	movdqa %xmm9, 16(%rdi)
-	movdqa %xmm10, 32(%rdi)
-	movdqa %xmm6, 48(%rdi)
-	movdqa %xmm1, 64(%rdi)
-	movq -8(%rbp), %rbx
-	leave
-	ret
-.Lpoly1305_blocks_x86_5:
-	movdqa 0(%rdi), %xmm7
-	movdqa 16(%rdi), %xmm9
-	movdqa 32(%rdi), %xmm10
-	movdqa 48(%rdi), %xmm6
-	movdqa 64(%rdi), %xmm1
-	jmp .Lpoly1305_blocks_x86_6
-.Lpoly1305_blocks_x86_11:
-	movdqa %xmm7, %xmm0
-	movdqa %xmm9, %xmm2
-	movdqa %xmm6, %xmm3
-	psrldq $8, %xmm0
-	movabsq $4398046511103, %rbx
-	paddq %xmm0, %xmm7
-	psrldq $8, %xmm2
-	movdqa %xmm10, %xmm0
-	movd %xmm7, %edx
-	paddq %xmm2, %xmm9
-	psrldq $8, %xmm0
-	movl %edx, %ecx
-	movd %xmm9, %eax
-	paddq %xmm0, %xmm10
-	shrl $26, %ecx
-	psrldq $8, %xmm3
-	movdqa %xmm1, %xmm0
-	addl %ecx, %eax
-	movd %xmm10, %ecx
-	paddq %xmm3, %xmm6
-	movl %eax, %r9d
-	shrl $26, %eax
-	psrldq $8, %xmm0
-	addl %ecx, %eax
-	movd %xmm6, %ecx
-	paddq %xmm0, %xmm1
-	movl %eax, %esi
-	andl $67108863, %r9d
-	movd %xmm1, %r10d
-	shrl $26, %esi
-	andl $67108863, %eax
-	andl $67108863, %edx
-	addl %ecx, %esi
-	salq $8, %rax
-	movl %r9d, %ecx
-	shrl $18, %r9d
-	movl %esi, %r8d
-	shrl $26, %esi
-	andl $67108863, %r8d
-	addl %r10d, %esi
-	orq %r9, %rax
-	salq $16, %rsi
-	movq %r8, %r9
-	shrl $10, %r8d
-	salq $26, %rcx
-	orq %r8, %rsi
-	salq $34, %r9
-	orq %rdx, %rcx
-	movq %rsi, %r8
-	shrq $42, %rsi
-	movabsq $17592186044415, %rdx
-	orq %r9, %rax
-	andq %rbx, %r8
-	leaq (%rsi,%rsi,4), %rsi
-	andq %rdx, %rcx
-	andq %rdx, %rax
-	movabsq $-4398046511104, %r10
-	addq %rsi, %rcx
-	movq %rcx, %rsi
-	shrq $44, %rcx
-	addq %rcx, %rax
-	andq %rdx, %rsi
-	movq %rax, %rcx
-	shrq $44, %rax
-	addq %r8, %rax
-	andq %rdx, %rcx
-	andq %rax, %rbx
-	shrq $42, %rax
-	leaq (%rsi,%rax,4), %rsi
-	addq %rbx, %r10
-	addq %rax, %rsi
-	movq %rsi, %r8
-	shrq $44, %rsi
-	andq %rdx, %r8
-	addq %rcx, %rsi
-	leaq 5(%r8), %r9
-	movq %r9, %r11
-	andq %rdx, %r9
-	shrq $44, %r11
-	addq %rsi, %r11
-	movq %r11, %rax
-	andq %r11, %rdx
-	shrq $44, %rax
-	addq %rax, %r10
-	movq %r10, %rax
-	shrq $63, %rax
-	subq $1, %rax
-	movq %rax, %rcx
-	andq %rax, %r9
-	andq %rax, %rdx
-	notq %rcx
-	andq %r10, %rax
-	andq %rcx, %r8
-	andq %rcx, %rsi
-	andq %rbx, %rcx
-	orq %r9, %r8
-	orq %rdx, %rsi
-	orq %rax, %rcx
-	movq %r8, 0(%rdi)
-	movq %rsi, 8(%rdi)
-	movq %rcx, 16(%rdi)
-	movq -8(%rbp), %rbx
-	movq %rbp, %rax
-	subq %rsp, %rax
-	pxor %xmm15, %xmm15
-	pxor %xmm7, %xmm7
-	pxor %xmm14, %xmm14
-	pxor %xmm6, %xmm6
-	pxor %xmm13, %xmm13
-	pxor %xmm5, %xmm5
-	pxor %xmm12, %xmm12
-	pxor %xmm4, %xmm4
-	leave
-	addq $8, %rax
-	pxor %xmm11, %xmm11
-	pxor %xmm3, %xmm3
-	pxor %xmm10, %xmm10
-	pxor %xmm2, %xmm2
-	pxor %xmm9, %xmm9
-	pxor %xmm1, %xmm1
-	pxor %xmm8, %xmm8
-	pxor %xmm0, %xmm0
-	ret
-ELF(.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks;)
-
-#endif
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 22255fb15..68d9b9015 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -1,5 +1,5 @@
 /* poly1305.c  -  Poly1305 internals and generic implementation
- * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -17,11 +17,6 @@
  * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
-/* The code is based on public-domain Poly1305 implementation by
- * Andrew Moon at
- *  https://github.com/floodyberry/poly1305-opt
- */
-
 #include <config.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -33,157 +28,325 @@
 #include "bufhelp.h"
 #include "poly1305-internal.h"
 
+#include "mpi-internal.h"
+#include "longlong.h"
+
 
 static const char *selftest (void);
-

-
-
-#ifdef POLY1305_USE_SSE2
-
-void _gcry_poly1305_amd64_sse2_init_ext(void *state, const poly1305_key_t *key)
-                                       OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_sse2_finish_ext(void *state, const byte *m,
-						  size_t remaining,
-						  byte mac[16]) OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_sse2_blocks(void *ctx, const byte *m,
-					      size_t bytes) OPS_FUNC_ABI;
-
-static const poly1305_ops_t poly1305_amd64_sse2_ops = {
-  POLY1305_SSE2_BLOCKSIZE,
-  _gcry_poly1305_amd64_sse2_init_ext,
-  _gcry_poly1305_amd64_sse2_blocks,
-  _gcry_poly1305_amd64_sse2_finish_ext
-};
-
-#else  /* !POLY1305_USE_SSE2 */
-
-static OPS_FUNC_ABI void poly1305_init_ext_ref32
-/**/                (void *state, const poly1305_key_t *key);
-static OPS_FUNC_ABI unsigned int poly1305_blocks_ref32
-/**/                (void *state, const byte *m, size_t bytes);
-static OPS_FUNC_ABI unsigned int poly1305_finish_ext_ref32
-/**/                (void *state, const byte * m,
-                     size_t remaining, byte mac[POLY1305_TAGLEN]);
-
-static const poly1305_ops_t poly1305_default_ops = {
-  POLY1305_REF_BLOCKSIZE,
-  poly1305_init_ext_ref32,
-  poly1305_blocks_ref32,
-  poly1305_finish_ext_ref32
-};
-
-#endif /* !POLY1305_USE_SSE2 */
-
-
-#ifdef POLY1305_USE_AVX2
-
-void _gcry_poly1305_amd64_avx2_init_ext(void *state, const poly1305_key_t *key)
-                                       OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_avx2_finish_ext(void *state, const byte *m,
-						  size_t remaining,
-						  byte mac[16]) OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_amd64_avx2_blocks(void *ctx, const byte *m,
-					      size_t bytes) OPS_FUNC_ABI;
-
-static const poly1305_ops_t poly1305_amd64_avx2_ops = {
-  POLY1305_AVX2_BLOCKSIZE,
-  _gcry_poly1305_amd64_avx2_init_ext,
-  _gcry_poly1305_amd64_avx2_blocks,
-  _gcry_poly1305_amd64_avx2_finish_ext
-};
 
+
+#undef USE_MPI_64BIT
+#undef USE_MPI_32BIT
+#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_U64_TYPEDEF)
+# define USE_MPI_64BIT 1
+#elif BYTES_PER_MPI_LIMB == 4
+# define USE_MPI_32BIT 1
+#else
+# error please implement for this limb size.
 #endif
 
 
-#ifdef POLY1305_USE_NEON
+static void poly1305_init (poly1305_context_t *ctx,
+			   const byte key[POLY1305_KEYLEN])
+{
+  POLY1305_STATE *st = &ctx->state;
 
-void _gcry_poly1305_armv7_neon_init_ext(void *state, const poly1305_key_t *key)
-                                       OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_armv7_neon_finish_ext(void *state, const byte *m,
-						  size_t remaining,
-						  byte mac[16]) OPS_FUNC_ABI;
-unsigned int _gcry_poly1305_armv7_neon_blocks(void *ctx, const byte *m,
-					      size_t bytes) OPS_FUNC_ABI;
+  ctx->leftover = 0;
 
-static const poly1305_ops_t poly1305_armv7_neon_ops = {
-  POLY1305_NEON_BLOCKSIZE,
-  _gcry_poly1305_armv7_neon_init_ext,
-  _gcry_poly1305_armv7_neon_blocks,
-  _gcry_poly1305_armv7_neon_finish_ext
-};
+  st->h[0] = 0;
+  st->h[1] = 0;
+  st->h[2] = 0;
+  st->h[3] = 0;
+  st->h[4] = 0;
 
-#endif
+  st->r[0] = buf_get_le32(key + 0)  & 0x0fffffff;
+  st->r[1] = buf_get_le32(key + 4)  & 0x0ffffffc;
+  st->r[2] = buf_get_le32(key + 8)  & 0x0ffffffc;
+  st->r[3] = buf_get_le32(key + 12) & 0x0ffffffc;
 
+  st->k[0] = buf_get_le32(key + 16);
+  st->k[1] = buf_get_le32(key + 20);
+  st->k[2] = buf_get_le32(key + 24);
+  st->k[3] = buf_get_le32(key + 28);
+}
 
-/* Reference unoptimized poly1305 implementation using 32 bit * 32 bit = 64 bit
- * multiplication and 64 bit addition.
- */
 
-typedef struct poly1305_state_ref32_s
+#ifdef USE_MPI_64BIT
+
+#if defined (__aarch64__) && __GNUC__ >= 4
+
+/* A += B (armv8/aarch64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("adds %0, %3, %0\n" \
+	       "adcs %1, %4, %1\n" \
+	       "adc  %2, %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "r" (B0), "r" (B1), "r" (B2) \
+	       : "cc" )
+
+#endif /* __aarch64__ */
+
+#if defined (__x86_64__) && __GNUC__ >= 4
+
+/* A += B (x86-64) */
+#define ADD_1305_64(A2, A1, A0, B2, B1, B0) \
+      __asm__ ("addq %3, %0\n" \
+	       "adcq %4, %1\n" \
+	       "adcq %5, %2\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2) \
+	       : "g" (B0), "g" (B1), "g" (B2) \
+	       : "cc" )
+
+#endif /* __x86_64__ */
+
+#ifndef ADD_1305_64
+/* A += B (generic, mpi) */
+#  define ADD_1305_64(A2, A1, A0, B2, B1, B0) do { \
+    u64 carry; \
+    add_ssaaaa(carry, A0, 0, A0, 0, B0); \
+    add_ssaaaa(A2, A1, A2, A1, B2, B1); \
+    add_ssaaaa(A2, A1, A2, A1, 0, carry); \
+  } while (0)
+#endif
+
+/* H = H * R mod 2¹³⁰-5 */
+#define MUL_MOD_1305_64(H2, H1, H0, R1, R0, R1_MULT5) do { \
+    u64 x0_lo, x0_hi, x1_lo, x1_hi; \
+    u64 t0_lo, t0_hi, t1_lo, t1_hi; \
+    \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    \
+    umul_ppmm(t0_hi, t0_lo, H1, R1_MULT5); /* h1 * r1 mod 2^130-5 */ \
+    add_ssaaaa(x0_hi, x0_lo, x0_hi, x0_lo, t0_hi, t0_lo); \
+    umul_ppmm(t1_hi, t1_lo, H1, R0);       /* h1 * r0 */ \
+    add_ssaaaa(x1_hi, x1_lo, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    t1_lo = H2 * R1_MULT5; /* h2 * r1 mod 2^130-5 */ \
+    t1_hi = H2 * R0;       /* h2 * r0 */ \
+    add_ssaaaa(H0, H1, x1_hi, x1_lo, t1_hi, t1_lo); \
+    \
+    /* carry propagation */ \
+    H2 = H0 & 3; \
+    H0 = (H0 >> 2) * 5; /* msb mod 2^130-5 */ \
+    ADD_1305_64(H2, H1, H0, 0, x0_hi, x0_lo); \
+  } while (0)
+
+unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
 {
-  u32 r[5];
-  u32 h[5];
-  u32 pad[4];
-  byte final;
-} poly1305_state_ref32_t;
+  POLY1305_STATE *st = &ctx->state;
+  u64 r0, r1, r1_mult5;
+  u64 h0, h1, h2;
+  u64 m0, m1, m2;
+
+  m2 = high_pad;
+
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
+
+  r0 = st->r[0] + ((u64)st->r[1] << 32);
+  r1 = st->r[2] + ((u64)st->r[3] << 32);
+
+  r1_mult5 = (r1 >> 2) + r1;
+
+  m0 = buf_get_le64(buf + 0);
+  m1 = buf_get_le64(buf + 8);
+  buf += POLY1305_BLOCKSIZE;
+  len -= POLY1305_BLOCKSIZE;
+
+  while (len >= POLY1305_BLOCKSIZE)
+    {
+      /* a = h + m */
+      ADD_1305_64(h2, h1, h0, m2, m1, m0);
+
+      m0 = buf_get_le64(buf + 0);
+      m1 = buf_get_le64(buf + 8);
+
+      /* h = a * r (partial mod 2^130-5) */
+      MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
+
+      buf += POLY1305_BLOCKSIZE;
+      len -= POLY1305_BLOCKSIZE;
+    }
+
+  /* a = h + m */
+  ADD_1305_64(h2, h1, h0, m2, m1, m0);
+
+  /* h = a * r (partial mod 2^130-5) */
+  MUL_MOD_1305_64(h2, h1, h0, r1, r0, r1_mult5);
 
+  st->h[0] = h0;
+  st->h[1] = h0 >> 32;
+  st->h[2] = h1;
+  st->h[3] = h1 >> 32;
+  st->h[4] = h2;
+
+  return 6 * sizeof (void *) + 18 * sizeof (u64);
+}
 
-#ifndef POLY1305_USE_SSE2
-static OPS_FUNC_ABI void
-poly1305_init_ext_ref32 (void *state, const poly1305_key_t * key)
+static unsigned int poly1305_final (poly1305_context_t *ctx,
+				    byte mac[POLY1305_TAGLEN])
 {
-  poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
+  POLY1305_STATE *st = &ctx->state;
+  unsigned int burn = 0;
+  u64 u, carry;
+  u64 k0, k1;
+  u64 h0, h1;
+  u64 h2;
+
+  /* process the remaining block */
+  if (ctx->leftover)
+    {
+      ctx->buffer[ctx->leftover++] = 1;
+      for (; ctx->leftover < POLY1305_BLOCKSIZE; ctx->leftover++)
+	ctx->buffer[ctx->leftover] = 0;
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
+    }
 
-  gcry_assert (sizeof (*st) + POLY1305_STATE_ALIGNMENT <=
-	       sizeof (((poly1305_context_t *) 0)->state));
+  h0 = st->h[0] + ((u64)st->h[1] << 32);
+  h1 = st->h[2] + ((u64)st->h[3] << 32);
+  h2 = st->h[4];
 
-  /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-  st->r[0] = (buf_get_le32 (&key->b[0])) & 0x3ffffff;
-  st->r[1] = (buf_get_le32 (&key->b[3]) >> 2) & 0x3ffff03;
-  st->r[2] = (buf_get_le32 (&key->b[6]) >> 4) & 0x3ffc0ff;
-  st->r[3] = (buf_get_le32 (&key->b[9]) >> 6) & 0x3f03fff;
-  st->r[4] = (buf_get_le32 (&key->b[12]) >> 8) & 0x00fffff;
+  k0 = st->k[0] + ((u64)st->k[1] << 32);
+  k1 = st->k[2] + ((u64)st->k[3] << 32);
 
-  /* h = 0 */
-  st->h[0] = 0;
-  st->h[1] = 0;
-  st->h[2] = 0;
-  st->h[3] = 0;
-  st->h[4] = 0;
+  /* check if h is more than 2^130-5, by adding 5. */
+  add_ssaaaa(carry, u, 0, h0, 0, 5);
+  add_ssaaaa(carry, u, 0, carry, 0, h1);
+  u = (carry + h2) >> 2; /* u == 0 or 1 */
 
-  /* save pad for later */
-  st->pad[0] = buf_get_le32 (&key->b[16]);
-  st->pad[1] = buf_get_le32 (&key->b[20]);
-  st->pad[2] = buf_get_le32 (&key->b[24]);
-  st->pad[3] = buf_get_le32 (&key->b[28]);
+  /* minus 2^130-5 ... (+5) */
+  u = (-u) & 5;
+  add_ssaaaa(h1, h0, h1, h0, 0, u);
 
-  st->final = 0;
+  /* add high part of key + h */
+  add_ssaaaa(h1, h0, h1, h0, k1, k0);
+  buf_put_le64(mac + 0, h0);
+  buf_put_le64(mac + 8, h1);
+
+  /* burn_stack */
+  return 4 * sizeof (void *) + 7 * sizeof (u64) + burn;
 }
-#endif /* !POLY1305_USE_SSE2 */
 
+#endif /* USE_MPI_64BIT */
+
+#ifdef USE_MPI_32BIT
+
+#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
+
+/* HI:LO += A * B (arm) */
+#define UMUL_ADD_32(HI, LO, A, B) \
+      __asm__ ("umlal %1, %0, %4, %5" \
+	       : "=r" (HI), "=r" (LO) \
+	       : "0" (HI), "1" (LO), "r" (A), "r" (B) )
+
+/* A += B (arm) */
+#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
+      __asm__ ("adds %0, %0, %5\n" \
+	       "adcs %1, %1, %6\n" \
+	       "adcs %2, %2, %7\n" \
+	       "adcs %3, %3, %8\n" \
+	       "adc %4, %4, %9\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
+	       : "r" (B0), "r" (B1), "r" (B2), "r" (B3), "r" (B4) \
+	       : "cc" )
+
+#endif /* HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS */
+
+#if defined (__i386__) && __GNUC__ >= 4
+
+/* A += B (i386) */
+#define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) \
+      __asm__ ("addl %5, %0\n" \
+	       "adcl %6, %1\n" \
+	       "adcl %7, %2\n" \
+	       "adcl %8, %3\n" \
+	       "adcl %9, %4\n" \
+	       : "+r" (A0), "+r" (A1), "+r" (A2), "+r" (A3), "+r" (A4) \
+	       : "g" (B0), "g" (B1), "g" (B2), "g" (B3), "g" (B4) \
+	       : "cc" )
+
+#endif /* __i386__ */
+
+#ifndef UMUL_ADD_32
+/* HI:LO += A * B (generic, mpi) */
+#  define UMUL_ADD_32(HI, LO, A, B) do { \
+    u32 t_lo, t_hi; \
+    umul_ppmm(t_hi, t_lo, A, B); \
+    add_ssaaaa(HI, LO, HI, LO, t_hi, t_lo); \
+  } while (0)
+#endif
+
+#ifndef ADD_1305_32
+/* A += B (generic, mpi) */
+#  define ADD_1305_32(A4, A3, A2, A1, A0, B4, B3, B2, B1, B0) do { \
+    u32 carry0, carry1, carry2; \
+    add_ssaaaa(carry0, A0, 0, A0, 0, B0); \
+    add_ssaaaa(carry1, A1, 0, A1, 0, B1); \
+    add_ssaaaa(carry1, A1, carry1, A1, 0, carry0); \
+    add_ssaaaa(carry2, A2, 0, A2, 0, B2); \
+    add_ssaaaa(carry2, A2, carry2, A2, 0, carry1); \
+    add_ssaaaa(A4, A3, A4, A3, B4, B3); \
+    add_ssaaaa(A4, A3, A4, A3, 0, carry2); \
+  } while (0)
+#endif
 
-#ifndef POLY1305_USE_SSE2
-static OPS_FUNC_ABI unsigned int
-poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
+/* H = H * R mod 2¹³⁰-5 */
+#define MUL_MOD_1305_32(H4, H3, H2, H1, H0, R3, R2, R1, R0, \
+                        R3_MULT5, R2_MULT5, R1_MULT5) do { \
+    u32 x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi; \
+    u32 t0_lo, t0_hi; \
+    \
+    /* x = a * r (partial mod 2^130-5) */ \
+    umul_ppmm(x0_hi, x0_lo, H0, R0);  /* h0 * r0 */ \
+    umul_ppmm(x1_hi, x1_lo, H0, R1);  /* h0 * r1 */ \
+    umul_ppmm(x2_hi, x2_lo, H0, R2);  /* h0 * r2 */ \
+    umul_ppmm(x3_hi, x3_lo, H0, R3);  /* h0 * r3 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H1, R3_MULT5); /* h1 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x1_hi, x1_lo, H1, R0);       /* h1 * r0 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H1, R1);       /* h1 * r1 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H1, R2);       /* h1 * r2 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H2, R2_MULT5); /* h2 * r2 mod 2^130-5 */ \
+    UMUL_ADD_32(x1_hi, x1_lo, H2, R3_MULT5); /* h2 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H2, R0);       /* h2 * r0 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H2, R1);       /* h2 * r1 */ \
+    \
+    UMUL_ADD_32(x0_hi, x0_lo, H3, R1_MULT5); /* h3 * r1 mod 2^130-5 */ \
+    H1 = x0_hi; \
+    UMUL_ADD_32(x1_hi, x1_lo, H3, R2_MULT5); /* h3 * r2 mod 2^130-5 */ \
+    UMUL_ADD_32(x2_hi, x2_lo, H3, R3_MULT5); /* h3 * r3 mod 2^130-5 */ \
+    UMUL_ADD_32(x3_hi, x3_lo, H3, R0);       /* h3 * r0 */ \
+    \
+    t0_lo = H4 * R1_MULT5; /* h4 * r1 mod 2^130-5 */ \
+    t0_hi = H4 * R2_MULT5; /* h4 * r2 mod 2^130-5 */ \
+    add_ssaaaa(H2, x1_lo, x1_hi, x1_lo, 0, t0_lo); \
+    add_ssaaaa(H3, x2_lo, x2_hi, x2_lo, 0, t0_hi); \
+    t0_lo = H4 * R3_MULT5; /* h4 * r3 mod 2^130-5 */ \
+    t0_hi = H4 * R0;       /* h4 * r0 */ \
+    add_ssaaaa(H4, x3_lo, x3_hi, x3_lo, t0_hi, t0_lo); \
+    \
+    /* carry propagation */ \
+    H0 = (H4 >> 2) * 5; /* msb mod 2^130-5 */ \
+    H4 = H4 & 3; \
+    ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
+  } while (0)
+
+unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
 {
-  poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
-  const u32 hibit = (st->final) ? 0 : (1 << 24);	/* 1 << 128 */
-  u32 r0, r1, r2, r3, r4;
-  u32 s1, s2, s3, s4;
+  POLY1305_STATE *st = &ctx->state;
+  u32 r1_mult5, r2_mult5, r3_mult5;
   u32 h0, h1, h2, h3, h4;
-  u64 d0, d1, d2, d3, d4;
-  u32 c;
-
-  r0 = st->r[0];
-  r1 = st->r[1];
-  r2 = st->r[2];
-  r3 = st->r[3];
-  r4 = st->r[4];
+  u32 m0, m1, m2, m3, m4;
 
-  s1 = r1 * 5;
-  s2 = r2 * 5;
-  s3 = r3 * 5;
-  s4 = r4 * 5;
+  m4 = high_pad;
 
   h0 = st->h[0];
   h1 = st->h[1];
@@ -191,54 +354,27 @@ poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
   h3 = st->h[3];
   h4 = st->h[4];
 
-  while (bytes >= POLY1305_REF_BLOCKSIZE)
+  r1_mult5 = (st->r[1] >> 2) + st->r[1];
+  r2_mult5 = (st->r[2] >> 2) + st->r[2];
+  r3_mult5 = (st->r[3] >> 2) + st->r[3];
+
+  while (len >= POLY1305_BLOCKSIZE)
     {
-      /* h += m[i] */
-      h0 += (buf_get_le32 (m + 0)) & 0x3ffffff;
-      h1 += (buf_get_le32 (m + 3) >> 2) & 0x3ffffff;
-      h2 += (buf_get_le32 (m + 6) >> 4) & 0x3ffffff;
-      h3 += (buf_get_le32 (m + 9) >> 6) & 0x3ffffff;
-      h4 += (buf_get_le32 (m + 12) >> 8) | hibit;
-
-      /* h *= r */
-      d0 =
-	((u64) h0 * r0) + ((u64) h1 * s4) +
-	((u64) h2 * s3) + ((u64) h3 * s2) + ((u64) h4 * s1);
-      d1 =
-	((u64) h0 * r1) + ((u64) h1 * r0) +
-	((u64) h2 * s4) + ((u64) h3 * s3) + ((u64) h4 * s2);
-      d2 =
-	((u64) h0 * r2) + ((u64) h1 * r1) +
-	((u64) h2 * r0) + ((u64) h3 * s4) + ((u64) h4 * s3);
-      d3 =
-	((u64) h0 * r3) + ((u64) h1 * r2) +
-	((u64) h2 * r1) + ((u64) h3 * r0) + ((u64) h4 * s4);
-      d4 =
-	((u64) h0 * r4) + ((u64) h1 * r3) +
-	((u64) h2 * r2) + ((u64) h3 * r1) + ((u64) h4 * r0);
-
-      /* (partial) h %= p */
-      c = (u32) (d0 >> 26);
-      h0 = (u32) d0 & 0x3ffffff;
-      d1 += c;
-      c = (u32) (d1 >> 26);
-      h1 = (u32) d1 & 0x3ffffff;
-      d2 += c;
-      c = (u32) (d2 >> 26);
-      h2 = (u32) d2 & 0x3ffffff;
-      d3 += c;
-      c = (u32) (d3 >> 26);
-      h3 = (u32) d3 & 0x3ffffff;
-      d4 += c;
-      c = (u32) (d4 >> 26);
-      h4 = (u32) d4 & 0x3ffffff;
-      h0 += c * 5;
-      c = (h0 >> 26);
-      h0 = h0 & 0x3ffffff;
-      h1 += c;
-
-      m += POLY1305_REF_BLOCKSIZE;
-      bytes -= POLY1305_REF_BLOCKSIZE;
+      m0 = buf_get_le32(buf + 0);
+      m1 = buf_get_le32(buf + 4);
+      m2 = buf_get_le32(buf + 8);
+      m3 = buf_get_le32(buf + 12);
+
+      /* a = h + m */
+      ADD_1305_32(h4, h3, h2, h1, h0, m4, m3, m2, m1, m0);
+
+      /* h = a * r (partial mod 2^130-5) */
+      MUL_MOD_1305_32(h4, h3, h2, h1, h0,
+		      st->r[3], st->r[2], st->r[1], st->r[0],
+		      r3_mult5, r2_mult5, r1_mult5);
+
+      buf += POLY1305_BLOCKSIZE;
+      len -= POLY1305_BLOCKSIZE;
     }
 
   st->h[0] = h0;
@@ -247,185 +383,95 @@ poly1305_blocks_ref32 (void *state, const byte * m, size_t bytes)
   st->h[3] = h3;
   st->h[4] = h4;
 
-  return (16 * sizeof (u32) + 5 * sizeof (u64) + 5 * sizeof (void *));
+  return 6 * sizeof (void *) + 28 * sizeof (u32);
 }
-#endif /* !POLY1305_USE_SSE2 */
-
 
-#ifndef POLY1305_USE_SSE2
-static OPS_FUNC_ABI unsigned int
-poly1305_finish_ext_ref32 (void *state, const byte * m,
-			   size_t remaining, byte mac[POLY1305_TAGLEN])
+static unsigned int poly1305_final (poly1305_context_t *ctx,
+				    byte mac[POLY1305_TAGLEN])
 {
-  poly1305_state_ref32_t *st = (poly1305_state_ref32_t *) state;
-  u32 h0, h1, h2, h3, h4, c;
-  u32 g0, g1, g2, g3, g4;
-  u64 f;
-  u32 mask;
+  POLY1305_STATE *st = &ctx->state;
   unsigned int burn = 0;
+  u32 carry, tmp0, tmp1, tmp2, u;
+  u32 h4, h3, h2, h1, h0;
 
   /* process the remaining block */
-  if (remaining)
+  if (ctx->leftover)
     {
-      byte final[POLY1305_REF_BLOCKSIZE] = { 0 };
-      size_t i;
-      for (i = 0; i < remaining; i++)
-	final[i] = m[i];
-      final[remaining] = 1;
-      st->final = 1;
-      burn = poly1305_blocks_ref32 (st, final, POLY1305_REF_BLOCKSIZE);
+      ctx->buffer[ctx->leftover++] = 1;
+      for (; ctx->leftover < POLY1305_BLOCKSIZE; ctx->leftover++)
+	ctx->buffer[ctx->leftover] = 0;
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 0);
     }
 
-  /* fully carry h */
   h0 = st->h[0];
   h1 = st->h[1];
   h2 = st->h[2];
   h3 = st->h[3];
   h4 = st->h[4];
 
-  c = h1 >> 26;
-  h1 = h1 & 0x3ffffff;
-  h2 += c;
-  c = h2 >> 26;
-  h2 = h2 & 0x3ffffff;
-  h3 += c;
-  c = h3 >> 26;
-  h3 = h3 & 0x3ffffff;
-  h4 += c;
-  c = h4 >> 26;
-  h4 = h4 & 0x3ffffff;
-  h0 += c * 5;
-  c = h0 >> 26;
-  h0 = h0 & 0x3ffffff;
-  h1 += c;
-
-  /* compute h + -p */
-  g0 = h0 + 5;
-  c = g0 >> 26;
-  g0 &= 0x3ffffff;
-  g1 = h1 + c;
-  c = g1 >> 26;
-  g1 &= 0x3ffffff;
-  g2 = h2 + c;
-  c = g2 >> 26;
-  g2 &= 0x3ffffff;
-  g3 = h3 + c;
-  c = g3 >> 26;
-  g3 &= 0x3ffffff;
-  g4 = h4 + c - (1 << 26);
-
-  /* select h if h < p, or h + -p if h >= p */
-  mask = (g4 >> ((sizeof (u32) * 8) - 1)) - 1;
-  g0 &= mask;
-  g1 &= mask;
-  g2 &= mask;
-  g3 &= mask;
-  g4 &= mask;
-  mask = ~mask;
-  h0 = (h0 & mask) | g0;
-  h1 = (h1 & mask) | g1;
-  h2 = (h2 & mask) | g2;
-  h3 = (h3 & mask) | g3;
-  h4 = (h4 & mask) | g4;
-
-  /* h = h % (2^128) */
-  h0 = ((h0) | (h1 << 26)) & 0xffffffff;
-  h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
-  h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
-  h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
-
-  /* mac = (h + pad) % (2^128) */
-  f = (u64) h0 + st->pad[0];
-  h0 = (u32) f;
-  f = (u64) h1 + st->pad[1] + (f >> 32);
-  h1 = (u32) f;
-  f = (u64) h2 + st->pad[2] + (f >> 32);
-  h2 = (u32) f;
-  f = (u64) h3 + st->pad[3] + (f >> 32);
-  h3 = (u32) f;
-
-  buf_put_le32 (mac + 0, h0);
-  buf_put_le32 (mac + 4, h1);
-  buf_put_le32 (mac + 8, h2);
-  buf_put_le32 (mac + 12, h3);
-
-  /* zero out the state */
-  st->h[0] = 0;
-  st->h[1] = 0;
-  st->h[2] = 0;
-  st->h[3] = 0;
-  st->h[4] = 0;
-  st->r[0] = 0;
-  st->r[1] = 0;
-  st->r[2] = 0;
-  st->r[3] = 0;
-  st->r[4] = 0;
-  st->pad[0] = 0;
-  st->pad[1] = 0;
-  st->pad[2] = 0;
-  st->pad[3] = 0;
+  /* check if h is more than 2^130-5, by adding 5. */
+  add_ssaaaa(carry, tmp0, 0, h0, 0, 5);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h1);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h2);
+  add_ssaaaa(carry, tmp0, 0, carry, 0, h3);
+  u = (carry + h4) >> 2; /* u == 0 or 1 */
+
+  /* minus 2^130-5 ... (+5) */
+  u = (-u) & 5;
+  add_ssaaaa(carry, h0, 0, h0, 0, u);
+  add_ssaaaa(carry, h1, 0, h1, 0, carry);
+  add_ssaaaa(carry, h2, 0, h2, 0, carry);
+  add_ssaaaa(carry, h3, 0, h3, 0, carry);
+
+  /* add high part of key + h */
+  add_ssaaaa(tmp0, h0, 0, h0, 0, st->k[0]);
+  add_ssaaaa(tmp1, h1, 0, h1, 0, st->k[1]);
+  add_ssaaaa(tmp1, h1, tmp1, h1, 0, tmp0);
+  add_ssaaaa(tmp2, h2, 0, h2, 0, st->k[2]);
+  add_ssaaaa(tmp2, h2, tmp2, h2, 0, tmp1);
+  add_ssaaaa(carry, h3, 0, h3, 0, st->k[3]);
+  h3 += tmp2;
+
+  buf_put_le32(mac + 0, h0);
+  buf_put_le32(mac + 4, h1);
+  buf_put_le32(mac + 8, h2);
+  buf_put_le32(mac + 12, h3);
 
   /* burn_stack */
-  return (13 * sizeof (u32) + sizeof (u64) +
-	  POLY1305_REF_BLOCKSIZE + 6 * sizeof (void *)) + burn;
+  return 4 * sizeof (void *) + 10 * sizeof (u32) + burn;
 }
-#endif /* !POLY1305_USE_SSE2*/
 
-
-

-
-
-static inline void *
-poly1305_get_state (poly1305_context_t * ctx)
-{
-  byte *c = ctx->state;
-  c += POLY1305_STATE_ALIGNMENT - 1;
-  c -= (uintptr_t) c & (POLY1305_STATE_ALIGNMENT - 1);
-  return c;
-}
-
-
-static void
-poly1305_init (poly1305_context_t * ctx, const poly1305_key_t * key)
-{
-  void *state = poly1305_get_state (ctx);
-
-  ctx->leftover = 0;
-
-  ctx->ops->init_ext (state, key);
-}
+#endif /* USE_MPI_32BIT */
 
 
 void
-_gcry_poly1305_update (poly1305_context_t * ctx, const byte * m, size_t bytes)
+_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
 {
-  void *state = poly1305_get_state (ctx);
   unsigned int burn = 0;
-  size_t block_size = ctx->ops->block_size;
 
   /* handle leftover */
   if (ctx->leftover)
     {
-      size_t want = (block_size - ctx->leftover);
+      size_t want = (POLY1305_BLOCKSIZE - ctx->leftover);
       if (want > bytes)
 	want = bytes;
       buf_cpy (ctx->buffer + ctx->leftover, m, want);
       bytes -= want;
       m += want;
       ctx->leftover += want;
-      if (ctx->leftover < block_size)
+      if (ctx->leftover < POLY1305_BLOCKSIZE)
 	return;
-      burn = ctx->ops->blocks (state, ctx->buffer, block_size);
+      burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
       ctx->leftover = 0;
     }
 
   /* process full blocks */
-  if (bytes >= block_size)
+  if (bytes >= POLY1305_BLOCKSIZE)
     {
-      size_t want = (bytes & ~(block_size - 1));
-      burn = ctx->ops->blocks (state, m, want);
-      m += want;
-      bytes -= want;
+      size_t nblks = bytes / POLY1305_BLOCKSIZE;
+      burn = poly1305_blocks (ctx, m, nblks * POLY1305_BLOCKSIZE, 1);
+      m += nblks * POLY1305_BLOCKSIZE;
+      bytes -= nblks * POLY1305_BLOCKSIZE;
     }
 
   /* store leftover */
@@ -441,12 +487,11 @@ _gcry_poly1305_update (poly1305_context_t * ctx, const byte * m, size_t bytes)
 
 
 void
-_gcry_poly1305_finish (poly1305_context_t * ctx, byte mac[POLY1305_TAGLEN])
+_gcry_poly1305_finish (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN])
 {
-  void *state = poly1305_get_state (ctx);
   unsigned int burn;
 
-  burn = ctx->ops->finish_ext (state, ctx->buffer, ctx->leftover, mac);
+  burn = poly1305_final (ctx, mac);
 
   _gcry_burn_stack (burn);
 }
@@ -458,8 +503,6 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
 {
   static int initialized;
   static const char *selftest_failed;
-  poly1305_key_t keytmp;
-  unsigned int features = _gcry_get_hw_features ();
 
   if (!initialized)
     {
@@ -475,26 +518,7 @@ _gcry_poly1305_init (poly1305_context_t * ctx, const byte * key,
   if (selftest_failed)
     return GPG_ERR_SELFTEST_FAILED;
 
-#ifdef POLY1305_USE_SSE2
-  ctx->ops = &poly1305_amd64_sse2_ops;
-#else
-  ctx->ops = &poly1305_default_ops;
-#endif
-
-#ifdef POLY1305_USE_AVX2
-  if (features & HWF_INTEL_AVX2)
-    ctx->ops = &poly1305_amd64_avx2_ops;
-#endif
-#ifdef POLY1305_USE_NEON
-  if (features & HWF_ARM_NEON)
-    ctx->ops = &poly1305_armv7_neon_ops;
-#endif
-  (void)features;
-
-  buf_cpy (keytmp.b, key, POLY1305_KEYLEN);
-  poly1305_init (ctx, &keytmp);
-
-  wipememory (&keytmp, sizeof (keytmp));
+  poly1305_init (ctx, key);
 
   return 0;
 }
diff --git a/configure.ac b/configure.ac
index 57b840e6e..c4b59f4dd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2239,19 +2239,6 @@ if test "$found" = "1" ; then
    fi
 fi
 
-case "${host}" in
-   x86_64-*-*)
-      # Build with the assembly implementation
-      GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-sse2-amd64.lo"
-      GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-avx2-amd64.lo"
-   ;;
-esac
-
-if test x"$neonsupport" = xyes ; then
-   # Build with the NEON implementation
-   GCRYPT_CIPHERS="$GCRYPT_CIPHERS poly1305-armv7-neon.lo"
-fi
-
 LIST_MEMBER(dsa, $enabled_pubkey_ciphers)
 if test "$found" = "1" ; then
    GCRYPT_PUBKEY_CIPHERS="$GCRYPT_PUBKEY_CIPHERS dsa.lo"




More information about the Gcrypt-devel mailing list