[PATCH 1/2] poly1305: add AVX512 implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Apr 3 17:10:42 CEST 2022


* LICENSES: Add 3-clause BSD license for poly1305-amd64-avx512.S.
* cipher/Makefile.am: Add 'poly1305-amd64-avx512.S'.
* cipher/poly1305-amd64-avx512.S: New.
* cipher/poly1305-internal.h (POLY1305_USE_AVX512): New.
(poly1305_context_s): Add 'use_avx512'.
* cipher/poly1305.c (ASM_FUNC_ABI, ASM_FUNC_WRAPPER_ATTR): New.
[POLY1305_USE_AVX512] (_gcry_poly1305_amd64_avx512_blocks)
(poly1305_amd64_avx512_blocks): New.
(poly1305_init): Use AVX512 is HW feature available (set use_avx512).
[USE_MPI_64BIT] (poly1305_blocks): Rename to ...
[USE_MPI_64BIT] (poly1305_blocks_generic): ... this.
[USE_MPI_64BIT] (poly1305_blocks): New.
--

Patch adds AMD64 AVX512-FMA52 implementation for Poly1305.

Benchmark on Intel Core i3-1115G4 (tigerlake):

 Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 POLY1305           |     0.306 ns/B      3117 MiB/s      1.25 c/B      4090

 After (5.0x faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 POLY1305           |     0.061 ns/B     15699 MiB/s     0.249 c/B      4095±3

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 LICENSES                       |   30 +
 cipher/Makefile.am             |    2 +-
 cipher/poly1305-amd64-avx512.S | 1625 ++++++++++++++++++++++++++++++++
 cipher/poly1305-internal.h     |   13 +
 cipher/poly1305.c              |   50 +-
 configure.ac                   |    3 +
 6 files changed, 1720 insertions(+), 3 deletions(-)
 create mode 100644 cipher/poly1305-amd64-avx512.S

diff --git a/LICENSES b/LICENSES
index 94499501..67b80e64 100644
--- a/LICENSES
+++ b/LICENSES
@@ -56,6 +56,36 @@ with any binary distributions derived from the GNU C Library.
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #+end_quote
 
+  For files:
+  - cipher/poly1305-amd64-avx512.S
+
+#+begin_quote
+   Copyright (c) 2021-2022, Intel Corporation
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+
+       * Redistributions of source code must retain the above copyright notice,
+         this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of Intel Corporation nor the names of its contributors
+         may be used to endorse or promote products derived from this software
+         without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#+end_quote
+
   For files:
   - random/jitterentropy-base.c
   - random/jitterentropy-gcd.c
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 1ac1923b..b6319d35 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -98,7 +98,7 @@ EXTRA_libcipher_la_SOURCES = \
 	gostr3411-94.c \
 	md4.c \
 	md5.c \
-	poly1305-s390x.S \
+	poly1305-s390x.S poly1305-amd64-avx512.S \
 	rijndael.c rijndael-internal.h rijndael-tables.h   \
 	rijndael-aesni.c rijndael-padlock.c                \
 	rijndael-amd64.S rijndael-arm.S                    \
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
new file mode 100644
index 00000000..48892777
--- /dev/null
+++ b/cipher/poly1305-amd64-avx512.S
@@ -0,0 +1,1625 @@
+/*
+;;
+;; Copyright (c) 2021-2022, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+*/
+/*
+ * From:
+ *  https://github.com/intel/intel-ipsec-mb/blob/f0cad21a644231c0f5d4af51f56061a5796343fb/lib/avx512/poly_fma_avx512.asm
+ *
+ * Conversion to GAS assembly and integration to libgcrypt
+ *  by Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX512)
+#include "asm-common-amd64.h"
+
+.intel_syntax noprefix
+
+.text
+
+ELF(.type _gcry_poly1305_avx512_consts, at object)
+_gcry_poly1305_avx512_consts:
+
+.align 64
+.Lmask_44:
+  .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
+  .quad 0xfffffffffff, 0xfffffffffff, 0xfffffffffff, 0xfffffffffff
+
+.align 64
+.Lmask_42:
+  .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
+  .quad 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff, 0x3ffffffffff
+
+.align 64
+.Lhigh_bit:
+  .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
+  .quad 0x10000000000, 0x10000000000, 0x10000000000, 0x10000000000
+
+.Lbyte_len_to_mask_table:
+  .short 0x0000, 0x0001, 0x0003, 0x0007
+  .short 0x000f, 0x001f, 0x003f, 0x007f
+  .short 0x00ff, 0x01ff, 0x03ff, 0x07ff
+  .short 0x0fff, 0x1fff, 0x3fff, 0x7fff
+  .short 0xffff
+
+.align 64
+.Lbyte64_len_to_mask_table:
+  .quad 0x0000000000000000, 0x0000000000000001
+  .quad 0x0000000000000003, 0x0000000000000007
+  .quad 0x000000000000000f, 0x000000000000001f
+  .quad 0x000000000000003f, 0x000000000000007f
+  .quad 0x00000000000000ff, 0x00000000000001ff
+  .quad 0x00000000000003ff, 0x00000000000007ff
+  .quad 0x0000000000000fff, 0x0000000000001fff
+  .quad 0x0000000000003fff, 0x0000000000007fff
+  .quad 0x000000000000ffff, 0x000000000001ffff
+  .quad 0x000000000003ffff, 0x000000000007ffff
+  .quad 0x00000000000fffff, 0x00000000001fffff
+  .quad 0x00000000003fffff, 0x00000000007fffff
+  .quad 0x0000000000ffffff, 0x0000000001ffffff
+  .quad 0x0000000003ffffff, 0x0000000007ffffff
+  .quad 0x000000000fffffff, 0x000000001fffffff
+  .quad 0x000000003fffffff, 0x000000007fffffff
+  .quad 0x00000000ffffffff, 0x00000001ffffffff
+  .quad 0x00000003ffffffff, 0x00000007ffffffff
+  .quad 0x0000000fffffffff, 0x0000001fffffffff
+  .quad 0x0000003fffffffff, 0x0000007fffffffff
+  .quad 0x000000ffffffffff, 0x000001ffffffffff
+  .quad 0x000003ffffffffff, 0x000007ffffffffff
+  .quad 0x00000fffffffffff, 0x00001fffffffffff
+  .quad 0x00003fffffffffff, 0x00007fffffffffff
+  .quad 0x0000ffffffffffff, 0x0001ffffffffffff
+  .quad 0x0003ffffffffffff, 0x0007ffffffffffff
+  .quad 0x000fffffffffffff, 0x001fffffffffffff
+  .quad 0x003fffffffffffff, 0x007fffffffffffff
+  .quad 0x00ffffffffffffff, 0x01ffffffffffffff
+  .quad 0x03ffffffffffffff, 0x07ffffffffffffff
+  .quad 0x0fffffffffffffff, 0x1fffffffffffffff
+  .quad 0x3fffffffffffffff, 0x7fffffffffffffff
+  .quad 0xffffffffffffffff
+
+.Lqword_high_bit_mask:
+  .short 0, 0x1, 0x5, 0x15, 0x55, 0x57, 0x5f, 0x7f, 0xff
+
+ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts)
+
+#define raxd eax
+#define rbxd ebx
+#define rcxd ecx
+#define rdxd edx
+#define rsid esi
+#define rdid edi
+#define rbpd ebp
+#define rspd esp
+#define __DWORD(X) X##d
+#define DWORD(R) __DWORD(R)
+
+#define arg1    rdi
+#define arg2    rsi
+#define arg3    rdx
+#define arg4    rcx
+
+#define job     arg1
+#define gp1     rsi
+#define gp2     rcx
+
+/* ;; don't use rdx and rax - they are needed for multiply operation */
+#define gp3     rbp
+#define gp4     r8
+#define gp5     r9
+#define gp6     r10
+#define gp7     r11
+#define gp8     r12
+#define gp9     r13
+#define gp10    r14
+#define gp11    r15
+
+#define len     gp11
+#define msg     gp10
+
+#define POLY1305_BLOCK_SIZE 16
+
+#define STACK_r_save         0
+#define STACK_r_save_size    (6 * 64)
+#define STACK_gpr_save       (STACK_r_save + STACK_r_save_size)
+#define STACK_gpr_save_size  (8 * 8)
+#define STACK_rsp_save       (STACK_gpr_save + STACK_gpr_save_size)
+#define STACK_rsp_save_size  (1 * 8)
+#define STACK_SIZE           (STACK_rsp_save + STACK_rsp_save_size)
+
+#define A2_ZERO(...) /**/
+#define A2_ZERO_INVERT(...) __VA_ARGS__
+#define A2_NOT_ZERO(...) __VA_ARGS__
+#define A2_NOT_ZERO_INVERT(...) /**/
+
+#define clear_zmm(vec) vpxord vec, vec, vec
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for message length being multiple of block size
+;; =============================================================================
+;; Combining 64-bit x 64-bit multiplication with reduction steps
+;;
+;; NOTES:
+;;   1) A2 here is only two bits so anything above is subject of reduction.
+;;      Constant C1 = R1 + (R1 >> 2) simplifies multiply with less operations
+;;   2) Magic 5x comes from mod 2^130-5 property and incorporating
+;;      reduction into multiply phase.
+;;      See "Cheating at modular arithmetic" and "Poly1305's prime: 2^130 - 5"
+;;      paragraphs at https://loup-vaillant.fr/tutorials/poly1305-design for more details.
+;;
+;; Flow of the code below is as follows:
+;;
+;;          A2        A1        A0
+;;        x           R1        R0
+;;   -----------------------------
+;;       A2×R0     A1×R0     A0×R0
+;;   +             A0×R1
+;;   +           5xA2xR1   5xA1xR1
+;;   -----------------------------
+;;     [0|L2L] [L1H|L1L] [L0H|L0L]
+;;
+;;   Registers:  T3:T2     T1:A0
+;;
+;; Completing the multiply and adding (with carry) 3x128-bit limbs into
+;; 192-bits again (3x64-bits):
+;; A0 = L0L
+;; A1 = L0H + L1L
+;; T3 = L1H + L2L
+; A0     [in/out] GPR with accumulator bits 63:0
+; A1     [in/out] GPR with accumulator bits 127:64
+; A2     [in/out] GPR with accumulator bits 195:128
+; R0     [in] GPR with R constant bits 63:0
+; R1     [in] GPR with R constant bits 127:64
+; C1     [in] C1 = R1 + (R1 >> 2)
+; T1     [clobbered] GPR register
+; T2     [clobbered] GPR register
+; T3     [clobbered] GPR register
+; GP_RAX [clobbered] RAX register
+; GP_RDX [clobbered] RDX register
+; IF_A2  [in] Used if input A2 is not 0
+*/
+#define POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, C1, T1, T2, T3, GP_RAX, GP_RDX, IF_A2) \
+	/* T3:T2 = (A0 * R1) */ \
+	mov     GP_RAX, R1; \
+	mul     A0; \
+	mov     T2, GP_RAX; \
+	mov     GP_RAX, R0; \
+	mov     T3, GP_RDX; \
+	\
+	/* T1:A0 = (A0 * R0) */ \
+	mul     A0; \
+	mov     A0, GP_RAX; /* A0 not used in other operations */ \
+	mov     GP_RAX, R0; \
+	mov     T1, GP_RDX; \
+	\
+	/* T3:T2 += (A1 * R0) */ \
+	mul     A1; \
+	add     T2, GP_RAX; \
+	mov     GP_RAX, C1; \
+	adc     T3, GP_RDX; \
+	\
+	/* T1:A0 += (A1 * R1x5) */ \
+	mul     A1; \
+	IF_A2(mov A1, A2); /* use A1 for A2 */ \
+	add     A0, GP_RAX; \
+	adc     T1, GP_RDX; \
+	\
+	/* NOTE: A2 is clamped to 2-bits, */ \
+	/*       R1/R0 is clamped to 60-bits, */ \
+	/*       their product is less than 2^64. */ \
+	\
+	IF_A2(/* T3:T2 += (A2 * R1x5) */); \
+	IF_A2(imul    A1, C1); \
+	IF_A2(add     T2, A1); \
+	IF_A2(mov     A1, T1); /* T1:A0 => A1:A0 */ \
+	IF_A2(adc     T3, 0); \
+	\
+	IF_A2(/* T3:A1 += (A2 * R0) */); \
+	IF_A2(imul    A2, R0); \
+	IF_A2(add     A1, T2); \
+	IF_A2(adc     T3, A2); \
+	\
+	IF_A2##_INVERT(/* If A2 == 0, just move and add T1-T2 to A1 */); \
+	IF_A2##_INVERT(mov     A1, T1); \
+	IF_A2##_INVERT(add     A1, T2); \
+	IF_A2##_INVERT(adc     T3, 0); \
+	\
+	/* At this point, 3 64-bit limbs are in T3:A1:A0 */ \
+	/* T3 can span over more than 2 bits so final partial reduction step is needed. */ \
+	\
+	/* Partial reduction (just to fit into 130 bits) */ \
+	/*    A2 = T3 & 3 */ \
+	/*    k = (T3 & ~3) + (T3 >> 2) */ \
+	/*         Y    x4  +  Y    x1 */ \
+	/*    A2:A1:A0 += k */ \
+	\
+	/* Result will be in A2:A1:A0 */ \
+	mov     T1, T3; \
+	mov     DWORD(A2), DWORD(T3); \
+	and     T1, ~3; \
+	shr     T3, 2; \
+	and     DWORD(A2), 3; \
+	add     T1, T3; \
+	\
+	/* A2:A1:A0 += k (kept in T1) */ \
+	add     A0, T1; \
+	adc     A1, 0; \
+	adc     DWORD(A2), 0
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 8 16-byte message blocks,
+;; and adds new message blocks to accumulator.
+;;
+;; It first multiplies all 8 blocks with powers of R:
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2.
+;;
+;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;R0    [in] ZMM register (R0) to include the 1st limb of R
+;R1    [in] ZMM register (R1) to include the 2nd limb of R
+;R2    [in] ZMM register (R2) to include the 3rd limb of R
+;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
+;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;ZTMP1 [clobbered] Temporary ZMM register
+*/
+#define POLY1305_MUL_REDUCE_VEC(A0, A1, A2, R0, R1, R2, R1P, R2P, P0_L, P0_H, \
+				P1_L, P1_H, P2_L, P2_H, ZTMP1) \
+	/* ;; Reset accumulator */ \
+	vpxorq  P0_L, P0_L, P0_L; \
+	vpxorq  P0_H, P0_H, P0_H; \
+	vpxorq  P1_L, P1_L, P1_L; \
+	vpxorq  P1_H, P1_H, P1_H; \
+	vpxorq  P2_L, P2_L, P2_L; \
+	vpxorq  P2_H, P2_H, P2_H; \
+	\
+	/* ; Reset accumulator and calculate products */ \
+	vpmadd52luq P0_L, A2, R1P; \
+	vpmadd52huq P0_H, A2, R1P; \
+	vpmadd52luq P1_L, A2, R2P; \
+	vpmadd52huq P1_H, A2, R2P; \
+	vpmadd52luq P2_L, A2, R0; \
+	vpmadd52huq P2_H, A2, R0; \
+	\
+	vpmadd52luq P1_L, A0, R1; \
+	vpmadd52huq P1_H, A0, R1; \
+	vpmadd52luq P2_L, A0, R2; \
+	vpmadd52huq P2_H, A0, R2; \
+	vpmadd52luq P0_L, A0, R0; \
+	vpmadd52huq P0_H, A0, R0; \
+	\
+	vpmadd52luq P0_L, A1, R2P; \
+	vpmadd52huq P0_H, A1, R2P; \
+	vpmadd52luq P1_L, A1, R0; \
+	vpmadd52huq P1_H, A1, R0; \
+	vpmadd52luq P2_L, A1, R1; \
+	vpmadd52huq P2_H, A1, R1; \
+	\
+	/* ; Carry propagation (first pass) */ \
+	vpsrlq  ZTMP1, P0_L, 44; \
+	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpsllq  P0_H, P0_H, 8; \
+	vpaddq  P0_H, P0_H, ZTMP1; \
+	vpaddq  P1_L, P1_L, P0_H; \
+	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpsrlq  ZTMP1, P1_L, 44; \
+	vpsllq  P1_H, P1_H, 8; \
+	vpaddq  P1_H, P1_H, ZTMP1; \
+	vpaddq  P2_L, P2_L, P1_H; \
+	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsrlq  ZTMP1, P2_L, 42; \
+	vpsllq  P2_H, P2_H, 10; \
+	vpaddq  P2_H, P2_H, ZTMP1; \
+	\
+	/* ; Carry propagation (second pass) */ \
+	\
+	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
+	vpaddq  A0, A0, P2_H; \
+	vpsllq  P2_H, P2_H, 2; \
+	vpaddq  A0, A0, P2_H; \
+	vpsrlq  ZTMP1, A0, 44; \
+	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  A1, A1, ZTMP1;
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 16 16-byte message blocks,
+;; and adds new message blocks to accumulator,
+;; interleaving this computation with the loading and splatting
+;; of new data.
+;;
+;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
+;; and 8 blocks from B0-B2, multiplied by R0-R2)
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43)
+;; from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2, and adds
+;; the results to A0-A2 and B0-B2.
+;;
+;; =============================================================================
+;A0    [in/out] ZMM register containing 1st 44-bit limb of blocks 1-8
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 1-8
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 1-8
+;B0    [in/out] ZMM register containing 1st 44-bit limb of blocks 9-16
+;B1    [in/out] ZMM register containing 2nd 44-bit limb of blocks 9-16
+;B2    [in/out] ZMM register containing 3rd 44-bit limb of blocks 9-16
+;R0    [in] ZMM register (R0) to include the 1st limb of R
+;R1    [in] ZMM register (R1) to include the 2nd limb of R
+;R2    [in] ZMM register (R2) to include the 3rd limb of R
+;R1P   [in] ZMM register (R1') to include the 2nd limb of R (multiplied by 5)
+;R2P   [in] ZMM register (R2') to include the 3rd limb of R (multiplied by 5)
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 1-8
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 1-8
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 1-8
+;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
+;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks 9-16
+;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
+;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks 9-16
+;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
+;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks 9-16
+;ZTMP1 [clobbered] Temporary ZMM register
+;ZTMP2 [clobbered] Temporary ZMM register
+;ZTMP3 [clobbered] Temporary ZMM register
+;ZTMP4 [clobbered] Temporary ZMM register
+;ZTMP5 [clobbered] Temporary ZMM register
+;ZTMP6 [clobbered] Temporary ZMM register
+;ZTMP7 [clobbered] Temporary ZMM register
+;ZTMP8 [clobbered] Temporary ZMM register
+;ZTMP9 [clobbered] Temporary ZMM register
+;MSG   [in/out] Pointer to message
+;LEN   [in/out] Length left of message
+*/
+#define POLY1305_MSG_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, \
+				      R2P, P0_L, P0_H, P1_L, P1_H, P2_L, P2_H, \
+				      Q0_L, Q0_H, Q1_L, Q1_H, Q2_L, Q2_H, \
+				      ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, \
+				      ZTMP6, ZTMP7, ZTMP8, ZTMP9, MSG, LEN) \
+	/* ;; Reset accumulator */ \
+	vpxorq  P0_L, P0_L, P0_L; \
+	vpxorq  P0_H, P0_H, P0_H; \
+	vpxorq  P1_L, P1_L, P1_L; \
+	vpxorq  P1_H, P1_H, P1_H; \
+	vpxorq  P2_L, P2_L, P2_L; \
+	vpxorq  P2_H, P2_H, P2_H; \
+	vpxorq  Q0_L, Q0_L, Q0_L; \
+	vpxorq  Q0_H, Q0_H, Q0_H; \
+	vpxorq  Q1_L, Q1_L, Q1_L; \
+	vpxorq  Q1_H, Q1_H, Q1_H; \
+	vpxorq  Q2_L, Q2_L, Q2_L; \
+	vpxorq  Q2_H, Q2_H, Q2_H; \
+	\
+	/* ;; This code interleaves hash computation with input loading/splatting */ \
+	\
+		/* ; Calculate products */ \
+		vpmadd52luq P0_L, A2, R1P; \
+		vpmadd52huq P0_H, A2, R1P; \
+	/* ;; input loading of new blocks */ \
+	add     MSG, POLY1305_BLOCK_SIZE*16; \
+	sub     LEN, POLY1305_BLOCK_SIZE*16; \
+	\
+		vpmadd52luq Q0_L, B2, R1P; \
+		vpmadd52huq Q0_H, B2, R1P; \
+		\
+		vpmadd52luq P1_L, A2, R2P; \
+		vpmadd52huq P1_H, A2, R2P; \
+	/* ; Load next block of data (128 bytes) */ \
+	vmovdqu64 ZTMP5, [MSG]; \
+	vmovdqu64 ZTMP2, [MSG + 64]; \
+	\
+		vpmadd52luq Q1_L, B2, R2P; \
+		vpmadd52huq Q1_H, B2, R2P; \
+	\
+	/* ; Interleave new blocks of data */ \
+	vpunpckhqdq ZTMP3, ZTMP5, ZTMP2; \
+	vpunpcklqdq ZTMP5, ZTMP5, ZTMP2; \
+	\
+		vpmadd52luq P0_L, A0, R0; \
+		vpmadd52huq P0_H, A0, R0; \
+	/* ; Highest 42-bit limbs of new blocks */ \
+	vpsrlq  ZTMP6, ZTMP3, 24; \
+	vporq   ZTMP6, ZTMP6, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
+	\
+		vpmadd52luq Q0_L, B0, R0; \
+		vpmadd52huq Q0_H, B0, R0; \
+		\
+	/* ; Middle 44-bit limbs of new blocks */ \
+	vpsrlq  ZTMP2, ZTMP5, 44; \
+	vpsllq  ZTMP4, ZTMP3, 20; \
+	\
+		vpmadd52luq P2_L, A2, R0; \
+		vpmadd52huq P2_H, A2, R0; \
+	vpternlogq ZTMP2, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	/* ; Lowest 44-bit limbs of new blocks */ \
+	vpandq  ZTMP5, ZTMP5, [.Lmask_44 ADD_RIP]; \
+	\
+		vpmadd52luq Q2_L, B2, R0; \
+		vpmadd52huq Q2_H, B2, R0; \
+		\
+	/* ; Load next block of data (128 bytes) */ \
+	vmovdqu64 ZTMP8, [MSG + 64*2]; \
+	vmovdqu64 ZTMP9, [MSG + 64*3]; \
+	\
+		vpmadd52luq P1_L, A0, R1; \
+		vpmadd52huq P1_H, A0, R1; \
+	/* ; Interleave new blocks of data */ \
+	vpunpckhqdq ZTMP3, ZTMP8, ZTMP9; \
+	vpunpcklqdq ZTMP8, ZTMP8, ZTMP9; \
+	\
+		vpmadd52luq Q1_L, B0, R1; \
+		vpmadd52huq Q1_H, B0, R1; \
+	\
+	/* ; Highest 42-bit limbs of new blocks */ \
+	vpsrlq  ZTMP7, ZTMP3, 24; \
+	vporq   ZTMP7, ZTMP7, [.Lhigh_bit ADD_RIP]; /* ; Add 2^128 to all 8 final qwords of the message */ \
+	\
+		vpmadd52luq P0_L, A1, R2P; \
+		vpmadd52huq P0_H, A1, R2P; \
+		\
+	/* ; Middle 44-bit limbs of new blocks */ \
+	vpsrlq  ZTMP9, ZTMP8, 44; \
+	vpsllq  ZTMP4, ZTMP3, 20; \
+	\
+		vpmadd52luq Q0_L, B1, R2P; \
+		vpmadd52huq Q0_H, B1, R2P; \
+		\
+	vpternlogq ZTMP9, ZTMP4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	/* ; Lowest 44-bit limbs of new blocks */ \
+	vpandq  ZTMP8, ZTMP8, [.Lmask_44 ADD_RIP]; \
+	\
+		vpmadd52luq P2_L, A0, R2; \
+		vpmadd52huq P2_H, A0, R2; \
+	/* ; Carry propagation (first pass) */ \
+	vpsrlq  ZTMP1, P0_L, 44; \
+	vpsllq  P0_H, P0_H, 8; \
+		vpmadd52luq Q2_L, B0, R2; \
+		vpmadd52huq Q2_H, B0, R2; \
+		\
+	vpsrlq  ZTMP3, Q0_L, 44; \
+	vpsllq  Q0_H, Q0_H, 8; \
+	\
+		vpmadd52luq P1_L, A1, R0; \
+		vpmadd52huq P1_H, A1, R0; \
+	/* ; Carry propagation (first pass) - continue */ \
+	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  P0_H, P0_H, ZTMP1; \
+		vpmadd52luq Q1_L, B1, R0; \
+		vpmadd52huq Q1_H, B1, R0; \
+	\
+	vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q0_H, Q0_H, ZTMP3; \
+	\
+		vpmadd52luq P2_L, A1, R1; \
+		vpmadd52huq P2_H, A1, R1; \
+	/* ; Carry propagation (first pass) - continue */ \
+	vpaddq  P1_L, P1_L, P0_H; \
+	vpsllq  P1_H, P1_H, 8; \
+	vpsrlq  ZTMP1, P1_L, 44; \
+		vpmadd52luq Q2_L, B1, R1; \
+		vpmadd52huq Q2_H, B1, R1; \
+	\
+	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q1_L, Q1_L, Q0_H; \
+	vpsllq  Q1_H, Q1_H, 8; \
+	vpsrlq  ZTMP3, Q1_L, 44; \
+	vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	\
+	vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  P2_L, P2_L, ZTMP1; \
+	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpaddq  A2, A2, ZTMP6; /* ; Add highest bits from new blocks to accumulator */ \
+	vpsrlq  ZTMP1, P2_L, 42; \
+	vpsllq  P2_H, P2_H, 10; \
+	vpaddq  P2_H, P2_H, ZTMP1; \
+	\
+	vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  Q2_L, Q2_L, ZTMP3; \
+	vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpaddq  B2, B2, ZTMP7; /* ; Add highest bits from new blocks to accumulator */ \
+	vpsrlq  ZTMP3, Q2_L, 42; \
+	vpsllq  Q2_H, Q2_H, 10; \
+	vpaddq  Q2_H, Q2_H, ZTMP3; \
+	\
+	/* ; Carry propagation (second pass) */ \
+	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
+	vpaddq  A0, A0, P2_H; \
+	vpsllq  P2_H, P2_H, 2; \
+	vpaddq  A0, A0, P2_H; \
+	vpaddq  B0, B0, Q2_H; \
+	vpsllq  Q2_H, Q2_H, 2; \
+	vpaddq  B0, B0, Q2_H; \
+	\
+	vpsrlq  ZTMP1, A0, 44; \
+	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  A0, A0, ZTMP5; /* ; Add low 42-bit bits from new blocks to accumulator */ \
+	vpaddq  A1, A1, ZTMP2; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
+	vpaddq  A1, A1, ZTMP1; \
+	vpsrlq  ZTMP3, B0, 44; \
+	vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  B0, B0, ZTMP8; /* ; Add low 42-bit bits from new blocks to accumulator */ \
+	vpaddq  B1, B1, ZTMP9; /* ; Add medium 42-bit bits from new blocks to accumulator */ \
+	vpaddq  B1, B1, ZTMP3
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for 16 16-byte message blocks.
+;;
+;; It first multiplies all 16 blocks with powers of R (8 blocks from A0-A2
+;; and 8 blocks from B0-B2, multiplied by R0-R2 and S0-S2)
+;;
+;;
+;;      a2      a1      a0
+;; ×    b2      b1      b0
+;; ---------------------------------------
+;;     a2×b0   a1×b0   a0×b0
+;; +   a1×b1   a0×b1 5×a2×b1
+;; +   a0×b2 5×a2×b2 5×a1×b2
+;; ---------------------------------------
+;;        p2      p1      p0
+;;
+;; Then, it propagates the carry (higher bits after bit 43) from lower limbs into higher limbs,
+;; multiplying by 5 in case of the carry of p2.
+;;
+;; =============================================================================
+;A0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;A1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;A2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;B0    [in/out] ZMM register containing 1st 44-bit limb of the 8 blocks
+;B1    [in/out] ZMM register containing 2nd 44-bit limb of the 8 blocks
+;B2    [in/out] ZMM register containing 3rd 44-bit limb of the 8 blocks
+;R0    [in] ZMM register (R0) to include the 1st limb in IDX
+;R1    [in] ZMM register (R1) to include the 2nd limb in IDX
+;R2    [in] ZMM register (R2) to include the 3rd limb in IDX
+;R1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
+;R2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
+;S0    [in] ZMM register (R0) to include the 1st limb in IDX
+;S1    [in] ZMM register (R1) to include the 2nd limb in IDX
+;S2    [in] ZMM register (R2) to include the 3rd limb in IDX
+;S1P   [in] ZMM register (R1') to include the 2nd limb (multiplied by 5) in IDX
+;S2P   [in] ZMM register (R2') to include the 3rd limb (multiplied by 5) in IDX
+;P0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;P1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;P2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;P2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;Q0_L  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;Q0_H  [clobbered] ZMM register to contain p[0] of the 8 blocks
+;Q1_L  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;Q1_H  [clobbered] ZMM register to contain p[1] of the 8 blocks
+;Q2_L  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;Q2_H  [clobbered] ZMM register to contain p[2] of the 8 blocks
+;ZTMP1 [clobbered] Temporary ZMM register
+;ZTMP2 [clobbered] Temporary ZMM register
+*/
+#define POLY1305_MUL_REDUCE_VEC16(A0, A1, A2, B0, B1, B2, R0, R1, R2, R1P, R2P,\
+				  S0, S1, S2, S1P, S2P, P0_L, P0_H, P1_L, P1_H,\
+				  P2_L, P2_H, Q0_L, Q0_H, Q1_L, Q1_H, Q2_L,\
+				  Q2_H, ZTMP1, ZTMP2) \
+	/* ;; Reset accumulator */ \
+	vpxorq  P0_L, P0_L, P0_L; \
+	vpxorq  P0_H, P0_H, P0_H; \
+	vpxorq  P1_L, P1_L, P1_L; \
+	vpxorq  P1_H, P1_H, P1_H; \
+	vpxorq  P2_L, P2_L, P2_L; \
+	vpxorq  P2_H, P2_H, P2_H; \
+	vpxorq  Q0_L, Q0_L, Q0_L; \
+	vpxorq  Q0_H, Q0_H, Q0_H; \
+	vpxorq  Q1_L, Q1_L, Q1_L; \
+	vpxorq  Q1_H, Q1_H, Q1_H; \
+	vpxorq  Q2_L, Q2_L, Q2_L; \
+	vpxorq  Q2_H, Q2_H, Q2_H; \
+	\
+	/* ;; This code interleaves hash computation with input loading/splatting */ \
+	\
+	/* ; Calculate products */ \
+	vpmadd52luq P0_L, A2, R1P; \
+	vpmadd52huq P0_H, A2, R1P; \
+	\
+	vpmadd52luq Q0_L, B2, S1P; \
+	vpmadd52huq Q0_H, B2, S1P; \
+	\
+	vpmadd52luq P1_L, A2, R2P; \
+	vpmadd52huq P1_H, A2, R2P; \
+	\
+	vpmadd52luq Q1_L, B2, S2P; \
+	vpmadd52huq Q1_H, B2, S2P; \
+	\
+	vpmadd52luq P0_L, A0, R0; \
+	vpmadd52huq P0_H, A0, R0; \
+	\
+	vpmadd52luq Q0_L, B0, S0; \
+	vpmadd52huq Q0_H, B0, S0; \
+	\
+	vpmadd52luq P2_L, A2, R0; \
+	vpmadd52huq P2_H, A2, R0; \
+	vpmadd52luq Q2_L, B2, S0; \
+	vpmadd52huq Q2_H, B2, S0; \
+	\
+	vpmadd52luq P1_L, A0, R1; \
+	vpmadd52huq P1_H, A0, R1; \
+	vpmadd52luq Q1_L, B0, S1; \
+	vpmadd52huq Q1_H, B0, S1; \
+	\
+	vpmadd52luq P0_L, A1, R2P; \
+	vpmadd52huq P0_H, A1, R2P; \
+	\
+	vpmadd52luq Q0_L, B1, S2P; \
+	vpmadd52huq Q0_H, B1, S2P; \
+	\
+	vpmadd52luq P2_L, A0, R2; \
+	vpmadd52huq P2_H, A0, R2; \
+	\
+	vpmadd52luq Q2_L, B0, S2; \
+	vpmadd52huq Q2_H, B0, S2; \
+	\
+	/* ; Carry propagation (first pass) */ \
+	vpsrlq  ZTMP1, P0_L, 44; \
+	vpsllq  P0_H, P0_H, 8; \
+	vpsrlq  ZTMP2, Q0_L, 44; \
+	vpsllq  Q0_H, Q0_H, 8; \
+	\
+	vpmadd52luq P1_L, A1, R0; \
+	vpmadd52huq P1_H, A1, R0; \
+	vpmadd52luq Q1_L, B1, S0; \
+	vpmadd52huq Q1_H, B1, S0; \
+	\
+	/* ; Carry propagation (first pass) - continue */ \
+	vpandq  A0, P0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  P0_H, P0_H, ZTMP1; \
+	vpandq  B0, Q0_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q0_H, Q0_H, ZTMP2; \
+	\
+	vpmadd52luq P2_L, A1, R1; \
+	vpmadd52huq P2_H, A1, R1; \
+	vpmadd52luq Q2_L, B1, S1; \
+	vpmadd52huq Q2_H, B1, S1; \
+	\
+	/* ; Carry propagation (first pass) - continue */ \
+	vpaddq  P1_L, P1_L, P0_H; \
+	vpsllq  P1_H, P1_H, 8; \
+	vpsrlq  ZTMP1, P1_L, 44; \
+	vpandq  A1, P1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  Q1_L, Q1_L, Q0_H; \
+	vpsllq  Q1_H, Q1_H, 8; \
+	vpsrlq  ZTMP2, Q1_L, 44; \
+	vpandq  B1, Q1_L, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	\
+	vpaddq  P2_L, P2_L, P1_H; /* ; P2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  P2_L, P2_L, ZTMP1; \
+	vpandq  A2, P2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsrlq  ZTMP1, P2_L, 42; \
+	vpsllq  P2_H, P2_H, 10; \
+	vpaddq  P2_H, P2_H, ZTMP1; \
+	\
+	vpaddq  Q2_L, Q2_L, Q1_H; /* ; Q2_L += P1_H + P1_L[63:44] */ \
+	vpaddq  Q2_L, Q2_L, ZTMP2; \
+	vpandq  B2, Q2_L, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsrlq  ZTMP2, Q2_L, 42; \
+	vpsllq  Q2_H, Q2_H, 10; \
+	vpaddq  Q2_H, Q2_H, ZTMP2; \
+	\
+	/* ; Carry propagation (second pass) */ \
+	/* ; Multiply by 5 the highest bits (above 130 bits) */ \
+	vpaddq  A0, A0, P2_H; \
+	vpsllq  P2_H, P2_H, 2; \
+	vpaddq  A0, A0, P2_H; \
+	vpaddq  B0, B0, Q2_H; \
+	vpsllq  Q2_H, Q2_H, 2; \
+	vpaddq  B0, B0, Q2_H; \
+	\
+	vpsrlq  ZTMP1, A0, 44; \
+	vpandq  A0, A0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  A1, A1, ZTMP1; \
+	vpsrlq  ZTMP2, B0, 44; \
+	vpandq  B0, B0, [.Lmask_44 ADD_RIP]; \
+	vpaddq  B1, B1, ZTMP2;
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Shuffle data blocks, so they match the right power of R.
+;; Powers of R are in this order: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R
+;; Data blocks are coming in this order: A0 A4 A1 A5 A2 A6 A3 A7
+;; Generally the computation is: A0*R^8 + A1*R^7 + A2*R^6 + A3*R^5 +
+;;                               A4*R^4 + A5*R^3 + A6*R^2 + A7*R
+;; When there are less data blocks, less powers of R are used, so data needs to
+;; be shuffled. Example: if 4 blocks are left, only A0-A3 are available and only
+;; R-R^4 are used (A0*R^4 + A1*R^3 + A2*R^2 + A3*R), so A0-A3 need to be shifted
+;; =============================================================================
+;A_L      [in/out] 0-43 bits of input data
+;A_M      [in/out] 44-87 bits of input data
+;A_H      [in/out] 88-129 bits of input data
+;TMP      [clobbered] Temporary GP register
+;N_BLOCKS [in] Number of remaining input blocks
+*/
+#define SHUFFLE_DATA_SMASK_1 0x39
+#define SHUFFLE_DATA_KMASK_1 0xffff
+#define SHUFFLE_DATA_SMASK_2 0x4E
+#define SHUFFLE_DATA_KMASK_2 0xffff
+#define SHUFFLE_DATA_SMASK_3 0x93
+#define SHUFFLE_DATA_KMASK_3 0xffff
+#define SHUFFLE_DATA_KMASK_4 0xffff
+#define SHUFFLE_DATA_SMASK_5 0x39
+#define SHUFFLE_DATA_KMASK_5 0xfff0
+#define SHUFFLE_DATA_SMASK_6 0x4E
+#define SHUFFLE_DATA_KMASK_6 0xff00
+#define SHUFFLE_DATA_SMASK_7 0x93
+#define SHUFFLE_DATA_KMASK_7 0xf000
+
+#define SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, N_BLOCKS) \
+	mov     TMP, SHUFFLE_DATA_KMASK_##N_BLOCKS; \
+	kmovq   k1, TMP; \
+	vpshufd A_L{k1}, A_L, 0x4E; \
+	vpshufd A_M{k1}, A_M, 0x4E; \
+	vpshufd A_H{k1}, A_H, 0x4E; \
+	vshufi64x2 A_L, A_L, A_L, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
+	vshufi64x2 A_M, A_M, A_M, SHUFFLE_DATA_SMASK_##N_BLOCKS; \
+	vshufi64x2 A_H, A_H, A_H, SHUFFLE_DATA_SMASK_##N_BLOCKS
+
+#define SHUFFLE_DATA_BLOCKS_1(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 1)
+
+#define SHUFFLE_DATA_BLOCKS_2(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 2)
+
+#define SHUFFLE_DATA_BLOCKS_3(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 3)
+
+#define SHUFFLE_DATA_BLOCKS_4(A_L, A_M, A_H, TMP) \
+	mov     TMP, SHUFFLE_DATA_KMASK_4; \
+	kmovq   k1, TMP; \
+	vpshufd A_L{k1}, A_L, 0x4E; \
+	vpshufd A_M{k1}, A_M, 0x4E; \
+	vpshufd A_H{k1}, A_H, 0x4E;
+
+#define SHUFFLE_DATA_BLOCKS_5(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 5)
+
+#define SHUFFLE_DATA_BLOCKS_6(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 6)
+
+#define SHUFFLE_DATA_BLOCKS_7(A_L, A_M, A_H, TMP) \
+	SHUFFLE_DATA_BLOCKS_GENERIC(A_L, A_M, A_H, TMP, 7)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Computes hash for message length being multiple of block size
+;; =============================================================================
+;MSG    [in/out] GPR pointer to input message (updated)
+;LEN    [in/out] GPR in: length in bytes / out: length mod 16
+;A0     [in/out] accumulator bits 63..0
+;A1     [in/out] accumulator bits 127..64
+;A2     [in/out] accumulator bits 195..128
+;R0     [in] R constant bits 63..0
+;R1     [in] R constant bits 127..64
+;T0     [clobbered] GPR register
+;T1     [clobbered] GPR register
+;T2     [clobbered] GPR register
+;T3     [clobbered] GPR register
+;GP_RAX [clobbered] RAX register
+;GP_RDX [clobbered] RDX register
+*/
+#define POLY1305_BLOCKS(MSG, LEN, A0, A1, A2, R0, R1, T0, T1, T2, T3, \
+			GP_RAX, GP_RDX) \
+	/* ; Minimum of 256 bytes to run vectorized code */ \
+	cmp     LEN, POLY1305_BLOCK_SIZE*16; \
+	jb      .L_final_loop; \
+	\
+	/* ; Spread accumulator into 44-bit limbs in quadwords */ \
+	mov     T0, A0; \
+	and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (A[43:0]) */ \
+	vmovq   xmm5, T0; \
+	\
+	mov     T0, A1; \
+	shrd    A0, T0, 44; \
+	and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (A[77:52]) */ \
+	vmovq   xmm6, A0; \
+	\
+	shrd    A1, A2, 24; \
+	and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (A[129:88]) */ \
+	vmovq   xmm7, A1; \
+	\
+	/* ; Load first block of data (128 bytes) */ \
+	vmovdqu64 zmm0, [MSG]; \
+	vmovdqu64 zmm1, [MSG + 64]; \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm15, zmm0, zmm1; \
+	vpunpcklqdq zmm13, zmm0, zmm1; \
+	\
+	vpsrlq  zmm14, zmm13, 44; \
+	vpsllq  zmm18, zmm15, 20; \
+	vpternlogq zmm14, zmm18, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm13, zmm13, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm15, zmm15, 24; \
+	\
+	/* ; Add 2^128 to all 8 final qwords of the message */ \
+	vporq   zmm15, zmm15, [.Lhigh_bit ADD_RIP]; \
+	\
+	vpaddq  zmm13, zmm13, zmm5; \
+	vpaddq  zmm14, zmm14, zmm6; \
+	vpaddq  zmm15, zmm15, zmm7; \
+	\
+	/* ; Load next blocks of data (128 bytes) */ \
+	vmovdqu64 zmm0, [MSG + 64*2]; \
+	vmovdqu64 zmm1, [MSG + 64*3]; \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm18, zmm0, zmm1; \
+	vpunpcklqdq zmm16, zmm0, zmm1; \
+	\
+	vpsrlq  zmm17, zmm16, 44; \
+	vpsllq  zmm19, zmm18, 20; \
+	vpternlogq zmm17, zmm19, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm16, zmm16, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm18, zmm18, 24; \
+	\
+	/* ; Add 2^128 to all 8 final qwords of the message */ \
+	vporq   zmm18, zmm18, [.Lhigh_bit ADD_RIP]; \
+	\
+	/* ; Use memory in stack to save powers of R, before loading them into ZMM registers */ \
+	/* ; The first 16*8 bytes will contain the 16 bytes of the 8 powers of R */ \
+	/* ; The last 64 bytes will contain the last 2 bits of powers of R, spread in 8 qwords, */ \
+	/* ; to be OR'd with the highest qwords (in zmm26) */ \
+	vmovq   xmm3, R0; \
+	vpinsrq xmm3, xmm3, R1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 3; \
+	\
+	vpxorq  zmm0, zmm0, zmm0; \
+	vpxorq  zmm2, zmm2, zmm2; \
+	\
+	/* ; Calculate R^2 */ \
+	mov     T0, R1; \
+	shr     T0, 2; \
+	add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
+	\
+	mov     A0, R0; \
+	mov     A1, R1; \
+	\
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_ZERO); \
+	\
+	vmovq   xmm3, A0; \
+	vpinsrq xmm3, xmm3, A1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 2; \
+	\
+	vmovq   xmm4, A2; \
+	vinserti32x4 zmm2, zmm2, xmm4, 2; \
+	\
+	/* ; Calculate R^3 */ \
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+	\
+	vmovq   xmm3, A0; \
+	vpinsrq xmm3, xmm3, A1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 1; \
+	\
+	vmovq   xmm4, A2; \
+	vinserti32x4 zmm2, zmm2, xmm4, 1; \
+	\
+	/* ; Calculate R^4 */ \
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+	\
+	vmovq   xmm3, A0; \
+	vpinsrq xmm3, xmm3, A1, 1; \
+	vinserti32x4 zmm1, zmm1, xmm3, 0; \
+	\
+	vmovq   xmm4, A2; \
+	vinserti32x4 zmm2, zmm2, xmm4, 0; \
+	\
+	/* ; Move 2 MSbits to top 24 bits, to be OR'ed later */ \
+	vpsllq  zmm2, zmm2, 40; \
+	\
+	vpunpckhqdq zmm21, zmm1, zmm0; \
+	vpunpcklqdq zmm19, zmm1, zmm0; \
+	\
+	vpsrlq  zmm20, zmm19, 44; \
+	vpsllq  zmm4, zmm21, 20; \
+	vpternlogq zmm20, zmm4, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm19, zmm19, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm21, zmm21, 24; \
+	\
+	/* ; zmm2 contains the 2 highest bits of the powers of R */ \
+	vporq   zmm21, zmm21, zmm2; \
+	\
+	/* ; Broadcast 44-bit limbs of R^4 */ \
+	mov     T0, A0; \
+	and     T0, [.Lmask_44 ADD_RIP]; /* ;; First limb (R^4[43:0]) */ \
+	vpbroadcastq zmm22, T0; \
+	\
+	mov     T0, A1; \
+	shrd    A0, T0, 44; \
+	and     A0, [.Lmask_44 ADD_RIP]; /* ;; Second limb (R^4[87:44]) */ \
+	vpbroadcastq zmm23, A0; \
+	\
+	shrd    A1, A2, 24; \
+	and     A1, [.Lmask_42 ADD_RIP]; /* ;; Third limb (R^4[129:88]) */ \
+	vpbroadcastq zmm24, A1; \
+	\
+	/* ; Generate 4*5*R^4 */ \
+	vpsllq  zmm25, zmm23, 2; \
+	vpsllq  zmm26, zmm24, 2; \
+	\
+	/* ; 5*R^4 */ \
+	vpaddq  zmm25, zmm25, zmm23; \
+	vpaddq  zmm26, zmm26, zmm24; \
+	\
+	/* ; 4*5*R^4 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	vpslldq zmm29, zmm19, 8; \
+	vpslldq zmm30, zmm20, 8; \
+	vpslldq zmm31, zmm21, 8; \
+	\
+	/* ; Calculate R^8-R^5 */ \
+	POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ; Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R */ \
+	vporq   zmm19, zmm19, zmm29; \
+	vporq   zmm20, zmm20, zmm30; \
+	vporq   zmm21, zmm21, zmm31; \
+	\
+	/* ; Broadcast R^8 */ \
+	vpbroadcastq zmm22, xmm19; \
+	vpbroadcastq zmm23, xmm20; \
+	vpbroadcastq zmm24, xmm21; \
+	\
+	/* ; Generate 4*5*R^8 */ \
+	vpsllq  zmm25, zmm23, 2; \
+	vpsllq  zmm26, zmm24, 2; \
+	\
+	/* ; 5*R^8 */ \
+	vpaddq  zmm25, zmm25, zmm23; \
+	vpaddq  zmm26, zmm26, zmm24; \
+	\
+	/* ; 4*5*R^8 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	cmp     LEN, POLY1305_BLOCK_SIZE*32; \
+	jb      .L_len_256_511; \
+	\
+	/* ; Store R^8-R for later use */ \
+	vmovdqa64 [rsp + STACK_r_save], zmm19; \
+	vmovdqa64 [rsp + STACK_r_save + 64], zmm20; \
+	vmovdqa64 [rsp + STACK_r_save + 64*2], zmm21; \
+	\
+	/* ; Calculate R^16-R^9 */ \
+	POLY1305_MUL_REDUCE_VEC(zmm19, zmm20, zmm21, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ; Store R^16-R^9 for later use */ \
+	vmovdqa64 [rsp + STACK_r_save + 64*3], zmm19; \
+	vmovdqa64 [rsp + STACK_r_save + 64*4], zmm20; \
+	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm21; \
+	\
+	/* ; Broadcast R^16 */ \
+	vpbroadcastq zmm22, xmm19; \
+	vpbroadcastq zmm23, xmm20; \
+	vpbroadcastq zmm24, xmm21; \
+	\
+	/* ; Generate 4*5*R^16 */ \
+	vpsllq  zmm25, zmm23, 2; \
+	vpsllq  zmm26, zmm24, 2; \
+	\
+	/* ; 5*R^16 */ \
+	vpaddq  zmm25, zmm25, zmm23; \
+	vpaddq  zmm26, zmm26, zmm24; \
+	\
+	/* ; 4*5*R^16 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	mov     T0, LEN; \
+	and     T0, 0xffffffffffffff00; /* ; multiple of 256 bytes */ \
+	\
+.L_poly1305_blocks_loop: \
+	cmp     T0, POLY1305_BLOCK_SIZE*16; \
+	jbe     .L_poly1305_blocks_loop_end; \
+	\
+	/* ; zmm13-zmm18 contain the 16 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 5x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 5x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MSG_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
+				      zmm22, zmm23, zmm24, zmm25, zmm26, \
+				      zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				      zmm19, zmm20, zmm21, zmm27, zmm28, zmm29, \
+				      zmm30, zmm31, zmm11, zmm0, zmm1, \
+				      zmm2, zmm3, zmm4, zmm12, MSG, T0); \
+	\
+	jmp     .L_poly1305_blocks_loop; \
+	\
+.L_poly1305_blocks_loop_end: \
+	\
+	/* ;; Need to multiply by r^16, r^15, r^14... r */ \
+	\
+	/* ; First multiply by r^16-r^9 */ \
+	\
+	/* ; Read R^16-R^9 */ \
+	vmovdqa64 zmm19, [rsp + STACK_r_save + 64*3]; \
+	vmovdqa64 zmm20, [rsp + STACK_r_save + 64*4]; \
+	vmovdqa64 zmm21, [rsp + STACK_r_save + 64*5]; \
+	/* ; Read R^8-R */ \
+	vmovdqa64 zmm22, [rsp + STACK_r_save]; \
+	vmovdqa64 zmm23, [rsp + STACK_r_save + 64]; \
+	vmovdqa64 zmm24, [rsp + STACK_r_save + 64*2]; \
+	\
+	/* ; zmm27 to have bits 87-44 of all 9-16th powers of R' in 8 qwords */ \
+	/* ; zmm28 to have bits 129-88 of all 9-16th powers of R' in 8 qwords */ \
+	vpsllq  zmm0, zmm20, 2; \
+	vpaddq  zmm27, zmm20, zmm0; /* ; R1' (R1*5) */ \
+	vpsllq  zmm1, zmm21, 2; \
+	vpaddq  zmm28, zmm21, zmm1; /* ; R2' (R2*5) */ \
+	\
+	/* ; 4*5*R */ \
+	vpsllq  zmm27, zmm27, 2; \
+	vpsllq  zmm28, zmm28, 2; \
+	\
+	/* ; Then multiply by r^8-r */ \
+	\
+	/* ; zmm25 to have bits 87-44 of all 1-8th powers of R' in 8 qwords */ \
+	/* ; zmm26 to have bits 129-88 of all 1-8th powers of R' in 8 qwords */ \
+	vpsllq  zmm2, zmm23, 2; \
+	vpaddq  zmm25, zmm23, zmm2; /* ; R1' (R1*5) */ \
+	vpsllq  zmm3, zmm24, 2; \
+	vpaddq  zmm26, zmm24, zmm3; /* ; R2' (R2*5) */ \
+	\
+	/* ; 4*5*R */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	POLY1305_MUL_REDUCE_VEC16(zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, \
+				  zmm19, zmm20, zmm21, zmm27, zmm28, \
+				  zmm22, zmm23, zmm24, zmm25, zmm26, \
+				  zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, \
+				  zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm29); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vpaddq  zmm13, zmm13, zmm16; \
+	vpaddq  zmm14, zmm14, zmm17; \
+	vpaddq  zmm15, zmm15, zmm18; \
+	\
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	/* ; Finish folding and clear second qword */ \
+	mov     T0, 0xfd; \
+	kmovq   k1, T0; \
+	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE*16; \
+	\
+	and     LEN, (POLY1305_BLOCK_SIZE*16 - 1); /* ; Get remaining lengths (LEN < 256 bytes) */ \
+	\
+.L_less_than_256: \
+	\
+	cmp     LEN, POLY1305_BLOCK_SIZE*8; \
+	jb      .L_less_than_128; \
+	\
+	/* ; Read next 128 bytes */ \
+	/* ; Load first block of data (128 bytes) */ \
+	vmovdqu64 zmm0, [MSG]; \
+	vmovdqu64 zmm1, [MSG + 64]; \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm5, zmm0, zmm1; \
+	vpunpcklqdq zmm3, zmm0, zmm1; \
+	\
+	vpsrlq  zmm4, zmm3, 44; \
+	vpsllq  zmm8, zmm5, 20; \
+	vpternlogq zmm4, zmm8, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm3, zmm3, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm5, zmm5, 24; \
+	\
+	/* ; Add 2^128 to all 8 final qwords of the message */ \
+	vporq   zmm5, zmm5, [.Lhigh_bit ADD_RIP]; \
+	\
+	vpaddq  zmm13, zmm13, zmm3; \
+	vpaddq  zmm14, zmm14, zmm4; \
+	vpaddq  zmm15, zmm15, zmm5; \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE*8; \
+	sub     LEN, POLY1305_BLOCK_SIZE*8; \
+	\
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	/* ; Finish folding and clear second qword */ \
+	mov     T0, 0xfd; \
+	kmovq   k1, T0; \
+	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+	\
+.L_less_than_128: \
+	cmp     LEN, 32; /* ; If remaining bytes is <= 32, perform last blocks in scalar */ \
+	jbe     .L_simd_to_gp; \
+	\
+	mov     T0, LEN; \
+	and     T0, 0x3f; \
+	lea     T1, [.Lbyte64_len_to_mask_table ADD_RIP]; \
+	mov     T1, [T1 + 8*T0]; \
+	\
+	/* ; Load default byte masks */ \
+	mov     T2, 0xffffffffffffffff; \
+	xor     T3, T3; \
+	\
+	cmp     LEN, 64; \
+	cmovb   T2, T1; /* ; Load mask for first 64 bytes */ \
+	cmovg   T3, T1; /* ; Load mask for second 64 bytes */ \
+	\
+	kmovq   k1, T2; \
+	kmovq   k2, T3; \
+	vmovdqu8 zmm0{k1}{z}, [MSG]; \
+	vmovdqu8 zmm1{k2}{z}, [MSG + 64]; \
+	\
+	/* ; Pad last block message, if partial */ \
+	mov     T0, LEN; \
+	and     T0, 0x70; /* ; Multiple of 16 bytes */ \
+	/* ; Load last block of data (up to 112 bytes) */ \
+	shr     T0, 3; /* ; Get number of full qwords */ \
+	\
+	/* ; Interleave the data to form 44-bit limbs */ \
+	/* ; */ \
+	/* ; zmm13 to have bits 0-43 of all 8 blocks in 8 qwords */ \
+	/* ; zmm14 to have bits 87-44 of all 8 blocks in 8 qwords */ \
+	/* ; zmm15 to have bits 127-88 of all 8 blocks in 8 qwords */ \
+	vpunpckhqdq zmm4, zmm0, zmm1; \
+	vpunpcklqdq zmm2, zmm0, zmm1; \
+	\
+	vpsrlq  zmm3, zmm2, 44; \
+	vpsllq  zmm28, zmm4, 20; \
+	vpternlogq zmm3, zmm28, [.Lmask_44 ADD_RIP], 0xA8; /* ; (A OR B AND C) */ \
+	\
+	vpandq  zmm2, zmm2, [.Lmask_44 ADD_RIP]; \
+	vpsrlq  zmm4, zmm4, 24; \
+	\
+	lea     T1, [.Lqword_high_bit_mask ADD_RIP]; \
+	kmovb   k1, [T1 + T0]; \
+	/* ; Add 2^128 to final qwords of the message (all full blocks and partial block, */ \
+	/* ; if "pad_to_16" is selected) */ \
+	vporq   zmm4{k1}, zmm4, [.Lhigh_bit ADD_RIP]; \
+	\
+	vpaddq  zmm13, zmm13, zmm2; \
+	vpaddq  zmm14, zmm14, zmm3; \
+	vpaddq  zmm15, zmm15, zmm4; \
+	\
+	mov     T0, LEN; \
+	add     T0, 15; \
+	shr     T0, 4;      /* ; Get number of 16-byte blocks (including partial blocks) */ \
+	xor     LEN, LEN; /* ; All length will be consumed */ \
+	\
+	/* ; No need to shuffle data blocks (data is in the right order) */ \
+	cmp     T0, 8; \
+	je      .L_end_shuffle; \
+	\
+	cmp     T0, 4; \
+	je      .L_shuffle_blocks_4; \
+	jb      .L_shuffle_blocks_3; \
+	\
+	/* ; Number of 16-byte blocks > 4 */ \
+	cmp     T0, 6; \
+	je      .L_shuffle_blocks_6; \
+	ja      .L_shuffle_blocks_7; \
+	jmp     .L_shuffle_blocks_5; \
+	\
+.L_shuffle_blocks_3: \
+	SHUFFLE_DATA_BLOCKS_3(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_4: \
+	SHUFFLE_DATA_BLOCKS_4(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_5: \
+	SHUFFLE_DATA_BLOCKS_5(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_6: \
+	SHUFFLE_DATA_BLOCKS_6(zmm13, zmm14, zmm15, T1); \
+	jmp     .L_end_shuffle; \
+.L_shuffle_blocks_7: \
+	SHUFFLE_DATA_BLOCKS_7(zmm13, zmm14, zmm15, T1); \
+	\
+.L_end_shuffle: \
+	\
+	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+.L_simd_to_gp: \
+	/* ; Carry propagation */ \
+	vpsrlq  xmm0, xmm13, 44; \
+	vpandq  xmm13, xmm13, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  xmm14, xmm14, xmm0; \
+	vpsrlq  xmm0, xmm14, 44; \
+	vpandq  xmm14, xmm14, [.Lmask_44 ADD_RIP]; /* ; Clear top 20 bits */ \
+	vpaddq  xmm15, xmm15, xmm0; \
+	vpsrlq  xmm0, xmm15, 42; \
+	vpandq  xmm15, xmm15, [.Lmask_42 ADD_RIP]; /* ; Clear top 22 bits */ \
+	vpsllq  xmm1, xmm0, 2; \
+	vpaddq  xmm0, xmm0, xmm1; \
+	vpaddq  xmm13, xmm13, xmm0; \
+	\
+	/* ; Put together A */ \
+	vmovq   A0, xmm13; \
+	\
+	vmovq   T0, xmm14; \
+	mov     T1, T0; \
+	shl     T1, 44; \
+	or      A0, T1; \
+	\
+	shr     T0, 20; \
+	vmovq   A2, xmm15; \
+	mov     A1, A2; \
+	shl     A1, 24; \
+	or      A1, T0; \
+	shr     A2, 40; \
+	\
+	/* ; Clear powers of R */ \
+	vpxorq  zmm0, zmm0, zmm0; \
+	vmovdqa64 [rsp + STACK_r_save], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*2], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*3], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*4], zmm0; \
+	vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \
+	\
+	vzeroall; \
+	clear_zmm(xmm16); clear_zmm(xmm20); clear_zmm(xmm24); clear_zmm(xmm28); \
+	clear_zmm(xmm17); clear_zmm(xmm21); clear_zmm(xmm25); clear_zmm(xmm29); \
+	clear_zmm(xmm18); clear_zmm(xmm22); clear_zmm(xmm26); clear_zmm(xmm30); \
+	clear_zmm(xmm19); clear_zmm(xmm23); clear_zmm(xmm27); clear_zmm(xmm31); \
+	\
+.L_final_loop: \
+	cmp     LEN, POLY1305_BLOCK_SIZE; \
+	jb      .L_poly1305_blocks_exit; \
+	\
+	/* ;; A += MSG[i] */ \
+	add     A0, [MSG + 0]; \
+	adc     A1, [MSG + 8]; \
+	adc     A2, 1; /* ;; no padding bit */ \
+	\
+	mov     T0, R1; \
+	shr     T0, 2; \
+	add     T0, R1; /* ;; T0 = R1 + (R1 >> 2) */ \
+	\
+	POLY1305_MUL_REDUCE(A0, A1, A2, R0, R1, \
+			    T0, T1, T2, T3, GP_RAX, GP_RDX, A2_NOT_ZERO); \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE; \
+	sub     LEN, POLY1305_BLOCK_SIZE; \
+	\
+	jmp     .L_final_loop; \
+	\
+.L_len_256_511: \
+	\
+	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ; Then multiply by r^8-r */ \
+	\
+	/* ; zmm19-zmm21 contains R^8-R, need to move it to zmm22-24, */ \
+	/* ; as it might be used in other part of the code */ \
+	vmovdqa64 zmm22, zmm19; \
+	vmovdqa64 zmm23, zmm20; \
+	vmovdqa64 zmm24, zmm21; \
+	\
+	/* ; zmm25 to have bits 87-44 of all 8 powers of R' in 8 qwords */ \
+	/* ; zmm26 to have bits 129-88 of all 8 powers of R' in 8 qwords */ \
+	vpsllq  zmm0, zmm23, 2; \
+	vpaddq  zmm25, zmm23, zmm0; /* ; R1' (R1*5) */ \
+	vpsllq  zmm1, zmm24, 2; \
+	vpaddq  zmm26, zmm24, zmm1; /* ; R2' (R2*5) */ \
+	\
+	/* ; 4*5*R^8 */ \
+	vpsllq  zmm25, zmm25, 2; \
+	vpsllq  zmm26, zmm26, 2; \
+	\
+	vpaddq  zmm13, zmm13, zmm16; \
+	vpaddq  zmm14, zmm14, zmm17; \
+	vpaddq  zmm15, zmm15, zmm18; \
+	\
+	/* ; zmm13-zmm15 contain the 8 blocks of message plus the previous accumulator */ \
+	/* ; zmm22-24 contain the 3x44-bit limbs of the powers of R */ \
+	/* ; zmm25-26 contain the 3x44-bit limbs of the powers of R' (5*4*R) */ \
+	POLY1305_MUL_REDUCE_VEC(zmm13, zmm14, zmm15, \
+				zmm22, zmm23, zmm24, \
+				zmm25, zmm26, \
+				zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \
+				zmm11); \
+	\
+	/* ;; Add all blocks (horizontally) */ \
+	vextracti64x4   ymm0, zmm13, 1; \
+	vextracti64x4   ymm1, zmm14, 1; \
+	vextracti64x4   ymm2, zmm15, 1; \
+	\
+	vpaddq  ymm13, ymm13, ymm0; \
+	vpaddq  ymm14, ymm14, ymm1; \
+	vpaddq  ymm15, ymm15, ymm2; \
+	\
+	vextracti32x4   xmm10, ymm13, 1; \
+	vextracti32x4   xmm11, ymm14, 1; \
+	vextracti32x4   xmm12, ymm15, 1; \
+	\
+	vpaddq  xmm13, xmm13, xmm10; \
+	vpaddq  xmm14, xmm14, xmm11; \
+	vpaddq  xmm15, xmm15, xmm12; \
+	\
+	vpsrldq xmm10, xmm13, 8; \
+	vpsrldq xmm11, xmm14, 8; \
+	vpsrldq xmm12, xmm15, 8; \
+	\
+	/* ; Finish folding and clear second qword */ \
+	mov     T0, 0xfd; \
+	kmovq   k1, T0; \
+	vpaddq  xmm13{k1}{z}, xmm13, xmm10; \
+	vpaddq  xmm14{k1}{z}, xmm14, xmm11; \
+	vpaddq  xmm15{k1}{z}, xmm15, xmm12; \
+	\
+	add     MSG, POLY1305_BLOCK_SIZE*16; \
+	sub     LEN, POLY1305_BLOCK_SIZE*16; \
+	\
+	jmp     .L_less_than_256; \
+.L_poly1305_blocks_exit: \
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Creates stack frame and saves registers
+;; =============================================================================
+*/
+#define FUNC_ENTRY() \
+	mov     rax, rsp; \
+	CFI_DEF_CFA_REGISTER(rax); \
+	sub     rsp, STACK_SIZE; \
+	and	rsp, -64; \
+	\
+	mov     [rsp + STACK_gpr_save + 8*0], rbx; \
+	mov     [rsp + STACK_gpr_save + 8*1], rbp; \
+	mov     [rsp + STACK_gpr_save + 8*2], r12; \
+	mov     [rsp + STACK_gpr_save + 8*3], r13; \
+	mov     [rsp + STACK_gpr_save + 8*4], r14; \
+	mov     [rsp + STACK_gpr_save + 8*5], r15; \
+	mov     [rsp + STACK_rsp_save], rax; \
+	CFI_CFA_ON_STACK(STACK_rsp_save, 0)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; Restores registers and removes the stack frame
+;; =============================================================================
+*/
+#define FUNC_EXIT() \
+	mov     rbx, [rsp + STACK_gpr_save + 8*0]; \
+	mov     rbp, [rsp + STACK_gpr_save + 8*1]; \
+	mov     r12, [rsp + STACK_gpr_save + 8*2]; \
+	mov     r13, [rsp + STACK_gpr_save + 8*3]; \
+	mov     r14, [rsp + STACK_gpr_save + 8*4]; \
+	mov     r15, [rsp + STACK_gpr_save + 8*5]; \
+	mov     rsp, [rsp + STACK_rsp_save]; \
+	CFI_DEF_CFA_REGISTER(rsp)
+
+/*
+;; =============================================================================
+;; =============================================================================
+;; void poly1305_aead_update_fma_avx512(const void *msg, const uint64_t msg_len,
+;;                                      void *hash, const void *key)
+;; arg1 - Input message
+;; arg2 - Message length
+;; arg3 - Input/output hash
+;; arg4 - Poly1305 key
+*/
+.align 32
+.globl _gcry_poly1305_amd64_avx512_blocks
+ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;)
+_gcry_poly1305_amd64_avx512_blocks:
+	CFI_STARTPROC()
+	vpxord xmm16, xmm16, xmm16;
+	vpopcntb zmm16, zmm16; /* spec stop for old AVX512 CPUs */
+	FUNC_ENTRY()
+
+#define _a0 gp3
+#define _a0 gp3
+#define _a1 gp4
+#define _a2 gp5
+#define _r0 gp6
+#define _r1 gp7
+#define _len arg2
+#define _arg3 arg4             /* ; use rcx, arg3 = rdx */
+
+	/* ;; load R */
+	mov     _r0, [arg4 + 0 * 8]
+	mov     _r1, [arg4 + 1 * 8]
+
+	/* ;; load accumulator / current hash value */
+	/* ;; note: arg4 can't be used beyond this point */
+	mov     _arg3, arg3             /* ; note: _arg3 = arg4 (linux) */
+	mov     _a0, [_arg3 + 0 * 8]
+	mov     _a1, [_arg3 + 1 * 8]
+	mov     DWORD(_a2), [_arg3 + 2 * 8]    /* ; note: _a2 = arg4 (win) */
+
+	POLY1305_BLOCKS(arg1, _len, _a0, _a1, _a2, _r0, _r1,
+			gp10, gp11, gp8, gp9, rax, rdx)
+
+	/* ;; save accumulator back */
+	mov     [_arg3 + 0 * 8], _a0
+	mov     [_arg3 + 1 * 8], _a1
+	mov     [_arg3 + 2 * 8], DWORD(_a2)
+
+	FUNC_EXIT()
+	xor eax, eax
+	kmovw k1, eax
+	kmovw k2, eax
+	ret_spec_stop
+	CFI_ENDPROC()
+ELF(.size _gcry_poly1305_amd64_avx512_blocks,
+	  .-_gcry_poly1305_amd64_avx512_blocks;)
+
+#endif
+#endif
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index 19cee5f6..9e01df46 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -34,6 +34,16 @@
 #define POLY1305_BLOCKSIZE 16
 
 
+/* POLY1305_USE_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef POLY1305_USE_AVX512
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define POLY1305_USE_AVX512 1
+#endif
+
+
 typedef struct
 {
   u32 k[4];
@@ -46,6 +56,9 @@ typedef struct poly1305_context_s
   POLY1305_STATE state;
   byte buffer[POLY1305_BLOCKSIZE];
   unsigned int leftover;
+#ifdef POLY1305_USE_AVX512
+  unsigned int use_avx512:1;
+#endif
 } poly1305_context_t;
 
 
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index e57e64f3..5482fc6a 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -60,6 +60,19 @@ static const char *selftest (void);
 #endif
 
 
+/* AMD64 Assembly implementations use SystemV ABI, ABI conversion and
+ * additional stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_FUNC_WRAPPER_ATTR
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_FUNC_WRAPPER_ATTR __attribute__((noinline))
+#else
+# define ASM_FUNC_ABI
+# define ASM_FUNC_WRAPPER_ATTR
+#endif
+
+
 #ifdef USE_S390X_ASM
 
 #define HAVE_ASM_POLY1305_BLOCKS 1
@@ -78,11 +91,32 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
 #endif /* USE_S390X_ASM */
 
 
+#ifdef POLY1305_USE_AVX512
+
+extern unsigned int
+_gcry_poly1305_amd64_avx512_blocks(const void *msg, const u64 msg_len,
+				   void *hash, const void *key) ASM_FUNC_ABI;
+
+ASM_FUNC_WRAPPER_ATTR static unsigned int
+poly1305_amd64_avx512_blocks(poly1305_context_t *ctx, const byte *buf,
+			     size_t len)
+{
+  POLY1305_STATE *st = &ctx->state;
+  return _gcry_poly1305_amd64_avx512_blocks(buf, len, st->h, st->r);
+}
+
+#endif /* POLY1305_USE_AVX512 */
+
+
 static void poly1305_init (poly1305_context_t *ctx,
 			   const byte key[POLY1305_KEYLEN])
 {
   POLY1305_STATE *st = &ctx->state;
 
+#ifdef POLY1305_USE_AVX512
+  ctx->use_avx512 = (_gcry_get_hw_features () & HWF_INTEL_AVX512) != 0;
+#endif
+
   ctx->leftover = 0;
 
   st->h[0] = 0;
@@ -181,8 +215,8 @@ static void poly1305_init (poly1305_context_t *ctx,
 #ifndef HAVE_ASM_POLY1305_BLOCKS
 
 static unsigned int
-poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
-		 byte high_pad)
+poly1305_blocks_generic (poly1305_context_t *ctx, const byte *buf, size_t len,
+			 byte high_pad)
 {
   POLY1305_STATE *st = &ctx->state;
   u64 r0, r1, r1_mult5;
@@ -235,6 +269,18 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
   return 6 * sizeof (void *) + 18 * sizeof (u64);
 }
 
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+		 byte high_pad)
+{
+#ifdef POLY1305_USE_AVX512
+  if ((high_pad & ctx->use_avx512) != 0)
+    return poly1305_amd64_avx512_blocks(ctx, buf, len);
+#endif
+
+  return poly1305_blocks_generic(ctx, buf, len, high_pad);
+}
+
 #endif /* !HAVE_ASM_POLY1305_BLOCKS */
 
 static unsigned int poly1305_final (poly1305_context_t *ctx,
diff --git a/configure.ac b/configure.ac
index e214082b..778dc633 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3106,6 +3106,9 @@ case "${host}" in
   s390x-*-*)
     GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-s390x.lo"
   ;;
+  x86_64-*-*)
+    GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS poly1305-amd64-avx512.lo"
+  ;;
 esac
 
 LIST_MEMBER(scrypt, $enabled_kdfs)
-- 
2.32.0




More information about the Gcrypt-devel mailing list