[PATCH 4/4] Add stitched ChaCha20-Poly1305 SSSE3 and AVX2 implementations
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Jan 18 23:35:52 CET 2019
* cipher/asm-poly1305-amd64.h: New.
* cipher/Makefile.am: Add 'asm-poly1305-amd64.h'.
* cipher/chacha20-amd64-avx2.S (QUATERROUND2): Add interleave
operators.
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): New.
* cipher/chacha20-amd64-ssse3.S (QUATERROUND2): Add interleave
operators.
(_gcry_chacha20_poly1305_amd64_ssse3_blocks4)
(_gcry_chacha20_poly1305_amd64_ssse3_blocks1): New.
* cipher/chacha20.c (_gcry_chacha20_poly1305_amd64_ssse3_blocks4)
(_gcry_chacha20_poly1305_amd64_ssse3_blocks1)
(_gcry_chacha20_poly1305_amd64_avx2_blocks8): New prototypes.
(chacha20_encrypt_stream): Split tail to...
(do_chacha20_encrypt_stream_tail): ... new function.
(_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt): New.
* cipher/cipher-internal.h (_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt): New prototypes.
* cipher/cipher-poly1305.c (_gcry_cipher_poly1305_encrypt): Call
'_gcry_chacha20_poly1305_encrypt' if cipher is ChaCha20.
(_gcry_cipher_poly1305_decrypt): Call
'_gcry_chacha20_poly1305_decrypt' if cipher is ChaCha20.
* cipher/poly1305-internal.h (_gcry_cipher_poly1305_update_burn): New
prototype.
* cipher/poly1305.c (poly1305_blocks): Make static.
(_gcry_poly1305_update): Split main function body to ...
(_gcry_poly1305_update_burn): ... new function.
--
Benchmark on Intel Skylake (i5-6500, 3200 Mhz):
Before, 8-way AVX2:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.378 ns/B 2526 MiB/s 1.21 c/B
STREAM dec | 0.373 ns/B 2560 MiB/s 1.19 c/B
POLY1305 enc | 0.685 ns/B 1392 MiB/s 2.19 c/B
POLY1305 dec | 0.686 ns/B 1390 MiB/s 2.20 c/B
POLY1305 auth | 0.315 ns/B 3031 MiB/s 1.01 c/B
After, 8-way AVX2 (~36% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.503 ns/B 1896 MiB/s 1.61 c/B
POLY1305 dec | 0.485 ns/B 1965 MiB/s 1.55 c/B
Benchmark on Intel Haswell (i7-4790K, 3998 Mhz):
Before, 8-way AVX2:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.318 ns/B 2999 MiB/s 1.27 c/B
STREAM dec | 0.317 ns/B 3004 MiB/s 1.27 c/B
POLY1305 enc | 0.586 ns/B 1627 MiB/s 2.34 c/B
POLY1305 dec | 0.586 ns/B 1627 MiB/s 2.34 c/B
POLY1305 auth | 0.271 ns/B 3524 MiB/s 1.08 c/B
After, 8-way AVX2 (~30% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.452 ns/B 2108 MiB/s 1.81 c/B
POLY1305 dec | 0.440 ns/B 2167 MiB/s 1.76 c/B
Before, 4-way SSSE3:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.627 ns/B 1521 MiB/s 2.51 c/B
STREAM dec | 0.626 ns/B 1523 MiB/s 2.50 c/B
POLY1305 enc | 0.895 ns/B 1065 MiB/s 3.58 c/B
POLY1305 dec | 0.896 ns/B 1064 MiB/s 3.58 c/B
POLY1305 auth | 0.271 ns/B 3521 MiB/s 1.08 c/B
After, 4-way SSSE3 (~20% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.733 ns/B 1301 MiB/s 2.93 c/B
POLY1305 dec | 0.726 ns/B 1314 MiB/s 2.90 c/B
Before, 1-way SSSE3:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 1.56 ns/B 609.6 MiB/s 6.25 c/B
POLY1305 dec | 1.56 ns/B 609.4 MiB/s 6.26 c/B
After, 1-way SSSE3 (~18% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 1.31 ns/B 725.4 MiB/s 5.26 c/B
POLY1305 dec | 1.31 ns/B 727.3 MiB/s 5.24 c/B
For comparison to other libraries (on Intel i7-4790K, 3998 Mhz):
bench-slope-openssl: OpenSSL 1.1.1 11 Sep 2018
Cipher:
chacha20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.301 ns/B 3166.4 MiB/s 1.20 c/B
STREAM dec | 0.300 ns/B 3174.7 MiB/s 1.20 c/B
POLY1305 enc | 0.463 ns/B 2060.6 MiB/s 1.85 c/B
POLY1305 dec | 0.462 ns/B 2063.8 MiB/s 1.85 c/B
POLY1305 auth | 0.162 ns/B 5899.3 MiB/s 0.646 c/B
bench-slope-nettle: Nettle 3.4
Cipher:
chacha | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 1.65 ns/B 578.2 MiB/s 6.59 c/B
STREAM dec | 1.65 ns/B 578.2 MiB/s 6.59 c/B
POLY1305 enc | 2.05 ns/B 464.8 MiB/s 8.20 c/B
POLY1305 dec | 2.05 ns/B 464.7 MiB/s 8.20 c/B
POLY1305 auth | 0.404 ns/B 2359.1 MiB/s 1.62 c/B
bench-slope-botan: Botan 2.6.0
Cipher:
ChaCha | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc/dec | 0.855 ns/B 1116.0 MiB/s 3.42 c/B
POLY1305 enc | 1.60 ns/B 595.4 MiB/s 6.40 c/B
POLY1305 dec | 1.60 ns/B 595.8 MiB/s 6.40 c/B
POLY1305 auth | 0.752 ns/B 1268.3 MiB/s 3.01 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 98320ca5f..16066bfc6 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -72,6 +72,7 @@ libcipher_la_SOURCES = \
EXTRA_libcipher_la_SOURCES = \
asm-common-amd64.h \
asm-common-aarch64.h \
+ asm-poly1305-amd64.h \
arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/asm-poly1305-amd64.h b/cipher/asm-poly1305-amd64.h
new file mode 100644
index 000000000..3f99ea3e1
--- /dev/null
+++ b/cipher/asm-poly1305-amd64.h
@@ -0,0 +1,171 @@
+/* asm-common-amd64.h - Poly1305 macros for AMD64 assembly
+ *
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_AMD64_H
+#define GCRY_ASM_POLY1305_AMD64_H
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+ poly1305 for stitched chacha20-poly1305 AMD64 implementations
+ **********************************************************************/
+
+#define POLY_RSTATE %r8
+#define POLY_RSRC %r9
+
+#define POLY_R_H0 %rbx
+#define POLY_R_H1 %rcx
+#define POLY_R_H2 %r10
+#define POLY_R_H2d %r10d
+#define POLY_R_R0 %r11
+#define POLY_R_R1_MUL5 %r12
+#define POLY_R_X0_HI %r13
+#define POLY_R_X0_LO %r14
+#define POLY_R_X1_HI %r15
+#define POLY_R_X1_LO %rsi
+
+#define POLY_S_R0 (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1 (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define POLY1305_LOAD_STATE() \
+ movq POLY_S_H0, POLY_R_H0; \
+ movq POLY_S_H1, POLY_R_H1; \
+ movl POLY_S_H2d, POLY_R_H2d; \
+ movq POLY_S_R0, POLY_R_R0; \
+ movq POLY_S_R1, POLY_R_R1_MUL5; \
+ shrq $2, POLY_R_R1_MUL5; \
+ addq POLY_S_R1, POLY_R_R1_MUL5;
+
+#define POLY1305_STORE_STATE() \
+ movq POLY_R_H0, POLY_S_H0; \
+ movq POLY_R_H1, POLY_S_H1; \
+ movl POLY_R_H2d, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1(src_offset) \
+ addq ((src_offset) + 0 * 8)(POLY_RSRC), POLY_R_H0; \
+ adcq ((src_offset) + 1 * 8)(POLY_RSRC), POLY_R_H1; \
+ adcl $1, POLY_R_H2d; \
+ \
+ /* h = a * r (partial mod 2^130-5): */ \
+ \
+ /* h0 * r1 */ \
+ movq POLY_R_H0, %rax; \
+ mulq POLY_S_R1; \
+ movq %rax, POLY_R_X1_LO; \
+ movq %rdx, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART2() \
+ \
+ /* h0 * r0 */ \
+ movq POLY_R_H0, %rax; \
+ mulq POLY_R_R0; \
+ movq %rax, POLY_R_X0_LO; \
+ movq %rdx, POLY_R_X0_HI;
+
+#define POLY1305_BLOCK_PART3() \
+ \
+ /* h1 * r0 */ \
+ movq POLY_R_H1, %rax; \
+ mulq POLY_R_R0; \
+ addq %rax, POLY_R_X1_LO; \
+ adcq %rdx, POLY_R_X1_HI; \
+ \
+ /* h1 * r1 mod 2^130-5 */ \
+ movq POLY_R_R1_MUL5, %rax; \
+ mulq POLY_R_H1;
+
+#define POLY1305_BLOCK_PART4() \
+ movq POLY_R_H2, POLY_R_H1; \
+ imulq POLY_R_R1_MUL5, POLY_R_H1; /* h2 * r1 mod 2^130-5 */ \
+ addq %rax, POLY_R_X0_LO; \
+ adcq %rdx, POLY_R_X0_HI; \
+ imulq POLY_R_R0, POLY_R_H2; /* h2 * r0 */ \
+ addq POLY_R_X1_LO, POLY_R_H1; \
+ adcq POLY_R_X1_HI, POLY_R_H2;
+
+#define POLY1305_BLOCK_PART5() \
+ \
+ /* carry propagation */ \
+ movq POLY_R_H2, POLY_R_H0; \
+ andl $3, POLY_R_H2d; \
+ shrq $2, POLY_R_H0; \
+ leaq (POLY_R_H0, POLY_R_H0, 4), POLY_R_H0; \
+ addq POLY_R_X0_LO, POLY_R_H0; \
+ adcq POLY_R_X0_HI, POLY_R_H1; \
+ adcl $0, POLY_R_H2d;
+
+#ifdef TESTING_POLY1305_ASM
+/* for testing only, mixed C/asm poly1305.c is marginally faster (~2%). */
+.align 8
+.globl _gcry_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_poly1305_amd64_ssse3_blocks1, at function;)
+
+_gcry_poly1305_amd64_ssse3_blocks1:
+ /* input:
+ * %rdi: poly1305-state
+ * %rsi: src
+ * %rdx: nblks
+ */
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ subq $(10 * 8), %rsp;
+ movq %rbx, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ movq %r14, (4 * 8)(%rsp);
+ movq %r15, (5 * 8)(%rsp);
+
+ movq %rdx, (8 * 8)(%rsp); # NBLKS
+
+ movq %rdi, POLY_RSTATE;
+ movq %rsi, POLY_RSRC;
+
+ POLY1305_LOAD_STATE();
+
+.L_poly1:
+ POLY1305_BLOCK_PART1(0 * 16);
+ POLY1305_BLOCK_PART2();
+ POLY1305_BLOCK_PART3();
+ POLY1305_BLOCK_PART4();
+ POLY1305_BLOCK_PART5();
+
+ subq $1, (8 * 8)(%rsp); # NBLKS
+ leaq (16)(POLY_RSRC), POLY_RSRC;
+ jnz .L_poly1;
+
+ POLY1305_STORE_STATE();
+
+ movq (1 * 8)(%rsp), %rbx;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ movq (4 * 8)(%rsp), %r14;
+ movq (5 * 8)(%rsp), %r15;
+
+ xorl %eax, %eax;
+ leave
+ ret;
+#endif
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S
index dad9e3e96..ef02c1733 100644
--- a/cipher/chacha20-amd64-avx2.S
+++ b/cipher/chacha20-amd64-avx2.S
@@ -1,7 +1,6 @@
/* chacha20-amd64-avx2.S - AVX2 implementation of ChaCha20 cipher
*
-
- * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -36,17 +35,8 @@
.text
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-# define RIP (%rip)
-#else
-# define RIP
-#endif
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
/* register macros */
#define INPUT %rdi
@@ -139,15 +129,21 @@
#define PLUS(ds,s) \
vpaddd s, ds, ds;
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1) \
- vbroadcasti128 .Lshuf_rol16 RIP, tmp1; \
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
+ interleave_op1,interleave_op2,\
+ interleave_op3,interleave_op4) \
+ vbroadcasti128 .Lshuf_rol16 rRIP, tmp1; \
+ interleave_op1; \
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
ROTATE_SHUF_2(d1, d2, tmp1); \
+ interleave_op2; \
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE2(b1, b2, 12, tmp1); \
- vbroadcasti128 .Lshuf_rol8 RIP, tmp1; \
+ vbroadcasti128 .Lshuf_rol8 rRIP, tmp1; \
+ interleave_op3; \
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
ROTATE_SHUF_2(d1, d2, tmp1); \
+ interleave_op4; \
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE2(b1, b2, 7, tmp1);
@@ -189,12 +185,12 @@ _gcry_chacha20_amd64_avx2_blocks8:
subq $STACK_MAX, %rsp;
andq $~31, %rsp;
-.Loop4:
+.Loop8:
mov $20, ROUND;
/* Construct counter vectors X12 and X13 */
- vpmovzxbd .Linc_counter RIP, X0;
- vpbroadcastd .Lunsigned_cmp RIP, X2;
+ vpmovzxbd .Linc_counter rRIP, X0;
+ vpbroadcastd .Lunsigned_cmp rRIP, X2;
vpbroadcastd (12 * 4)(INPUT), X12;
vpbroadcastd (13 * 4)(INPUT), X13;
vpaddd X0, X12, X12;
@@ -223,14 +219,14 @@ _gcry_chacha20_amd64_avx2_blocks8:
vmovdqa X15, (STACK_TMP)(%rsp);
.Lround2:
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15)
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,,,,)
vmovdqa (STACK_TMP)(%rsp), X15;
vmovdqa X8, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8)
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8)
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,,,,)
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,,,,)
vmovdqa (STACK_TMP)(%rsp), X8;
vmovdqa X15, (STACK_TMP)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15)
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,,,,)
sub $2, ROUND;
jnz .Lround2;
@@ -302,7 +298,7 @@ _gcry_chacha20_amd64_avx2_blocks8:
sub $8, NBLKS;
lea (8 * 64)(DST), DST;
lea (8 * 64)(SRC), SRC;
- jnz .Loop4;
+ jnz .Loop8;
/* clear the used vector registers and stack */
vpxor X0, X0, X0;
@@ -319,5 +315,438 @@ _gcry_chacha20_amd64_avx2_blocks8:
ELF(.size _gcry_chacha20_amd64_avx2_blocks8,
.-_gcry_chacha20_amd64_avx2_blocks8;)
+/**********************************************************************
+ 8-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_avx2_blocks8
+ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8, at function;)
+
+_gcry_chacha20_poly1305_amd64_avx2_blocks8:
+ /* input:
+ * %rdi: input
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: nblks (multiple of 8)
+ * %r9: poly1305-state
+ * %r8: poly1305-src
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(8 * 8) + STACK_MAX + 32, %rsp;
+ andq $~31, %rsp;
+
+ movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+ movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+ movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+ movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+ movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+
+ movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+ movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+ movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+ /* Load state */
+ POLY1305_LOAD_STATE();
+
+.Loop_poly8:
+
+ /* Construct counter vectors X12 and X13 */
+ vpmovzxbd .Linc_counter rRIP, X0;
+ vpbroadcastd .Lunsigned_cmp rRIP, X2;
+ vpbroadcastd (12 * 4)(INPUT), X12;
+ vpbroadcastd (13 * 4)(INPUT), X13;
+ vpaddd X0, X12, X12;
+ vpxor X2, X0, X0;
+ vpxor X2, X12, X1;
+ vpcmpgtd X1, X0, X0;
+ vpsubd X0, X13, X13;
+ vmovdqa X12, (STACK_VEC_X12)(%rsp);
+ vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+ /* Load vectors */
+ vpbroadcastd (0 * 4)(INPUT), X0;
+ vpbroadcastd (1 * 4)(INPUT), X1;
+ vpbroadcastd (2 * 4)(INPUT), X2;
+ vpbroadcastd (3 * 4)(INPUT), X3;
+ vpbroadcastd (4 * 4)(INPUT), X4;
+ vpbroadcastd (5 * 4)(INPUT), X5;
+ vpbroadcastd (6 * 4)(INPUT), X6;
+ vpbroadcastd (7 * 4)(INPUT), X7;
+ vpbroadcastd (8 * 4)(INPUT), X8;
+ vpbroadcastd (9 * 4)(INPUT), X9;
+ vpbroadcastd (10 * 4)(INPUT), X10;
+ vpbroadcastd (11 * 4)(INPUT), X11;
+ vpbroadcastd (14 * 4)(INPUT), X14;
+ vpbroadcastd (15 * 4)(INPUT), X15;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+
+ # rounds 0,1
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART1(0 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(1 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(2 * 16),
+ POLY1305_BLOCK_PART2())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(3 * 16))
+
+ # rounds 2,3
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART1(4 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(5 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(6 * 16),
+ POLY1305_BLOCK_PART2())
+
+ # rounds 4,5
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(7 * 16))
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART1(8 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(9 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+
+ # rounds 6,7
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(10 * 16),
+ POLY1305_BLOCK_PART2())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(11 * 16))
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART1(12 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+
+ # rounds 8,9
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(13 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(14 * 16),
+ POLY1305_BLOCK_PART2())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(15 * 16))
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+
+ # rounds 10,11
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART1(16 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(17 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(18 * 16),
+ POLY1305_BLOCK_PART2())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(19 * 16))
+
+ # rounds 12,13
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART1(20 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(21 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(22 * 16),
+ POLY1305_BLOCK_PART2())
+
+ # rounds 14,15
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(23 * 16))
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART1(24 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(25 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+
+ # rounds 16,17
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(26 * 16),
+ POLY1305_BLOCK_PART2())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(27 * 16))
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART1(28 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+
+ # rounds 18,19
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(29 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X8, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(30 * 16),
+ POLY1305_BLOCK_PART2())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(31 * 16))
+ vmovdqa (STACK_TMP)(%rsp), X8;
+ vmovdqa X15, (STACK_TMP)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+
+ /* tmp := X15 */
+ vpbroadcastd (0 * 4)(INPUT), X15;
+ PLUS(X0, X15);
+ vpbroadcastd (1 * 4)(INPUT), X15;
+ PLUS(X1, X15);
+ vpbroadcastd (2 * 4)(INPUT), X15;
+ PLUS(X2, X15);
+ vpbroadcastd (3 * 4)(INPUT), X15;
+ PLUS(X3, X15);
+ vpbroadcastd (4 * 4)(INPUT), X15;
+ PLUS(X4, X15);
+ vpbroadcastd (5 * 4)(INPUT), X15;
+ PLUS(X5, X15);
+ vpbroadcastd (6 * 4)(INPUT), X15;
+ PLUS(X6, X15);
+ vpbroadcastd (7 * 4)(INPUT), X15;
+ PLUS(X7, X15);
+ vpbroadcastd (8 * 4)(INPUT), X15;
+ PLUS(X8, X15);
+ vpbroadcastd (9 * 4)(INPUT), X15;
+ PLUS(X9, X15);
+ vpbroadcastd (10 * 4)(INPUT), X15;
+ PLUS(X10, X15);
+ vpbroadcastd (11 * 4)(INPUT), X15;
+ PLUS(X11, X15);
+ vmovdqa (STACK_VEC_X12)(%rsp), X15;
+ PLUS(X12, X15);
+ vmovdqa (STACK_VEC_X13)(%rsp), X15;
+ PLUS(X13, X15);
+ vmovdqa (STACK_TMP)(%rsp), X15;
+ vmovdqa X13, (STACK_TMP)(%rsp);
+ vpbroadcastd (14 * 4)(INPUT), X13;
+ PLUS(X14, X13);
+ vmovdqa X14, (STACK_TMP1)(%rsp);
+ vpbroadcastd (15 * 4)(INPUT), X13;
+ PLUS(X15, X13);
+ vmovdqa X15, (STACK_TMP2)(%rsp);
+
+ /* Update counter */
+ addq $8, (12 * 4)(INPUT);
+
+ movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+ movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+ transpose_4x4(X0, X1, X2, X3, X13, X14);
+ transpose_4x4(X4, X5, X6, X7, X13, X14);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 0), (64 * 4 + 16 * 0), X0, X15);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 0), (64 * 5 + 16 * 0), X1, X15);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 0), (64 * 6 + 16 * 0), X2, X15);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 0), (64 * 7 + 16 * 0), X3, X15);
+ vmovdqa (STACK_TMP)(%rsp), X13;
+ vmovdqa (STACK_TMP1)(%rsp), X14;
+ vmovdqa (STACK_TMP2)(%rsp), X15;
+ transpose_4x4(X8, X9, X10, X11, X0, X1);
+ transpose_4x4(X12, X13, X14, X15, X0, X1);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 1), (64 * 4 + 16 * 1), X4, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 1), (64 * 5 + 16 * 1), X5, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 1), (64 * 6 + 16 * 1), X6, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 1), (64 * 7 + 16 * 1), X7, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 2), (64 * 4 + 16 * 2), X8, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 2), (64 * 5 + 16 * 2), X9, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 2), (64 * 6 + 16 * 2), X10, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 2), (64 * 7 + 16 * 2), X11, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 0 + 16 * 3), (64 * 4 + 16 * 3), X12, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 1 + 16 * 3), (64 * 5 + 16 * 3), X13, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 2 + 16 * 3), (64 * 6 + 16 * 3), X14, X0);
+ BUF_XOR_256_TO_128(DST, SRC, (64 * 3 + 16 * 3), (64 * 7 + 16 * 3), X15, X0);
+
+ subq $8, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+ lea (32 * 16)(POLY_RSRC), POLY_RSRC;
+ lea (8 * 64)(DST), DST;
+ lea (8 * 64)(SRC), SRC;
+ movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+ movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+ jnz .Loop_poly8;
+
+ /* Store state */
+ POLY1305_STORE_STATE();
+
+ /* clear the used vector registers and stack */
+ vpxor X0, X0, X0;
+ vmovdqa X0, (STACK_VEC_X12)(%rsp);
+ vmovdqa X0, (STACK_VEC_X13)(%rsp);
+ vmovdqa X0, (STACK_TMP)(%rsp);
+ vmovdqa X0, (STACK_TMP1)(%rsp);
+ vmovdqa X0, (STACK_TMP2)(%rsp);
+ vzeroall;
+
+ movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+ movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+ movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+ movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+ movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+
+ xorl %eax, %eax;
+ leave;
+ ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8,
+ .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;)
+
#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
#endif /*__x86_64*/
diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S
index 0e59ff981..d7faf6442 100644
--- a/cipher/chacha20-amd64-ssse3.S
+++ b/cipher/chacha20-amd64-ssse3.S
@@ -1,6 +1,6 @@
/* chacha20-amd64-ssse3.S - SSSE3 implementation of ChaCha20 cipher
*
- * Copyright (C) 2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -35,17 +35,8 @@
.text
-#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
-# define ELF(...) __VA_ARGS__
-#else
-# define ELF(...) /*_*/
-#endif
-
-#ifdef __PIC__
-# define RIP (%rip)
-#else
-# define RIP
-#endif
+#include "asm-common-amd64.h"
+#include "asm-poly1305-amd64.h"
/* register macros */
#define INPUT %rdi
@@ -145,13 +136,16 @@
#define PLUS(ds,s) \
paddd s, ds;
-#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2) \
- movdqa .Lshuf_rol16 RIP, tmp1; \
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
+ interleave_op1,interleave_op2) \
+ movdqa .Lshuf_rol16 rRIP, tmp1; \
+ interleave_op1; \
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
ROTATE_SHUF_2(d1, d2, tmp1); \
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
ROTATE2(b1, b2, 12, tmp1, tmp2); \
- movdqa .Lshuf_rol8 RIP, tmp1; \
+ movdqa .Lshuf_rol8 rRIP, tmp1; \
+ interleave_op2; \
PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
ROTATE_SHUF_2(d1, d2, tmp1); \
PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
@@ -192,8 +186,8 @@ _gcry_chacha20_amd64_ssse3_blocks4:
mov $20, ROUND;
/* Construct counter vectors X12 and X13 */
- movdqa .Linc_counter RIP, X0;
- movdqa .Lunsigned_cmp RIP, X2;
+ movdqa .Linc_counter rRIP, X0;
+ movdqa .Lunsigned_cmp rRIP, X2;
pbroadcastd((12 * 4)(INPUT), X12);
pbroadcastd((13 * 4)(INPUT), X13);
paddd X0, X12;
@@ -224,18 +218,18 @@ _gcry_chacha20_amd64_ssse3_blocks4:
movdqa X15, (STACK_TMP1)(%rsp);
.Lround2_4:
- QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15)
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,,)
movdqa (STACK_TMP)(%rsp), X11;
movdqa (STACK_TMP1)(%rsp), X15;
movdqa X8, (STACK_TMP)(%rsp);
movdqa X9, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9)
- QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9)
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,,)
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,,)
movdqa (STACK_TMP)(%rsp), X8;
movdqa (STACK_TMP1)(%rsp), X9;
movdqa X11, (STACK_TMP)(%rsp);
movdqa X15, (STACK_TMP1)(%rsp);
- QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15)
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,,)
sub $2, ROUND;
jnz .Lround2_4;
@@ -380,9 +374,9 @@ _gcry_chacha20_amd64_ssse3_blocks1:
*/
/* Load constants */
- movdqa .Lcounter1 RIP, X4;
- movdqa .Lshuf_rol8 RIP, X5;
- movdqa .Lshuf_rol16 RIP, X6;
+ movdqa .Lcounter1 rRIP, X4;
+ movdqa .Lshuf_rol8 rRIP, X5;
+ movdqa .Lshuf_rol16 rRIP, X6;
/* Load state */
movdqu (0 * 4)(INPUT), X10;
@@ -445,5 +439,570 @@ _gcry_chacha20_amd64_ssse3_blocks1:
ELF(.size _gcry_chacha20_amd64_ssse3_blocks1,
.-_gcry_chacha20_amd64_ssse3_blocks1;)
+/**********************************************************************
+ 4-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4, at function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks4:
+ /* input:
+ * %rdi: input
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: nblks (multiple of 4)
+ * %r9: poly1305-state
+ * %r8: poly1305-src
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ subq $(8 * 8) + STACK_MAX + 16, %rsp;
+ andq $~15, %rsp;
+
+ movq %rbx, (STACK_MAX + 0 * 8)(%rsp);
+ movq %r12, (STACK_MAX + 1 * 8)(%rsp);
+ movq %r13, (STACK_MAX + 2 * 8)(%rsp);
+ movq %r14, (STACK_MAX + 3 * 8)(%rsp);
+ movq %r15, (STACK_MAX + 4 * 8)(%rsp);
+
+ movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC
+ movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST
+ movq %rcx, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+ /* Load state */
+ POLY1305_LOAD_STATE();
+
+.Loop_poly4:
+
+ /* Construct counter vectors X12 and X13 */
+ movdqa .Linc_counter rRIP, X0;
+ movdqa .Lunsigned_cmp rRIP, X2;
+ pbroadcastd((12 * 4)(INPUT), X12);
+ pbroadcastd((13 * 4)(INPUT), X13);
+ paddd X0, X12;
+ movdqa X12, X1;
+ pxor X2, X0;
+ pxor X2, X1;
+ pcmpgtd X1, X0;
+ psubd X0, X13;
+ movdqa X12, (STACK_VEC_X12)(%rsp);
+ movdqa X13, (STACK_VEC_X13)(%rsp);
+
+ /* Load vectors */
+ pbroadcastd((0 * 4)(INPUT), X0);
+ pbroadcastd((1 * 4)(INPUT), X1);
+ pbroadcastd((2 * 4)(INPUT), X2);
+ pbroadcastd((3 * 4)(INPUT), X3);
+ pbroadcastd((4 * 4)(INPUT), X4);
+ pbroadcastd((5 * 4)(INPUT), X5);
+ pbroadcastd((6 * 4)(INPUT), X6);
+ pbroadcastd((7 * 4)(INPUT), X7);
+ pbroadcastd((8 * 4)(INPUT), X8);
+ pbroadcastd((9 * 4)(INPUT), X9);
+ pbroadcastd((10 * 4)(INPUT), X10);
+ pbroadcastd((11 * 4)(INPUT), X11);
+ pbroadcastd((14 * 4)(INPUT), X14);
+ pbroadcastd((15 * 4)(INPUT), X15);
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+
+ /* rounds 0,1 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART1(0 * 16),
+ POLY1305_BLOCK_PART2())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(1 * 16))
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+
+ /* rounds 2,3 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART1(2 * 16),
+ POLY1305_BLOCK_PART2())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(3 * 16))
+
+ /* rounds 4,5 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART1(4 * 16),
+ POLY1305_BLOCK_PART2())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+
+ /* rounds 6,7 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(5 * 16))
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART1(6 * 16),
+ POLY1305_BLOCK_PART2())
+
+ /* rounds 8,9 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(7 * 16))
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+
+ /* rounds 10,11 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART1(8 * 16),
+ POLY1305_BLOCK_PART2())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(9 * 16))
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+
+ /* rounds 12,13 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART1(10 * 16),
+ POLY1305_BLOCK_PART2())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(11 * 16))
+
+ /* rounds 14,15 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART1(12 * 16),
+ POLY1305_BLOCK_PART2())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+
+ /* rounds 16,17 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(13 * 16))
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART1(14 * 16),
+ POLY1305_BLOCK_PART2())
+
+ /* rounds 18,19 */
+ QUARTERROUND2(X0, X4, X8, X12, X1, X5, X9, X13, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4())
+ movdqa (STACK_TMP)(%rsp), X11;
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X8, (STACK_TMP)(%rsp);
+ movdqa X9, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X6, X10, X14, X3, X7, X11, X15, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART1(15 * 16))
+ QUARTERROUND2(X0, X5, X10, X15, X1, X6, X11, X12, tmp:=,X8,X9,
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3())
+ movdqa (STACK_TMP)(%rsp), X8;
+ movdqa (STACK_TMP1)(%rsp), X9;
+ movdqa X11, (STACK_TMP)(%rsp);
+ movdqa X15, (STACK_TMP1)(%rsp);
+ QUARTERROUND2(X2, X7, X8, X13, X3, X4, X9, X14, tmp:=,X11,X15,
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5())
+
+ /* tmp := X15 */
+ movdqa (STACK_TMP)(%rsp), X11;
+ pbroadcastd((0 * 4)(INPUT), X15);
+ PLUS(X0, X15);
+ pbroadcastd((1 * 4)(INPUT), X15);
+ PLUS(X1, X15);
+ pbroadcastd((2 * 4)(INPUT), X15);
+ PLUS(X2, X15);
+ pbroadcastd((3 * 4)(INPUT), X15);
+ PLUS(X3, X15);
+ pbroadcastd((4 * 4)(INPUT), X15);
+ PLUS(X4, X15);
+ pbroadcastd((5 * 4)(INPUT), X15);
+ PLUS(X5, X15);
+ pbroadcastd((6 * 4)(INPUT), X15);
+ PLUS(X6, X15);
+ pbroadcastd((7 * 4)(INPUT), X15);
+ PLUS(X7, X15);
+ pbroadcastd((8 * 4)(INPUT), X15);
+ PLUS(X8, X15);
+ pbroadcastd((9 * 4)(INPUT), X15);
+ PLUS(X9, X15);
+ pbroadcastd((10 * 4)(INPUT), X15);
+ PLUS(X10, X15);
+ pbroadcastd((11 * 4)(INPUT), X15);
+ PLUS(X11, X15);
+ movdqa (STACK_VEC_X12)(%rsp), X15;
+ PLUS(X12, X15);
+ movdqa (STACK_VEC_X13)(%rsp), X15;
+ PLUS(X13, X15);
+ movdqa X13, (STACK_TMP)(%rsp);
+ pbroadcastd((14 * 4)(INPUT), X15);
+ PLUS(X14, X15);
+ movdqa (STACK_TMP1)(%rsp), X15;
+ movdqa X14, (STACK_TMP1)(%rsp);
+ pbroadcastd((15 * 4)(INPUT), X13);
+ PLUS(X15, X13);
+ movdqa X15, (STACK_TMP2)(%rsp);
+
+ /* Update counter */
+ addq $4, (12 * 4)(INPUT);
+
+ movq (STACK_MAX + 5 * 8)(%rsp), SRC;
+ movq (STACK_MAX + 6 * 8)(%rsp), DST;
+
+ transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+ xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+ xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+ xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+ xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+ transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+ movdqa (STACK_TMP)(%rsp), X13;
+ movdqa (STACK_TMP1)(%rsp), X14;
+ movdqa (STACK_TMP2)(%rsp), X15;
+ xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+ xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+ xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+ xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+ transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+ xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+ xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+ xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+ xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+ transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+ xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+ xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+ xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+ xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+ subq $4, (STACK_MAX + 7 * 8)(%rsp); # NBLKS
+
+ lea (16 * 16)(POLY_RSRC), POLY_RSRC;
+ lea (4 * 64)(DST), DST;
+ lea (4 * 64)(SRC), SRC;
+ movq SRC, (STACK_MAX + 5 * 8)(%rsp);
+ movq DST, (STACK_MAX + 6 * 8)(%rsp);
+
+ jnz .Loop_poly4;
+
+ /* Store state */
+ POLY1305_STORE_STATE();
+
+ /* clear the used vector registers and stack */
+ clear(X0);
+ movdqa X0, (STACK_VEC_X12)(%rsp);
+ movdqa X0, (STACK_VEC_X13)(%rsp);
+ movdqa X0, (STACK_TMP)(%rsp);
+ movdqa X0, (STACK_TMP1)(%rsp);
+ movdqa X0, (STACK_TMP2)(%rsp);
+ clear(X1);
+ clear(X2);
+ clear(X3);
+ clear(X4);
+ clear(X5);
+ clear(X6);
+ clear(X7);
+ clear(X8);
+ clear(X9);
+ clear(X10);
+ clear(X11);
+ clear(X12);
+ clear(X13);
+ clear(X14);
+ clear(X15);
+
+ movq (STACK_MAX + 0 * 8)(%rsp), %rbx;
+ movq (STACK_MAX + 1 * 8)(%rsp), %r12;
+ movq (STACK_MAX + 2 * 8)(%rsp), %r13;
+ movq (STACK_MAX + 3 * 8)(%rsp), %r14;
+ movq (STACK_MAX + 4 * 8)(%rsp), %r15;
+
+ xorl %eax, %eax;
+ leave;
+ ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4,
+ .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;)
+
+/**********************************************************************
+ 1-way stitched chacha20-poly1305
+ **********************************************************************/
+
+.align 8
+.globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1
+ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1, at function;)
+
+_gcry_chacha20_poly1305_amd64_ssse3_blocks1:
+ /* input:
+ * %rdi: chacha20-state
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: nblks
+ * %r9: poly1305-state
+ * %r8: poly1305-src
+ */
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ subq $(8 * 8), %rsp;
+ movq %rbx, (0 * 8)(%rsp);
+ movq %r12, (1 * 8)(%rsp);
+ movq %r13, (2 * 8)(%rsp);
+ movq %r14, (3 * 8)(%rsp);
+ movq %r15, (4 * 8)(%rsp);
+
+ movq %rdx, (5 * 8)(%rsp); # SRC
+ movq %rsi, (6 * 8)(%rsp); # DST
+ movq %rcx, (7 * 8)(%rsp); # NBLKS
+
+ /* Load constants */
+ movdqa .Lcounter1 rRIP, X4;
+ movdqa .Lshuf_rol8 rRIP, X5;
+ movdqa .Lshuf_rol16 rRIP, X6;
+
+ /* Load state */
+ movdqu (0 * 4)(INPUT), X10;
+ movdqu (4 * 4)(INPUT), X11;
+ movdqu (8 * 4)(INPUT), X12;
+ movdqu (12 * 4)(INPUT), X13;
+
+ POLY1305_LOAD_STATE();
+
+.Loop_poly1:
+ movdqa X10, X0;
+ movdqa X11, X1;
+ movdqa X12, X2;
+ movdqa X13, X3;
+
+ /* Process one ChaCha20 block and four Poly1305 blocks. */
+ POLY1305_BLOCK_PART1(0 * 16);
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART3();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART4();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART5();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART1(1 * 16);
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART3();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART4();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART5();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART1(2 * 16);
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART3();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART4();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART5();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART1(3 * 16);
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART3();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ POLY1305_BLOCK_PART4();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x39, 0x4e, 0x93);
+ POLY1305_BLOCK_PART5();
+ QUARTERROUND4(X0, X1, X2, X3, X5, X6, X7, 0x93, 0x4e, 0x39);
+
+ movq (5 * 8)(%rsp), SRC;
+ movq (6 * 8)(%rsp), DST;
+
+ PLUS(X0, X10);
+ PLUS(X1, X11);
+ PLUS(X2, X12);
+ PLUS(X3, X13);
+
+ /* Update counter */
+ paddq X4, X13;
+
+ xor_src_dst(DST, SRC, 0 * 4, X0, X7);
+ xor_src_dst(DST, SRC, 4 * 4, X1, X7);
+ xor_src_dst(DST, SRC, 8 * 4, X2, X7);
+ xor_src_dst(DST, SRC, 12 * 4, X3, X7);
+
+ subq $1, (7 * 8)(%rsp); # NBLKS
+ lea (64)(POLY_RSRC), POLY_RSRC;
+ lea (64)(SRC), SRC;
+ lea (64)(DST), DST;
+ movq SRC, (5 * 8)(%rsp);
+ movq DST, (6 * 8)(%rsp);
+
+ jnz .Loop_poly1;
+
+ /* Store state */
+ POLY1305_STORE_STATE();
+
+ movdqu X13, (12 * 4)(INPUT);
+
+ /* clear the used vector registers */
+ clear(X0);
+ clear(X1);
+ clear(X2);
+ clear(X3);
+ clear(X4);
+ clear(X5);
+ clear(X6);
+ clear(X7);
+ clear(X10);
+ clear(X11);
+ clear(X12);
+ clear(X13);
+
+ movq (0 * 8)(%rsp), %rbx;
+ movq (1 * 8)(%rsp), %r12;
+ movq (2 * 8)(%rsp), %r13;
+ movq (3 * 8)(%rsp), %r14;
+ movq (4 * 8)(%rsp), %r15;
+
+ xorl %eax, %eax;
+ leave;
+ ret;
+ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1,
+ .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;)
+
#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
#endif /*__x86_64*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index f1afd18e0..0847c20ea 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -1,5 +1,5 @@
/* chacha20.c - Bernstein's ChaCha20 cipher
- * Copyright (C) 2014,2017,2018 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2014,2017-2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -36,6 +36,7 @@
#include "types.h"
#include "g10lib.h"
#include "cipher.h"
+#include "cipher-internal.h"
#include "bufhelp.h"
@@ -116,6 +117,14 @@ unsigned int _gcry_chacha20_amd64_ssse3_blocks1(u32 *state, byte *dst,
const byte *src,
size_t nblks) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
+unsigned int _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
#endif /* USE_SSSE3 */
#ifdef USE_AVX2
@@ -124,6 +133,10 @@ unsigned int _gcry_chacha20_amd64_avx2_blocks8(u32 *state, byte *dst,
const byte *src,
size_t nblks) ASM_FUNC_ABI;
+unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ void *poly1305_state, const byte *poly1305_src) ASM_FUNC_ABI;
+
#endif /* USE_AVX2 */
#ifdef USE_ARMV7_NEON
@@ -402,39 +415,13 @@ chacha20_setkey (void *context, const byte *key, unsigned int keylen,
}
-static void
-chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
- size_t length)
+static unsigned int
+do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
+ const byte *inbuf, size_t length)
{
static const unsigned char zero_pad[CHACHA20_BLOCK_SIZE] = { 0, };
- CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
unsigned int nburn, burn = 0;
- if (!length)
- return;
-
- if (ctx->unused)
- {
- unsigned char *p = ctx->pad;
- size_t n;
-
- gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
-
- n = ctx->unused;
- if (n > length)
- n = length;
-
- buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
- length -= n;
- outbuf += n;
- inbuf += n;
- ctx->unused -= n;
-
- if (!length)
- return;
- gcry_assert (!ctx->unused);
- }
-
#ifdef USE_AVX2
if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
{
@@ -510,7 +497,349 @@ chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
ctx->unused = CHACHA20_BLOCK_SIZE - length;
}
- _gcry_burn_stack (burn);
+ if (burn)
+ burn += 5 * sizeof(void *);
+
+ return burn;
+}
+
+
+static void
+chacha20_encrypt_stream (void *context, byte *outbuf, const byte *inbuf,
+ size_t length)
+{
+ CHACHA20_context_t *ctx = (CHACHA20_context_t *) context;
+ unsigned int nburn, burn = 0;
+
+ if (!length)
+ return;
+
+ if (ctx->unused)
+ {
+ unsigned char *p = ctx->pad;
+ size_t n;
+
+ gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+ n = ctx->unused;
+ if (n > length)
+ n = length;
+
+ buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+ length -= n;
+ outbuf += n;
+ inbuf += n;
+ ctx->unused -= n;
+
+ if (!length)
+ return;
+ gcry_assert (!ctx->unused);
+ }
+
+ nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+ burn = nburn > burn ? nburn : burn;
+
+ if (burn)
+ _gcry_burn_stack (burn);
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
+ const byte *inbuf, size_t length)
+{
+ CHACHA20_context_t *ctx = (CHACHA20_context_t *) &c->context.c;
+ unsigned int nburn, burn = 0;
+ byte *authptr = NULL;
+
+ if (!length)
+ return 0;
+
+ if (ctx->unused)
+ {
+ unsigned char *p = ctx->pad;
+ size_t n;
+
+ gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+ n = ctx->unused;
+ if (n > length)
+ n = length;
+
+ buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+ nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf, n);
+ burn = nburn > burn ? nburn : burn;
+ length -= n;
+ outbuf += n;
+ inbuf += n;
+ ctx->unused -= n;
+
+ if (!length)
+ {
+ if (burn)
+ _gcry_burn_stack (burn);
+
+ return 0;
+ }
+ gcry_assert (!ctx->unused);
+ }
+
+ gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+ if (0)
+ { }
+#ifdef USE_AVX2
+ else if (ctx->use_avx2 && length >= CHACHA20_BLOCK_SIZE * 8)
+ {
+ nburn = _gcry_chacha20_amd64_avx2_blocks8(ctx->input, outbuf, inbuf, 8);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 8 * CHACHA20_BLOCK_SIZE;
+ outbuf += 8 * CHACHA20_BLOCK_SIZE;
+ inbuf += 8 * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+#ifdef USE_SSSE3
+ else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ nburn = _gcry_chacha20_amd64_ssse3_blocks4(ctx->input, outbuf, inbuf, 4);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 4 * CHACHA20_BLOCK_SIZE;
+ outbuf += 4 * CHACHA20_BLOCK_SIZE;
+ inbuf += 4 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_ssse3 && length >= CHACHA20_BLOCK_SIZE)
+ {
+ nburn = _gcry_chacha20_amd64_ssse3_blocks1(ctx->input, outbuf, inbuf, 1);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 1 * CHACHA20_BLOCK_SIZE;
+ outbuf += 1 * CHACHA20_BLOCK_SIZE;
+ inbuf += 1 * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
+ if (authptr)
+ {
+ size_t authoffset = outbuf - authptr;
+
+#ifdef USE_AVX2
+ if (ctx->use_avx2 &&
+ length >= 8 * CHACHA20_BLOCK_SIZE &&
+ authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+
+ nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
+#ifdef USE_SSSE3
+ if (ctx->use_ssse3)
+ {
+ if (length >= 4 * CHACHA20_BLOCK_SIZE &&
+ authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+
+ nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+
+ if (length >= CHACHA20_BLOCK_SIZE &&
+ authoffset >= CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+ nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+ }
+#endif
+
+ if (authoffset > 0)
+ {
+ _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
+ authptr += authoffset;
+ authoffset = 0;
+ }
+
+ gcry_assert(authptr == outbuf);
+ }
+
+ while (length)
+ {
+ size_t currlen = length;
+
+ /* Since checksumming is done after encryption, process input in 24KiB
+ * chunks to keep data loaded in L1 cache for checksumming. */
+ if (currlen > 24 * 1024)
+ currlen = 24 * 1024;
+
+ nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+ burn = nburn > burn ? nburn : burn;
+
+ nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, outbuf,
+ currlen);
+ burn = nburn > burn ? nburn : burn;
+
+ outbuf += currlen;
+ inbuf += currlen;
+ length -= currlen;
+ }
+
+ if (burn)
+ _gcry_burn_stack (burn);
+
+ return 0;
+}
+
+
+gcry_err_code_t
+_gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
+ const byte *inbuf, size_t length)
+{
+ CHACHA20_context_t *ctx = (CHACHA20_context_t *) &c->context.c;
+ unsigned int nburn, burn = 0;
+
+ if (!length)
+ return 0;
+
+ if (ctx->unused)
+ {
+ unsigned char *p = ctx->pad;
+ size_t n;
+
+ gcry_assert (ctx->unused < CHACHA20_BLOCK_SIZE);
+
+ n = ctx->unused;
+ if (n > length)
+ n = length;
+
+ nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf, n);
+ burn = nburn > burn ? nburn : burn;
+ buf_xor (outbuf, inbuf, p + CHACHA20_BLOCK_SIZE - ctx->unused, n);
+ length -= n;
+ outbuf += n;
+ inbuf += n;
+ ctx->unused -= n;
+
+ if (!length)
+ {
+ if (burn)
+ _gcry_burn_stack (burn);
+
+ return 0;
+ }
+ gcry_assert (!ctx->unused);
+ }
+
+ gcry_assert (c->u_mode.poly1305.ctx.leftover == 0);
+
+#ifdef USE_AVX2
+ if (ctx->use_avx2 && length >= 8 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+
+ nburn = _gcry_chacha20_poly1305_amd64_avx2_blocks8(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
+#ifdef USE_SSSE3
+ if (ctx->use_ssse3)
+ {
+ if (length >= 4 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+
+ nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+
+ if (length >= CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+ nburn = _gcry_chacha20_poly1305_amd64_ssse3_blocks1(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+ }
+#endif
+
+ while (length)
+ {
+ size_t currlen = length;
+
+ /* Since checksumming is done before decryption, process input in 24KiB
+ * chunks to keep data loaded in L1 cache for decryption. */
+ if (currlen > 24 * 1024)
+ currlen = 24 * 1024;
+
+ nburn = _gcry_poly1305_update_burn (&c->u_mode.poly1305.ctx, inbuf,
+ currlen);
+ burn = nburn > burn ? nburn : burn;
+
+ nburn = do_chacha20_encrypt_stream_tail (ctx, outbuf, inbuf, length);
+ burn = nburn > burn ? nburn : burn;
+
+ outbuf += currlen;
+ inbuf += currlen;
+ length -= currlen;
+ }
+
+ if (burn)
+ _gcry_burn_stack (burn);
+
+ return 0;
}
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index 898869623..78f05dbb5 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -542,6 +542,15 @@ void _gcry_cipher_poly1305_setkey
/* */ (gcry_cipher_hd_t c);
+/*-- chacha20.c --*/
+gcry_err_code_t _gcry_chacha20_poly1305_encrypt
+/* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+ size_t length);
+gcry_err_code_t _gcry_chacha20_poly1305_decrypt
+/* */ (gcry_cipher_hd_t c, byte *outbuf, const byte *inbuf,
+ size_t length);
+
+
/*-- cipher-ocb.c --*/
gcry_err_code_t _gcry_cipher_ocb_encrypt
/* */ (gcry_cipher_hd_t c,
diff --git a/cipher/cipher-poly1305.c b/cipher/cipher-poly1305.c
index 607586b55..bb475236b 100644
--- a/cipher/cipher-poly1305.c
+++ b/cipher/cipher-poly1305.c
@@ -164,6 +164,11 @@ _gcry_cipher_poly1305_encrypt (gcry_cipher_hd_t c,
return GPG_ERR_INV_LENGTH;
}
+ if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+ {
+ return _gcry_chacha20_poly1305_encrypt (c, outbuf, inbuf, inbuflen);
+ }
+
while (inbuflen)
{
size_t currlen = inbuflen;
@@ -217,6 +222,11 @@ _gcry_cipher_poly1305_decrypt (gcry_cipher_hd_t c,
return GPG_ERR_INV_LENGTH;
}
+ if (LIKELY(inbuflen > 0) && LIKELY(c->spec->algo == GCRY_CIPHER_CHACHA20))
+ {
+ return _gcry_chacha20_poly1305_decrypt (c, outbuf, inbuf, inbuflen);
+ }
+
while (inbuflen)
{
size_t currlen = inbuflen;
diff --git a/cipher/poly1305-internal.h b/cipher/poly1305-internal.h
index 2405a090f..19cee5f6f 100644
--- a/cipher/poly1305-internal.h
+++ b/cipher/poly1305-internal.h
@@ -58,5 +58,7 @@ void _gcry_poly1305_finish (poly1305_context_t *ctx,
void _gcry_poly1305_update (poly1305_context_t *ctx, const byte *buf,
size_t buflen);
+unsigned int _gcry_poly1305_update_burn (poly1305_context_t *ctx,
+ const byte *m, size_t bytes);
#endif /* G10_POLY1305_INTERNAL_H */
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index 571f82862..8de6cd5e6 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -133,7 +133,7 @@ static void poly1305_init (poly1305_context_t *ctx,
ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
} while (0)
-unsigned int
+static unsigned int
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
byte high_pad)
{
@@ -337,7 +337,7 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
} while (0)
-unsigned int
+static unsigned int
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
byte high_pad)
{
@@ -444,8 +444,9 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
#endif /* USE_MPI_32BIT */
-void
-_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+unsigned int
+_gcry_poly1305_update_burn (poly1305_context_t *ctx, const byte *m,
+ size_t bytes)
{
unsigned int burn = 0;
@@ -460,7 +461,7 @@ _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
m += want;
ctx->leftover += want;
if (ctx->leftover < POLY1305_BLOCKSIZE)
- return;
+ return 0;
burn = poly1305_blocks (ctx, ctx->buffer, POLY1305_BLOCKSIZE, 1);
ctx->leftover = 0;
}
@@ -481,6 +482,17 @@ _gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
ctx->leftover += bytes;
}
+ return burn;
+}
+
+
+void
+_gcry_poly1305_update (poly1305_context_t *ctx, const byte *m, size_t bytes)
+{
+ unsigned int burn;
+
+ burn = _gcry_poly1305_update_burn (ctx, m, bytes);
+
if (burn)
_gcry_burn_stack (burn);
}
More information about the Gcrypt-devel
mailing list