[PATCH] Add AVX2/vpgather bulk implementation of Twofish
Jussi Kivilinna
jussi.kivilinna at iki.fi
Wed Jan 4 16:15:17 CET 2017
* cipher/Makefile.am: Add 'twofish-avx2-amd64.S'.
* cipher/twofish-avx2-amd64.S: New.
* cipher/twofish.c (USE_AVX2): New.
(TWOFISH_context) [USE_AVX2]: Add 'use_avx2' member.
(ASM_FUNC_ABI): New.
(twofish_setkey): Add check for AVX2 and fast VPGATHER HW features.
(_gcry_twofish_avx2_ctr_enc, _gcry_twofish_avx2_cbc_dec)
(_gcry_twofish_avx2_cfb_dec, _gcry_twofish_avx2_ocb_enc)
(_gcry_twofish_avx2_ocb_dec, _gcry_twofish_avx2_ocb_auth): New.
(_gcry_twofish_ctr_enc, _gcry_twofish_cbc_dec, _gcry_twofish_cfb_dec)
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Add AVX2 bulk
handling.
(selftest_ctr, selftest_cbc, selftest_cfb): Increase nblocks from
3+X to 16+X.
* configure.ac: Add 'twofish-avx2-amd64.lo'.
* src/g10lib.h (HWF_INTEL_FAST_VPGATHER): New.
* src/hwf-x86.c (detect_x86_gnuc): Add detection for
HWF_INTEL_FAST_VPGATHER.
* src/hwfeatures.c (HWF_INTEL_FAST_VPGATHER): Add
"intel-fast-vpgather" for HWF_INTEL_FAST_VPGATHER.
--
Benchmark on Intel Core i3-6100 (3.7 Ghz):
Before:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 4.25 ns/B 224.5 MiB/s 15.71 c/B
ECB dec | 4.16 ns/B 229.5 MiB/s 15.38 c/B
CBC enc | 4.53 ns/B 210.4 MiB/s 16.77 c/B
CBC dec | 2.71 ns/B 351.6 MiB/s 10.04 c/B
CFB enc | 4.60 ns/B 207.3 MiB/s 17.02 c/B
CFB dec | 2.70 ns/B 353.5 MiB/s 9.98 c/B
OFB enc | 4.25 ns/B 224.2 MiB/s 15.74 c/B
OFB dec | 4.24 ns/B 225.0 MiB/s 15.68 c/B
CTR enc | 2.72 ns/B 350.6 MiB/s 10.06 c/B
CTR dec | 2.72 ns/B 350.7 MiB/s 10.06 c/B
CCM enc | 7.25 ns/B 131.5 MiB/s 26.83 c/B
CCM dec | 7.25 ns/B 131.5 MiB/s 26.83 c/B
CCM auth | 4.57 ns/B 208.9 MiB/s 16.89 c/B
GCM enc | 3.02 ns/B 315.3 MiB/s 11.19 c/B
GCM dec | 3.02 ns/B 315.6 MiB/s 11.18 c/B
GCM auth | 0.297 ns/B 3208.4 MiB/s 1.10 c/B
OCB enc | 2.73 ns/B 349.7 MiB/s 10.09 c/B
OCB dec | 2.82 ns/B 338.3 MiB/s 10.43 c/B
OCB auth | 2.77 ns/B 343.7 MiB/s 10.27 c/B
After (CBC-dec & CFB-dec & CTR & OCB, ~1.5x faster):
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 4.25 ns/B 224.2 MiB/s 15.74 c/B
ECB dec | 4.15 ns/B 229.5 MiB/s 15.37 c/B
CBC enc | 4.61 ns/B 206.8 MiB/s 17.06 c/B
CBC dec | 1.75 ns/B 544.0 MiB/s 6.49 c/B
CFB enc | 4.52 ns/B 211.0 MiB/s 16.72 c/B
CFB dec | 1.72 ns/B 554.1 MiB/s 6.37 c/B
OFB enc | 4.27 ns/B 223.3 MiB/s 15.80 c/B
OFB dec | 4.28 ns/B 222.7 MiB/s 15.84 c/B
CTR enc | 1.73 ns/B 549.9 MiB/s 6.42 c/B
CTR dec | 1.75 ns/B 545.1 MiB/s 6.47 c/B
CCM enc | 6.31 ns/B 151.2 MiB/s 23.34 c/B
CCM dec | 6.42 ns/B 148.5 MiB/s 23.76 c/B
CCM auth | 4.56 ns/B 208.9 MiB/s 16.89 c/B
GCM enc | 1.90 ns/B 502.8 MiB/s 7.02 c/B
GCM dec | 2.00 ns/B 477.8 MiB/s 7.38 c/B
GCM auth | 0.300 ns/B 3178.6 MiB/s 1.11 c/B
OCB enc | 1.76 ns/B 542.2 MiB/s 6.51 c/B
OCB dec | 1.76 ns/B 540.7 MiB/s 6.53 c/B
OCB auth | 1.76 ns/B 542.8 MiB/s 6.50 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 71a25ed..8c9fc0e 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -100,6 +100,7 @@ stribog.c \
tiger.c \
whirlpool.c whirlpool-sse2-amd64.S \
twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
+ twofish-avx2-amd64.S \
rfc2268.c \
camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
new file mode 100644
index 0000000..db6e218
--- /dev/null
+++ b/cipher/twofish-avx2-amd64.S
@@ -0,0 +1,1012 @@
+/* twofish-avx2-amd64.S - AMD64/AVX2 assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2013-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \
+ defined(ENABLE_AVX2_SUPPORT)
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+# define RIP (%rip)
+#else
+# define RIP
+#endif
+
+.text
+
+/* structure of TWOFISH_context: */
+#define s0 0
+#define s1 ((s0) + 4 * 256)
+#define s2 ((s1) + 4 * 256)
+#define s3 ((s2) + 4 * 256)
+#define w ((s3) + 4 * 256)
+#define k ((w) + 4 * 8)
+
+/* register macros */
+#define CTX %rdi
+
+#define RROUND %rbp
+#define RROUNDd %ebp
+#define RS0 CTX
+#define RS1 %r8
+#define RS2 %r9
+#define RS3 %r10
+#define RK %r11
+#define RW %rax
+
+#define RA0 %ymm8
+#define RB0 %ymm9
+#define RC0 %ymm10
+#define RD0 %ymm11
+#define RA1 %ymm12
+#define RB1 %ymm13
+#define RC1 %ymm14
+#define RD1 %ymm15
+
+/* temp regs */
+#define RX0 %ymm0
+#define RY0 %ymm1
+#define RX1 %ymm2
+#define RY1 %ymm3
+#define RT0 %ymm4
+#define RIDX %ymm5
+
+#define RX0x %xmm0
+#define RY0x %xmm1
+#define RX1x %xmm2
+#define RY1x %xmm3
+#define RT0x %xmm4
+#define RIDXx %xmm5
+
+#define RTMP0 RX0
+#define RTMP0x RX0x
+#define RTMP1 RX1
+#define RTMP1x RX1x
+#define RTMP2 RY0
+#define RTMP2x RY0x
+#define RTMP3 RY1
+#define RTMP3x RY1x
+#define RTMP4 RIDX
+#define RTMP4x RIDXx
+
+/* vpgatherdd mask and '-1' */
+#define RNOT %ymm6
+#define RNOTx %xmm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE %ymm7
+
+/**********************************************************************
+ 16-way AVX2 twofish
+ **********************************************************************/
+#define init_round_constants() \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ leaq k(CTX), RK; \
+ leaq w(CTX), RW; \
+ vpsrld $24, RNOT, RBYTE; \
+ leaq s1(CTX), RS1; \
+ leaq s2(CTX), RS2; \
+ leaq s3(CTX), RS3; \
+
+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
+ vpand RBYTE, ab ## 0, RIDX; \
+ vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ \
+ vpand RBYTE, ab ## 1, RIDX; \
+ vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ \
+ vpsrld $8, ab ## 0, RIDX; \
+ vpand RBYTE, RIDX, RIDX; \
+ vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ vpxor RT0, xy ## 0, xy ## 0; \
+ \
+ vpsrld $8, ab ## 1, RIDX; \
+ vpand RBYTE, RIDX, RIDX; \
+ vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ vpxor RT0, xy ## 1, xy ## 1; \
+ \
+ vpsrld $16, ab ## 0, RIDX; \
+ vpand RBYTE, RIDX, RIDX; \
+ vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ vpxor RT0, xy ## 0, xy ## 0; \
+ \
+ vpsrld $16, ab ## 1, RIDX; \
+ vpand RBYTE, RIDX, RIDX; \
+ vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ vpxor RT0, xy ## 1, xy ## 1; \
+ \
+ vpsrld $24, ab ## 0, RIDX; \
+ vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ vpxor RT0, xy ## 0, xy ## 0; \
+ \
+ vpsrld $24, ab ## 1, RIDX; \
+ vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+ vpcmpeqd RNOT, RNOT, RNOT; \
+ vpxor RT0, xy ## 1, xy ## 1;
+
+#define g1_16(a, x) \
+ g16(a, RS0, RS1, RS2, RS3, x);
+
+#define g2_16(b, y) \
+ g16(b, RS1, RS2, RS3, RS0, y);
+
+#define encrypt_round_end16(a, b, c, d, nk, r) \
+ vpaddd RY0, RX0, RX0; \
+ vpaddd RX0, RY0, RY0; \
+ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RX0, RX0; \
+ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RY0, RY0; \
+ \
+ vpxor RY0, d ## 0, d ## 0; \
+ \
+ vpxor RX0, c ## 0, c ## 0; \
+ vpsrld $1, c ## 0, RT0; \
+ vpslld $31, c ## 0, c ## 0; \
+ vpor RT0, c ## 0, c ## 0; \
+ \
+ vpaddd RY1, RX1, RX1; \
+ vpaddd RX1, RY1, RY1; \
+ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RX1, RX1; \
+ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RY1, RY1; \
+ \
+ vpxor RY1, d ## 1, d ## 1; \
+ \
+ vpxor RX1, c ## 1, c ## 1; \
+ vpsrld $1, c ## 1, RT0; \
+ vpslld $31, c ## 1, c ## 1; \
+ vpor RT0, c ## 1, c ## 1; \
+
+#define encrypt_round16(a, b, c, d, nk, r) \
+ g2_16(b, RY); \
+ \
+ vpslld $1, b ## 0, RT0; \
+ vpsrld $31, b ## 0, b ## 0; \
+ vpor RT0, b ## 0, b ## 0; \
+ \
+ vpslld $1, b ## 1, RT0; \
+ vpsrld $31, b ## 1, b ## 1; \
+ vpor RT0, b ## 1, b ## 1; \
+ \
+ g1_16(a, RX); \
+ \
+ encrypt_round_end16(a, b, c, d, nk, r);
+
+#define encrypt_round_first16(a, b, c, d, nk, r) \
+ vpslld $1, d ## 0, RT0; \
+ vpsrld $31, d ## 0, d ## 0; \
+ vpor RT0, d ## 0, d ## 0; \
+ \
+ vpslld $1, d ## 1, RT0; \
+ vpsrld $31, d ## 1, d ## 1; \
+ vpor RT0, d ## 1, d ## 1; \
+ \
+ encrypt_round16(a, b, c, d, nk, r);
+
+#define encrypt_round_last16(a, b, c, d, nk, r) \
+ g2_16(b, RY); \
+ \
+ g1_16(a, RX); \
+ \
+ encrypt_round_end16(a, b, c, d, nk, r);
+
+#define decrypt_round_end16(a, b, c, d, nk, r) \
+ vpaddd RY0, RX0, RX0; \
+ vpaddd RX0, RY0, RY0; \
+ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RX0, RX0; \
+ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RY0, RY0; \
+ \
+ vpxor RX0, c ## 0, c ## 0; \
+ \
+ vpxor RY0, d ## 0, d ## 0; \
+ vpsrld $1, d ## 0, RT0; \
+ vpslld $31, d ## 0, d ## 0; \
+ vpor RT0, d ## 0, d ## 0; \
+ \
+ vpaddd RY1, RX1, RX1; \
+ vpaddd RX1, RY1, RY1; \
+ vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RX1, RX1; \
+ vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+ vpaddd RT0, RY1, RY1; \
+ \
+ vpxor RX1, c ## 1, c ## 1; \
+ \
+ vpxor RY1, d ## 1, d ## 1; \
+ vpsrld $1, d ## 1, RT0; \
+ vpslld $31, d ## 1, d ## 1; \
+ vpor RT0, d ## 1, d ## 1;
+
+#define decrypt_round16(a, b, c, d, nk, r) \
+ g1_16(a, RX); \
+ \
+ vpslld $1, a ## 0, RT0; \
+ vpsrld $31, a ## 0, a ## 0; \
+ vpor RT0, a ## 0, a ## 0; \
+ \
+ vpslld $1, a ## 1, RT0; \
+ vpsrld $31, a ## 1, a ## 1; \
+ vpor RT0, a ## 1, a ## 1; \
+ \
+ g2_16(b, RY); \
+ \
+ decrypt_round_end16(a, b, c, d, nk, r);
+
+#define decrypt_round_first16(a, b, c, d, nk, r) \
+ vpslld $1, c ## 0, RT0; \
+ vpsrld $31, c ## 0, c ## 0; \
+ vpor RT0, c ## 0, c ## 0; \
+ \
+ vpslld $1, c ## 1, RT0; \
+ vpsrld $31, c ## 1, c ## 1; \
+ vpor RT0, c ## 1, c ## 1; \
+ \
+ decrypt_round16(a, b, c, d, nk, r)
+
+#define decrypt_round_last16(a, b, c, d, nk, r) \
+ g1_16(a, RX); \
+ \
+ g2_16(b, RY); \
+ \
+ decrypt_round_end16(a, b, c, d, nk, r);
+
+#define encrypt_cycle16(r) \
+ encrypt_round16(RA, RB, RC, RD, 0, r); \
+ encrypt_round16(RC, RD, RA, RB, 8, r);
+
+#define encrypt_cycle_first16(r) \
+ encrypt_round_first16(RA, RB, RC, RD, 0, r); \
+ encrypt_round16(RC, RD, RA, RB, 8, r);
+
+#define encrypt_cycle_last16(r) \
+ encrypt_round16(RA, RB, RC, RD, 0, r); \
+ encrypt_round_last16(RC, RD, RA, RB, 8, r);
+
+#define decrypt_cycle16(r) \
+ decrypt_round16(RC, RD, RA, RB, 8, r); \
+ decrypt_round16(RA, RB, RC, RD, 0, r);
+
+#define decrypt_cycle_first16(r) \
+ decrypt_round_first16(RC, RD, RA, RB, 8, r); \
+ decrypt_round16(RA, RB, RC, RD, 0, r);
+
+#define decrypt_cycle_last16(r) \
+ decrypt_round16(RC, RD, RA, RB, 8, r); \
+ decrypt_round_last16(RA, RB, RC, RD, 0, r);
+
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define read_blocks8(offs,a,b,c,d) \
+ vmovdqu 16*offs(RIO), a; \
+ vmovdqu 16*offs+32(RIO), b; \
+ vmovdqu 16*offs+64(RIO), c; \
+ vmovdqu 16*offs+96(RIO), d; \
+ \
+ transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define write_blocks8(offs,a,b,c,d) \
+ transpose_4x4(a, b, c, d, RX0, RY0); \
+ \
+ vmovdqu a, 16*offs(RIO); \
+ vmovdqu b, 16*offs+32(RIO); \
+ vmovdqu c, 16*offs+64(RIO); \
+ vmovdqu d, 16*offs+96(RIO);
+
+#define inpack_enc8(a,b,c,d) \
+ vpbroadcastd 4*0(RW), RT0; \
+ vpxor RT0, a, a; \
+ \
+ vpbroadcastd 4*1(RW), RT0; \
+ vpxor RT0, b, b; \
+ \
+ vpbroadcastd 4*2(RW), RT0; \
+ vpxor RT0, c, c; \
+ \
+ vpbroadcastd 4*3(RW), RT0; \
+ vpxor RT0, d, d;
+
+#define outunpack_enc8(a,b,c,d) \
+ vpbroadcastd 4*4(RW), RX0; \
+ vpbroadcastd 4*5(RW), RY0; \
+ vpxor RX0, c, RX0; \
+ vpxor RY0, d, RY0; \
+ \
+ vpbroadcastd 4*6(RW), RT0; \
+ vpxor RT0, a, c; \
+ vpbroadcastd 4*7(RW), RT0; \
+ vpxor RT0, b, d; \
+ \
+ vmovdqa RX0, a; \
+ vmovdqa RY0, b;
+
+#define inpack_dec8(a,b,c,d) \
+ vpbroadcastd 4*4(RW), RX0; \
+ vpbroadcastd 4*5(RW), RY0; \
+ vpxor RX0, a, RX0; \
+ vpxor RY0, b, RY0; \
+ \
+ vpbroadcastd 4*6(RW), RT0; \
+ vpxor RT0, c, a; \
+ vpbroadcastd 4*7(RW), RT0; \
+ vpxor RT0, d, b; \
+ \
+ vmovdqa RX0, c; \
+ vmovdqa RY0, d;
+
+#define outunpack_dec8(a,b,c,d) \
+ vpbroadcastd 4*0(RW), RT0; \
+ vpxor RT0, a, a; \
+ \
+ vpbroadcastd 4*1(RW), RT0; \
+ vpxor RT0, b, b; \
+ \
+ vpbroadcastd 4*2(RW), RT0; \
+ vpxor RT0, c, c; \
+ \
+ vpbroadcastd 4*3(RW), RT0; \
+ vpxor RT0, d, d;
+
+#define transpose4x4_16(a,b,c,d) \
+ transpose_4x4(a ## 0, b ## 0, c ## 0, d ## 0, RX0, RY0); \
+ transpose_4x4(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0);
+
+#define inpack_enc16(a,b,c,d) \
+ inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+ inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_enc16(a,b,c,d) \
+ outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+ outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_dec16(a,b,c,d) \
+ inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+ inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_dec16(a,b,c,d) \
+ outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+ outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+.align 8
+ELF(.type __twofish_enc_blk16, at function;)
+__twofish_enc_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+ * ciphertext blocks
+ */
+ init_round_constants();
+
+ transpose4x4_16(RA, RB, RC, RD);
+ inpack_enc16(RA, RB, RC, RD);
+
+ encrypt_cycle_first16(0);
+ encrypt_cycle16(2);
+ encrypt_cycle16(4);
+ encrypt_cycle16(6);
+ encrypt_cycle16(8);
+ encrypt_cycle16(10);
+ encrypt_cycle16(12);
+ encrypt_cycle_last16(14);
+
+ outunpack_enc16(RA, RB, RC, RD);
+ transpose4x4_16(RA, RB, RC, RD);
+
+ ret;
+ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
+
+.align 8
+ELF(.type __twofish_dec_blk16, at function;)
+__twofish_dec_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+ * ciphertext blocks
+ */
+ init_round_constants();
+
+ transpose4x4_16(RA, RB, RC, RD);
+ inpack_dec16(RA, RB, RC, RD);
+
+ decrypt_cycle_first16(14);
+ decrypt_cycle16(12);
+ decrypt_cycle16(10);
+ decrypt_cycle16(8);
+ decrypt_cycle16(6);
+ decrypt_cycle16(4);
+ decrypt_cycle16(2);
+ decrypt_cycle_last16(0);
+
+ outunpack_dec16(RA, RB, RC, RD);
+ transpose4x4_16(RA, RB, RC, RD);
+
+ ret;
+ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_twofish_avx2_ctr_enc
+ELF(.type _gcry_twofish_avx2_ctr_enc, at function;)
+_gcry_twofish_avx2_ctr_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+
+ movq 8(%rcx), %rax;
+ bswapq %rax;
+
+ vzeroupper;
+
+ vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RTMP4x;
+ vpshufb RTMP3x, RTMP4x, RTMP4x;
+ vmovdqa RTMP4x, RTMP0x;
+ inc_le128(RTMP4x, RNOTx, RTMP1x);
+ vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+ vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 16), %rax;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+ vpshufb RTMP3, RTMP0, RB0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+ vpshufb RTMP3, RTMP0, RC0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+ vpshufb RTMP3, RTMP0, RD0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+ vpshufb RTMP3, RTMP0, RA1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+ vpshufb RTMP3, RTMP0, RB1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+ vpshufb RTMP3, RTMP0, RC1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+ vpshufb RTMP3, RTMP0, RD1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+ vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+ jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB0; /* +3 ; +2 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RC0; /* +5 ; +4 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RD0; /* +7 ; +6 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA1; /* +9 ; +8 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RC1; /* +13 ; +12 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RD1; /* +15 ; +14 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vextracti128 $1, RTMP0, RTMP0x;
+ vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+ /* store new IV */
+ vmovdqu RTMP0x, (%rcx);
+
+ call __twofish_enc_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RB0, RB0;
+ vpxor (2 * 32)(%rdx), RC0, RC0;
+ vpxor (3 * 32)(%rdx), RD0, RD0;
+ vpxor (4 * 32)(%rdx), RA1, RA1;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RC1, RC1;
+ vpxor (7 * 32)(%rdx), RD1, RD1;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RB0, (1 * 32)(%rsi);
+ vmovdqu RC0, (2 * 32)(%rsi);
+ vmovdqu RD0, (3 * 32)(%rsi);
+ vmovdqu RA1, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RC1, (6 * 32)(%rsi);
+ vmovdqu RD1, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret
+ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_twofish_avx2_cbc_dec
+ELF(.type _gcry_twofish_avx2_cbc_dec, at function;)
+_gcry_twofish_avx2_cbc_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+
+ vzeroupper;
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RB0;
+ vmovdqu (2 * 32)(%rdx), RC0;
+ vmovdqu (3 * 32)(%rdx), RD0;
+ vmovdqu (4 * 32)(%rdx), RA1;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RC1;
+ vmovdqu (7 * 32)(%rdx), RD1;
+
+ call __twofish_dec_blk16;
+
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RNOT;
+ vpxor RNOT, RA0, RA0;
+ vpxor (0 * 32 + 16)(%rdx), RB0, RB0;
+ vpxor (1 * 32 + 16)(%rdx), RC0, RC0;
+ vpxor (2 * 32 + 16)(%rdx), RD0, RD0;
+ vpxor (3 * 32 + 16)(%rdx), RA1, RA1;
+ vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+ vpxor (5 * 32 + 16)(%rdx), RC1, RC1;
+ vpxor (6 * 32 + 16)(%rdx), RD1, RD1;
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RB0, (1 * 32)(%rsi);
+ vmovdqu RC0, (2 * 32)(%rsi);
+ vmovdqu RD0, (3 * 32)(%rsi);
+ vmovdqu RA1, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RC1, (6 * 32)(%rsi);
+ vmovdqu RD1, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret
+ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_cfb_dec
+ELF(.type _gcry_twofish_avx2_cfb_dec, at function;)
+_gcry_twofish_avx2_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+
+ vzeroupper;
+
+ /* Load input */
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RA0;
+ vmovdqu (0 * 32 + 16)(%rdx), RB0;
+ vmovdqu (1 * 32 + 16)(%rdx), RC0;
+ vmovdqu (2 * 32 + 16)(%rdx), RD0;
+ vmovdqu (3 * 32 + 16)(%rdx), RA1;
+ vmovdqu (4 * 32 + 16)(%rdx), RB1;
+ vmovdqu (5 * 32 + 16)(%rdx), RC1;
+ vmovdqu (6 * 32 + 16)(%rdx), RD1;
+
+ /* Update IV */
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx);
+
+ call __twofish_enc_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RB0, RB0;
+ vpxor (2 * 32)(%rdx), RC0, RC0;
+ vpxor (3 * 32)(%rdx), RD0, RD0;
+ vpxor (4 * 32)(%rdx), RA1, RA1;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RC1, RC1;
+ vpxor (7 * 32)(%rdx), RD1, RD1;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RB0, (1 * 32)(%rsi);
+ vmovdqu RC0, (2 * 32)(%rsi);
+ vmovdqu RD0, (3 * 32)(%rsi);
+ vmovdqu RA1, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RC1, (6 * 32)(%rsi);
+ vmovdqu RD1, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret
+ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_enc
+ELF(.type _gcry_twofish_avx2_ocb_enc, at function;)
+
+_gcry_twofish_avx2_ocb_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+
+ vzeroupper;
+
+ subq $(4 * 8), %rsp;
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+
+ vmovdqu (%rcx), RTMP0x;
+ vmovdqu (%r8), RTMP1x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RTMP1, RTMP1; \
+ vpxor yreg, RNOT, yreg; \
+ vmovdqu RNOT, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RB0);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, RC0);
+ OCB_INPUT(3, %r12, %r13, RD0);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, RA1);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, RC1);
+ OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vmovdqu RTMP0x, (%rcx);
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%r8);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+
+ call __twofish_enc_blk16;
+
+ addq $(4 * 8), %rsp;
+
+ vpxor (0 * 32)(%rsi), RA0, RA0;
+ vpxor (1 * 32)(%rsi), RB0, RB0;
+ vpxor (2 * 32)(%rsi), RC0, RC0;
+ vpxor (3 * 32)(%rsi), RD0, RD0;
+ vpxor (4 * 32)(%rsi), RA1, RA1;
+ vpxor (5 * 32)(%rsi), RB1, RB1;
+ vpxor (6 * 32)(%rsi), RC1, RC1;
+ vpxor (7 * 32)(%rsi), RD1, RD1;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RB0, (1 * 32)(%rsi);
+ vmovdqu RC0, (2 * 32)(%rsi);
+ vmovdqu RD0, (3 * 32)(%rsi);
+ vmovdqu RA1, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RC1, (6 * 32)(%rsi);
+ vmovdqu RD1, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret;
+ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_dec
+ELF(.type _gcry_twofish_avx2_ocb_dec, at function;)
+
+_gcry_twofish_avx2_ocb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+
+ vzeroupper;
+
+ subq $(4 * 8), %rsp;
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+
+ vmovdqu (%rcx), RTMP0x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RNOT, yreg; \
+ vmovdqu RNOT, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RB0);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, RC0);
+ OCB_INPUT(3, %r12, %r13, RD0);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, RA1);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, RC1);
+ OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+ vmovdqu RTMP0x, (%rcx);
+ mov %r8, %rcx
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+
+ call __twofish_dec_blk16;
+
+ vmovdqu (%rcx), RTMP1x;
+
+ vpxor (0 * 32)(%rsi), RA0, RA0;
+ vpxor (1 * 32)(%rsi), RB0, RB0;
+ vpxor (2 * 32)(%rsi), RC0, RC0;
+ vpxor (3 * 32)(%rsi), RD0, RD0;
+ vpxor (4 * 32)(%rsi), RA1, RA1;
+ vpxor (5 * 32)(%rsi), RB1, RB1;
+ vpxor (6 * 32)(%rsi), RC1, RC1;
+ vpxor (7 * 32)(%rsi), RD1, RD1;
+
+ addq $(4 * 8), %rsp;
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vpxor RA0, RTMP1, RTMP1;
+ vmovdqu RB0, (1 * 32)(%rsi);
+ vpxor RB0, RTMP1, RTMP1;
+ vmovdqu RC0, (2 * 32)(%rsi);
+ vpxor RC0, RTMP1, RTMP1;
+ vmovdqu RD0, (3 * 32)(%rsi);
+ vpxor RD0, RTMP1, RTMP1;
+ vmovdqu RA1, (4 * 32)(%rsi);
+ vpxor RA1, RTMP1, RTMP1;
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vpxor RB1, RTMP1, RTMP1;
+ vmovdqu RC1, (6 * 32)(%rsi);
+ vpxor RC1, RTMP1, RTMP1;
+ vmovdqu RD1, (7 * 32)(%rsi);
+ vpxor RD1, RTMP1, RTMP1;
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%rcx);
+
+ vzeroall;
+
+ ret;
+ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_auth
+ELF(.type _gcry_twofish_avx2_ocb_auth, at function;)
+
+_gcry_twofish_avx2_ocb_auth:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: abuf (16 blocks)
+ * %rdx: offset
+ * %rcx: checksum
+ * %r8 : L pointers (void *L[16])
+ */
+
+ vzeroupper;
+
+ subq $(4 * 8), %rsp;
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+
+ vmovdqu (%rdx), RTMP0x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rsi), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RNOT, yreg;
+
+ movq (0 * 8)(%r8), %r10;
+ movq (1 * 8)(%r8), %r11;
+ movq (2 * 8)(%r8), %r12;
+ movq (3 * 8)(%r8), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RB0);
+ movq (4 * 8)(%r8), %r10;
+ movq (5 * 8)(%r8), %r11;
+ movq (6 * 8)(%r8), %r12;
+ movq (7 * 8)(%r8), %r13;
+ OCB_INPUT(2, %r10, %r11, RC0);
+ OCB_INPUT(3, %r12, %r13, RD0);
+ movq (8 * 8)(%r8), %r10;
+ movq (9 * 8)(%r8), %r11;
+ movq (10 * 8)(%r8), %r12;
+ movq (11 * 8)(%r8), %r13;
+ OCB_INPUT(4, %r10, %r11, RA1);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r8), %r10;
+ movq (13 * 8)(%r8), %r11;
+ movq (14 * 8)(%r8), %r12;
+ movq (15 * 8)(%r8), %r13;
+ OCB_INPUT(6, %r10, %r11, RC1);
+ OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+ vmovdqu RTMP0x, (%rdx);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+
+ call __twofish_enc_blk16;
+
+ vpxor RA0, RB0, RA0;
+ vpxor RC0, RD0, RC0;
+ vpxor RA1, RB1, RA1;
+ vpxor RC1, RD1, RC1;
+
+ vpxor RA0, RC0, RA0;
+ vpxor RA1, RC1, RA1;
+
+ addq $(4 * 8), %rsp;
+
+ vpxor RA1, RA0, RTMP1;
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vpxor (%rcx), RTMP1x, RTMP1x;
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%rcx);
+
+ vzeroall;
+
+ ret;
+ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
+
+.align 16
+
+/* For CTR-mode IV byteswap */
+ _gcry_twofish_bswap128_mask:
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;)
+
+#endif /*defined(USE_TWOFISH) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 55f6fb9..942e8d4 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -72,6 +72,15 @@
# endif
# endif
+/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# if defined(ENABLE_AVX2_SUPPORT)
+# define USE_AVX2 1
+# endif
+#endif
+
/* Prototype for the self-test function. */
static const char *selftest(void);
@@ -82,8 +91,25 @@ static const char *selftest(void);
* that k[i] corresponds to what the Twofish paper calls K[i+8]. */
typedef struct {
u32 s[4][256], w[8], k[32];
+
+#ifdef USE_AVX2
+ int use_avx2;
+#endif
} TWOFISH_context;
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+# define ASM_FUNC_ABI
+# endif
+#endif
+
+
/* These two tables are the q0 and q1 permutations, exactly as described in
* the Twofish paper. */
@@ -711,12 +737,66 @@ static gcry_err_code_t
twofish_setkey (void *context, const byte *key, unsigned int keylen)
{
TWOFISH_context *ctx = context;
- int rc = do_twofish_setkey (ctx, key, keylen);
+ unsigned int hwfeatures = _gcry_get_hw_features ();
+ int rc;
+
+ rc = do_twofish_setkey (ctx, key, keylen);
+
+#ifdef USE_AVX2
+ ctx->use_avx2 = 0;
+ if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
+ {
+ ctx->use_avx2 = 1;
+ }
+#endif
+
+ (void)hwfeatures;
+
_gcry_burn_stack (23+6*sizeof(void*));
return rc;
}
+#ifdef USE_AVX2
+/* Assembler implementations of Twofish using AVX2. Process 16 block in
+ parallel.
+ */
+extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+#endif
+
#ifdef USE_AMD64_ASM
@@ -1111,6 +1191,31 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
unsigned int burn, burn_stack_depth = 0;
int i;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
#ifdef USE_AMD64_ASM
{
/* Process data in 3 block chunks. */
@@ -1169,6 +1274,31 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
unsigned char savebuf[TWOFISH_BLOCKSIZE];
unsigned int burn, burn_stack_depth = 0;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
#ifdef USE_AMD64_ASM
{
/* Process data in 3 block chunks. */
@@ -1218,6 +1348,31 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
const unsigned char *inbuf = inbuf_arg;
unsigned int burn, burn_stack_depth = 0;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
#ifdef USE_AMD64_ASM
{
/* Process data in 3 block chunks. */
@@ -1264,6 +1419,62 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned int burn, burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.data_nblocks;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+ u64 Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ u64 *l;
+ int i;
+
+ if (nblocks >= 16)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ /* Use u64 to store pointers for x32 support (assembly function
+ * assumes 64-bit pointers). */
+ Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
{
/* Use u64 to store pointers for x32 support (assembly function
* assumes 64-bit pointers). */
@@ -1321,6 +1532,59 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
unsigned int burn, burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.aad_nblocks;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+ u64 Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ u64 *l;
+ int i;
+
+ if (nblocks >= 16)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ /* Use u64 to store pointers for x32 support (assembly function
+ * assumes 64-bit pointers). */
+ Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
{
/* Use u64 to store pointers for x32 support (assembly function
* assumes 64-bit pointers). */
@@ -1367,7 +1631,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
static const char *
selftest_ctr (void)
{
- const int nblocks = 3+1;
+ const int nblocks = 16+1;
const int blocksize = TWOFISH_BLOCKSIZE;
const int context_size = sizeof(TWOFISH_context);
@@ -1381,7 +1645,7 @@ selftest_ctr (void)
static const char *
selftest_cbc (void)
{
- const int nblocks = 3+2;
+ const int nblocks = 16+2;
const int blocksize = TWOFISH_BLOCKSIZE;
const int context_size = sizeof(TWOFISH_context);
@@ -1395,7 +1659,7 @@ selftest_cbc (void)
static const char *
selftest_cfb (void)
{
- const int nblocks = 3+2;
+ const int nblocks = 16+2;
const int blocksize = TWOFISH_BLOCKSIZE;
const int context_size = sizeof(TWOFISH_context);
diff --git a/configure.ac b/configure.ac
index 91562a9..4932786 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2070,6 +2070,11 @@ if test "$found" = "1" ; then
x86_64-*-*)
# Build with the assembly implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
+
+ if test x"$avx2support" = xyes ; then
+ # Build with the AVX2 implementation
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo"
+ fi
;;
arm*-*-*)
# Build with the assembly implementation
diff --git a/src/g10lib.h b/src/g10lib.h
index f0a4628..1308cff 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -196,27 +196,28 @@ char **_gcry_strtokenize (const char *string, const char *delim);
/*-- src/hwfeatures.c --*/
-#define HWF_PADLOCK_RNG (1 << 0)
-#define HWF_PADLOCK_AES (1 << 1)
-#define HWF_PADLOCK_SHA (1 << 2)
-#define HWF_PADLOCK_MMUL (1 << 3)
-
-#define HWF_INTEL_CPU (1 << 4)
-#define HWF_INTEL_FAST_SHLD (1 << 5)
-#define HWF_INTEL_BMI2 (1 << 6)
-#define HWF_INTEL_SSSE3 (1 << 7)
-#define HWF_INTEL_SSE4_1 (1 << 8)
-#define HWF_INTEL_PCLMUL (1 << 9)
-#define HWF_INTEL_AESNI (1 << 10)
-#define HWF_INTEL_RDRAND (1 << 11)
-#define HWF_INTEL_AVX (1 << 12)
-#define HWF_INTEL_AVX2 (1 << 13)
-
-#define HWF_ARM_NEON (1 << 14)
-#define HWF_ARM_AES (1 << 15)
-#define HWF_ARM_SHA1 (1 << 16)
-#define HWF_ARM_SHA2 (1 << 17)
-#define HWF_ARM_PMULL (1 << 18)
+#define HWF_PADLOCK_RNG (1 << 0)
+#define HWF_PADLOCK_AES (1 << 1)
+#define HWF_PADLOCK_SHA (1 << 2)
+#define HWF_PADLOCK_MMUL (1 << 3)
+
+#define HWF_INTEL_CPU (1 << 4)
+#define HWF_INTEL_FAST_SHLD (1 << 5)
+#define HWF_INTEL_BMI2 (1 << 6)
+#define HWF_INTEL_SSSE3 (1 << 7)
+#define HWF_INTEL_SSE4_1 (1 << 8)
+#define HWF_INTEL_PCLMUL (1 << 9)
+#define HWF_INTEL_AESNI (1 << 10)
+#define HWF_INTEL_RDRAND (1 << 11)
+#define HWF_INTEL_AVX (1 << 12)
+#define HWF_INTEL_AVX2 (1 << 13)
+#define HWF_INTEL_FAST_VPGATHER (1 << 14)
+
+#define HWF_ARM_NEON (1 << 15)
+#define HWF_ARM_AES (1 << 16)
+#define HWF_ARM_SHA1 (1 << 17)
+#define HWF_ARM_SHA2 (1 << 18)
+#define HWF_ARM_PMULL (1 << 19)
gpg_err_code_t _gcry_disable_hw_feature (const char *name);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index eeacccb..a746ab2 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -176,6 +176,7 @@ detect_x86_gnuc (void)
unsigned int max_cpuid_level;
unsigned int fms, family, model;
unsigned int result = 0;
+ unsigned int avoid_vpgather = 0;
(void)os_supports_avx_avx2_registers;
@@ -262,11 +263,33 @@ detect_x86_gnuc (void)
case 0x47:
case 0x4E:
case 0x5E:
+ case 0x8E:
+ case 0x9E:
case 0x55:
case 0x66:
result |= HWF_INTEL_FAST_SHLD;
break;
}
+
+ /* These Intel Core processors that have AVX2 have slow VPGATHER and
+ * should be avoided for table-lookup use. */
+ switch (model)
+ {
+ case 0x3C:
+ case 0x3F:
+ case 0x45:
+ case 0x46:
+ /* Haswell */
+ avoid_vpgather |= 1;
+ break;
+ }
+ }
+ else
+ {
+ /* Avoid VPGATHER for non-Intel CPUs as testing is needed to
+ * make sure it is fast enough. */
+
+ avoid_vpgather |= 1;
}
#ifdef ENABLE_PCLMUL_SUPPORT
@@ -324,6 +347,9 @@ detect_x86_gnuc (void)
if (features & 0x00000020)
if (os_supports_avx_avx2_registers)
result |= HWF_INTEL_AVX2;
+
+ if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
+ result |= HWF_INTEL_FAST_VPGATHER;
#endif /*ENABLE_AVX_SUPPORT*/
}
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 82f8bf2..b2ae7c3 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -42,25 +42,26 @@ static struct
const char *desc;
} hwflist[] =
{
- { HWF_PADLOCK_RNG, "padlock-rng" },
- { HWF_PADLOCK_AES, "padlock-aes" },
- { HWF_PADLOCK_SHA, "padlock-sha" },
- { HWF_PADLOCK_MMUL, "padlock-mmul"},
- { HWF_INTEL_CPU, "intel-cpu" },
- { HWF_INTEL_FAST_SHLD, "intel-fast-shld" },
- { HWF_INTEL_BMI2, "intel-bmi2" },
- { HWF_INTEL_SSSE3, "intel-ssse3" },
- { HWF_INTEL_SSE4_1, "intel-sse4.1" },
- { HWF_INTEL_PCLMUL, "intel-pclmul" },
- { HWF_INTEL_AESNI, "intel-aesni" },
- { HWF_INTEL_RDRAND, "intel-rdrand" },
- { HWF_INTEL_AVX, "intel-avx" },
- { HWF_INTEL_AVX2, "intel-avx2" },
- { HWF_ARM_NEON, "arm-neon" },
- { HWF_ARM_AES, "arm-aes" },
- { HWF_ARM_SHA1, "arm-sha1" },
- { HWF_ARM_SHA2, "arm-sha2" },
- { HWF_ARM_PMULL, "arm-pmull" }
+ { HWF_PADLOCK_RNG, "padlock-rng" },
+ { HWF_PADLOCK_AES, "padlock-aes" },
+ { HWF_PADLOCK_SHA, "padlock-sha" },
+ { HWF_PADLOCK_MMUL, "padlock-mmul"},
+ { HWF_INTEL_CPU, "intel-cpu" },
+ { HWF_INTEL_FAST_SHLD, "intel-fast-shld" },
+ { HWF_INTEL_BMI2, "intel-bmi2" },
+ { HWF_INTEL_SSSE3, "intel-ssse3" },
+ { HWF_INTEL_SSE4_1, "intel-sse4.1" },
+ { HWF_INTEL_PCLMUL, "intel-pclmul" },
+ { HWF_INTEL_AESNI, "intel-aesni" },
+ { HWF_INTEL_RDRAND, "intel-rdrand" },
+ { HWF_INTEL_AVX, "intel-avx" },
+ { HWF_INTEL_AVX2, "intel-avx2" },
+ { HWF_INTEL_FAST_VPGATHER, "intel-fast-vpgather" },
+ { HWF_ARM_NEON, "arm-neon" },
+ { HWF_ARM_AES, "arm-aes" },
+ { HWF_ARM_SHA1, "arm-sha1" },
+ { HWF_ARM_SHA2, "arm-sha2" },
+ { HWF_ARM_PMULL, "arm-pmull" }
};
/* A bit vector with the hardware features which shall not be used.
More information about the Gcrypt-devel
mailing list