[PATCH] Add AVX2/vpgather bulk implementation of Twofish

Jussi Kivilinna jussi.kivilinna at iki.fi
Wed Jan 4 16:15:17 CET 2017


* cipher/Makefile.am: Add 'twofish-avx2-amd64.S'.
* cipher/twofish-avx2-amd64.S: New.
* cipher/twofish.c (USE_AVX2): New.
(TWOFISH_context) [USE_AVX2]: Add 'use_avx2' member.
(ASM_FUNC_ABI): New.
(twofish_setkey): Add check for AVX2 and fast VPGATHER HW features.
(_gcry_twofish_avx2_ctr_enc, _gcry_twofish_avx2_cbc_dec)
(_gcry_twofish_avx2_cfb_dec, _gcry_twofish_avx2_ocb_enc)
(_gcry_twofish_avx2_ocb_dec, _gcry_twofish_avx2_ocb_auth): New.
(_gcry_twofish_ctr_enc, _gcry_twofish_cbc_dec, _gcry_twofish_cfb_dec)
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Add AVX2 bulk
handling.
(selftest_ctr, selftest_cbc, selftest_cfb): Increase nblocks from
3+X to 16+X.
* configure.ac: Add 'twofish-avx2-amd64.lo'.
* src/g10lib.h (HWF_INTEL_FAST_VPGATHER): New.
* src/hwf-x86.c (detect_x86_gnuc): Add detection for
HWF_INTEL_FAST_VPGATHER.
* src/hwfeatures.c (HWF_INTEL_FAST_VPGATHER): Add
"intel-fast-vpgather" for HWF_INTEL_FAST_VPGATHER.
--

Benchmark on Intel Core i3-6100 (3.7 Ghz):

Before:
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      4.25 ns/B     224.5 MiB/s     15.71 c/B
        ECB dec |      4.16 ns/B     229.5 MiB/s     15.38 c/B
        CBC enc |      4.53 ns/B     210.4 MiB/s     16.77 c/B
        CBC dec |      2.71 ns/B     351.6 MiB/s     10.04 c/B
        CFB enc |      4.60 ns/B     207.3 MiB/s     17.02 c/B
        CFB dec |      2.70 ns/B     353.5 MiB/s      9.98 c/B
        OFB enc |      4.25 ns/B     224.2 MiB/s     15.74 c/B
        OFB dec |      4.24 ns/B     225.0 MiB/s     15.68 c/B
        CTR enc |      2.72 ns/B     350.6 MiB/s     10.06 c/B
        CTR dec |      2.72 ns/B     350.7 MiB/s     10.06 c/B
        CCM enc |      7.25 ns/B     131.5 MiB/s     26.83 c/B
        CCM dec |      7.25 ns/B     131.5 MiB/s     26.83 c/B
       CCM auth |      4.57 ns/B     208.9 MiB/s     16.89 c/B
        GCM enc |      3.02 ns/B     315.3 MiB/s     11.19 c/B
        GCM dec |      3.02 ns/B     315.6 MiB/s     11.18 c/B
       GCM auth |     0.297 ns/B    3208.4 MiB/s      1.10 c/B
        OCB enc |      2.73 ns/B     349.7 MiB/s     10.09 c/B
        OCB dec |      2.82 ns/B     338.3 MiB/s     10.43 c/B
       OCB auth |      2.77 ns/B     343.7 MiB/s     10.27 c/B

After (CBC-dec & CFB-dec & CTR & OCB, ~1.5x faster):
 TWOFISH        |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      4.25 ns/B     224.2 MiB/s     15.74 c/B
        ECB dec |      4.15 ns/B     229.5 MiB/s     15.37 c/B
        CBC enc |      4.61 ns/B     206.8 MiB/s     17.06 c/B
        CBC dec |      1.75 ns/B     544.0 MiB/s      6.49 c/B
        CFB enc |      4.52 ns/B     211.0 MiB/s     16.72 c/B
        CFB dec |      1.72 ns/B     554.1 MiB/s      6.37 c/B
        OFB enc |      4.27 ns/B     223.3 MiB/s     15.80 c/B
        OFB dec |      4.28 ns/B     222.7 MiB/s     15.84 c/B
        CTR enc |      1.73 ns/B     549.9 MiB/s      6.42 c/B
        CTR dec |      1.75 ns/B     545.1 MiB/s      6.47 c/B
        CCM enc |      6.31 ns/B     151.2 MiB/s     23.34 c/B
        CCM dec |      6.42 ns/B     148.5 MiB/s     23.76 c/B
       CCM auth |      4.56 ns/B     208.9 MiB/s     16.89 c/B
        GCM enc |      1.90 ns/B     502.8 MiB/s      7.02 c/B
        GCM dec |      2.00 ns/B     477.8 MiB/s      7.38 c/B
       GCM auth |     0.300 ns/B    3178.6 MiB/s      1.11 c/B
        OCB enc |      1.76 ns/B     542.2 MiB/s      6.51 c/B
        OCB dec |      1.76 ns/B     540.7 MiB/s      6.53 c/B
       OCB auth |      1.76 ns/B     542.8 MiB/s      6.50 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 71a25ed..8c9fc0e 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -100,6 +100,7 @@ stribog.c \
 tiger.c \
 whirlpool.c whirlpool-sse2-amd64.S \
 twofish.c twofish-amd64.S twofish-arm.S twofish-aarch64.S \
+  twofish-avx2-amd64.S \
 rfc2268.c \
 camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
   camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
new file mode 100644
index 0000000..db6e218
--- /dev/null
+++ b/cipher/twofish-avx2-amd64.S
@@ -0,0 +1,1012 @@
+/* twofish-avx2-amd64.S  -  AMD64/AVX2 assembly implementation of Twofish cipher
+ *
+ * Copyright (C) 2013-2017 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \
+    defined(ENABLE_AVX2_SUPPORT)
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+#ifdef __PIC__
+#  define RIP (%rip)
+#else
+#  define RIP
+#endif
+
+.text
+
+/* structure of TWOFISH_context: */
+#define s0	0
+#define s1	((s0) + 4 * 256)
+#define s2	((s1) + 4 * 256)
+#define s3	((s2) + 4 * 256)
+#define w	((s3) + 4 * 256)
+#define k	((w) + 4 * 8)
+
+/* register macros */
+#define CTX	%rdi
+
+#define RROUND  %rbp
+#define RROUNDd %ebp
+#define RS0	CTX
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+#define RK	%r11
+#define RW	%rax
+
+#define RA0	%ymm8
+#define RB0	%ymm9
+#define RC0	%ymm10
+#define RD0	%ymm11
+#define RA1	%ymm12
+#define RB1	%ymm13
+#define RC1	%ymm14
+#define RD1	%ymm15
+
+/* temp regs */
+#define RX0	%ymm0
+#define RY0	%ymm1
+#define RX1	%ymm2
+#define RY1	%ymm3
+#define RT0	%ymm4
+#define RIDX	%ymm5
+
+#define RX0x	%xmm0
+#define RY0x	%xmm1
+#define RX1x	%xmm2
+#define RY1x	%xmm3
+#define RT0x	%xmm4
+#define RIDXx	%xmm5
+
+#define RTMP0   RX0
+#define RTMP0x  RX0x
+#define RTMP1   RX1
+#define RTMP1x  RX1x
+#define RTMP2   RY0
+#define RTMP2x  RY0x
+#define RTMP3   RY1
+#define RTMP3x  RY1x
+#define RTMP4   RIDX
+#define RTMP4x  RIDXx
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+#define RNOTx	%xmm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/**********************************************************************
+  16-way AVX2 twofish
+ **********************************************************************/
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	leaq k(CTX), RK; \
+	leaq w(CTX), RW; \
+	vpsrld $24, RNOT, RBYTE; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+
+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
+	vpand RBYTE, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+		\
+		vpand RBYTE, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+	\
+	vpsrld $8, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $8, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $16, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $16, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $24, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $24, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1;
+
+#define g1_16(a, x) \
+	g16(a, RS0, RS1, RS2, RS3, x);
+
+#define g2_16(b, y) \
+	g16(b, RS1, RS2, RS3, RS0, y);
+
+#define encrypt_round_end16(a, b, c, d, nk, r) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	vpsrld $1, c ## 0, RT0; \
+	vpslld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		vpsrld $1, c ## 1, RT0; \
+		vpslld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+
+#define encrypt_round16(a, b, c, d, nk, r) \
+	g2_16(b, RY); \
+	\
+	vpslld $1, b ## 0, RT0; \
+	vpsrld $31, b ## 0, b ## 0; \
+	vpor RT0, b ## 0, b ## 0; \
+	\
+		vpslld $1, b ## 1, RT0; \
+		vpsrld $31, b ## 1, b ## 1; \
+		vpor RT0, b ## 1, b ## 1; \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk, r);
+
+#define encrypt_round_first16(a, b, c, d, nk, r) \
+	vpslld $1, d ## 0, RT0; \
+	vpsrld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpslld $1, d ## 1, RT0; \
+		vpsrld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1; \
+	\
+	encrypt_round16(a, b, c, d, nk, r);
+
+#define encrypt_round_last16(a, b, c, d, nk, r) \
+	g2_16(b, RY); \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk, r);
+
+#define decrypt_round_end16(a, b, c, d, nk, r) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	vpsrld $1, d ## 0, RT0; \
+	vpslld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd ((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+((nk)+((r)*8))(RK), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		vpsrld $1, d ## 1, RT0; \
+		vpslld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1;
+
+#define decrypt_round16(a, b, c, d, nk, r) \
+	g1_16(a, RX); \
+	\
+	vpslld $1, a ## 0, RT0; \
+	vpsrld $31, a ## 0, a ## 0; \
+	vpor RT0, a ## 0, a ## 0; \
+	\
+		vpslld $1, a ## 1, RT0; \
+		vpsrld $31, a ## 1, a ## 1; \
+		vpor RT0, a ## 1, a ## 1; \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk, r);
+
+#define decrypt_round_first16(a, b, c, d, nk, r) \
+	vpslld $1, c ## 0, RT0; \
+	vpsrld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpslld $1, c ## 1, RT0; \
+		vpsrld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+	\
+	decrypt_round16(a, b, c, d, nk, r)
+
+#define decrypt_round_last16(a, b, c, d, nk, r) \
+	g1_16(a, RX); \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk, r);
+
+#define encrypt_cycle16(r) \
+	encrypt_round16(RA, RB, RC, RD, 0, r); \
+	encrypt_round16(RC, RD, RA, RB, 8, r);
+
+#define encrypt_cycle_first16(r) \
+	encrypt_round_first16(RA, RB, RC, RD, 0, r); \
+	encrypt_round16(RC, RD, RA, RB, 8, r);
+
+#define encrypt_cycle_last16(r) \
+	encrypt_round16(RA, RB, RC, RD, 0, r); \
+	encrypt_round_last16(RC, RD, RA, RB, 8, r);
+
+#define decrypt_cycle16(r) \
+	decrypt_round16(RC, RD, RA, RB, 8, r); \
+	decrypt_round16(RA, RB, RC, RD, 0, r);
+
+#define decrypt_cycle_first16(r) \
+	decrypt_round_first16(RC, RD, RA, RB, 8, r); \
+	decrypt_round16(RA, RB, RC, RD, 0, r);
+
+#define decrypt_cycle_last16(r) \
+	decrypt_round16(RC, RD, RA, RB, 8, r); \
+	decrypt_round_last16(RA, RB, RC, RD, 0, r);
+
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1,	x0, x1; \
+	vpunpcklqdq t1,	x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2,	t2, x2;
+
+#define read_blocks8(offs,a,b,c,d) \
+	vmovdqu 16*offs(RIO), a; \
+	vmovdqu 16*offs+32(RIO), b; \
+	vmovdqu 16*offs+64(RIO), c; \
+	vmovdqu 16*offs+96(RIO), d; \
+	\
+	transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define write_blocks8(offs,a,b,c,d) \
+	transpose_4x4(a, b, c, d, RX0, RY0); \
+	\
+	vmovdqu a, 16*offs(RIO); \
+	vmovdqu b, 16*offs+32(RIO); \
+	vmovdqu c, 16*offs+64(RIO); \
+	vmovdqu d, 16*offs+96(RIO);
+
+#define inpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define outunpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, c, RX0; \
+	vpxor RY0, d, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, a, c; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, b, d; \
+	\
+	vmovdqa RX0, a; \
+	vmovdqa RY0, b;
+
+#define inpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, a, RX0; \
+	vpxor RY0, b, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, c, a; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, d, b; \
+	\
+	vmovdqa RX0, c; \
+	vmovdqa RY0, d;
+
+#define outunpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define transpose4x4_16(a,b,c,d) \
+	transpose_4x4(a ## 0, b ## 0, c ## 0, d ## 0, RX0, RY0); \
+	transpose_4x4(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0);
+
+#define inpack_enc16(a,b,c,d) \
+	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_enc16(a,b,c,d) \
+	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_dec16(a,b,c,d) \
+	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_dec16(a,b,c,d) \
+	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+.align 8
+ELF(.type __twofish_enc_blk16, at function;)
+__twofish_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						ciphertext blocks
+	 */
+	init_round_constants();
+
+	transpose4x4_16(RA, RB, RC, RD);
+	inpack_enc16(RA, RB, RC, RD);
+
+	encrypt_cycle_first16(0);
+	encrypt_cycle16(2);
+	encrypt_cycle16(4);
+	encrypt_cycle16(6);
+	encrypt_cycle16(8);
+	encrypt_cycle16(10);
+	encrypt_cycle16(12);
+	encrypt_cycle_last16(14);
+
+	outunpack_enc16(RA, RB, RC, RD);
+	transpose4x4_16(RA, RB, RC, RD);
+
+	ret;
+ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;)
+
+.align 8
+ELF(.type __twofish_dec_blk16, at function;)
+__twofish_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						plaintext blocks
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel
+	 *						ciphertext blocks
+	 */
+	init_round_constants();
+
+	transpose4x4_16(RA, RB, RC, RD);
+	inpack_dec16(RA, RB, RC, RD);
+
+	decrypt_cycle_first16(14);
+	decrypt_cycle16(12);
+	decrypt_cycle16(10);
+	decrypt_cycle16(8);
+	decrypt_cycle16(6);
+	decrypt_cycle16(4);
+	decrypt_cycle16(2);
+	decrypt_cycle_last16(0);
+
+	outunpack_dec16(RA, RB, RC, RD);
+	transpose4x4_16(RA, RB, RC, RD);
+
+	ret;
+ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_twofish_avx2_ctr_enc
+ELF(.type   _gcry_twofish_avx2_ctr_enc, at function;)
+_gcry_twofish_avx2_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (big endian, 128bit)
+	 */
+
+	movq 8(%rcx), %rax;
+	bswapq %rax;
+
+	vzeroupper;
+
+	vbroadcasti128 .Lbswap128_mask RIP, RTMP3;
+	vpcmpeqd RNOT, RNOT, RNOT;
+	vpsrldq $8, RNOT, RNOT;   /* ab: -1:0 ; cd: -1:0 */
+	vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), RTMP4x;
+	vpshufb RTMP3x, RTMP4x, RTMP4x;
+	vmovdqa RTMP4x, RTMP0x;
+	inc_le128(RTMP4x, RNOTx, RTMP1x);
+	vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+	vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+	/* check need for handling 64-bit overflow and carry */
+	cmpq $(0xffffffffffffffff - 16), %rax;
+	ja .Lhandle_ctr_carry;
+
+	/* construct IVs */
+	vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+	vpshufb RTMP3, RTMP0, RB0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+	vpshufb RTMP3, RTMP0, RC0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+	vpshufb RTMP3, RTMP0, RD0;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+	vpshufb RTMP3, RTMP0, RA1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+	vpshufb RTMP3, RTMP0, RB1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+	vpshufb RTMP3, RTMP0, RC1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+	vpshufb RTMP3, RTMP0, RD1;
+	vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+	vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+	jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+	/* construct IVs */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB0; /* +3 ; +2 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RC0; /* +5 ; +4 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RD0; /* +7 ; +6 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RA1; /* +9 ; +8 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RC1; /* +13 ; +12 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vpshufb RTMP3, RTMP0, RD1; /* +15 ; +14 */
+	inc_le128(RTMP0, RNOT, RTMP1);
+	vextracti128 $1, RTMP0, RTMP0x;
+	vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+	/* store new IV */
+	vmovdqu RTMP0x, (%rcx);
+
+	call __twofish_enc_blk16;
+
+	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (1 * 32)(%rdx), RB0, RB0;
+	vpxor (2 * 32)(%rdx), RC0, RC0;
+	vpxor (3 * 32)(%rdx), RD0, RD0;
+	vpxor (4 * 32)(%rdx), RA1, RA1;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RC1, RC1;
+	vpxor (7 * 32)(%rdx), RD1, RD1;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret
+ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_twofish_avx2_cbc_dec
+ELF(.type   _gcry_twofish_avx2_cbc_dec, at function;)
+_gcry_twofish_avx2_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+
+	vzeroupper;
+
+	vmovdqu (0 * 32)(%rdx), RA0;
+	vmovdqu (1 * 32)(%rdx), RB0;
+	vmovdqu (2 * 32)(%rdx), RC0;
+	vmovdqu (3 * 32)(%rdx), RD0;
+	vmovdqu (4 * 32)(%rdx), RA1;
+	vmovdqu (5 * 32)(%rdx), RB1;
+	vmovdqu (6 * 32)(%rdx), RC1;
+	vmovdqu (7 * 32)(%rdx), RD1;
+
+	call __twofish_dec_blk16;
+
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RNOT;
+	vpxor RNOT, RA0, RA0;
+	vpxor (0 * 32 + 16)(%rdx), RB0, RB0;
+	vpxor (1 * 32 + 16)(%rdx), RC0, RC0;
+	vpxor (2 * 32 + 16)(%rdx), RD0, RD0;
+	vpxor (3 * 32 + 16)(%rdx), RA1, RA1;
+	vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+	vpxor (5 * 32 + 16)(%rdx), RC1, RC1;
+	vpxor (6 * 32 + 16)(%rdx), RD1, RD1;
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx); /* store new IV */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret
+ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_cfb_dec
+ELF(.type   _gcry_twofish_avx2_cfb_dec, at function;)
+_gcry_twofish_avx2_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv
+	 */
+
+	vzeroupper;
+
+	/* Load input */
+	vmovdqu (%rcx), RNOTx;
+	vinserti128 $1, (%rdx), RNOT, RA0;
+	vmovdqu (0 * 32 + 16)(%rdx), RB0;
+	vmovdqu (1 * 32 + 16)(%rdx), RC0;
+	vmovdqu (2 * 32 + 16)(%rdx), RD0;
+	vmovdqu (3 * 32 + 16)(%rdx), RA1;
+	vmovdqu (4 * 32 + 16)(%rdx), RB1;
+	vmovdqu (5 * 32 + 16)(%rdx), RC1;
+	vmovdqu (6 * 32 + 16)(%rdx), RD1;
+
+	/* Update IV */
+	vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+	vmovdqu RNOTx, (%rcx);
+
+	call __twofish_enc_blk16;
+
+	vpxor (0 * 32)(%rdx), RA0, RA0;
+	vpxor (1 * 32)(%rdx), RB0, RB0;
+	vpxor (2 * 32)(%rdx), RC0, RC0;
+	vpxor (3 * 32)(%rdx), RD0, RD0;
+	vpxor (4 * 32)(%rdx), RA1, RA1;
+	vpxor (5 * 32)(%rdx), RB1, RB1;
+	vpxor (6 * 32)(%rdx), RC1, RC1;
+	vpxor (7 * 32)(%rdx), RD1, RD1;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret
+ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_enc
+ELF(.type _gcry_twofish_avx2_ocb_enc, at function;)
+
+_gcry_twofish_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	vmovdqu (%rcx), RTMP0x;
+	vmovdqu (%r8), RTMP1x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RTMP1, RTMP1; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RB0);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RC0);
+	OCB_INPUT(3, %r12, %r13, RD0);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RA1);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RC1);
+	OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vmovdqu RTMP0x, (%rcx);
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __twofish_enc_blk16;
+
+	addq $(4 * 8), %rsp;
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RB0, RB0;
+	vpxor (2 * 32)(%rsi), RC0, RC0;
+	vpxor (3 * 32)(%rsi), RD0, RD0;
+	vpxor (4 * 32)(%rsi), RA1, RA1;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RC1, RC1;
+	vpxor (7 * 32)(%rsi), RD1, RD1;
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vmovdqu RD1, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_dec
+ELF(.type _gcry_twofish_avx2_ocb_dec, at function;)
+
+_gcry_twofish_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	vmovdqu (%rcx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RB0);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RC0);
+	OCB_INPUT(3, %r12, %r13, RD0);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RA1);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RC1);
+	OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rcx);
+	mov %r8, %rcx
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __twofish_dec_blk16;
+
+	vmovdqu (%rcx), RTMP1x;
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RB0, RB0;
+	vpxor (2 * 32)(%rsi), RC0, RC0;
+	vpxor (3 * 32)(%rsi), RD0, RD0;
+	vpxor (4 * 32)(%rsi), RA1, RA1;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RC1, RC1;
+	vpxor (7 * 32)(%rsi), RD1, RD1;
+
+	addq $(4 * 8), %rsp;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vpxor RA0, RTMP1, RTMP1;
+	vmovdqu RB0, (1 * 32)(%rsi);
+	vpxor RB0, RTMP1, RTMP1;
+	vmovdqu RC0, (2 * 32)(%rsi);
+	vpxor RC0, RTMP1, RTMP1;
+	vmovdqu RD0, (3 * 32)(%rsi);
+	vpxor RD0, RTMP1, RTMP1;
+	vmovdqu RA1, (4 * 32)(%rsi);
+	vpxor RA1, RTMP1, RTMP1;
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vpxor RB1, RTMP1, RTMP1;
+	vmovdqu RC1, (6 * 32)(%rsi);
+	vpxor RC1, RTMP1, RTMP1;
+	vmovdqu RD1, (7 * 32)(%rsi);
+	vpxor RD1, RTMP1, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_twofish_avx2_ocb_auth
+ELF(.type _gcry_twofish_avx2_ocb_auth, at function;)
+
+_gcry_twofish_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	vmovdqu (%rdx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RB0);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, RC0);
+	OCB_INPUT(3, %r12, %r13, RD0);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, RA1);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, RC1);
+	OCB_INPUT(7, %r12, %r13, RD1);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __twofish_enc_blk16;
+
+	vpxor RA0, RB0, RA0;
+	vpxor RC0, RD0, RC0;
+	vpxor RA1, RB1, RA1;
+	vpxor RC1, RD1, RC1;
+
+	vpxor RA0, RC0, RA0;
+	vpxor RA1, RC1, RA1;
+
+	addq $(4 * 8), %rsp;
+
+	vpxor RA1, RA0, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor (%rcx), RTMP1x, RTMP1x;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;)
+
+.align 16
+
+/* For CTR-mode IV byteswap */
+ _gcry_twofish_bswap128_mask:
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;)
+
+#endif /*defined(USE_TWOFISH) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 55f6fb9..942e8d4 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -72,6 +72,15 @@
 #  endif
 # endif
 
+/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+    defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# if defined(ENABLE_AVX2_SUPPORT)
+#  define USE_AVX2 1
+# endif
+#endif
+
 
 /* Prototype for the self-test function. */
 static const char *selftest(void);
@@ -82,8 +91,25 @@ static const char *selftest(void);
  * that k[i] corresponds to what the Twofish paper calls K[i+8]. */
 typedef struct {
    u32 s[4][256], w[8], k[32];
+
+#ifdef USE_AVX2
+  int use_avx2;
+#endif
 } TWOFISH_context;
 

+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+#  define ASM_FUNC_ABI
+# endif
+#endif
+
+
 /* These two tables are the q0 and q1 permutations, exactly as described in
  * the Twofish paper. */
 
@@ -711,12 +737,66 @@ static gcry_err_code_t
 twofish_setkey (void *context, const byte *key, unsigned int keylen)
 {
   TWOFISH_context *ctx = context;
-  int rc = do_twofish_setkey (ctx, key, keylen);
+  unsigned int hwfeatures = _gcry_get_hw_features ();
+  int rc;
+
+  rc = do_twofish_setkey (ctx, key, keylen);
+
+#ifdef USE_AVX2
+  ctx->use_avx2 = 0;
+  if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
+    {
+      ctx->use_avx2 = 1;
+    }
+#endif
+
+  (void)hwfeatures;
+
   _gcry_burn_stack (23+6*sizeof(void*));
   return rc;
 }
 
 
+#ifdef USE_AVX2
+/* Assembler implementations of Twofish using AVX2.  Process 16 block in
+   parallel.
+ */
+extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const u64 Ls[16]) ASM_FUNC_ABI;
+#endif
+
 

 #ifdef USE_AMD64_ASM
 
@@ -1111,6 +1191,31 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
   unsigned int burn, burn_stack_depth = 0;
   int i;
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+          nblocks -= 16;
+          outbuf += 16 * TWOFISH_BLOCKSIZE;
+          inbuf  += 16 * TWOFISH_BLOCKSIZE;
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* twofish-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+    }
+#endif
+
 #ifdef USE_AMD64_ASM
   {
     /* Process data in 3 block chunks. */
@@ -1169,6 +1274,31 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
   unsigned char savebuf[TWOFISH_BLOCKSIZE];
   unsigned int burn, burn_stack_depth = 0;
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * TWOFISH_BLOCKSIZE;
+          inbuf  += 16 * TWOFISH_BLOCKSIZE;
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* twofish-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+    }
+#endif
+
 #ifdef USE_AMD64_ASM
   {
     /* Process data in 3 block chunks. */
@@ -1218,6 +1348,31 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
   const unsigned char *inbuf = inbuf_arg;
   unsigned int burn, burn_stack_depth = 0;
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+        {
+          _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+          nblocks -= 16;
+          outbuf += 16 * TWOFISH_BLOCKSIZE;
+          inbuf  += 16 * TWOFISH_BLOCKSIZE;
+          did_use_avx2 = 1;
+        }
+
+      if (did_use_avx2)
+        {
+          /* twofish-avx2 assembly code does not use stack */
+          if (nblocks == 0)
+            burn_stack_depth = 0;
+        }
+    }
+#endif
+
 #ifdef USE_AMD64_ASM
   {
     /* Process data in 3 block chunks. */
@@ -1264,6 +1419,62 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   unsigned int burn, burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      if (encrypt)
+		_gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+	      else
+		_gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					  c->u_ctr.ctr, Ls);
+
+	      nblocks -= 16;
+	      outbuf += 16 * TWOFISH_BLOCKSIZE;
+	      inbuf  += 16 * TWOFISH_BLOCKSIZE;
+	      did_use_avx2 = 1;
+	    }
+	}
+
+      if (did_use_avx2)
+	{
+	  /* twofish-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+    }
+#endif
+
   {
     /* Use u64 to store pointers for x32 support (assembly function
       * assumes 64-bit pointers). */
@@ -1321,6 +1532,59 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   unsigned int burn, burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
 
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      u64 Ls[16];
+      unsigned int n = 16 - (blkn % 16);
+      u64 *l;
+      int i;
+
+      if (nblocks >= 16)
+	{
+	  for (i = 0; i < 16; i += 8)
+	    {
+	      /* Use u64 to store pointers for x32 support (assembly function
+	       * assumes 64-bit pointers). */
+	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+	    }
+
+	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+	  l = &Ls[(15 + n) % 16];
+
+	  /* Process data in 16 block chunks. */
+	  while (nblocks >= 16)
+	    {
+	      blkn += 16;
+	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+	      _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					  c->u_mode.ocb.aad_sum, Ls);
+
+	      nblocks -= 16;
+	      abuf += 16 * TWOFISH_BLOCKSIZE;
+	      did_use_avx2 = 1;
+	    }
+	}
+
+      if (did_use_avx2)
+	{
+	  /* twofish-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
   {
     /* Use u64 to store pointers for x32 support (assembly function
       * assumes 64-bit pointers). */
@@ -1367,7 +1631,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 static const char *
 selftest_ctr (void)
 {
-  const int nblocks = 3+1;
+  const int nblocks = 16+1;
   const int blocksize = TWOFISH_BLOCKSIZE;
   const int context_size = sizeof(TWOFISH_context);
 
@@ -1381,7 +1645,7 @@ selftest_ctr (void)
 static const char *
 selftest_cbc (void)
 {
-  const int nblocks = 3+2;
+  const int nblocks = 16+2;
   const int blocksize = TWOFISH_BLOCKSIZE;
   const int context_size = sizeof(TWOFISH_context);
 
@@ -1395,7 +1659,7 @@ selftest_cbc (void)
 static const char *
 selftest_cfb (void)
 {
-  const int nblocks = 3+2;
+  const int nblocks = 16+2;
   const int blocksize = TWOFISH_BLOCKSIZE;
   const int context_size = sizeof(TWOFISH_context);
 
diff --git a/configure.ac b/configure.ac
index 91562a9..4932786 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2070,6 +2070,11 @@ if test "$found" = "1" ; then
       x86_64-*-*)
          # Build with the assembly implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-amd64.lo"
+
+         if test x"$avx2support" = xyes ; then
+            # Build with the AVX2 implementation
+            GCRYPT_CIPHERS="$GCRYPT_CIPHERS twofish-avx2-amd64.lo"
+         fi
       ;;
       arm*-*-*)
          # Build with the assembly implementation
diff --git a/src/g10lib.h b/src/g10lib.h
index f0a4628..1308cff 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -196,27 +196,28 @@ char **_gcry_strtokenize (const char *string, const char *delim);
 
 
 /*-- src/hwfeatures.c --*/
-#define HWF_PADLOCK_RNG     (1 << 0)
-#define HWF_PADLOCK_AES     (1 << 1)
-#define HWF_PADLOCK_SHA     (1 << 2)
-#define HWF_PADLOCK_MMUL    (1 << 3)
-
-#define HWF_INTEL_CPU       (1 << 4)
-#define HWF_INTEL_FAST_SHLD (1 << 5)
-#define HWF_INTEL_BMI2      (1 << 6)
-#define HWF_INTEL_SSSE3     (1 << 7)
-#define HWF_INTEL_SSE4_1    (1 << 8)
-#define HWF_INTEL_PCLMUL    (1 << 9)
-#define HWF_INTEL_AESNI     (1 << 10)
-#define HWF_INTEL_RDRAND    (1 << 11)
-#define HWF_INTEL_AVX       (1 << 12)
-#define HWF_INTEL_AVX2      (1 << 13)
-
-#define HWF_ARM_NEON        (1 << 14)
-#define HWF_ARM_AES         (1 << 15)
-#define HWF_ARM_SHA1        (1 << 16)
-#define HWF_ARM_SHA2        (1 << 17)
-#define HWF_ARM_PMULL       (1 << 18)
+#define HWF_PADLOCK_RNG         (1 << 0)
+#define HWF_PADLOCK_AES         (1 << 1)
+#define HWF_PADLOCK_SHA         (1 << 2)
+#define HWF_PADLOCK_MMUL        (1 << 3)
+
+#define HWF_INTEL_CPU           (1 << 4)
+#define HWF_INTEL_FAST_SHLD     (1 << 5)
+#define HWF_INTEL_BMI2          (1 << 6)
+#define HWF_INTEL_SSSE3         (1 << 7)
+#define HWF_INTEL_SSE4_1        (1 << 8)
+#define HWF_INTEL_PCLMUL        (1 << 9)
+#define HWF_INTEL_AESNI         (1 << 10)
+#define HWF_INTEL_RDRAND        (1 << 11)
+#define HWF_INTEL_AVX           (1 << 12)
+#define HWF_INTEL_AVX2          (1 << 13)
+#define HWF_INTEL_FAST_VPGATHER (1 << 14)
+
+#define HWF_ARM_NEON            (1 << 15)
+#define HWF_ARM_AES             (1 << 16)
+#define HWF_ARM_SHA1            (1 << 17)
+#define HWF_ARM_SHA2            (1 << 18)
+#define HWF_ARM_PMULL           (1 << 19)
 
 
 gpg_err_code_t _gcry_disable_hw_feature (const char *name);
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index eeacccb..a746ab2 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -176,6 +176,7 @@ detect_x86_gnuc (void)
   unsigned int max_cpuid_level;
   unsigned int fms, family, model;
   unsigned int result = 0;
+  unsigned int avoid_vpgather = 0;
 
   (void)os_supports_avx_avx2_registers;
 
@@ -262,11 +263,33 @@ detect_x86_gnuc (void)
 	case 0x47:
 	case 0x4E:
 	case 0x5E:
+	case 0x8E:
+	case 0x9E:
 	case 0x55:
 	case 0x66:
 	  result |= HWF_INTEL_FAST_SHLD;
 	  break;
 	}
+
+      /* These Intel Core processors that have AVX2 have slow VPGATHER and
+       * should be avoided for table-lookup use. */
+      switch (model)
+	{
+	case 0x3C:
+	case 0x3F:
+	case 0x45:
+	case 0x46:
+	  /* Haswell */
+	  avoid_vpgather |= 1;
+	  break;
+	}
+    }
+  else
+    {
+      /* Avoid VPGATHER for non-Intel CPUs as testing is needed to
+       * make sure it is fast enough. */
+
+      avoid_vpgather |= 1;
     }
 
 #ifdef ENABLE_PCLMUL_SUPPORT
@@ -324,6 +347,9 @@ detect_x86_gnuc (void)
       if (features & 0x00000020)
         if (os_supports_avx_avx2_registers)
           result |= HWF_INTEL_AVX2;
+
+      if ((result & HWF_INTEL_AVX2) && !avoid_vpgather)
+        result |= HWF_INTEL_FAST_VPGATHER;
 #endif /*ENABLE_AVX_SUPPORT*/
     }
 
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 82f8bf2..b2ae7c3 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -42,25 +42,26 @@ static struct
   const char *desc;
 } hwflist[] =
   {
-    { HWF_PADLOCK_RNG,     "padlock-rng" },
-    { HWF_PADLOCK_AES,     "padlock-aes" },
-    { HWF_PADLOCK_SHA,     "padlock-sha" },
-    { HWF_PADLOCK_MMUL,    "padlock-mmul"},
-    { HWF_INTEL_CPU,       "intel-cpu" },
-    { HWF_INTEL_FAST_SHLD, "intel-fast-shld" },
-    { HWF_INTEL_BMI2,      "intel-bmi2" },
-    { HWF_INTEL_SSSE3,     "intel-ssse3" },
-    { HWF_INTEL_SSE4_1,    "intel-sse4.1" },
-    { HWF_INTEL_PCLMUL,    "intel-pclmul" },
-    { HWF_INTEL_AESNI,     "intel-aesni" },
-    { HWF_INTEL_RDRAND,    "intel-rdrand" },
-    { HWF_INTEL_AVX,       "intel-avx" },
-    { HWF_INTEL_AVX2,      "intel-avx2" },
-    { HWF_ARM_NEON,        "arm-neon" },
-    { HWF_ARM_AES,         "arm-aes" },
-    { HWF_ARM_SHA1,        "arm-sha1" },
-    { HWF_ARM_SHA2,        "arm-sha2" },
-    { HWF_ARM_PMULL,       "arm-pmull" }
+    { HWF_PADLOCK_RNG,         "padlock-rng" },
+    { HWF_PADLOCK_AES,         "padlock-aes" },
+    { HWF_PADLOCK_SHA,         "padlock-sha" },
+    { HWF_PADLOCK_MMUL,        "padlock-mmul"},
+    { HWF_INTEL_CPU,           "intel-cpu" },
+    { HWF_INTEL_FAST_SHLD,     "intel-fast-shld" },
+    { HWF_INTEL_BMI2,          "intel-bmi2" },
+    { HWF_INTEL_SSSE3,         "intel-ssse3" },
+    { HWF_INTEL_SSE4_1,        "intel-sse4.1" },
+    { HWF_INTEL_PCLMUL,        "intel-pclmul" },
+    { HWF_INTEL_AESNI,         "intel-aesni" },
+    { HWF_INTEL_RDRAND,        "intel-rdrand" },
+    { HWF_INTEL_AVX,           "intel-avx" },
+    { HWF_INTEL_AVX2,          "intel-avx2" },
+    { HWF_INTEL_FAST_VPGATHER, "intel-fast-vpgather" },
+    { HWF_ARM_NEON,            "arm-neon" },
+    { HWF_ARM_AES,             "arm-aes" },
+    { HWF_ARM_SHA1,            "arm-sha1" },
+    { HWF_ARM_SHA2,            "arm-sha2" },
+    { HWF_ARM_PMULL,           "arm-pmull" }
   };
 
 /* A bit vector with the hardware features which shall not be used.




More information about the Gcrypt-devel mailing list