[PATCH 2/7] Add GFNI/AVX2 implementation of Camellia

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Apr 24 20:40:20 CEST 2022


* cipher/Makefile.am: Add "camellia-gfni-avx2-amd64.S".
* cipher/camellia-aesni-avx2-amd64.h [CAMELLIA_GFNI_BUILD]: Add GFNI
support.
* cipher/camellia-gfni-avx2-amd64.S: New.
* cipher/camellia-glue.c (USE_GFNI_AVX2): New.
(CAMELLIA_context) [USE_AESNI_AVX2]: New member "use_gfni_avx2".
[USE_GFNI_AVX2] (_gcry_camellia_gfni_avx2_ctr_enc)
(_gcry_camellia_gfni_avx2_cbc_dec, _gcry_camellia_gfni_avx2_cfb_dec)
(_gcry_camellia_gfni_avx2_ocb_enc, _gcry_camellia_gfni_avx2_ocb_dec)
(_gcry_camellia_gfni_avx2_ocb_auth): New.
(camellia_setkey) [USE_GFNI_AVX2]: Enable GFNI if supported by HW.
(_gcry_camellia_ctr_enc) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_cbc_dec) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_cfb_dec) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_ocb_crypt) [USE_GFNI_AVX2]: Add GFNI support.
(_gcry_camellia_ocb_auth) [USE_GFNI_AVX2]: Add GFNI support.
* configure.ac: Add "camellia-gfni-avx2-amd64.lo".
--

Benchmark on Intel Core i3-1115G4 (tigerlake):

Before (VAES/AVX2 implementation):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        CBC dec |     0.579 ns/B      1646 MiB/s      2.37 c/B      4090
        CFB dec |     0.579 ns/B      1648 MiB/s      2.37 c/B      4089
        CTR enc |     0.586 ns/B      1628 MiB/s      2.40 c/B      4090
        CTR dec |     0.587 ns/B      1626 MiB/s      2.40 c/B      4090
        OCB enc |     0.607 ns/B      1570 MiB/s      2.48 c/B      4089
        OCB dec |     0.611 ns/B      1561 MiB/s      2.50 c/B      4089
       OCB auth |     0.602 ns/B      1585 MiB/s      2.46 c/B      4089

After (~80% faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        CBC dec |     0.299 ns/B      3186 MiB/s      1.22 c/B      4090
        CFB dec |     0.314 ns/B      3039 MiB/s      1.28 c/B      4089
        CTR enc |     0.322 ns/B      2962 MiB/s      1.32 c/B      4090
        CTR dec |     0.321 ns/B      2970 MiB/s      1.31 c/B      4090
        OCB enc |     0.339 ns/B      2817 MiB/s      1.38 c/B      4089
        OCB dec |     0.346 ns/B      2756 MiB/s      1.41 c/B      4089
       OCB auth |     0.337 ns/B      2831 MiB/s      1.38 c/B      4089

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am                 |   5 +-
 cipher/camellia-aesni-avx2-amd64.h | 249 ++++++++++++++++++++++++++++-
 cipher/camellia-gfni-avx2-amd64.S  |  34 ++++
 cipher/camellia-glue.c             | 170 +++++++++++++-------
 configure.ac                       |   3 +
 5 files changed, 398 insertions(+), 63 deletions(-)
 create mode 100644 cipher/camellia-gfni-avx2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 07e5ba26..7a429e8b 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -139,8 +139,9 @@ EXTRA_libcipher_la_SOURCES = \
 	twofish-avx2-amd64.S \
 	rfc2268.c \
 	camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
-	camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \
-	camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+	camellia-aesni-avx2-amd64.h camellia-gfni-avx2-amd64.S \
+	camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
+	camellia-arm.S camellia-aarch64.S \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2s-amd64-avx.S
 
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index e93c40b8..8cd4b1cd 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1,6 +1,6 @@
-/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia
+/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia
  *
- * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -36,6 +36,8 @@
 /**********************************************************************
   helper macros
  **********************************************************************/
+
+#ifndef CAMELLIA_GFNI_BUILD
 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
 	vpand x, mask4bit, tmp0; \
 	vpandn x, mask4bit, x; \
@@ -44,6 +46,7 @@
 	vpshufb tmp0, lo_t, tmp0; \
 	vpshufb x, hi_t, x; \
 	vpxor tmp0, x, x;
+#endif
 
 #define ymm0_x xmm0
 #define ymm1_x xmm1
@@ -70,11 +73,61 @@
 # define IF_VAES(...)
 #endif
 
+/**********************************************************************
+  GFNI helper macros and constants
+ **********************************************************************/
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+	( (((a0) & 1) << 0) | \
+	  (((a1) & 1) << 1) | \
+	  (((a2) & 1) << 2) | \
+	  (((a3) & 1) << 3) | \
+	  (((a4) & 1) << 4) | \
+	  (((a5) & 1) << 5) | \
+	  (((a6) & 1) << 6) | \
+	  (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+	( ((l7) << (0 * 8)) | \
+	  ((l6) << (1 * 8)) | \
+	  ((l5) << (2 * 8)) | \
+	  ((l4) << (3 * 8)) | \
+	  ((l3) << (4 * 8)) | \
+	  ((l2) << (5 * 8)) | \
+	  ((l1) << (6 * 8)) | \
+	  ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14  BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2   BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3   BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+#endif /* CAMELLIA_GFNI_BUILD */
+
 /**********************************************************************
   32-way camellia
  **********************************************************************/
 
-/*
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* roundsm32 (GFNI version)
  * IN:
  *   x0..x7: byte-sliced AB state
  *   mem_cd: register pointer storing CD state
@@ -82,7 +135,119 @@
  * OUT:
  *   x0..x7: new byte-sliced CD state
  */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+		  t6, t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+	vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+	vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+	vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+	vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+	vpxor t7##_x, t7##_x, t7##_x; \
+	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+	\
+	/* prefilter sboxes */ \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+	vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+	\
+	/* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+	vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+	\
+	/* sbox GF8 inverse + postfilter sbox 3 */ \
+	vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+	vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+	\
+	/* sbox GF8 inverse + postfilter sbox 2 */ \
+	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+	vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+	\
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpshufb t7, t1, t1; \
+	vpsrldq $3, t0, t3; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpshufb t7, t2, t2; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t7, t3, t3; \
+	vpsrldq $5, t0, t5; \
+	vpshufb t7, t4, t4; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpsrldq $6, t0, t6; \
+	vpshufb t7, t5, t5; \
+	vpshufb t7, t6, t6; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 32(mem_cd), x1, x1; \
+	\
+	vpsrldq $7, t0, t6; \
+	vpshufb t7, t0, t0; \
+	vpshufb t7, t6, t7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 32(mem_cd), x0, x0; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 32(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 32(mem_cd), x3, x3; \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 32(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 32(mem_cd), x5, x5; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 32(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 32(mem_cd), x7, x7;
 
+#else /* CAMELLIA_GFNI_BUILD */
+
+/* roundsm32 (AES-NI / VAES version)
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
 		  t6, t7, mem_cd, key) \
 	/* \
@@ -181,7 +346,7 @@
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
-	vpxor t7, t7, t7; \
+	vpxor t7##_x, t7##_x, t7##_x; \
 	\
 	vpsrldq $1, t0, t1; \
 	vpsrldq $2, t0, t2; \
@@ -249,6 +414,8 @@
 	vpxor t0, x7, x7; \
 	vpxor 3 * 32(mem_cd), x7, x7;
 
+#endif /* CAMELLIA_GFNI_BUILD */
+
 /*
  * IN/OUT:
  *  x0..x7: byte-sliced AB state preloaded
@@ -623,6 +790,9 @@
 #define SHUFB_BYTES(idx) \
 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
 
+FUNC_NAME(_constants):
+ELF(.type   FUNC_NAME(_constants), at object;)
+
 .Lshufb_16x16b:
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
@@ -635,6 +805,74 @@
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ *   See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Bit-matrix from "θ₁(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+	.quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+		    BV8(0, 0, 1, 1, 0, 0, 1, 0),
+		    BV8(1, 1, 0, 1, 0, 0, 0, 0),
+		    BV8(1, 0, 1, 1, 0, 0, 1, 1),
+		    BV8(0, 0, 0, 0, 1, 1, 0, 0),
+		    BV8(1, 0, 1, 0, 0, 1, 0, 0),
+		    BV8(0, 0, 1, 0, 1, 1, 0, 0),
+		    BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "θ₄(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+	.quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 0, 0),
+		    BV8(1, 0, 1, 0, 0, 0, 0, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 1, 1),
+		    BV8(0, 0, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 1, 0, 0, 1, 0, 0, 1),
+		    BV8(0, 1, 0, 1, 1, 0, 0, 0),
+		    BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "ψ₁(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+	.quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 1, 0),
+		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
+		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 1, 1),
+		    BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "ψ₂(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+	.quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1),
+		    BV8(0, 1, 1, 0, 0, 1, 1, 0),
+		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
+		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "ψ₃(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+	.quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+		    BV8(1, 0, 1, 1, 1, 1, 1, 0),
+		    BV8(0, 0, 0, 1, 1, 0, 1, 1),
+		    BV8(1, 0, 0, 0, 1, 1, 1, 0),
+		    BV8(0, 1, 0, 1, 1, 1, 1, 0),
+		    BV8(0, 1, 1, 1, 1, 1, 1, 1),
+		    BV8(0, 0, 0, 1, 1, 1, 0, 0),
+		    BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#else /* CAMELLIA_GFNI_BUILD */
+
 /*
  * pre-SubByte transform
  *
@@ -756,6 +994,9 @@
 .L0f0f0f0f:
 	.long 0x0f0f0f0f
 
+#endif /* CAMELLIA_GFNI_BUILD */
+
+ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);)
 
 .align 8
 ELF(.type   __camellia_enc_blk32, at function;)
diff --git a/cipher/camellia-gfni-avx2-amd64.S b/cipher/camellia-gfni-avx2-amd64.S
new file mode 100644
index 00000000..20c9a432
--- /dev/null
+++ b/cipher/camellia-gfni-avx2-amd64.S
@@ -0,0 +1,34 @@
+/* camellia-vaes-avx2-amd64.S  -  GFNI/AVX2 implementation of Camellia cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#define CAMELLIA_GFNI_BUILD 1
+#define FUNC_NAME(func) _gcry_camellia_gfni_avx2_ ## func
+
+#include "camellia-aesni-avx2-amd64.h"
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 72c02d77..7f009db4 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -97,6 +97,12 @@
 # define USE_VAES_AVX2 1
 #endif
 
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
 typedef struct
 {
   KEY_TABLE_TYPE keytable;
@@ -107,6 +113,7 @@ typedef struct
 #ifdef USE_AESNI_AVX2
   unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used.  */
   unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used.  */
+  unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used.  */
 #endif /*USE_AESNI_AVX2*/
 } CAMELLIA_context;
 
@@ -248,6 +255,46 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
 #endif
 
+#ifdef USE_GFNI_AVX2
+/* Assembler implementations of Camellia using GFNI and AVX2.  Process data
+   in 32 block same time.
+ */
+extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
+					      const unsigned char *abuf,
+					      unsigned char *offset,
+					      unsigned char *checksum,
+					      const u64 Ls[32]) ASM_FUNC_ABI;
+#endif
+
 static const char *selftest(void);
 
 static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -272,7 +319,8 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   CAMELLIA_context *ctx=c;
   static int initialized=0;
   static const char *selftest_failed=NULL;
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \
+    || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2)
   unsigned int hwf = _gcry_get_hw_features ();
 #endif
 
@@ -296,10 +344,14 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
 #ifdef USE_AESNI_AVX2
   ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
   ctx->use_vaes_avx2 = 0;
+  ctx->use_gfni_avx2 = 0;
 #endif
 #ifdef USE_VAES_AVX2
   ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
 #endif
+#ifdef USE_GFNI_AVX2
+  ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
 
   ctx->keybitlength=keylen*8;
 
@@ -440,20 +492,22 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
+      typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn =
+	  _gcry_camellia_aesni_avx2_ctr_enc;
+
 #ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
+      if (ctx->use_vaes_avx2)
+	bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc;
+#endif
+#ifdef USE_GFNI_AVX2
+      if (ctx->use_gfni_avx2)
+	bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
-#ifdef USE_VAES_AVX2
-          if (use_vaes)
-            _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
-          else
-#endif
-            _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
-
+	  bulk_ctr_fn (ctx, outbuf, inbuf, ctr);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
@@ -537,20 +591,22 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
+      typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn =
+	  _gcry_camellia_aesni_avx2_cbc_dec;
+
 #ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
+      if (ctx->use_vaes_avx2)
+	bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+      if (ctx->use_gfni_avx2)
+	bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
-#ifdef USE_VAES_AVX2
-          if (use_vaes)
-            _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
-          else
-#endif
-            _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
-
+	  bulk_cbc_fn (ctx, outbuf, inbuf, iv);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
@@ -631,20 +687,22 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
+      typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn =
+	  _gcry_camellia_aesni_avx2_cfb_dec;
+
 #ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
+      if (ctx->use_vaes_avx2)
+	bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+      if (ctx->use_gfni_avx2)
+	bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec;
 #endif
 
       /* Process data in 32 block chunks. */
       while (nblocks >= 32)
         {
-#ifdef USE_VAES_AVX2
-          if (use_vaes)
-            _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
-          else
-#endif
-            _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
-
+	  bulk_cfb_fn (ctx, outbuf, inbuf, iv);
           nblocks -= 32;
           outbuf += 32 * CAMELLIA_BLOCK_SIZE;
           inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
@@ -729,10 +787,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
-      int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2;
-      int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2;
-#endif
       u64 Ls[32];
       unsigned int n = 32 - (blkn % 32);
       u64 *l;
@@ -740,6 +794,21 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
       if (nblocks >= 32)
 	{
+	  typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn =
+	      encrypt ? _gcry_camellia_aesni_avx2_ocb_enc
+		      : _gcry_camellia_aesni_avx2_ocb_dec;
+
+#ifdef USE_VAES_AVX2
+	  if (ctx->use_vaes_avx2)
+	    bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc
+				  : _gcry_camellia_vaes_avx2_ocb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+	  if (ctx->use_gfni_avx2)
+	    bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc
+				  : _gcry_camellia_gfni_avx2_ocb_dec;
+#endif
+
 	  for (i = 0; i < 32; i += 8)
 	    {
 	      /* Use u64 to store pointers for x32 support (assembly function
@@ -764,21 +833,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 	      blkn += 32;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
 
-	      if (0) {}
-#ifdef USE_VAES_AVX2
-	      else if (encrypt_use_vaes)
-		_gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-                                                 c->u_ctr.ctr, Ls);
-	      else if (decrypt_use_vaes)
-		_gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-                                                 c->u_ctr.ctr, Ls);
-#endif
-	      else if (encrypt)
-		_gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
-						  c->u_ctr.ctr, Ls);
-	      else
-		_gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
-						  c->u_ctr.ctr, Ls);
+	      bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
 
 	      nblocks -= 32;
 	      outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -891,9 +946,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
   if (ctx->use_aesni_avx2)
     {
       int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
-      int use_vaes = ctx->use_vaes_avx2;
-#endif
       u64 Ls[32];
       unsigned int n = 32 - (blkn % 32);
       u64 *l;
@@ -901,6 +953,18 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       if (nblocks >= 32)
 	{
+	  typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn =
+	      _gcry_camellia_aesni_avx2_ocb_auth;
+
+#ifdef USE_VAES_AVX2
+	  if (ctx->use_vaes_avx2)
+	    bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth;
+#endif
+#ifdef USE_GFNI_AVX2
+	  if (ctx->use_gfni_avx2)
+	    bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth;
+#endif
+
 	  for (i = 0; i < 32; i += 8)
 	    {
 	      /* Use u64 to store pointers for x32 support (assembly function
@@ -925,16 +989,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 	      blkn += 32;
 	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
 
-#ifdef USE_VAES_AVX2
-              if (use_vaes)
-                _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf,
-                                                  c->u_mode.ocb.aad_offset,
-                                                  c->u_mode.ocb.aad_sum, Ls);
-              else
-#endif
-                _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
-                                                   c->u_mode.ocb.aad_offset,
-                                                   c->u_mode.ocb.aad_sum, Ls);
+	      bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset,
+			    c->u_mode.ocb.aad_sum, Ls);
 
 	      nblocks -= 32;
 	      abuf += 32 * CAMELLIA_BLOCK_SIZE;
diff --git a/configure.ac b/configure.ac
index 15c92018..c5d61657 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2755,6 +2755,9 @@ if test "$found" = "1" ; then
 
         # Build with the VAES/AVX2 implementation
         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
+
+        # Build with the GFNI/AVX2 implementation
+        GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
       fi
    fi
 fi
-- 
2.34.1




More information about the Gcrypt-devel mailing list