[PATCH 2/2] camellia: add AArch64 crypto-extension implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Feb 28 15:15:22 CET 2023


* cipher/Makefile.am: Add 'camellia-aarch64-ce.(c|o|lo)'.
(aarch64_neon_cflags): New.
* cipher/camellia-aarch64-ce.c: New.
* cipher/camellia-glue.c (USE_AARCH64_CE): New.
(CAMELLIA_context): Add 'use_aarch64ce'.
(_gcry_camellia_aarch64ce_encrypt_blk16)
(_gcry_camellia_aarch64ce_decrypt_blk16)
(_gcry_camellia_aarch64ce_keygen, camellia_aarch64ce_enc_blk16)
(camellia_aarch64ce_dec_blk16, aarch64ce_burn_stack_depth): New.
(camellia_setkey) [USE_AARCH64_CE]: Set use_aarch64ce if HW has
HWF_ARM_AES; Use AArch64/CE key generation if supported by HW.
(camellia_encrypt_blk1_32, camellia_decrypt_blk1_32)
[USE_AARCH64_CE]: Add AArch64/CE code path.
--

Patch enables 128-bit vector instrinsics implementation of Camellia
cipher for AArch64.

Benchmark on AWS Graviton2:

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      5.99 ns/B     159.2 MiB/s     14.97 c/B      2500
        ECB dec |      5.99 ns/B     159.1 MiB/s     14.98 c/B      2500
        CBC enc |      6.16 ns/B     154.7 MiB/s     15.41 c/B      2500
        CBC dec |      6.12 ns/B     155.8 MiB/s     15.29 c/B      2499
        CFB enc |      6.49 ns/B     147.0 MiB/s     16.21 c/B      2500
        CFB dec |      6.05 ns/B     157.6 MiB/s     15.13 c/B      2500
        CTR enc |      6.09 ns/B     156.7 MiB/s     15.22 c/B      2500
        CTR dec |      6.09 ns/B     156.6 MiB/s     15.22 c/B      2500
        XTS enc |      6.16 ns/B     154.9 MiB/s     15.39 c/B      2500
        XTS dec |      6.16 ns/B     154.8 MiB/s     15.40 c/B      2499
        GCM enc |      6.31 ns/B     151.1 MiB/s     15.78 c/B      2500
        GCM dec |      6.31 ns/B     151.1 MiB/s     15.78 c/B      2500
       GCM auth |     0.206 ns/B      4635 MiB/s     0.514 c/B      2500
        OCB enc |      6.63 ns/B     143.9 MiB/s     16.57 c/B      2499
        OCB dec |      6.63 ns/B     143.9 MiB/s     16.56 c/B      2499
       OCB auth |      6.55 ns/B     145.7 MiB/s     16.37 c/B      2499

 After (ecb ~2.1x faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      2.77 ns/B     344.2 MiB/s      6.93 c/B      2499
        ECB dec |      2.76 ns/B     345.3 MiB/s      6.90 c/B      2499
        CBC enc |      6.17 ns/B     154.7 MiB/s     15.41 c/B      2499
        CBC dec |      2.89 ns/B     330.3 MiB/s      7.22 c/B      2500
        CFB enc |      6.48 ns/B     147.1 MiB/s     16.21 c/B      2499
        CFB dec |      2.84 ns/B     336.1 MiB/s      7.09 c/B      2499
        CTR enc |      2.90 ns/B     328.8 MiB/s      7.25 c/B      2499
        CTR dec |      2.90 ns/B     328.9 MiB/s      7.25 c/B      2500
        XTS enc |      2.93 ns/B     325.3 MiB/s      7.33 c/B      2500
        XTS dec |      2.92 ns/B     326.2 MiB/s      7.31 c/B      2500
        GCM enc |      3.10 ns/B     307.2 MiB/s      7.76 c/B      2500
        GCM dec |      3.10 ns/B     307.2 MiB/s      7.76 c/B      2499
       GCM auth |     0.206 ns/B      4635 MiB/s     0.514 c/B      2500

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am           |  14 ++++-
 cipher/camellia-aarch64-ce.c |  42 ++++++++++++++
 cipher/camellia-glue.c       |  70 +++++++++++++++++++++++
 configure.ac                 | 106 +++++++++++++++++++++++++++++++++--
 4 files changed, 227 insertions(+), 5 deletions(-)
 create mode 100644 cipher/camellia-aarch64-ce.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 52435ed5..dcaa68bb 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -148,7 +148,7 @@ EXTRA_libcipher_la_SOURCES = \
 	camellia-aesni-avx2-amd64.h \
 	camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
 	camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
-	camellia-arm.S camellia-aarch64.S \
+	camellia-arm.S camellia-aarch64.S camellia-aarch64-ce.c \
 	camellia-simd128.h camellia-ppc8le.c camellia-ppc9le.c \
 	blake2.c \
 	blake2b-amd64-avx2.S blake2b-amd64-avx512.S \
@@ -238,6 +238,12 @@ else
 ppc_vcrypto_cflags =
 endif
 
+if ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS
+aarch64_neon_cflags = -O2 -march=armv8-a+crypto
+else
+aarch64_neon_cflags =
+endif
+
 rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
@@ -297,3 +303,9 @@ camellia-ppc9le.o: $(srcdir)/camellia-ppc9le.c Makefile
 
 camellia-ppc9le.lo: $(srcdir)/camellia-ppc9le.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.o: $(srcdir)/camellia-aarch64-ce.c Makefile
+	`echo $(COMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
+
+camellia-aarch64-ce.lo: $(srcdir)/camellia-aarch64-ce.c Makefile
+	`echo $(LTCOMPILE) $(aarch64_neon_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/camellia-aarch64-ce.c b/cipher/camellia-aarch64-ce.c
new file mode 100644
index 00000000..76813e94
--- /dev/null
+++ b/cipher/camellia-aarch64-ce.c
@@ -0,0 +1,42 @@
+/* camellia-aarch64-ce.c - ARMv8/CE Camellia implementation
+ * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+
+#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
+# define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
+#else
+# define FUNC_ATTR_OPT
+#endif
+
+#define SIMD128_OPT_ATTR FUNC_ATTR_OPT
+
+#define FUNC_ENC_BLK16 _gcry_camellia_aarch64ce_encrypt_blk16
+#define FUNC_DEC_BLK16 _gcry_camellia_aarch64ce_decrypt_blk16
+#define FUNC_KEY_SETUP _gcry_camellia_aarch64ce_keygen
+
+#include "camellia-simd128.h"
+
+#endif /* __AARCH64EL__ */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 46bbe182..0b07f2d1 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -119,6 +119,16 @@
 # define USE_PPC_CRYPTO 1
 #endif
 
+/* USE_AARCH64_CE indicates whether to enable ARMv8/CE accelerated code. */
+#undef USE_AARCH64_CE
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
+    defined(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS) && \
+    (__GNUC__ >= 4)
+# define USE_AARCH64_CE 1
+#endif
+
 typedef struct
 {
   KEY_TABLE_TYPE keytable;
@@ -138,6 +148,9 @@ typedef struct
   unsigned int use_ppc8:1;
   unsigned int use_ppc9:1;
 #endif /*USE_PPC_CRYPTO*/
+#ifdef USE_AARCH64_CE
+  unsigned int use_aarch64ce:1;
+#endif /*USE_AARCH64_CE*/
 } CAMELLIA_context;
 
 /* Assembly implementations use SystemV ABI, ABI conversion and additional
@@ -472,6 +485,36 @@ static const int ppc_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
                                         2 * sizeof(void *);
 #endif /*USE_PPC_CRYPTO*/
 
+#ifdef USE_AARCH64_CE
+extern void _gcry_camellia_aarch64ce_encrypt_blk16(const void *key_table,
+						   void *out, const void *in,
+						   int key_length);
+
+extern void _gcry_camellia_aarch64ce_decrypt_blk16(const void *key_table,
+						   void *out, const void *in,
+						   int key_length);
+
+extern void _gcry_camellia_aarch64ce_keygen(void *key_table, const void *vkey,
+					    unsigned int keylen);
+
+void camellia_aarch64ce_enc_blk16(const CAMELLIA_context *ctx,
+				  unsigned char *out, const unsigned char *in)
+{
+  _gcry_camellia_aarch64ce_encrypt_blk16 (ctx->keytable, out, in,
+					  ctx->keybitlength / 8);
+}
+
+void camellia_aarch64ce_dec_blk16(const CAMELLIA_context *ctx,
+				  unsigned char *out, const unsigned char *in)
+{
+  _gcry_camellia_aarch64ce_decrypt_blk16 (ctx->keytable, out, in,
+					  ctx->keybitlength / 8);
+}
+
+static const int aarch64ce_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+					      2 * sizeof(void *);
+#endif /*USE_AARCH64_CE*/
+
 static const char *selftest(void);
 
 static void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
@@ -549,6 +592,9 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
   ctx->use_ppc9 = (hwf & HWF_PPC_VCRYPTO) && (hwf & HWF_PPC_ARCH_3_00);
   ctx->use_ppc = ctx->use_ppc8 || ctx->use_ppc9;
 #endif
+#ifdef USE_AARCH64_CE
+  ctx->use_aarch64ce = (hwf & HWF_ARM_AES) != 0;
+#endif
 
   ctx->keybitlength=keylen*8;
 
@@ -574,6 +620,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
     _gcry_camellia_ppc9_keygen(ctx->keytable, key, keylen);
   else if (ctx->use_ppc8)
     _gcry_camellia_ppc8_keygen(ctx->keytable, key, keylen);
+#endif
+#ifdef USE_AARCH64_CE
+  else if (ctx->use_aarch64ce)
+    _gcry_camellia_aarch64ce_keygen(ctx->keytable, key, keylen);
 #endif
   else
     {
@@ -754,6 +804,16 @@ camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       num_blks -= 16;
     }
 #endif
+#ifdef USE_AARCH64_CE
+  while (ctx->use_aarch64ce && num_blks >= 16)
+    {
+      camellia_aarch64ce_enc_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = aarch64ce_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
@@ -855,6 +915,16 @@ camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf,
       num_blks -= 16;
     }
 #endif
+#ifdef USE_AARCH64_CE
+  while (ctx->use_aarch64ce && num_blks >= 16)
+    {
+      camellia_aarch64ce_dec_blk16 (ctx, outbuf, inbuf);
+      stack_burn_size = aarch64ce_burn_stack_depth;
+      outbuf += CAMELLIA_BLOCK_SIZE * 16;
+      inbuf += CAMELLIA_BLOCK_SIZE * 16;
+      num_blks -= 16;
+    }
+#endif
 
   while (num_blks)
     {
diff --git a/configure.ac b/configure.ac
index a40a8135..0d5c9160 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2136,7 +2136,103 @@ fi
 
 
 #
-# Check whether PowerPC AltiVec/VSX intrinsics
+# Check whether compiler supports AArch64/NEON/crypto intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics],
+      [gcry_cv_cc_aarch64_neon_intrinsics],
+      [if test "$mpi_cpu_arch" != "aarch64" ||
+	  test "$try_asm_modules" != "yes" ; then
+	gcry_cv_cc_aarch64_neon_intrinsics="n/a"
+      else
+	gcry_cv_cc_aarch64_neon_intrinsics=no
+	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+	[[#include <arm_neon.h>
+	  #define __m128i uint64x2_t
+	  #define vpsrldq128(s, a, o) \
+	    ({ uint64x2_t __tmp = { 0, 0 }; \
+		o = (__m128i)vextq_u8((uint8x16_t)a, \
+				      (uint8x16_t)__tmp, (s) & 15); })
+	  #define vaesenclast128(a, b, o) \
+	    (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+	  #define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+	  static inline __attribute__((always_inline)) __m128i
+	  fn2(__m128i a)
+	  {
+	    vpsrldq128(2, a, a);
+	    return a;
+	  }
+	  __m128i fn(__m128i in)
+	  {
+	    __m128i x;
+	    memory_barrier_with_vec(in);
+	    x = fn2(in);
+	    memory_barrier_with_vec(x);
+	    vaesenclast128(in, x, in);
+	    memory_barrier_with_vec(in);
+	    return in;
+	  }
+	  ]])],
+	[gcry_cv_cc_aarch64_neon_intrinsics=yes])
+      fi])
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+	    [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -O2 -march=armv8-a+crypto"
+
+if test "$gcry_cv_cc_aarch64_neon_intrinsics" = "no" &&
+   test "$mpi_cpu_arch" = "aarch64" &&
+   test "$try_asm_modules" = "yes" ; then
+  AC_CACHE_CHECK([whether compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags],
+    [gcry_cv_cc_aarch64_neon_intrinsics_cflags],
+    [gcry_cv_cc_aarch64_neon_intrinsics_cflags=no
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      [[#include <arm_neon.h>
+	#define __m128i uint64x2_t
+	#define vpsrldq128(s, a, o) \
+	  ({ uint64x2_t __tmp = { 0, 0 }; \
+	      o = (__m128i)vextq_u8((uint8x16_t)a, \
+				    (uint8x16_t)__tmp, (s) & 15); })
+	#define vaesenclast128(a, b, o) \
+	  (o = (__m128i)vaeseq_u8((uint8x16_t)b, (uint8x16_t)a))
+	#define memory_barrier_with_vec(a) __asm__("" : "+w"(a) :: "memory")
+	static inline __attribute__((always_inline)) __m128i
+	fn2(__m128i a)
+	{
+	  vpsrldq128(2, a, a);
+	  return a;
+	}
+	__m128i fn(__m128i in)
+	{
+	  __m128i x;
+	  memory_barrier_with_vec(in);
+	  x = fn2(in);
+	  memory_barrier_with_vec(x);
+	  vaesenclast128(in, x, in);
+	  memory_barrier_with_vec(in);
+	  return in;
+	}
+	]])],
+      [gcry_cv_cc_aarch64_neon_intrinsics_cflags=yes])])
+  if test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS,1,
+	      [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_AARCH64_NEON_INTRINSICS_WITH_CFLAGS,1,
+	      [Defined if underlying compiler supports AArch64/NEON/crypto intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_AARCH64_NEON_INTRINSICS_EXTRA_CFLAGS,
+	       test "$gcry_cv_cc_aarch64_neon_intrinsics_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether compiler supports PowerPC AltiVec/VSX intrinsics
 #
 AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics],
       [gcry_cv_cc_ppc_altivec],
@@ -2173,8 +2269,8 @@ _gcc_cflags_save=$CFLAGS
 CFLAGS="$CFLAGS -O2 -maltivec -mvsx -mcrypto"
 
 if test "$gcry_cv_cc_ppc_altivec" = "no" &&
-    test "$mpi_cpu_arch" = "ppc" &&
-    test "$try_asm_modules" == "yes" ; then
+   test "$mpi_cpu_arch" = "ppc" &&
+   test "$try_asm_modules" = "yes" ; then
   AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
     [gcry_cv_cc_ppc_altivec_cflags],
     [gcry_cv_cc_ppc_altivec_cflags=no
@@ -2193,7 +2289,8 @@ if test "$gcry_cv_cc_ppc_altivec" = "no" &&
 	  vecu32 y = vec_vsx_ld (0, (unsigned int*)0);
 	  y = vec_sld_u32 (y, y, 3);
 	  return vec_cipher_be (t, in) ^ (block)y;
-	}]])],
+	}
+	]])],
       [gcry_cv_cc_ppc_altivec_cflags=yes])])
   if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
     AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
@@ -2966,6 +3063,7 @@ if test "$found" = "1" ; then
       aarch64-*-*)
          # Build with the assembly implementation
          GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64.lo"
+         GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-aarch64-ce.lo"
       ;;
       powerpc64le-*-*)
          # Build with the POWER vector implementations
-- 
2.37.2




More information about the Gcrypt-devel mailing list