[PATCH 3/6] rijndael-ppc: add key setup and enable single block PowerPC AES

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Aug 23 18:52:10 CEST 2019


* cipher/Makefile.am: Add 'rijndael-ppc.c'.
* cipher/rijndael-internal.h (USE_PPC_CRYPTO): New.
(RIJNDAEL_context): Add 'use_ppc_crypto'.
* cipher/rijndael-ppc.c (backwards, swap_if_le): Remove.
(u128_t, ALWAYS_INLINE, NO_INLINE, NO_INSTRUMENT_FUNCTION)
(ASM_FUNC_ATTR, ASM_FUNC_ATTR_INLINE, ASM_FUNC_ATTR_NOINLINE)
(ALIGNED_LOAD, ALIGNED_STORE, VEC_LOAD_BE, VEC_STORE_BE)
(vec_bswap32_const, vec_aligned_ld, vec_load_be_const)
(vec_load_be, vec_aligned_st, vec_store_be, _gcry_aes_sbox4_ppc8)
(_gcry_aes_ppc8_setkey, _gcry_aes_ppc8_prepare_decryption)
(aes_ppc8_encrypt_altivec, aes_ppc8_decrypt_altivec): New.
(_gcry_aes_ppc8_encrypt, _gcry_aes_ppc8_decrypt): Rewrite.
(_gcry_aes_ppc8_ocb_crypt): Comment out.
* cipher/rijndael.c [USE_PPC_CRYPTO] (_gcry_aes_ppc8_setkey)
(_gcry_aes_ppc8_prepare_decryption, _gcry_aes_ppc8_encrypt)
(_gcry_aes_ppc8_decrypt): New prototypes.
(do_setkey) [USE_PPC_CRYPTO]: Add setup for PowerPC AES.
(prepare_decryption) [USE_PPC_CRYPTO]: Ditto.
* configure.ac: Add 'rijndael-ppc.lo'.
(gcry_cv_ppc_altivec, gcry_cv_cc_ppc_altivec_cflags)
(gcry_cv_gcc_inline_asm_ppc_altivec)
(gcry_cv_gcc_inline_asm_ppc_arch_3_00): New checks.
--

Benchmark on POWER8 ~3.8Ghz:
Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      7.27 ns/B     131.2 MiB/s     27.61 c/B
        ECB dec |      7.70 ns/B     123.8 MiB/s     29.28 c/B
        CBC enc |      6.38 ns/B     149.5 MiB/s     24.24 c/B
        CBC dec |      6.17 ns/B     154.5 MiB/s     23.45 c/B
        CFB enc |      6.45 ns/B     147.9 MiB/s     24.51 c/B
        CFB dec |      6.20 ns/B     153.8 MiB/s     23.57 c/B
        OFB enc |      7.36 ns/B     129.6 MiB/s     27.96 c/B
        OFB dec |      7.36 ns/B     129.6 MiB/s     27.96 c/B
        CTR enc |      6.22 ns/B     153.2 MiB/s     23.65 c/B
        CTR dec |      6.22 ns/B     153.3 MiB/s     23.65 c/B
        XTS enc |      6.67 ns/B     142.9 MiB/s     25.36 c/B
        XTS dec |      6.70 ns/B     142.3 MiB/s     25.46 c/B
        CCM enc |     12.61 ns/B     75.60 MiB/s     47.93 c/B
        CCM dec |     12.62 ns/B     75.56 MiB/s     47.96 c/B
       CCM auth |      6.41 ns/B     148.8 MiB/s     24.36 c/B
        EAX enc |     12.62 ns/B     75.55 MiB/s     47.96 c/B
        EAX dec |     12.62 ns/B     75.55 MiB/s     47.97 c/B
       EAX auth |      6.39 ns/B     149.2 MiB/s     24.30 c/B
        GCM enc |      9.81 ns/B     97.24 MiB/s     37.27 c/B
        GCM dec |      9.81 ns/B     97.20 MiB/s     37.28 c/B
       GCM auth |      3.59 ns/B     265.8 MiB/s     13.63 c/B
        OCB enc |      6.39 ns/B     149.3 MiB/s     24.27 c/B
        OCB dec |      6.38 ns/B     149.5 MiB/s     24.25 c/B
       OCB auth |      6.35 ns/B     150.2 MiB/s     24.13 c/B

After:
        ECB enc |      1.29 ns/B     737.7 MiB/s      4.91 c/B
        ECB dec |      1.34 ns/B     711.1 MiB/s      5.10 c/B
        CBC enc |      2.13 ns/B     448.5 MiB/s      8.08 c/B
        CBC dec |      1.05 ns/B     908.0 MiB/s      3.99 c/B
        CFB enc |      2.17 ns/B     439.9 MiB/s      8.24 c/B
        CFB dec |      2.22 ns/B     429.8 MiB/s      8.43 c/B
        OFB enc |      1.49 ns/B     640.1 MiB/s      5.66 c/B
        OFB dec |      1.49 ns/B     640.1 MiB/s      5.66 c/B
        CTR enc |      2.21 ns/B     432.5 MiB/s      8.38 c/B
        CTR dec |      2.20 ns/B     432.5 MiB/s      8.38 c/B
        XTS enc |      2.32 ns/B     410.6 MiB/s      8.83 c/B
        XTS dec |      2.33 ns/B     409.7 MiB/s      8.85 c/B
        CCM enc |      4.36 ns/B     218.7 MiB/s     16.57 c/B
        CCM dec |      4.36 ns/B     218.8 MiB/s     16.56 c/B
       CCM auth |      2.17 ns/B     440.4 MiB/s      8.23 c/B
        EAX enc |      4.37 ns/B     218.3 MiB/s     16.60 c/B
        EAX dec |      4.36 ns/B     218.7 MiB/s     16.57 c/B
       EAX auth |      2.16 ns/B     440.7 MiB/s      8.22 c/B
        GCM enc |      5.78 ns/B     165.0 MiB/s     21.96 c/B
        GCM dec |      5.78 ns/B     165.0 MiB/s     21.96 c/B
       GCM auth |      3.59 ns/B     265.9 MiB/s     13.63 c/B
        OCB enc |      2.33 ns/B     410.1 MiB/s      8.84 c/B
        OCB dec |      2.34 ns/B     407.2 MiB/s      8.90 c/B
       OCB auth |      2.32 ns/B     411.1 MiB/s      8.82 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 2aae82e27..1f2d8ec97 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -96,6 +96,7 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
 	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
 	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
+	rijndael-ppc.c                                     \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
@@ -197,3 +198,15 @@ crc-intel-pclmul.o: $(srcdir)/crc-intel-pclmul.c Makefile
 
 crc-intel-pclmul.lo: $(srcdir)/crc-intel-pclmul.c Makefile
 	`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
+
+if ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS
+ppc_vcrypto_cflags = -maltivec -mvsx -mcrypto
+else
+ppc_vcrypto_cflags =
+endif
+
+rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 78b08e8f8..5150a69d7 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -75,7 +75,7 @@
 #   define USE_PADLOCK 1
 #  endif
 # endif
-#endif /*ENABLE_PADLOCK_SUPPORT*/
+#endif /* ENABLE_PADLOCK_SUPPORT */
 
 /* USE_AESNI inidicates whether to compile with Intel AES-NI code.  We
    need the vector-size attribute which seems to be available since
@@ -104,6 +104,18 @@
 # endif
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
+/* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
+ * accelerated code. */
+#undef USE_PPC_CRYPTO
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+     defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+#  if __GNUC__ >= 4
+#   define USE_PPC_CRYPTO 1
+#  endif
+# endif
+#endif /* ENABLE_PPC_CRYPTO_SUPPORT */
+
 struct RIJNDAEL_context_s;
 
 typedef unsigned int (*rijndael_cryptfn_t)(const struct RIJNDAEL_context_s *ctx,
@@ -154,6 +166,9 @@ typedef struct RIJNDAEL_context_s
 #ifdef USE_ARM_CE
   unsigned int use_arm_ce:1;          /* ARMv8 CE shall be used.  */
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO
+  unsigned int use_ppc_crypto:1;      /* PowerPC crypto shall be used.  */
+#endif /*USE_PPC_CRYPTO*/
   rijndael_cryptfn_t encrypt_fn;
   rijndael_cryptfn_t decrypt_fn;
   rijndael_prefetchfn_t prefetch_enc_fn;
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 2e5dd2f89..a7c47a876 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -1,5 +1,6 @@
-/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
  * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -24,138 +25,397 @@
 
 #include <config.h>
 
-/* PPC AES extensions */
-#include <altivec.h>
 #include "rijndael-internal.h"
 #include "cipher-internal.h"
+#include "bufhelp.h"
+
+#ifdef USE_PPC_CRYPTO
+
+#include <altivec.h>
+
 
 typedef vector unsigned char block;
-static const vector unsigned char backwards =
-  { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-
-#ifdef __LITTLE_ENDIAN__
-#define swap_if_le(a) \
-  vec_perm(a, a, backwards)
-#elif __BIG_ENDIAN__
-#define swap_if_le(a) (a)
+
+typedef union
+{
+  u32 data32[4];
+} __attribute__((packed, aligned(1), may_alias)) u128_t;
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+#define ALIGNED_LOAD(in_ptr) \
+  (vec_aligned_ld (0, (const unsigned char *)(in_ptr)))
+
+#define ALIGNED_STORE(out_ptr, vec) \
+  (vec_aligned_st ((vec), 0, (unsigned char *)(out_ptr)))
+
+#define VEC_LOAD_BE(in_ptr, bige_const) \
+  (vec_load_be (0, (const unsigned char *)(in_ptr), bige_const))
+
+#define VEC_STORE_BE(out_ptr, vec, bige_const) \
+  (vec_store_be ((vec), 0, (unsigned char *)(out_ptr), bige_const))
+
+
+static const block vec_bswap32_const =
+  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+
+
+static ASM_FUNC_ATTR_INLINE block
+vec_aligned_ld(unsigned long offset, const unsigned char *ptr)
+{
+#ifndef WORDS_BIGENDIAN
+  block vec;
+  __asm__ ("lvx %0,%1,%2\n\t"
+	   : "=v" (vec)
+	   : "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory");
+  return vec;
 #else
-#error "What endianness?"
+  return vec_vsx_ld (offset, ptr);
 #endif
+}
 
-/* Passes in AltiVec registers (big-endian)
- * sadly compilers don't know how to unroll outer loops into
- * inner loops with more registers on static functions,
- * so that this can be properly optimized for OOO multi-issue
- * without having to hand-unroll.
- */
-static block _gcry_aes_ppc8_encrypt_altivec (const RIJNDAEL_context *ctx,
-                                             block a)
+
+static ASM_FUNC_ATTR_INLINE block
+vec_load_be_const(void)
+{
+#ifndef WORDS_BIGENDIAN
+  return ~ALIGNED_LOAD(&vec_bswap32_const);
+#else
+  static const block vec_dummy = { 0 };
+  return vec_dummy;
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE block
+vec_load_be(unsigned long offset, const unsigned char *ptr,
+	    block be_bswap_const)
+{
+#ifndef WORDS_BIGENDIAN
+  block vec;
+  /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
+   * lxvw4x directly instead. */
+  __asm__ ("lxvw4x %x0,%1,%2\n\t"
+	   : "=wa" (vec)
+	   : "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory");
+  __asm__ ("vperm %0,%1,%1,%2\n\t"
+	   : "=v" (vec)
+	   : "v" (vec), "v" (be_bswap_const));
+  return vec;
+#else
+  (void)be_bswap_const;
+  return vec_vsx_ld (offset, ptr);
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr)
+{
+#ifndef WORDS_BIGENDIAN
+  __asm__ ("stvx %0,%1,%2\n\t"
+	   :
+	   : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory");
+#else
+  vec_vsx_st (vec, offset, ptr);
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE void
+vec_store_be(block vec, unsigned long offset, unsigned char *ptr,
+	     block be_bswap_const)
+{
+#ifndef WORDS_BIGENDIAN
+  /* GCC vec_vsx_st is generating two instructions on little-endian. Use
+   * stxvw4x directly instead. */
+  __asm__ ("vperm %0,%1,%1,%2\n\t"
+	   : "=v" (vec)
+	   : "v" (vec), "v" (be_bswap_const));
+  __asm__ ("stxvw4x %x0,%1,%2\n\t"
+	   :
+	   : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+	   : "memory");
+#else
+  (void)be_bswap_const;
+  vec_vsx_st (vec, offset, ptr);
+#endif
+}
+
+
+static ASM_FUNC_ATTR_INLINE u32
+_gcry_aes_sbox4_ppc8(u32 fourbytes)
+{
+  union
+    {
+      PROPERLY_ALIGNED_TYPE dummy;
+      block data_vec;
+      u32 data32[4];
+    } u;
+
+  u.data32[0] = fourbytes;
+  u.data_vec = vec_sbox_be(u.data_vec);
+  return u.data32[0];
+}
+
+void
+_gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
+{
+  const block bige_const = vec_load_be_const();
+  union
+    {
+      PROPERLY_ALIGNED_TYPE dummy;
+      byte data[MAXKC][4];
+      u32 data32[MAXKC];
+    } tkk[2];
+  unsigned int rounds = ctx->rounds;
+  int KC = rounds - 6;
+  unsigned int keylen = KC * 4;
+  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+  unsigned int i, r, t;
+  byte rcon = 1;
+  int j;
+#define k      tkk[0].data
+#define k_u32  tkk[0].data32
+#define tk     tkk[1].data
+#define tk_u32 tkk[1].data32
+#define W      (ctx->keyschenc)
+#define W_u32  (ctx->keyschenc32)
+
+  for (i = 0; i < keylen; i++)
+    {
+      k[i >> 2][i & 3] = key[i];
+    }
+
+  for (j = KC-1; j >= 0; j--)
+    {
+      tk_u32[j] = k_u32[j];
+    }
+  r = 0;
+  t = 0;
+  /* Copy values into round key array.  */
+  for (j = 0; (j < KC) && (r < rounds + 1); )
+    {
+      for (; (j < KC) && (t < 4); j++, t++)
+        {
+          W_u32[r][t] = le_bswap32(tk_u32[j]);
+        }
+      if (t == 4)
+        {
+          r++;
+          t = 0;
+        }
+    }
+  while (r < rounds + 1)
+    {
+      tk_u32[0] ^=
+	le_bswap32(
+	  _gcry_aes_sbox4_ppc8(rol(le_bswap32(tk_u32[KC - 1]), 24)) ^ rcon);
+
+      if (KC != 8)
+        {
+          for (j = 1; j < KC; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+        }
+      else
+        {
+          for (j = 1; j < KC/2; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+
+          tk_u32[KC/2] ^=
+	    le_bswap32(_gcry_aes_sbox4_ppc8(le_bswap32(tk_u32[KC/2 - 1])));
+
+          for (j = KC/2 + 1; j < KC; j++)
+            {
+              tk_u32[j] ^= tk_u32[j-1];
+            }
+        }
+
+      /* Copy values into round key array.  */
+      for (j = 0; (j < KC) && (r < rounds + 1); )
+        {
+          for (; (j < KC) && (t < 4); j++, t++)
+            {
+              W_u32[r][t] = le_bswap32(tk_u32[j]);
+            }
+          if (t == 4)
+            {
+              r++;
+              t = 0;
+            }
+        }
+
+      rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b);
+    }
+
+  /* Store in big-endian order. */
+  for (r = 0; r <= rounds; r++)
+    {
+#ifndef WORDS_BIGENDIAN
+      VEC_STORE_BE(&ekey[r], ALIGNED_LOAD(&ekey[r]), bige_const);
+#else
+      block rvec = ALIGNED_LOAD(&ekey[r]);
+      ALIGNED_STORE(&ekey[r],
+		    vec_perm(rvec, rvec, vec_bswap32_const));
+      (void)bige_const;
+#endif
+    }
+
+#undef W
+#undef tk
+#undef k
+#undef W_u32
+#undef tk_u32
+#undef k_u32
+  wipememory(&tkk, sizeof(tkk));
+}
+
+
+/* Make a decryption key from an encryption key. */
+void
+_gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 {
+  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+  u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
+  int rounds = ctx->rounds;
+  int rr;
   int r;
+
+  r = 0;
+  rr = rounds;
+  for (r = 0, rr = rounds; r <= rounds; r++, rr--)
+    {
+      ALIGNED_STORE(&dkey[r], ALIGNED_LOAD(&ekey[rr]));
+    }
+}
+
+
+static ASM_FUNC_ATTR_INLINE block
+aes_ppc8_encrypt_altivec (const RIJNDAEL_context *ctx, block a)
+{
+  u128_t *rk = (u128_t *)ctx->keyschenc;
   int rounds = ctx->rounds;
-  block *rk = (block*)ctx->keyschenc;
+  int r;
 
-  a = rk[0] ^ a;
-  for (r = 1;r < rounds;r++)
+#define DO_ROUND(r) (a = vec_cipher_be (a, ALIGNED_LOAD (&rk[r])))
+
+  a = ALIGNED_LOAD(&rk[0]) ^ a;
+  DO_ROUND(1);
+  DO_ROUND(2);
+  DO_ROUND(3);
+  DO_ROUND(4);
+  DO_ROUND(5);
+  DO_ROUND(6);
+  DO_ROUND(7);
+  DO_ROUND(8);
+  DO_ROUND(9);
+  r = 10;
+  if (rounds >= 12)
     {
-      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-        :"+v" (a)
-        :"v" (rk[r])
-      );
+      DO_ROUND(10);
+      DO_ROUND(11);
+      r = 12;
+      if (rounds > 12)
+	{
+	  DO_ROUND(12);
+	  DO_ROUND(13);
+	  r = 14;
+	}
     }
-  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-    :"+v" (a)
-    :"v" (rk[r])
-  );
+  a = vec_cipherlast_be(a, ALIGNED_LOAD(&rk[r]));
+
+#undef DO_ROUND
+
   return a;
 }
 
 
-static block _gcry_aes_ppc8_decrypt_altivec (const RIJNDAEL_context *ctx,
-                                             block a)
+static ASM_FUNC_ATTR_INLINE block
+aes_ppc8_decrypt_altivec (const RIJNDAEL_context *ctx, block a)
 {
-  int r;
+  u128_t *rk = (u128_t *)ctx->keyschdec;
   int rounds = ctx->rounds;
-  block *rk = (block*)ctx->keyschdec;
+  int r;
 
-  a = rk[0] ^ a;
-  for (r = 1;r < rounds;r++)
+#define DO_ROUND(r) (a = vec_ncipher_be (a, ALIGNED_LOAD (&rk[r])))
+
+  a = ALIGNED_LOAD(&rk[0]) ^ a;
+  DO_ROUND(1);
+  DO_ROUND(2);
+  DO_ROUND(3);
+  DO_ROUND(4);
+  DO_ROUND(5);
+  DO_ROUND(6);
+  DO_ROUND(7);
+  DO_ROUND(8);
+  DO_ROUND(9);
+  r = 10;
+  if (rounds >= 12)
     {
-      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-        :"+v" (a)
-        :"v" (rk[r])
-      );
+      DO_ROUND(10);
+      DO_ROUND(11);
+      r = 12;
+      if (rounds > 12)
+	{
+	  DO_ROUND(12);
+	  DO_ROUND(13);
+	  r = 14;
+	}
     }
-  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-    :"+v" (a)
-    :"v" (rk[r])
-  );
+  a = vec_ncipherlast_be(a, ALIGNED_LOAD(&rk[r]));
+
+#undef DO_ROUND
+
   return a;
 }
 
+
 unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
 				     unsigned char *b,
 				     const unsigned char *a)
 {
-  uintptr_t zero = 0;
+  const block bige_const = vec_load_be_const();
   block sa;
 
-  if ((uintptr_t)a % 16 == 0)
-    {
-      sa = vec_ld (0, a);
-    }
-  else
-    {
-      block unalignedprev, unalignedcur;
-      unalignedprev = vec_ld (0, a);
-      unalignedcur = vec_ld (16, a);
-      sa = vec_perm (unalignedprev, unalignedcur, vec_lvsl(0, a));
-    }
-
-  sa = swap_if_le(sa);
-  sa = _gcry_aes_ppc8_encrypt_altivec(ctx, sa);
-
-  __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-    :
-    : "wa" (sa), "r" (zero), "r" ((uintptr_t)b));
+  sa = VEC_LOAD_BE (a, bige_const);
+  sa = aes_ppc8_encrypt_altivec (ctx, sa);
+  VEC_STORE_BE (b, sa, bige_const);
 
   return 0; /* does not use stack */
 }
 
+
 unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
 				     unsigned char *b,
 				     const unsigned char *a)
 {
-  uintptr_t zero = 0;
-  block sa, unalignedprev, unalignedcur;
-
-  if ((uintptr_t)a % 16 == 0)
-    {
-      sa = vec_ld(0, a);
-    }
-  else
-    {
-      unalignedprev = vec_ld (0, a);
-      unalignedcur = vec_ld (16, a);
-      sa = vec_perm (unalignedprev, unalignedcur, vec_lvsl(0, a));
-    }
+  const block bige_const = vec_load_be_const();
+  block sa;
 
-  sa = swap_if_le (sa);
-  sa = _gcry_aes_ppc8_decrypt_altivec  (ctx, sa);
+  sa = VEC_LOAD_BE (a, bige_const);
+  sa = aes_ppc8_decrypt_altivec (ctx, sa);
+  VEC_STORE_BE (b, sa, bige_const);
 
-  if ((uintptr_t)b % 16 == 0)
-    {
-      vec_vsx_st(swap_if_le(sa), 0, b);
-    }
-  else
-    {
-      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-	:
-	: "wa" (sa), "r" (zero), "r" ((uintptr_t)b));
-    }
   return 0; /* does not use stack */
 }
 
+
+#if 0
 size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                                             const void *inbuf_arg, size_t nblocks,
                                             int encrypt)
@@ -673,4 +933,6 @@ size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
   return 0;
 }
+#endif
 
+#endif /* USE_PPC_CRYPTO */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 2c9aa6733..8a27dfe0b 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -199,6 +199,19 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
                                           size_t nblocks, int encrypt);
 #endif /*USE_ARM_ASM*/
 
+#ifdef USE_PPC_CRYPTO
+/* PowerPC Crypto implementations of AES */
+extern void _gcry_aes_ppc8_setkey(RIJNDAEL_context *ctx, const byte *key);
+extern void _gcry_aes_ppc8_prepare_decryption(RIJNDAEL_context *ctx);
+
+extern unsigned int _gcry_aes_ppc8_encrypt(const RIJNDAEL_context *ctx,
+					   unsigned char *dst,
+					   const unsigned char *src);
+extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx,
+					   unsigned char *dst,
+					   const unsigned char *src);
+#endif /*USE_PPC_CRYPTO*/
+
 static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
                                 const unsigned char *ax);
 static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -280,7 +293,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
   int i,j, r, t, rconpointer = 0;
   int KC;
 #if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \
-    || defined(USE_ARM_CE)
+    || defined(USE_ARM_CE) || defined(USE_PPC_CRYPTO)
   unsigned int hwfeatures;
 #endif
 
@@ -324,7 +337,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
   ctx->rounds = rounds;
 
 #if defined(USE_AESNI) || defined(USE_PADLOCK) || defined(USE_SSSE3) \
-    || defined(USE_ARM_CE)
+    || defined(USE_ARM_CE) || defined(USE_PPC_CRYPTO)
   hwfeatures = _gcry_get_hw_features ();
 #endif
 
@@ -341,6 +354,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 #ifdef USE_ARM_CE
   ctx->use_arm_ce = 0;
 #endif
+#ifdef USE_PPC_CRYPTO
+  ctx->use_ppc_crypto = 0;
+#endif
 
   if (0)
     {
@@ -420,6 +436,19 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
           hd->bulk.xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
         }
     }
+#endif
+#ifdef USE_PPC_CRYPTO
+  else if (hwfeatures & HWF_PPC_VCRYPTO)
+    {
+      ctx->encrypt_fn = _gcry_aes_ppc8_encrypt;
+      ctx->decrypt_fn = _gcry_aes_ppc8_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->use_ppc_crypto = 1;
+      if (hd)
+        {
+        }
+    }
 #endif
   else
     {
@@ -446,6 +475,10 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 #ifdef USE_ARM_CE
   else if (ctx->use_arm_ce)
     _gcry_aes_armv8_ce_setkey (ctx, key);
+#endif
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc_crypto)
+    _gcry_aes_ppc8_setkey (ctx, key);
 #endif
   else
     {
@@ -584,7 +617,19 @@ prepare_decryption( RIJNDAEL_context *ctx )
     {
       _gcry_aes_armv8_ce_prepare_decryption (ctx);
     }
-#endif /*USE_SSSE3*/
+#endif /*USE_ARM_CE*/
+#ifdef USE_ARM_CE
+  else if (ctx->use_arm_ce)
+    {
+      _gcry_aes_armv8_ce_prepare_decryption (ctx);
+    }
+#endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc_crypto)
+    {
+      _gcry_aes_ppc8_prepare_decryption (ctx);
+    }
+#endif
 #ifdef USE_PADLOCK
   else if (ctx->use_padlock)
     {
diff --git a/configure.ac b/configure.ac
index 6980f381a..586145aa4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1655,6 +1655,7 @@ if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then
             [Defined if underlying assembler is compatible with Intel syntax assembly implementations])
 fi
 
+
 #
 # Check whether compiler is configured for ARMv6 or newer architecture
 #
@@ -1831,6 +1832,112 @@ if test "$gcry_cv_gcc_inline_asm_aarch64_crypto" = "yes" ; then
 fi
 
 
+#
+# Check whether PowerPC AltiVec/VSX intrinsics
+#
+AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX intrinsics],
+      [gcry_cv_cc_ppc_altivec],
+      [if test "$mpi_cpu_arch" != "ppc" ; then
+	gcry_cv_cc_ppc_altivec="n/a"
+      else
+	gcry_cv_cc_ppc_altivec=no
+	AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+	[[#include <altivec.h>
+	  typedef vector unsigned char block;
+	  block fn(block in)
+	  {
+	    block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
+	    return vec_cipher_be (t, in);
+	  }
+	  ]])],
+	[gcry_cv_cc_ppc_altivec=yes])
+      fi])
+if test "$gcry_cv_cc_ppc_altivec" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
+	    [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
+fi
+
+_gcc_cflags_save=$CFLAGS
+CFLAGS="$CFLAGS -maltivec -mvsx -mcrypto"
+
+if test "$gcry_cv_cc_ppc_altivec" = "no" &&
+    test "$mpi_cpu_arch" = "ppc" ; then
+  AC_CACHE_CHECK([whether compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags],
+    [gcry_cv_cc_ppc_altivec_cflags],
+    [gcry_cv_cc_ppc_altivec_cflags=no
+    AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+      [[#include <altivec.h>
+	typedef vector unsigned char block;
+	block fn(block in)
+	{
+	  block t = vec_perm (in, in, vec_vsx_ld (0, (unsigned char*)0));
+	  return vec_cipher_be (t, in);
+	}]])],
+      [gcry_cv_cc_ppc_altivec_cflags=yes])])
+  if test "$gcry_cv_cc_ppc_altivec_cflags" = "yes" ; then
+    AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC,1,
+	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics])
+    AC_DEFINE(HAVE_COMPATIBLE_CC_PPC_ALTIVEC_WITH_CFLAGS,1,
+	      [Defined if underlying compiler supports PowerPC AltiVec/VSX/crypto intrinsics with extra GCC flags])
+  fi
+fi
+
+AM_CONDITIONAL(ENABLE_PPC_VCRYPTO_EXTRA_CFLAGS,
+	       test "$gcry_cv_cc_ppc_altivec_cflags" = "yes")
+
+# Restore flags.
+CFLAGS=$_gcc_cflags_save;
+
+
+#
+# Check whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports PowerPC AltiVec/VSX/crypto instructions],
+       [gcry_cv_gcc_inline_asm_ppc_altivec],
+       [if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_gcc_inline_asm_ppc_altivec="n/a"
+        else
+          gcry_cv_gcc_inline_asm_ppc_altivec=no
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[__asm__(".globl testfn;\n"
+		    "testfn:\n"
+		    "stvx %v31,%r12,%r0;\n"
+		    "lvx  %v20,%r12,%r0;\n"
+		    "vcipher %v0, %v1, %v22;\n"
+		    "lxvw4x %vs32, %r0, %r1;\n"
+		  );
+            ]])],
+          [gcry_cv_gcc_inline_asm_ppc_altivec=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_ppc_altivec" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC,1,
+     [Defined if inline assembler supports PowerPC AltiVec/VSX/crypto instructions])
+fi
+
+
+#
+# Check whether GCC inline assembler supports PowerISA 3.00 instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports PowerISA 3.00 instructions],
+       [gcry_cv_gcc_inline_asm_ppc_arch_3_00],
+       [if test "$mpi_cpu_arch" != "ppc" ; then
+          gcry_cv_gcc_inline_asm_ppc_arch_3_00="n/a"
+        else
+          gcry_cv_gcc_inline_asm_ppc_arch_3_00=no
+          AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[__asm__(".globl testfn;\n"
+		    "testfn:\n"
+		    "stxvb16x %r1,%v12,%v30;\n"
+		  );
+            ]])],
+          [gcry_cv_gcc_inline_asm_ppc_arch_3_00=yes])
+        fi])
+if test "$gcry_cv_gcc_inline_asm_ppc_arch_3_00" = "yes" ; then
+   AC_DEFINE(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00,1,
+     [Defined if inline assembler supports PowerISA 3.00 instructions])
+fi
+
+
 #######################################
 #### Checks for library functions. ####
 #######################################
@@ -2229,6 +2336,20 @@ if test "$found" = "1" ; then
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-ce.lo"
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo"
       ;;
+      powerpc64le-*-*)
+         # Build with the crypto extension implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
+      ;;
+      powerpc64-*-*)
+         # Big-Endian.
+         # Build with the crypto extension implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
+      ;;
+      powerpc-*-*)
+         # Big-Endian.
+         # Build with the crypto extension implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
+      ;;
    esac
 
    case "$mpi_cpu_arch" in




More information about the Gcrypt-devel mailing list