[PATCH 1/3] Add POWER9 little-endian variant of PPC AES implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Sat Feb 1 23:58:01 CET 2020


* configure.ac: Add 'rijndael-ppc9le.lo'.
* cipher/Makefile.am: Add 'rijndael-ppc9le.c', 'rijndael-ppc-common.h'
and 'rijndael-ppc-functions.h'.
* cipher/rijndael-internal.h (USE_PPC_CRYPTO_WITH_PPC9LE): New.
(RIJNDAEL_context_s): Add 'use_ppc9le_crypto'.
* cipher/rijndael.c (_gcry_aes_ppc9le_encrypt)
(_gcry_aes_ppc9le_decrypt, _gcry_aes_ppc9le_cfb_enc)
(_gcry_aes_ppc9le_cfb_dec, _gcry_aes_ppc9le_ctr_enc)
(_gcry_aes_ppc9le_cbc_enc, _gcry_aes_ppc9le_cbc_dec)
(_gcry_aes_ppc9le_ocb_crypt, _gcry_aes_ppc9le_ocb_auth)
(_gcry_aes_ppc9le_xts_crypt): New.
(do_setkey, _gcry_aes_cfb_enc, _gcry_aes_cbc_enc)
(_gcry_aes_ctr_enc, _gcry_aes_cfb_dec, _gcry_aes_cbc_dec)
(_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth, _gcry_aes_xts_crypt)
[USE_PPC_CRYPTO_WITH_PPC9LE]: New.
* cipher/rijndael-ppc.c: Split common code to headers
'rijndael-ppc-common.h' and 'rijndael-ppc-functions.h'.
* cipher/rijndael-ppc-common.h: Split from 'rijndael-ppc.c'.
(asm_add_uint64, asm_sra_int64, asm_swap_uint64_halfs): New.
* cipher/rijndael-ppc-functions.h: Split from 'rijndael-ppc.c'.
(CFB_ENC_FUNC, CBC_ENC_FUNC): Unroll loop by 2.
(XTS_CRYPT_FUNC, GEN_TWEAK): Tweak generation without vperm
instruction.
* cipher/rijndael-ppc9le.c: New.
--

Provide POWER9 little-endian optimized variant of PPC vcrypto AES
implementation. This implementation uses 'lxvb16x' and 'stxvb16x'
instructions to load/store vectors directly in big-endian order.

Benchmark on POWER9 (~3.8Ghz):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC enc |      1.04 ns/B     918.7 MiB/s      3.94 c/B
        CBC dec |     0.222 ns/B      4292 MiB/s     0.844 c/B
        CFB enc |      1.04 ns/B     916.9 MiB/s      3.95 c/B
        CFB dec |     0.224 ns/B      4252 MiB/s     0.852 c/B
        CTR enc |     0.226 ns/B      4218 MiB/s     0.859 c/B
        CTR dec |     0.225 ns/B      4233 MiB/s     0.856 c/B
        XTS enc |     0.500 ns/B      1907 MiB/s      1.90 c/B
        XTS dec |     0.494 ns/B      1932 MiB/s      1.88 c/B
        OCB enc |     0.288 ns/B      3312 MiB/s      1.09 c/B
        OCB dec |     0.292 ns/B      3266 MiB/s      1.11 c/B
       OCB auth |     0.267 ns/B      3567 MiB/s      1.02 c/B

After (ctr & ocb & cbc-dec & cfb-dec ~15% and xts ~8% faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        CBC enc |      1.04 ns/B     914.2 MiB/s      3.96 c/B
        CBC dec |     0.191 ns/B      4984 MiB/s     0.727 c/B
        CFB enc |      1.03 ns/B     930.0 MiB/s      3.90 c/B
        CFB dec |     0.194 ns/B      4906 MiB/s     0.739 c/B
        CTR enc |     0.196 ns/B      4868 MiB/s     0.744 c/B
        CTR dec |     0.197 ns/B      4834 MiB/s     0.750 c/B
        XTS enc |     0.460 ns/B      2075 MiB/s      1.75 c/B
        XTS dec |     0.455 ns/B      2097 MiB/s      1.73 c/B
        OCB enc |     0.250 ns/B      3812 MiB/s     0.951 c/B
        OCB dec |     0.253 ns/B      3764 MiB/s     0.963 c/B
       OCB auth |     0.232 ns/B      4106 MiB/s     0.883 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 10a5ab62f..ef83cc741 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -99,7 +99,8 @@ EXTRA_libcipher_la_SOURCES = \
 	rijndael-ssse3-amd64.c rijndael-ssse3-amd64-asm.S  \
 	rijndael-armv8-ce.c rijndael-armv8-aarch32-ce.S    \
 	rijndael-armv8-aarch64-ce.S rijndael-aarch64.S     \
-	rijndael-ppc.c                                     \
+	rijndael-ppc.c rijndael-ppc9le.c                   \
+	rijndael-ppc-common.h rijndael-ppc-functions.h     \
 	rmd160.c \
 	rsa.c \
 	salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
@@ -221,6 +222,12 @@ rijndael-ppc.o: $(srcdir)/rijndael-ppc.c Makefile
 rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
 	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
+rijndael-ppc9le.o: $(srcdir)/rijndael-ppc9le.c Makefile
+	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
+rijndael-ppc9le.lo: $(srcdir)/rijndael-ppc9le.c Makefile
+	`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
+
 sha256-ppc.o: $(srcdir)/sha256-ppc.c Makefile
 	`echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< | $(instrumentation_munging) `
 
diff --git a/cipher/rijndael-internal.h b/cipher/rijndael-internal.h
index 5150a69d7..bdd3bee14 100644
--- a/cipher/rijndael-internal.h
+++ b/cipher/rijndael-internal.h
@@ -105,13 +105,18 @@
 #endif /* ENABLE_ARM_CRYPTO_SUPPORT */
 
 /* USE_PPC_CRYPTO indicates whether to enable PowerPC vector crypto
- * accelerated code. */
+ * accelerated code.  USE_PPC_CRYPTO_WITH_PPC9LE indicates whether to
+ * enable POWER9 optimized variant.  */
 #undef USE_PPC_CRYPTO
+#undef USE_PPC_CRYPTO_WITH_PPC9LE
 #ifdef ENABLE_PPC_CRYPTO_SUPPORT
 # if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
      defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
 #  if __GNUC__ >= 4
 #   define USE_PPC_CRYPTO 1
+#   if !defined(WORDS_BIGENDIAN) && defined(HAVE_GCC_INLINE_ASM_PPC_ARCH_3_00)
+#    define USE_PPC_CRYPTO_WITH_PPC9LE 1
+#   endif
 #  endif
 # endif
 #endif /* ENABLE_PPC_CRYPTO_SUPPORT */
@@ -169,6 +174,9 @@ typedef struct RIJNDAEL_context_s
 #ifdef USE_PPC_CRYPTO
   unsigned int use_ppc_crypto:1;      /* PowerPC crypto shall be used.  */
 #endif /*USE_PPC_CRYPTO*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  unsigned int use_ppc9le_crypto:1;   /* POWER9 LE crypto shall be used.  */
+#endif
   rijndael_cryptfn_t encrypt_fn;
   rijndael_cryptfn_t decrypt_fn;
   rijndael_prefetchfn_t prefetch_enc_fn;
diff --git a/cipher/rijndael-ppc-common.h b/cipher/rijndael-ppc-common.h
new file mode 100644
index 000000000..165dd9f71
--- /dev/null
+++ b/cipher/rijndael-ppc-common.h
@@ -0,0 +1,326 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#ifndef G10_RIJNDAEL_PPC_COMMON_H
+#define G10_RIJNDAEL_PPC_COMMON_H
+
+#include <altivec.h>
+
+
+typedef vector unsigned char block;
+
+typedef union
+{
+  u32 data32[4];
+} __attribute__((packed, aligned(1), may_alias)) u128_t;
+
+
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define NO_INLINE __attribute__((noinline))
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
+
+#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
+#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
+#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
+
+
+#define ALIGNED_LOAD(in_ptr, offs) \
+  (asm_aligned_ld ((offs) * 16, (const void *)(in_ptr)))
+
+#define ALIGNED_STORE(out_ptr, offs, vec) \
+  (asm_aligned_st ((vec), (offs) * 16, (void *)(out_ptr)))
+
+#define VEC_BE_SWAP(vec, bige_const) (asm_be_swap ((vec), (bige_const)))
+
+#define VEC_LOAD_BE(in_ptr, offs, bige_const) \
+  (asm_be_swap (asm_load_be_noswap ((offs) * 16, (const void *)(in_ptr)), \
+		bige_const))
+
+#define VEC_LOAD_BE_NOSWAP(in_ptr, offs) \
+  (asm_load_be_noswap ((offs) * 16, (const unsigned char *)(in_ptr)))
+
+#define VEC_STORE_BE(out_ptr, offs, vec, bige_const) \
+  (asm_store_be_noswap (asm_be_swap ((vec), (bige_const)), (offs) * 16, \
+		        (void *)(out_ptr)))
+
+#define VEC_STORE_BE_NOSWAP(out_ptr, offs, vec) \
+  (asm_store_be_noswap ((vec), (offs) * 16, (void *)(out_ptr)))
+
+
+#define ROUND_KEY_VARIABLES \
+  block rkey0, rkeylast
+
+#define PRELOAD_ROUND_KEYS(nrounds) \
+  do { \
+    rkey0 = ALIGNED_LOAD (rk, 0); \
+    rkeylast = ALIGNED_LOAD (rk, nrounds); \
+  } while (0)
+
+#define AES_ENCRYPT(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 1)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 2)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 3)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 4)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 5)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 6)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 7)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 8)); \
+    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 9)); \
+    if (nrounds >= 12) \
+      { \
+	blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 10)); \
+	blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 11)); \
+	if (rounds > 12) \
+	  { \
+	    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 12)); \
+	    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 13)); \
+	  } \
+      } \
+    blk = asm_cipherlast_be (blk, rkeylast); \
+  } while (0)
+
+#define AES_DECRYPT(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 1)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 2)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 3)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 4)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 5)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 6)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 7)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 8)); \
+    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 9)); \
+    if (nrounds >= 12) \
+      { \
+	blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 10)); \
+	blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 11)); \
+	if (rounds > 12) \
+	  { \
+	    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 12)); \
+	    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 13)); \
+	  } \
+      } \
+    blk = asm_ncipherlast_be (blk, rkeylast); \
+  } while (0)
+
+
+#define ROUND_KEY_VARIABLES_ALL \
+  block rkey0, rkey1, rkey2, rkey3, rkey4, rkey5, rkey6, rkey7, rkey8, \
+        rkey9, rkey10, rkey11, rkey12, rkey13, rkeylast
+
+#define PRELOAD_ROUND_KEYS_ALL(nrounds) \
+  do { \
+    rkey0 = ALIGNED_LOAD (rk, 0); \
+    rkey1 = ALIGNED_LOAD (rk, 1); \
+    rkey2 = ALIGNED_LOAD (rk, 2); \
+    rkey3 = ALIGNED_LOAD (rk, 3); \
+    rkey4 = ALIGNED_LOAD (rk, 4); \
+    rkey5 = ALIGNED_LOAD (rk, 5); \
+    rkey6 = ALIGNED_LOAD (rk, 6); \
+    rkey7 = ALIGNED_LOAD (rk, 7); \
+    rkey8 = ALIGNED_LOAD (rk, 8); \
+    rkey9 = ALIGNED_LOAD (rk, 9); \
+    if (nrounds >= 12) \
+      { \
+	rkey10 = ALIGNED_LOAD (rk, 10); \
+	rkey11 = ALIGNED_LOAD (rk, 11); \
+	if (rounds > 12) \
+	  { \
+	    rkey12 = ALIGNED_LOAD (rk, 12); \
+	    rkey13 = ALIGNED_LOAD (rk, 13); \
+	  } \
+      } \
+    rkeylast = ALIGNED_LOAD (rk, nrounds); \
+  } while (0)
+
+#define AES_ENCRYPT_ALL(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = asm_cipher_be (blk, rkey1); \
+    blk = asm_cipher_be (blk, rkey2); \
+    blk = asm_cipher_be (blk, rkey3); \
+    blk = asm_cipher_be (blk, rkey4); \
+    blk = asm_cipher_be (blk, rkey5); \
+    blk = asm_cipher_be (blk, rkey6); \
+    blk = asm_cipher_be (blk, rkey7); \
+    blk = asm_cipher_be (blk, rkey8); \
+    blk = asm_cipher_be (blk, rkey9); \
+    if (nrounds >= 12) \
+      { \
+	blk = asm_cipher_be (blk, rkey10); \
+	blk = asm_cipher_be (blk, rkey11); \
+	if (rounds > 12) \
+	  { \
+	    blk = asm_cipher_be (blk, rkey12); \
+	    blk = asm_cipher_be (blk, rkey13); \
+	  } \
+      } \
+    blk = asm_cipherlast_be (blk, rkeylast); \
+  } while (0)
+
+
+static ASM_FUNC_ATTR_INLINE block
+asm_aligned_ld(unsigned long offset, const void *ptr)
+{
+  block vec;
+  __asm__ volatile ("lvx %0,%1,%2\n\t"
+		    : "=v" (vec)
+		    : "r" (offset), "r" ((uintptr_t)ptr)
+		    : "memory", "r0");
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+asm_aligned_st(block vec, unsigned long offset, void *ptr)
+{
+  __asm__ volatile ("stvx %0,%1,%2\n\t"
+		    :
+		    : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+		    : "memory", "r0");
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_vperm1(block vec, block mask)
+{
+  block o;
+  __asm__ volatile ("vperm %0,%1,%1,%2\n\t"
+		    : "=v" (o)
+		    : "v" (vec), "v" (mask));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_add_uint128(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vadduqm %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_add_uint64(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vaddudm %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_sra_int64(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vsrad %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static block
+asm_swap_uint64_halfs(block a)
+{
+  block res;
+  __asm__ volatile ("xxswapd %x0, %x1"
+		    : "=wa" (res)
+		    : "wa" (a));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_xor(block a, block b)
+{
+  block res;
+  __asm__ volatile ("vxor %0,%1,%2\n\t"
+		    : "=v" (res)
+		    : "v" (a), "v" (b));
+  return res;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_cipher_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vcipher %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_cipherlast_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vcipherlast %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_ncipher_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vncipher %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_ncipherlast_be(block b, block rk)
+{
+  block o;
+  __asm__ volatile ("vncipherlast %0, %1, %2\n\t"
+		    : "=v" (o)
+		    : "v" (b), "v" (rk));
+  return o;
+}
+
+
+/* Make a decryption key from an encryption key. */
+static ASM_FUNC_ATTR_INLINE void
+internal_aes_ppc_prepare_decryption (RIJNDAEL_context *ctx)
+{
+  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
+  u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
+  int rounds = ctx->rounds;
+  int rr;
+  int r;
+
+  r = 0;
+  rr = rounds;
+  for (r = 0, rr = rounds; r <= rounds; r++, rr--)
+    {
+      ALIGNED_STORE (dkey, r, ALIGNED_LOAD (ekey, rr));
+    }
+}
+
+#endif /* G10_RIJNDAEL_PPC_COMMON_H */
diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
new file mode 100644
index 000000000..72f31852b
--- /dev/null
+++ b/cipher/rijndael-ppc-functions.h
@@ -0,0 +1,2020 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+unsigned int ENCRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
+				 unsigned char *out,
+				 const unsigned char *in)
+{
+  const block bige_const = asm_load_be_const();
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b;
+
+  b = VEC_LOAD_BE (in, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  AES_ENCRYPT (b, rounds);
+  VEC_STORE_BE (out, 0, b, bige_const);
+
+  return 0; /* does not use stack */
+}
+
+
+unsigned int DECRYPT_BLOCK_FUNC (const RIJNDAEL_context *ctx,
+				 unsigned char *out,
+				 const unsigned char *in)
+{
+  const block bige_const = asm_load_be_const();
+  const u128_t *rk = (u128_t *)&ctx->keyschdec;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b;
+
+  b = VEC_LOAD_BE (in, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  AES_DECRYPT (b, rounds);
+  VEC_STORE_BE (out, 0, b, bige_const);
+
+  return 0; /* does not use stack */
+}
+
+
+void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES_ALL;
+  block rkeylast_orig;
+  block iv;
+
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS_ALL (rounds);
+  rkeylast_orig = rkeylast;
+
+  for (; nblocks >= 2; nblocks -= 2)
+    {
+      block in2, iv1;
+
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
+      in += 2;
+
+      AES_ENCRYPT_ALL (iv, rounds);
+
+      iv1 = iv;
+      rkeylast = rkeylast_orig ^ in2;
+
+      AES_ENCRYPT_ALL (iv, rounds);
+
+      VEC_STORE_BE (out++, 0, iv1, bige_const);
+      VEC_STORE_BE (out++, 0, iv, bige_const);
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+      AES_ENCRYPT_ALL (iv, rounds);
+
+      VEC_STORE_BE (out++, 0, iv, bige_const);
+    }
+
+  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+}
+
+void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block iv, b, bin;
+  block in0, in1, in2, in3, in4, in5, in6, in7;
+  block b0, b1, b2, b3, b4, b5, b6, b7;
+  block rkey;
+
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      in0 = iv;
+      in1 = VEC_LOAD_BE_NOSWAP (in, 0);
+      in2 = VEC_LOAD_BE_NOSWAP (in, 1);
+      in3 = VEC_LOAD_BE_NOSWAP (in, 2);
+      in4 = VEC_LOAD_BE_NOSWAP (in, 3);
+      in1 = VEC_BE_SWAP (in1, bige_const);
+      in2 = VEC_BE_SWAP (in2, bige_const);
+      in5 = VEC_LOAD_BE_NOSWAP (in, 4);
+      in6 = VEC_LOAD_BE_NOSWAP (in, 5);
+      in3 = VEC_BE_SWAP (in3, bige_const);
+      in4 = VEC_BE_SWAP (in4, bige_const);
+      in7 = VEC_LOAD_BE_NOSWAP (in, 6);
+      iv = VEC_LOAD_BE_NOSWAP (in, 7);
+      in += 8;
+      in5 = VEC_BE_SWAP (in5, bige_const);
+      in6 = VEC_BE_SWAP (in6, bige_const);
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      in7 = VEC_BE_SWAP (in7, bige_const);
+      iv = VEC_BE_SWAP (iv, bige_const);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+      b4 = asm_xor (rkey0, in4);
+      b5 = asm_xor (rkey0, in5);
+      b6 = asm_xor (rkey0, in6);
+      b7 = asm_xor (rkey0, in7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+      in3 = asm_xor (rkeylast, in3);
+      in4 = asm_xor (rkeylast, in4);
+      b0 = asm_cipherlast_be (b0, in1);
+      b1 = asm_cipherlast_be (b1, in2);
+      in5 = asm_xor (rkeylast, in5);
+      in6 = asm_xor (rkeylast, in6);
+      b2 = asm_cipherlast_be (b2, in3);
+      b3 = asm_cipherlast_be (b3, in4);
+      in7 = asm_xor (rkeylast, in7);
+      in0 = asm_xor (rkeylast, iv);
+      b0 = VEC_BE_SWAP (b0, bige_const);
+      b1 = VEC_BE_SWAP (b1, bige_const);
+      b4 = asm_cipherlast_be (b4, in5);
+      b5 = asm_cipherlast_be (b5, in6);
+      b2 = VEC_BE_SWAP (b2, bige_const);
+      b3 = VEC_BE_SWAP (b3, bige_const);
+      b6 = asm_cipherlast_be (b6, in7);
+      b7 = asm_cipherlast_be (b7, in0);
+      b4 = VEC_BE_SWAP (b4, bige_const);
+      b5 = VEC_BE_SWAP (b5, bige_const);
+      b6 = VEC_BE_SWAP (b6, bige_const);
+      b7 = VEC_BE_SWAP (b7, bige_const);
+      VEC_STORE_BE_NOSWAP (out, 0, b0);
+      VEC_STORE_BE_NOSWAP (out, 1, b1);
+      VEC_STORE_BE_NOSWAP (out, 2, b2);
+      VEC_STORE_BE_NOSWAP (out, 3, b3);
+      VEC_STORE_BE_NOSWAP (out, 4, b4);
+      VEC_STORE_BE_NOSWAP (out, 5, b5);
+      VEC_STORE_BE_NOSWAP (out, 6, b6);
+      VEC_STORE_BE_NOSWAP (out, 7, b7);
+      out += 8;
+    }
+
+  if (nblocks >= 4)
+    {
+      in0 = iv;
+      in1 = VEC_LOAD_BE (in, 0, bige_const);
+      in2 = VEC_LOAD_BE (in, 1, bige_const);
+      in3 = VEC_LOAD_BE (in, 2, bige_const);
+      iv = VEC_LOAD_BE (in, 3, bige_const);
+
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+      in3 = asm_xor (rkeylast, in3);
+      in0 = asm_xor (rkeylast, iv);
+      b0 = asm_cipherlast_be (b0, in1);
+      b1 = asm_cipherlast_be (b1, in2);
+      b2 = asm_cipherlast_be (b2, in3);
+      b3 = asm_cipherlast_be (b3, in0);
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+
+      in += 4;
+      out += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      bin = VEC_LOAD_BE (in, 0, bige_const);
+      rkeylast = rkeylast_orig ^ bin;
+      b = iv;
+      iv = bin;
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+}
+
+
+void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks, int cbc_mac)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  byte *out = (byte *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES_ALL;
+  block lastiv, b;
+  unsigned int outadd = -(!cbc_mac) & 16;
+
+  lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS_ALL (rounds);
+
+  for (; nblocks >= 2; nblocks -= 2)
+    {
+      block in2, lastiv1;
+
+      b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);
+      in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
+      in += 2;
+
+      AES_ENCRYPT_ALL (b, rounds);
+
+      lastiv1 = b;
+      b = lastiv1 ^ in2;
+
+      AES_ENCRYPT_ALL (b, rounds);
+
+      lastiv = b;
+      VEC_STORE_BE ((u128_t *)out, 0, lastiv1, bige_const);
+      out += outadd;
+      VEC_STORE_BE ((u128_t *)out, 0, lastiv, bige_const);
+      out += outadd;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = lastiv ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+      AES_ENCRYPT_ALL (b, rounds);
+
+      lastiv = b;
+      VEC_STORE_BE ((u128_t *)out, 0, b, bige_const);
+      out += outadd;
+    }
+
+  VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
+}
+
+void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschdec;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block in0, in1, in2, in3, in4, in5, in6, in7;
+  block b0, b1, b2, b3, b4, b5, b6, b7;
+  block rkey;
+  block iv, b;
+
+  if (!ctx->decryption_prepared)
+    {
+      internal_aes_ppc_prepare_decryption (ctx);
+      ctx->decryption_prepared = 1;
+    }
+
+  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+      in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+      in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+      in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+      in0 = VEC_BE_SWAP (in0, bige_const);
+      in1 = VEC_BE_SWAP (in1, bige_const);
+      in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+      in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+      in2 = VEC_BE_SWAP (in2, bige_const);
+      in3 = VEC_BE_SWAP (in3, bige_const);
+      in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+      in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+      in += 8;
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      in4 = VEC_BE_SWAP (in4, bige_const);
+      in5 = VEC_BE_SWAP (in5, bige_const);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+      in6 = VEC_BE_SWAP (in6, bige_const);
+      in7 = VEC_BE_SWAP (in7, bige_const);
+      b4 = asm_xor (rkey0, in4);
+      b5 = asm_xor (rkey0, in5);
+      b6 = asm_xor (rkey0, in6);
+      b7 = asm_xor (rkey0, in7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      iv = asm_xor (rkeylast, iv);
+      in0 = asm_xor (rkeylast, in0);
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+      b0 = asm_ncipherlast_be (b0, iv);
+      iv = in7;
+      b1 = asm_ncipherlast_be (b1, in0);
+      in3 = asm_xor (rkeylast, in3);
+      in4 = asm_xor (rkeylast, in4);
+      b2 = asm_ncipherlast_be (b2, in1);
+      b3 = asm_ncipherlast_be (b3, in2);
+      in5 = asm_xor (rkeylast, in5);
+      in6 = asm_xor (rkeylast, in6);
+      b0 = VEC_BE_SWAP (b0, bige_const);
+      b1 = VEC_BE_SWAP (b1, bige_const);
+      b4 = asm_ncipherlast_be (b4, in3);
+      b5 = asm_ncipherlast_be (b5, in4);
+      b2 = VEC_BE_SWAP (b2, bige_const);
+      b3 = VEC_BE_SWAP (b3, bige_const);
+      b6 = asm_ncipherlast_be (b6, in5);
+      b7 = asm_ncipherlast_be (b7, in6);
+      b4 = VEC_BE_SWAP (b4, bige_const);
+      b5 = VEC_BE_SWAP (b5, bige_const);
+      b6 = VEC_BE_SWAP (b6, bige_const);
+      b7 = VEC_BE_SWAP (b7, bige_const);
+      VEC_STORE_BE_NOSWAP (out, 0, b0);
+      VEC_STORE_BE_NOSWAP (out, 1, b1);
+      VEC_STORE_BE_NOSWAP (out, 2, b2);
+      VEC_STORE_BE_NOSWAP (out, 3, b3);
+      VEC_STORE_BE_NOSWAP (out, 4, b4);
+      VEC_STORE_BE_NOSWAP (out, 5, b5);
+      VEC_STORE_BE_NOSWAP (out, 6, b6);
+      VEC_STORE_BE_NOSWAP (out, 7, b7);
+      out += 8;
+    }
+
+  if (nblocks >= 4)
+    {
+      in0 = VEC_LOAD_BE (in, 0, bige_const);
+      in1 = VEC_LOAD_BE (in, 1, bige_const);
+      in2 = VEC_LOAD_BE (in, 2, bige_const);
+      in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+      b0 = asm_xor (rkey0, in0);
+      b1 = asm_xor (rkey0, in1);
+      b2 = asm_xor (rkey0, in2);
+      b3 = asm_xor (rkey0, in3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      iv = asm_xor (rkeylast, iv);
+      in0 = asm_xor (rkeylast, in0);
+      in1 = asm_xor (rkeylast, in1);
+      in2 = asm_xor (rkeylast, in2);
+
+      b0 = asm_ncipherlast_be (b0, iv);
+      iv = in3;
+      b1 = asm_ncipherlast_be (b1, in0);
+      b2 = asm_ncipherlast_be (b2, in1);
+      b3 = asm_ncipherlast_be (b3, in2);
+
+      VEC_STORE_BE (out, 0, b0, bige_const);
+      VEC_STORE_BE (out, 1, b1, bige_const);
+      VEC_STORE_BE (out, 2, b2, bige_const);
+      VEC_STORE_BE (out, 3, b3, bige_const);
+
+      in += 4;
+      out += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      rkeylast = rkeylast_orig ^ iv;
+
+      iv = VEC_LOAD_BE (in, 0, bige_const);
+      b = iv;
+      AES_DECRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      in++;
+      out++;
+    }
+
+  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+}
+
+
+void CTR_ENC_FUNC (void *context, unsigned char *ctr_arg,
+		   void *outbuf_arg, const void *inbuf_arg,
+		   size_t nblocks)
+{
+  static const unsigned char vec_one_const[16] =
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block rkeylast_orig;
+  block ctr, b, one;
+
+  ctr = VEC_LOAD_BE (ctr_arg, 0, bige_const);
+  one = VEC_LOAD_BE (&vec_one_const, 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+  rkeylast_orig = rkeylast;
+
+  if (nblocks >= 4)
+    {
+      block in0, in1, in2, in3, in4, in5, in6, in7;
+      block b0, b1, b2, b3, b4, b5, b6, b7;
+      block two, three, four;
+      block rkey;
+
+      two   = asm_add_uint128 (one, one);
+      three = asm_add_uint128 (two, one);
+      four  = asm_add_uint128 (two, two);
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b1 = asm_add_uint128 (ctr, one);
+	  b2 = asm_add_uint128 (ctr, two);
+	  b3 = asm_add_uint128 (ctr, three);
+	  b4 = asm_add_uint128 (ctr, four);
+	  b5 = asm_add_uint128 (b1, four);
+	  b6 = asm_add_uint128 (b2, four);
+	  b7 = asm_add_uint128 (b3, four);
+	  b0 = asm_xor (rkey0, ctr);
+	  rkey = ALIGNED_LOAD (rk, 1);
+	  ctr = asm_add_uint128 (b4, four);
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+	  b0 = asm_cipher_be (b0, rkey);
+	  b1 = asm_cipher_be (b1, rkey);
+	  b2 = asm_cipher_be (b2, rkey);
+	  b3 = asm_cipher_be (b3, rkey);
+	  b4 = asm_xor (rkey0, b4);
+	  b5 = asm_xor (rkey0, b5);
+	  b6 = asm_xor (rkey0, b6);
+	  b7 = asm_xor (rkey0, b7);
+	  b4 = asm_cipher_be (b4, rkey);
+	  b5 = asm_cipher_be (b5, rkey);
+	  b6 = asm_cipher_be (b6, rkey);
+	  b7 = asm_cipher_be (b7, rkey);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  DO_ROUND(2);
+	  in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  DO_ROUND(3);
+	  in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  DO_ROUND(4);
+	  in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  DO_ROUND(5);
+	  in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  DO_ROUND(6);
+	  in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  DO_ROUND(7);
+	  in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  DO_ROUND(8);
+	  in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  DO_ROUND(9);
+
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = VEC_BE_SWAP (in0, bige_const);
+	  in1 = VEC_BE_SWAP (in1, bige_const);
+	  in2 = VEC_BE_SWAP (in2, bige_const);
+	  in3 = VEC_BE_SWAP (in3, bige_const);
+	  in4 = VEC_BE_SWAP (in4, bige_const);
+	  in5 = VEC_BE_SWAP (in5, bige_const);
+	  in6 = VEC_BE_SWAP (in6, bige_const);
+	  in7 = VEC_BE_SWAP (in7, bige_const);
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  in4 = asm_xor (rkeylast, in4);
+	  in5 = asm_xor (rkeylast, in5);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+	  in6 = asm_xor (rkeylast, in6);
+	  in7 = asm_xor (rkeylast, in7);
+	  b4 = asm_cipherlast_be (b4, in4);
+	  b5 = asm_cipherlast_be (b5, in5);
+	  b6 = asm_cipherlast_be (b6, in6);
+	  b7 = asm_cipherlast_be (b7, in7);
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  b1 = asm_add_uint128 (ctr, one);
+	  b2 = asm_add_uint128 (ctr, two);
+	  b3 = asm_add_uint128 (ctr, three);
+	  b0 = asm_xor (rkey0, ctr);
+	  ctr = asm_add_uint128 (ctr, four);
+	  b1 = asm_xor (rkey0, b1);
+	  b2 = asm_xor (rkey0, b2);
+	  b3 = asm_xor (rkey0, b3);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+
+	  in0 = VEC_LOAD_BE (in, 0, bige_const);
+	  in1 = VEC_LOAD_BE (in, 1, bige_const);
+	  in2 = VEC_LOAD_BE (in, 2, bige_const);
+	  in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  in0 = asm_xor (rkeylast, in0);
+	  in1 = asm_xor (rkeylast, in1);
+	  in2 = asm_xor (rkeylast, in2);
+	  in3 = asm_xor (rkeylast, in3);
+
+	  b0 = asm_cipherlast_be (b0, in0);
+	  b1 = asm_cipherlast_be (b1, in1);
+	  b2 = asm_cipherlast_be (b2, in2);
+	  b3 = asm_cipherlast_be (b3, in3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      b = ctr;
+      ctr = asm_add_uint128 (ctr, one);
+      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+
+      AES_ENCRYPT (b, rounds);
+
+      VEC_STORE_BE (out, 0, b, bige_const);
+
+      out++;
+      in++;
+    }
+
+  VEC_STORE_BE (ctr_arg, 0, ctr, bige_const);
+}
+
+
+size_t OCB_CRYPT_FUNC (gcry_cipher_hd_t c, void *outbuf_arg,
+		       const void *inbuf_arg, size_t nblocks,
+		       int encrypt)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  u64 data_nblocks = c->u_mode.ocb.data_nblocks;
+  block l0, l1, l2, l;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b;
+  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+  block rkey, rkeylf;
+  block ctr, iv;
+  ROUND_KEY_VARIABLES;
+
+  iv = VEC_LOAD_BE (c->u_iv.iv, 0, bige_const);
+  ctr = VEC_LOAD_BE (c->u_ctr.ctr, 0, bige_const);
+
+  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
+  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
+  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);
+
+  if (encrypt)
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschenc;
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_ENCRYPT (b, rounds);
+	  b ^= iv;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+	  l = VEC_BE_SWAP(l, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l2;
+	  iv4 = iv ^ l1 ^ l2 ^ l0;
+	  iv5 = iv ^ l2 ^ l0;
+	  iv6 = iv ^ l2;
+	  iv7 = iv ^ l2 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  b4 ^= iv4;
+	  b5 ^= iv5;
+	  b6 ^= iv6;
+	  b7 ^= iv7;
+	  iv = iv7 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  iv0 = asm_xor (rkeylf, iv0);
+	  iv1 = asm_xor (rkeylf, iv1);
+	  iv2 = asm_xor (rkeylf, iv2);
+	  iv3 = asm_xor (rkeylf, iv3);
+	  iv4 = asm_xor (rkeylf, iv4);
+	  iv5 = asm_xor (rkeylf, iv5);
+	  iv6 = asm_xor (rkeylf, iv6);
+	  iv7 = asm_xor (rkeylf, iv7);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_cipherlast_be (b0, iv0);
+	  b1 = asm_cipherlast_be (b1, iv1);
+	  b2 = asm_cipherlast_be (b2, iv2);
+	  b3 = asm_cipherlast_be (b3, iv3);
+	  b4 = asm_cipherlast_be (b4, iv4);
+	  b5 = asm_cipherlast_be (b5, iv5);
+	  b6 = asm_cipherlast_be (b6, iv6);
+	  b7 = asm_cipherlast_be (b7, iv7);
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4 && (data_nblocks % 4) == 0)
+	{
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  iv = iv3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = asm_cipherlast_be (b0, rkey ^ iv0);
+	  b1 = asm_cipherlast_be (b1, rkey ^ iv1);
+	  b2 = asm_cipherlast_be (b2, rkey ^ iv2);
+	  b3 = asm_cipherlast_be (b3, rkey ^ iv3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_ENCRYPT (b, rounds);
+	  b ^= iv;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+    }
+  else
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschdec;
+
+      if (!ctx->decryption_prepared)
+	{
+	  internal_aes_ppc_prepare_decryption (ctx);
+	  ctx->decryption_prepared = 1;
+	}
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_DECRYPT (b, rounds);
+	  b ^= iv;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+	  l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+	  l = VEC_BE_SWAP(l, bige_const);
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l2;
+	  iv4 = iv ^ l1 ^ l2 ^ l0;
+	  iv5 = iv ^ l2 ^ l0;
+	  iv6 = iv ^ l2;
+	  iv7 = iv ^ l2 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  b4 ^= iv4;
+	  b5 ^= iv5;
+	  b6 ^= iv6;
+	  b7 ^= iv7;
+	  iv = iv7 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  iv0 = asm_xor (rkeylf, iv0);
+	  iv1 = asm_xor (rkeylf, iv1);
+	  iv2 = asm_xor (rkeylf, iv2);
+	  iv3 = asm_xor (rkeylf, iv3);
+	  iv4 = asm_xor (rkeylf, iv4);
+	  iv5 = asm_xor (rkeylf, iv5);
+	  iv6 = asm_xor (rkeylf, iv6);
+	  iv7 = asm_xor (rkeylf, iv7);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_ncipherlast_be (b0, iv0);
+	  b1 = asm_ncipherlast_be (b1, iv1);
+	  b2 = asm_ncipherlast_be (b2, iv2);
+	  b3 = asm_ncipherlast_be (b3, iv3);
+	  b4 = asm_ncipherlast_be (b4, iv4);
+	  b5 = asm_ncipherlast_be (b5, iv5);
+	  b6 = asm_ncipherlast_be (b6, iv6);
+	  b7 = asm_ncipherlast_be (b7, iv7);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4 && (data_nblocks % 4) == 0)
+	{
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  iv = iv3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = asm_ncipherlast_be (b0, rkey ^ iv0);
+	  b1 = asm_ncipherlast_be (b1, rkey ^ iv1);
+	  b2 = asm_ncipherlast_be (b2, rkey ^ iv2);
+	  b3 = asm_ncipherlast_be (b3, rkey ^ iv3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+	  b = VEC_LOAD_BE (in, 0, bige_const);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_DECRYPT (b, rounds);
+	  b ^= iv;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
+    }
+
+  VEC_STORE_BE (c->u_iv.iv, 0, iv, bige_const);
+  VEC_STORE_BE (c->u_ctr.ctr, 0, ctr, bige_const);
+  c->u_mode.ocb.data_nblocks = data_nblocks;
+
+  return 0;
+}
+
+size_t OCB_AUTH_FUNC (gcry_cipher_hd_t c, void *abuf_arg, size_t nblocks)
+{
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
+  const u128_t *abuf = (const u128_t *)abuf_arg;
+  int rounds = ctx->rounds;
+  u64 data_nblocks = c->u_mode.ocb.aad_nblocks;
+  block l0, l1, l2, l;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b;
+  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+  block rkey, frkey;
+  block ctr, iv;
+  ROUND_KEY_VARIABLES;
+
+  iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, 0, bige_const);
+  ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, 0, bige_const);
+
+  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
+  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
+  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);
+
+  PRELOAD_ROUND_KEYS (rounds);
+
+  for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
+    {
+      l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+      b = VEC_LOAD_BE (abuf, 0, bige_const);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      iv ^= l;
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      b ^= iv;
+      AES_ENCRYPT (b, rounds);
+      ctr ^= b;
+
+      abuf += 1;
+    }
+
+  for (; nblocks >= 8; nblocks -= 8)
+    {
+      b0 = VEC_LOAD_BE (abuf, 0, bige_const);
+      b1 = VEC_LOAD_BE (abuf, 1, bige_const);
+      b2 = VEC_LOAD_BE (abuf, 2, bige_const);
+      b3 = VEC_LOAD_BE (abuf, 3, bige_const);
+      b4 = VEC_LOAD_BE (abuf, 4, bige_const);
+      b5 = VEC_LOAD_BE (abuf, 5, bige_const);
+      b6 = VEC_LOAD_BE (abuf, 6, bige_const);
+      b7 = VEC_LOAD_BE (abuf, 7, bige_const);
+
+      l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), 0, bige_const);
+
+      frkey = rkey0;
+      iv ^= frkey;
+
+      iv0 = iv ^ l0;
+      iv1 = iv ^ l0 ^ l1;
+      iv2 = iv ^ l1;
+      iv3 = iv ^ l1 ^ l2;
+      iv4 = iv ^ l1 ^ l2 ^ l0;
+      iv5 = iv ^ l2 ^ l0;
+      iv6 = iv ^ l2;
+      iv7 = iv ^ l2 ^ l;
+
+      b0 ^= iv0;
+      b1 ^= iv1;
+      b2 ^= iv2;
+      b3 ^= iv3;
+      b4 ^= iv4;
+      b5 ^= iv5;
+      b6 ^= iv6;
+      b7 ^= iv7;
+      iv = iv7 ^ frkey;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      rkey = rkeylast;
+      b0 = asm_cipherlast_be (b0, rkey);
+      b1 = asm_cipherlast_be (b1, rkey);
+      b2 = asm_cipherlast_be (b2, rkey);
+      b3 = asm_cipherlast_be (b3, rkey);
+      b4 = asm_cipherlast_be (b4, rkey);
+      b5 = asm_cipherlast_be (b5, rkey);
+      b6 = asm_cipherlast_be (b6, rkey);
+      b7 = asm_cipherlast_be (b7, rkey);
+
+      ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+      abuf += 8;
+    }
+
+  if (nblocks >= 4 && (data_nblocks % 4) == 0)
+    {
+      b0 = VEC_LOAD_BE (abuf, 0, bige_const);
+      b1 = VEC_LOAD_BE (abuf, 1, bige_const);
+      b2 = VEC_LOAD_BE (abuf, 2, bige_const);
+      b3 = VEC_LOAD_BE (abuf, 3, bige_const);
+
+      l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
+
+      frkey = rkey0;
+      iv ^= frkey;
+
+      iv0 = iv ^ l0;
+      iv1 = iv ^ l0 ^ l1;
+      iv2 = iv ^ l1;
+      iv3 = iv ^ l1 ^ l;
+
+      b0 ^= iv0;
+      b1 ^= iv1;
+      b2 ^= iv2;
+      b3 ^= iv3;
+      iv = iv3 ^ frkey;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+      DO_ROUND(1);
+      DO_ROUND(2);
+      DO_ROUND(3);
+      DO_ROUND(4);
+      DO_ROUND(5);
+      DO_ROUND(6);
+      DO_ROUND(7);
+      DO_ROUND(8);
+      DO_ROUND(9);
+      if (rounds >= 12)
+	{
+	  DO_ROUND(10);
+	  DO_ROUND(11);
+	  if (rounds > 12)
+	    {
+	      DO_ROUND(12);
+	      DO_ROUND(13);
+	    }
+	}
+
+#undef DO_ROUND
+
+      rkey = rkeylast;
+      b0 = asm_cipherlast_be (b0, rkey);
+      b1 = asm_cipherlast_be (b1, rkey);
+      b2 = asm_cipherlast_be (b2, rkey);
+      b3 = asm_cipherlast_be (b3, rkey);
+
+      ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+      abuf += 4;
+      nblocks -= 4;
+    }
+
+  for (; nblocks; nblocks--)
+    {
+      l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
+      b = VEC_LOAD_BE (abuf, 0, bige_const);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      iv ^= l;
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      b ^= iv;
+      AES_ENCRYPT (b, rounds);
+      ctr ^= b;
+
+      abuf += 1;
+    }
+
+  VEC_STORE_BE (c->u_mode.ocb.aad_offset, 0, iv, bige_const);
+  VEC_STORE_BE (c->u_mode.ocb.aad_sum, 0, ctr, bige_const);
+  c->u_mode.ocb.aad_nblocks = data_nblocks;
+
+  return 0;
+}
+
+
+void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
+		     void *outbuf_arg, const void *inbuf_arg,
+		     size_t nblocks, int encrypt)
+{
+#ifdef WORDS_BIGENDIAN
+  static const block vec_bswap128_const =
+    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
+#else
+  static const block vec_bswap128_const =
+    { ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8, ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0 };
+#endif
+  static const unsigned char vec_tweak_const[16] =
+    { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0x87 };
+  static const vector unsigned long long vec_shift63_const =
+    { 63, 63 };
+  const block bige_const = asm_load_be_const();
+  RIJNDAEL_context *ctx = context;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
+  int rounds = ctx->rounds;
+  block tweak;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey, rkeylf;
+  block tweak0, tweak1, tweak2, tweak3, tweak4, tweak5, tweak6, tweak7;
+  block tweak_const, bswap128_const, shift63_const;
+  ROUND_KEY_VARIABLES;
+
+  tweak_const = VEC_LOAD_BE (&vec_tweak_const, 0, bige_const);
+  bswap128_const = ALIGNED_LOAD (&vec_bswap128_const, 0);
+  shift63_const = ALIGNED_LOAD (&vec_shift63_const, 0);
+
+  tweak = VEC_LOAD_BE (tweak_arg, 0, bige_const);
+  tweak = asm_vperm1 (tweak, bswap128_const);
+
+#define GEN_TWEAK(tout, tin) /* Generate next tweak. */ \
+    do { \
+      block tmp1, tmp2; \
+      tmp1 = asm_swap_uint64_halfs(tin); \
+      tmp2 = asm_add_uint64(tin, tin); \
+      tmp1 = asm_sra_int64(tmp1, shift63_const) & tweak_const; \
+      tout = asm_xor(tmp1, tmp2); \
+    } while (0)
+
+  if (encrypt)
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschenc;
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  GEN_TWEAK (tweak2, tweak1);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  GEN_TWEAK (tweak3, tweak2);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  GEN_TWEAK (tweak4, tweak3);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  GEN_TWEAK (tweak5, tweak4);
+	  tweak4 = asm_vperm1 (tweak4, bswap128_const);
+	  GEN_TWEAK (tweak6, tweak5);
+	  tweak5 = asm_vperm1 (tweak5, bswap128_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  GEN_TWEAK (tweak7, tweak6);
+	  tweak6 = asm_vperm1 (tweak6, bswap128_const);
+	  GEN_TWEAK (tweak, tweak7);
+	  tweak7 = asm_vperm1 (tweak7, bswap128_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+
+	  tweak0 = asm_xor (tweak0, rkey0);
+	  tweak1 = asm_xor (tweak1, rkey0);
+	  tweak2 = asm_xor (tweak2, rkey0);
+	  tweak3 = asm_xor (tweak3, rkey0);
+	  tweak4 = asm_xor (tweak4, rkey0);
+	  tweak5 = asm_xor (tweak5, rkey0);
+	  tweak6 = asm_xor (tweak6, rkey0);
+	  tweak7 = asm_xor (tweak7, rkey0);
+
+	  b0 = asm_xor (b0, tweak0);
+	  b1 = asm_xor (b1, tweak1);
+	  b2 = asm_xor (b2, tweak2);
+	  b3 = asm_xor (b3, tweak3);
+	  b4 = asm_xor (b4, tweak4);
+	  b5 = asm_xor (b5, tweak5);
+	  b6 = asm_xor (b6, tweak6);
+	  b7 = asm_xor (b7, tweak7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey); \
+	      b4 = asm_cipher_be (b4, rkey); \
+	      b5 = asm_cipher_be (b5, rkey); \
+	      b6 = asm_cipher_be (b6, rkey); \
+	      b7 = asm_cipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  tweak0 = asm_xor (tweak0, rkeylf);
+	  tweak1 = asm_xor (tweak1, rkeylf);
+	  tweak2 = asm_xor (tweak2, rkeylf);
+	  tweak3 = asm_xor (tweak3, rkeylf);
+	  tweak4 = asm_xor (tweak4, rkeylf);
+	  tweak5 = asm_xor (tweak5, rkeylf);
+	  tweak6 = asm_xor (tweak6, rkeylf);
+	  tweak7 = asm_xor (tweak7, rkeylf);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_cipherlast_be (b0, tweak0);
+	  b1 = asm_cipherlast_be (b1, tweak1);
+	  b2 = asm_cipherlast_be (b2, tweak2);
+	  b3 = asm_cipherlast_be (b3, tweak3);
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b4 = asm_cipherlast_be (b4, tweak4);
+	  b5 = asm_cipherlast_be (b5, tweak5);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b6 = asm_cipherlast_be (b6, tweak6);
+	  b7 = asm_cipherlast_be (b7, tweak7);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  GEN_TWEAK (tweak2, tweak1);
+	  GEN_TWEAK (tweak3, tweak2);
+	  GEN_TWEAK (tweak, tweak3);
+
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+
+	  b0 ^= tweak0 ^ rkey0;
+	  b1 ^= tweak1 ^ rkey0;
+	  b2 ^= tweak2 ^ rkey0;
+	  b3 ^= tweak3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_cipher_be (b0, rkey); \
+	      b1 = asm_cipher_be (b1, rkey); \
+	      b2 = asm_cipher_be (b2, rkey); \
+	      b3 = asm_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast;
+	  b0 = asm_cipherlast_be (b0, rkey ^ tweak0);
+	  b1 = asm_cipherlast_be (b1, rkey ^ tweak1);
+	  b2 = asm_cipherlast_be (b2, rkey ^ tweak2);
+	  b3 = asm_cipherlast_be (b3, rkey ^ tweak3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  tweak0 = asm_vperm1 (tweak, bswap128_const);
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;
+
+	  /* Generate next tweak. */
+	  GEN_TWEAK (tweak, tweak);
+
+	  AES_ENCRYPT (b, rounds);
+
+	  b ^= tweak0;
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in++;
+	  out++;
+	}
+    }
+  else
+    {
+      const u128_t *rk = (u128_t *)&ctx->keyschdec;
+
+      if (!ctx->decryption_prepared)
+	{
+	  internal_aes_ppc_prepare_decryption (ctx);
+	  ctx->decryption_prepared = 1;
+	}
+
+      PRELOAD_ROUND_KEYS (rounds);
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
+	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
+	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
+	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
+	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
+	  GEN_TWEAK (tweak2, tweak1);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
+	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
+	  in += 8;
+
+	  b0 = VEC_BE_SWAP(b0, bige_const);
+	  b1 = VEC_BE_SWAP(b1, bige_const);
+	  GEN_TWEAK (tweak3, tweak2);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  GEN_TWEAK (tweak4, tweak3);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+	  b2 = VEC_BE_SWAP(b2, bige_const);
+	  b3 = VEC_BE_SWAP(b3, bige_const);
+	  GEN_TWEAK (tweak5, tweak4);
+	  tweak4 = asm_vperm1 (tweak4, bswap128_const);
+	  GEN_TWEAK (tweak6, tweak5);
+	  tweak5 = asm_vperm1 (tweak5, bswap128_const);
+	  b4 = VEC_BE_SWAP(b4, bige_const);
+	  b5 = VEC_BE_SWAP(b5, bige_const);
+	  GEN_TWEAK (tweak7, tweak6);
+	  tweak6 = asm_vperm1 (tweak6, bswap128_const);
+	  GEN_TWEAK (tweak, tweak7);
+	  tweak7 = asm_vperm1 (tweak7, bswap128_const);
+	  b6 = VEC_BE_SWAP(b6, bige_const);
+	  b7 = VEC_BE_SWAP(b7, bige_const);
+
+	  tweak0 = asm_xor (tweak0, rkey0);
+	  tweak1 = asm_xor (tweak1, rkey0);
+	  tweak2 = asm_xor (tweak2, rkey0);
+	  tweak3 = asm_xor (tweak3, rkey0);
+	  tweak4 = asm_xor (tweak4, rkey0);
+	  tweak5 = asm_xor (tweak5, rkey0);
+	  tweak6 = asm_xor (tweak6, rkey0);
+	  tweak7 = asm_xor (tweak7, rkey0);
+
+	  b0 = asm_xor (b0, tweak0);
+	  b1 = asm_xor (b1, tweak1);
+	  b2 = asm_xor (b2, tweak2);
+	  b3 = asm_xor (b3, tweak3);
+	  b4 = asm_xor (b4, tweak4);
+	  b5 = asm_xor (b5, tweak5);
+	  b6 = asm_xor (b6, tweak6);
+	  b7 = asm_xor (b7, tweak7);
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey); \
+	      b4 = asm_ncipher_be (b4, rkey); \
+	      b5 = asm_ncipher_be (b5, rkey); \
+	      b6 = asm_ncipher_be (b6, rkey); \
+	      b7 = asm_ncipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+
+	  rkeylf = asm_xor (rkeylast, rkey0);
+
+	  DO_ROUND(8);
+
+	  tweak0 = asm_xor (tweak0, rkeylf);
+	  tweak1 = asm_xor (tweak1, rkeylf);
+	  tweak2 = asm_xor (tweak2, rkeylf);
+	  tweak3 = asm_xor (tweak3, rkeylf);
+	  tweak4 = asm_xor (tweak4, rkeylf);
+	  tweak5 = asm_xor (tweak5, rkeylf);
+	  tweak6 = asm_xor (tweak6, rkeylf);
+	  tweak7 = asm_xor (tweak7, rkeylf);
+
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  b0 = asm_ncipherlast_be (b0, tweak0);
+	  b1 = asm_ncipherlast_be (b1, tweak1);
+	  b2 = asm_ncipherlast_be (b2, tweak2);
+	  b3 = asm_ncipherlast_be (b3, tweak3);
+	  b0 = VEC_BE_SWAP (b0, bige_const);
+	  b1 = VEC_BE_SWAP (b1, bige_const);
+	  b4 = asm_ncipherlast_be (b4, tweak4);
+	  b5 = asm_ncipherlast_be (b5, tweak5);
+	  b2 = VEC_BE_SWAP (b2, bige_const);
+	  b3 = VEC_BE_SWAP (b3, bige_const);
+	  b6 = asm_ncipherlast_be (b6, tweak6);
+	  b7 = asm_ncipherlast_be (b7, tweak7);
+	  VEC_STORE_BE_NOSWAP (out, 0, b0);
+	  VEC_STORE_BE_NOSWAP (out, 1, b1);
+	  b4 = VEC_BE_SWAP (b4, bige_const);
+	  b5 = VEC_BE_SWAP (b5, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 2, b2);
+	  VEC_STORE_BE_NOSWAP (out, 3, b3);
+	  b6 = VEC_BE_SWAP (b6, bige_const);
+	  b7 = VEC_BE_SWAP (b7, bige_const);
+	  VEC_STORE_BE_NOSWAP (out, 4, b4);
+	  VEC_STORE_BE_NOSWAP (out, 5, b5);
+	  VEC_STORE_BE_NOSWAP (out, 6, b6);
+	  VEC_STORE_BE_NOSWAP (out, 7, b7);
+	  out += 8;
+	}
+
+      if (nblocks >= 4)
+	{
+	  tweak0 = tweak;
+	  GEN_TWEAK (tweak1, tweak0);
+	  GEN_TWEAK (tweak2, tweak1);
+	  GEN_TWEAK (tweak3, tweak2);
+	  GEN_TWEAK (tweak, tweak3);
+
+	  b0 = VEC_LOAD_BE (in, 0, bige_const);
+	  b1 = VEC_LOAD_BE (in, 1, bige_const);
+	  b2 = VEC_LOAD_BE (in, 2, bige_const);
+	  b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
+	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
+	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
+	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
+
+	  b0 ^= tweak0 ^ rkey0;
+	  b1 ^= tweak1 ^ rkey0;
+	  b2 ^= tweak2 ^ rkey0;
+	  b3 ^= tweak3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (rk, r); \
+	      b0 = asm_ncipher_be (b0, rkey); \
+	      b1 = asm_ncipher_be (b1, rkey); \
+	      b2 = asm_ncipher_be (b2, rkey); \
+	      b3 = asm_ncipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
+	    {
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
+	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast;
+	  b0 = asm_ncipherlast_be (b0, rkey ^ tweak0);
+	  b1 = asm_ncipherlast_be (b1, rkey ^ tweak1);
+	  b2 = asm_ncipherlast_be (b2, rkey ^ tweak2);
+	  b3 = asm_ncipherlast_be (b3, rkey ^ tweak3);
+
+	  VEC_STORE_BE (out, 0, b0, bige_const);
+	  VEC_STORE_BE (out, 1, b1, bige_const);
+	  VEC_STORE_BE (out, 2, b2, bige_const);
+	  VEC_STORE_BE (out, 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
+	}
+
+      for (; nblocks; nblocks--)
+	{
+	  tweak0 = asm_vperm1 (tweak, bswap128_const);
+
+	  /* Xor-Encrypt/Decrypt-Xor block. */
+	  b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;
+
+	  /* Generate next tweak. */
+	  GEN_TWEAK (tweak, tweak);
+
+	  AES_DECRYPT (b, rounds);
+
+	  b ^= tweak0;
+	  VEC_STORE_BE (out, 0, b, bige_const);
+
+	  in++;
+	  out++;
+	}
+    }
+
+  tweak = asm_vperm1 (tweak, bswap128_const);
+  VEC_STORE_BE (tweak_arg, 0, tweak, bige_const);
+
+#undef GEN_TWEAK
+}
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index a8bcae468..3e727628b 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -1,6 +1,6 @@
 /* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
  * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
- * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -31,162 +31,7 @@
 
 #ifdef USE_PPC_CRYPTO
 
-#include <altivec.h>
-
-
-typedef vector unsigned char block;
-
-typedef union
-{
-  u32 data32[4];
-} __attribute__((packed, aligned(1), may_alias)) u128_t;
-
-
-#define ALWAYS_INLINE inline __attribute__((always_inline))
-#define NO_INLINE __attribute__((noinline))
-#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
-
-#define ASM_FUNC_ATTR          NO_INSTRUMENT_FUNCTION
-#define ASM_FUNC_ATTR_INLINE   ASM_FUNC_ATTR ALWAYS_INLINE
-#define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
-
-
-#define ALIGNED_LOAD(in_ptr, offs) \
-  (asm_aligned_ld ((offs) * 16, (const void *)(in_ptr)))
-
-#define ALIGNED_STORE(out_ptr, offs, vec) \
-  (asm_aligned_st ((vec), (offs) * 16, (void *)(out_ptr)))
-
-#define VEC_BE_SWAP(vec, bige_const) (asm_be_swap ((vec), (bige_const)))
-
-#define VEC_LOAD_BE(in_ptr, offs, bige_const) \
-  (asm_be_swap (asm_load_be_noswap ((offs) * 16, (const void *)(in_ptr)), \
-		bige_const))
-
-#define VEC_LOAD_BE_NOSWAP(in_ptr, offs) \
-  (asm_load_be_noswap ((offs) * 16, (const unsigned char *)(in_ptr)))
-
-#define VEC_STORE_BE(out_ptr, offs, vec, bige_const) \
-  (asm_store_be_noswap (asm_be_swap ((vec), (bige_const)), (offs) * 16, \
-		        (void *)(out_ptr)))
-
-#define VEC_STORE_BE_NOSWAP(out_ptr, offs, vec) \
-  (asm_store_be_noswap ((vec), (offs) * 16, (void *)(out_ptr)))
-
-
-#define ROUND_KEY_VARIABLES \
-  block rkey0, rkeylast
-
-#define PRELOAD_ROUND_KEYS(nrounds) \
-  do { \
-    rkey0 = ALIGNED_LOAD (rk, 0); \
-    rkeylast = ALIGNED_LOAD (rk, nrounds); \
-  } while (0)
-
-#define AES_ENCRYPT(blk, nrounds) \
-  do { \
-    blk ^= rkey0; \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 1)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 2)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 3)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 4)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 5)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 6)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 7)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 8)); \
-    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 9)); \
-    if (nrounds >= 12) \
-      { \
-	blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 10)); \
-	blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 11)); \
-	if (rounds > 12) \
-	  { \
-	    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 12)); \
-	    blk = asm_cipher_be (blk, ALIGNED_LOAD (rk, 13)); \
-	  } \
-      } \
-    blk = asm_cipherlast_be (blk, rkeylast); \
-  } while (0)
-
-#define AES_DECRYPT(blk, nrounds) \
-  do { \
-    blk ^= rkey0; \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 1)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 2)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 3)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 4)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 5)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 6)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 7)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 8)); \
-    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 9)); \
-    if (nrounds >= 12) \
-      { \
-	blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 10)); \
-	blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 11)); \
-	if (rounds > 12) \
-	  { \
-	    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 12)); \
-	    blk = asm_ncipher_be (blk, ALIGNED_LOAD (rk, 13)); \
-	  } \
-      } \
-    blk = asm_ncipherlast_be (blk, rkeylast); \
-  } while (0)
-
-
-#define ROUND_KEY_VARIABLES_ALL \
-  block rkey0, rkey1, rkey2, rkey3, rkey4, rkey5, rkey6, rkey7, rkey8, \
-        rkey9, rkey10, rkey11, rkey12, rkey13, rkeylast
-
-#define PRELOAD_ROUND_KEYS_ALL(nrounds) \
-  do { \
-    rkey0 = ALIGNED_LOAD (rk, 0); \
-    rkey1 = ALIGNED_LOAD (rk, 1); \
-    rkey2 = ALIGNED_LOAD (rk, 2); \
-    rkey3 = ALIGNED_LOAD (rk, 3); \
-    rkey4 = ALIGNED_LOAD (rk, 4); \
-    rkey5 = ALIGNED_LOAD (rk, 5); \
-    rkey6 = ALIGNED_LOAD (rk, 6); \
-    rkey7 = ALIGNED_LOAD (rk, 7); \
-    rkey8 = ALIGNED_LOAD (rk, 8); \
-    rkey9 = ALIGNED_LOAD (rk, 9); \
-    if (nrounds >= 12) \
-      { \
-	rkey10 = ALIGNED_LOAD (rk, 10); \
-	rkey11 = ALIGNED_LOAD (rk, 11); \
-	if (rounds > 12) \
-	  { \
-	    rkey12 = ALIGNED_LOAD (rk, 12); \
-	    rkey13 = ALIGNED_LOAD (rk, 13); \
-	  } \
-      } \
-    rkeylast = ALIGNED_LOAD (rk, nrounds); \
-  } while (0)
-
-#define AES_ENCRYPT_ALL(blk, nrounds) \
-  do { \
-    blk ^= rkey0; \
-    blk = asm_cipher_be (blk, rkey1); \
-    blk = asm_cipher_be (blk, rkey2); \
-    blk = asm_cipher_be (blk, rkey3); \
-    blk = asm_cipher_be (blk, rkey4); \
-    blk = asm_cipher_be (blk, rkey5); \
-    blk = asm_cipher_be (blk, rkey6); \
-    blk = asm_cipher_be (blk, rkey7); \
-    blk = asm_cipher_be (blk, rkey8); \
-    blk = asm_cipher_be (blk, rkey9); \
-    if (nrounds >= 12) \
-      { \
-	blk = asm_cipher_be (blk, rkey10); \
-	blk = asm_cipher_be (blk, rkey11); \
-	if (rounds > 12) \
-	  { \
-	    blk = asm_cipher_be (blk, rkey12); \
-	    blk = asm_cipher_be (blk, rkey13); \
-	  } \
-      } \
-    blk = asm_cipherlast_be (blk, rkeylast); \
-  } while (0)
+#include "rijndael-ppc-common.h"
 
 
 #ifdef WORDS_BIGENDIAN
@@ -198,26 +43,6 @@ static const block vec_bswap32_const_neg =
 #endif
 
 
-static ASM_FUNC_ATTR_INLINE block
-asm_aligned_ld(unsigned long offset, const void *ptr)
-{
-  block vec;
-  __asm__ volatile ("lvx %0,%1,%2\n\t"
-		    : "=v" (vec)
-		    : "r" (offset), "r" ((uintptr_t)ptr)
-		    : "memory", "r0");
-  return vec;
-}
-
-static ASM_FUNC_ATTR_INLINE void
-asm_aligned_st(block vec, unsigned long offset, void *ptr)
-{
-  __asm__ volatile ("stvx %0,%1,%2\n\t"
-		    :
-		    : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
-		    : "memory", "r0");
-}
-
 static ASM_FUNC_ATTR_INLINE block
 asm_load_be_const(void)
 {
@@ -229,16 +54,6 @@ asm_load_be_const(void)
 #endif
 }
 
-static ASM_FUNC_ATTR_INLINE block
-asm_vperm1(block vec, block mask)
-{
-  block o;
-  __asm__ volatile ("vperm %0,%1,%1,%2\n\t"
-		    : "=v" (o)
-		    : "v" (vec), "v" (mask));
-  return o;
-}
-
 static ASM_FUNC_ATTR_INLINE block
 asm_be_swap(block vec, block be_bswap_const)
 {
@@ -272,66 +87,6 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
 		    : "memory", "r0");
 }
 
-static ASM_FUNC_ATTR_INLINE block
-asm_add_uint128(block a, block b)
-{
-  block res;
-  __asm__ volatile ("vadduqm %0,%1,%2\n\t"
-		    : "=v" (res)
-		    : "v" (a), "v" (b));
-  return res;
-}
-
-static ASM_FUNC_ATTR_INLINE block
-asm_xor(block a, block b)
-{
-  block res;
-  __asm__ volatile ("vxor %0,%1,%2\n\t"
-		    : "=v" (res)
-		    : "v" (a), "v" (b));
-  return res;
-}
-
-static ASM_FUNC_ATTR_INLINE block
-asm_cipher_be(block b, block rk)
-{
-  block o;
-  __asm__ volatile ("vcipher %0, %1, %2\n\t"
-		    : "=v" (o)
-		    : "v" (b), "v" (rk));
-  return o;
-}
-
-static ASM_FUNC_ATTR_INLINE block
-asm_cipherlast_be(block b, block rk)
-{
-  block o;
-  __asm__ volatile ("vcipherlast %0, %1, %2\n\t"
-		    : "=v" (o)
-		    : "v" (b), "v" (rk));
-  return o;
-}
-
-static ASM_FUNC_ATTR_INLINE block
-asm_ncipher_be(block b, block rk)
-{
-  block o;
-  __asm__ volatile ("vncipher %0, %1, %2\n\t"
-		    : "=v" (o)
-		    : "v" (b), "v" (rk));
-  return o;
-}
-
-static ASM_FUNC_ATTR_INLINE block
-asm_ncipherlast_be(block b, block rk)
-{
-  block o;
-  __asm__ volatile ("vncipherlast %0, %1, %2\n\t"
-		    : "=v" (o)
-		    : "v" (b), "v" (rk));
-  return o;
-}
-
 
 static ASM_FUNC_ATTR_INLINE u32
 _gcry_aes_sbox4_ppc8(u32 fourbytes)
@@ -439,7 +194,7 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
             }
         }
 
-      rcon = (rcon << 1) ^ ((rcon >> 7) * 0x1b);
+      rcon = (rcon << 1) ^ (-(rcon >> 7) & 0x1b);
     }
 
   /* Store in big-endian order. */
@@ -450,7 +205,7 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
 #else
       block rvec = ALIGNED_LOAD (ekey, r);
       ALIGNED_STORE (ekey, r,
-		     vec_perm(rvec, rvec, vec_bswap32_const));
+                     vec_perm(rvec, rvec, vec_bswap32_const));
       (void)bige_const;
 #endif
     }
@@ -464,2012 +219,25 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
   wipememory(&tkk, sizeof(tkk));
 }
 
-
-/* Make a decryption key from an encryption key. */
-static ASM_FUNC_ATTR_INLINE void
-aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
-{
-  u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
-  u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
-  int rounds = ctx->rounds;
-  int rr;
-  int r;
-
-  r = 0;
-  rr = rounds;
-  for (r = 0, rr = rounds; r <= rounds; r++, rr--)
-    {
-      ALIGNED_STORE (dkey, r, ALIGNED_LOAD (ekey, rr));
-    }
-}
-
-
 void
 _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 {
-  aes_ppc8_prepare_decryption (ctx);
-}
-
-
-unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
-				     unsigned char *out,
-				     const unsigned char *in)
-{
-  const block bige_const = asm_load_be_const();
-  const u128_t *rk = (u128_t *)&ctx->keyschenc;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES;
-  block b;
-
-  b = VEC_LOAD_BE (in, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS (rounds);
-
-  AES_ENCRYPT (b, rounds);
-  VEC_STORE_BE (out, 0, b, bige_const);
-
-  return 0; /* does not use stack */
-}
-
-
-unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
-				     unsigned char *out,
-				     const unsigned char *in)
-{
-  const block bige_const = asm_load_be_const();
-  const u128_t *rk = (u128_t *)&ctx->keyschdec;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES;
-  block b;
-
-  b = VEC_LOAD_BE (in, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS (rounds);
-
-  AES_DECRYPT (b, rounds);
-  VEC_STORE_BE (out, 0, b, bige_const);
-
-  return 0; /* does not use stack */
-}
-
-
-void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv_arg,
-			     void *outbuf_arg, const void *inbuf_arg,
-			     size_t nblocks)
-{
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = context;
-  const u128_t *rk = (u128_t *)&ctx->keyschenc;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES_ALL;
-  block rkeylast_orig;
-  block iv;
-
-  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS_ALL (rounds);
-  rkeylast_orig = rkeylast;
-
-  for (; nblocks; nblocks--)
-    {
-      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
-
-      AES_ENCRYPT_ALL (iv, rounds);
-
-      VEC_STORE_BE (out, 0, iv, bige_const);
-
-      out++;
-      in++;
-    }
-
-  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
-}
-
-void _gcry_aes_ppc8_cfb_dec (void *context, unsigned char *iv_arg,
-			     void *outbuf_arg, const void *inbuf_arg,
-			     size_t nblocks)
-{
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = context;
-  const u128_t *rk = (u128_t *)&ctx->keyschenc;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES;
-  block rkeylast_orig;
-  block iv, b, bin;
-  block in0, in1, in2, in3, in4, in5, in6, in7;
-  block b0, b1, b2, b3, b4, b5, b6, b7;
-  block rkey;
-
-  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS (rounds);
-  rkeylast_orig = rkeylast;
-
-  for (; nblocks >= 8; nblocks -= 8)
-    {
-      in0 = iv;
-      in1 = VEC_LOAD_BE_NOSWAP (in, 0);
-      in2 = VEC_LOAD_BE_NOSWAP (in, 1);
-      in3 = VEC_LOAD_BE_NOSWAP (in, 2);
-      in4 = VEC_LOAD_BE_NOSWAP (in, 3);
-      in1 = VEC_BE_SWAP (in1, bige_const);
-      in2 = VEC_BE_SWAP (in2, bige_const);
-      in5 = VEC_LOAD_BE_NOSWAP (in, 4);
-      in6 = VEC_LOAD_BE_NOSWAP (in, 5);
-      in3 = VEC_BE_SWAP (in3, bige_const);
-      in4 = VEC_BE_SWAP (in4, bige_const);
-      in7 = VEC_LOAD_BE_NOSWAP (in, 6);
-      iv = VEC_LOAD_BE_NOSWAP (in, 7);
-      in += 8;
-      in5 = VEC_BE_SWAP (in5, bige_const);
-      in6 = VEC_BE_SWAP (in6, bige_const);
-      b0 = asm_xor (rkey0, in0);
-      b1 = asm_xor (rkey0, in1);
-      in7 = VEC_BE_SWAP (in7, bige_const);
-      iv = VEC_BE_SWAP (iv, bige_const);
-      b2 = asm_xor (rkey0, in2);
-      b3 = asm_xor (rkey0, in3);
-      b4 = asm_xor (rkey0, in4);
-      b5 = asm_xor (rkey0, in5);
-      b6 = asm_xor (rkey0, in6);
-      b7 = asm_xor (rkey0, in7);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey); \
-	      b4 = asm_cipher_be (b4, rkey); \
-	      b5 = asm_cipher_be (b5, rkey); \
-	      b6 = asm_cipher_be (b6, rkey); \
-	      b7 = asm_cipher_be (b7, rkey);
-
-      DO_ROUND(1);
-      DO_ROUND(2);
-      DO_ROUND(3);
-      DO_ROUND(4);
-      DO_ROUND(5);
-      DO_ROUND(6);
-      DO_ROUND(7);
-      DO_ROUND(8);
-      DO_ROUND(9);
-      if (rounds >= 12)
-	{
-	  DO_ROUND(10);
-	  DO_ROUND(11);
-	  if (rounds > 12)
-	    {
-	      DO_ROUND(12);
-	      DO_ROUND(13);
-	    }
-	}
-
-#undef DO_ROUND
-
-      in1 = asm_xor (rkeylast, in1);
-      in2 = asm_xor (rkeylast, in2);
-      in3 = asm_xor (rkeylast, in3);
-      in4 = asm_xor (rkeylast, in4);
-      b0 = asm_cipherlast_be (b0, in1);
-      b1 = asm_cipherlast_be (b1, in2);
-      in5 = asm_xor (rkeylast, in5);
-      in6 = asm_xor (rkeylast, in6);
-      b2 = asm_cipherlast_be (b2, in3);
-      b3 = asm_cipherlast_be (b3, in4);
-      in7 = asm_xor (rkeylast, in7);
-      in0 = asm_xor (rkeylast, iv);
-      b0 = VEC_BE_SWAP (b0, bige_const);
-      b1 = VEC_BE_SWAP (b1, bige_const);
-      b4 = asm_cipherlast_be (b4, in5);
-      b5 = asm_cipherlast_be (b5, in6);
-      b2 = VEC_BE_SWAP (b2, bige_const);
-      b3 = VEC_BE_SWAP (b3, bige_const);
-      b6 = asm_cipherlast_be (b6, in7);
-      b7 = asm_cipherlast_be (b7, in0);
-      b4 = VEC_BE_SWAP (b4, bige_const);
-      b5 = VEC_BE_SWAP (b5, bige_const);
-      b6 = VEC_BE_SWAP (b6, bige_const);
-      b7 = VEC_BE_SWAP (b7, bige_const);
-      VEC_STORE_BE_NOSWAP (out, 0, b0);
-      VEC_STORE_BE_NOSWAP (out, 1, b1);
-      VEC_STORE_BE_NOSWAP (out, 2, b2);
-      VEC_STORE_BE_NOSWAP (out, 3, b3);
-      VEC_STORE_BE_NOSWAP (out, 4, b4);
-      VEC_STORE_BE_NOSWAP (out, 5, b5);
-      VEC_STORE_BE_NOSWAP (out, 6, b6);
-      VEC_STORE_BE_NOSWAP (out, 7, b7);
-      out += 8;
-    }
-
-  if (nblocks >= 4)
-    {
-      in0 = iv;
-      in1 = VEC_LOAD_BE (in, 0, bige_const);
-      in2 = VEC_LOAD_BE (in, 1, bige_const);
-      in3 = VEC_LOAD_BE (in, 2, bige_const);
-      iv = VEC_LOAD_BE (in, 3, bige_const);
-
-      b0 = asm_xor (rkey0, in0);
-      b1 = asm_xor (rkey0, in1);
-      b2 = asm_xor (rkey0, in2);
-      b3 = asm_xor (rkey0, in3);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey);
-
-      DO_ROUND(1);
-      DO_ROUND(2);
-      DO_ROUND(3);
-      DO_ROUND(4);
-      DO_ROUND(5);
-      DO_ROUND(6);
-      DO_ROUND(7);
-      DO_ROUND(8);
-      DO_ROUND(9);
-      if (rounds >= 12)
-	{
-	  DO_ROUND(10);
-	  DO_ROUND(11);
-	  if (rounds > 12)
-	    {
-	      DO_ROUND(12);
-	      DO_ROUND(13);
-	    }
-	}
-
-#undef DO_ROUND
-
-      in1 = asm_xor (rkeylast, in1);
-      in2 = asm_xor (rkeylast, in2);
-      in3 = asm_xor (rkeylast, in3);
-      in0 = asm_xor (rkeylast, iv);
-      b0 = asm_cipherlast_be (b0, in1);
-      b1 = asm_cipherlast_be (b1, in2);
-      b2 = asm_cipherlast_be (b2, in3);
-      b3 = asm_cipherlast_be (b3, in0);
-      VEC_STORE_BE (out, 0, b0, bige_const);
-      VEC_STORE_BE (out, 1, b1, bige_const);
-      VEC_STORE_BE (out, 2, b2, bige_const);
-      VEC_STORE_BE (out, 3, b3, bige_const);
-
-      in += 4;
-      out += 4;
-      nblocks -= 4;
-    }
-
-  for (; nblocks; nblocks--)
-    {
-      bin = VEC_LOAD_BE (in, 0, bige_const);
-      rkeylast = rkeylast_orig ^ bin;
-      b = iv;
-      iv = bin;
-
-      AES_ENCRYPT (b, rounds);
-
-      VEC_STORE_BE (out, 0, b, bige_const);
-
-      out++;
-      in++;
-    }
-
-  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
-}
-
-
-void _gcry_aes_ppc8_cbc_enc (void *context, unsigned char *iv_arg,
-			     void *outbuf_arg, const void *inbuf_arg,
-			     size_t nblocks, int cbc_mac)
-{
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = context;
-  const u128_t *rk = (u128_t *)&ctx->keyschenc;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES_ALL;
-  block lastiv, b;
-  unsigned int outadd = !cbc_mac;
-
-  lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS_ALL (rounds);
-
-  for (; nblocks; nblocks--)
-    {
-      b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);
-
-      AES_ENCRYPT_ALL (b, rounds);
-
-      lastiv = b;
-      VEC_STORE_BE (out, 0, b, bige_const);
-
-      in++;
-      out += outadd;
-    }
-
-  VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
-}
-
-void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv_arg,
-			     void *outbuf_arg, const void *inbuf_arg,
-			     size_t nblocks)
-{
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = context;
-  const u128_t *rk = (u128_t *)&ctx->keyschdec;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES;
-  block rkeylast_orig;
-  block in0, in1, in2, in3, in4, in5, in6, in7;
-  block b0, b1, b2, b3, b4, b5, b6, b7;
-  block rkey;
-  block iv, b;
-
-  if (!ctx->decryption_prepared)
-    {
-      aes_ppc8_prepare_decryption (ctx);
-      ctx->decryption_prepared = 1;
-    }
-
-  iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS (rounds);
-  rkeylast_orig = rkeylast;
-
-  for (; nblocks >= 8; nblocks -= 8)
-    {
-      in0 = VEC_LOAD_BE_NOSWAP (in, 0);
-      in1 = VEC_LOAD_BE_NOSWAP (in, 1);
-      in2 = VEC_LOAD_BE_NOSWAP (in, 2);
-      in3 = VEC_LOAD_BE_NOSWAP (in, 3);
-      in0 = VEC_BE_SWAP (in0, bige_const);
-      in1 = VEC_BE_SWAP (in1, bige_const);
-      in4 = VEC_LOAD_BE_NOSWAP (in, 4);
-      in5 = VEC_LOAD_BE_NOSWAP (in, 5);
-      in2 = VEC_BE_SWAP (in2, bige_const);
-      in3 = VEC_BE_SWAP (in3, bige_const);
-      in6 = VEC_LOAD_BE_NOSWAP (in, 6);
-      in7 = VEC_LOAD_BE_NOSWAP (in, 7);
-      in += 8;
-      b0 = asm_xor (rkey0, in0);
-      b1 = asm_xor (rkey0, in1);
-      in4 = VEC_BE_SWAP (in4, bige_const);
-      in5 = VEC_BE_SWAP (in5, bige_const);
-      b2 = asm_xor (rkey0, in2);
-      b3 = asm_xor (rkey0, in3);
-      in6 = VEC_BE_SWAP (in6, bige_const);
-      in7 = VEC_BE_SWAP (in7, bige_const);
-      b4 = asm_xor (rkey0, in4);
-      b5 = asm_xor (rkey0, in5);
-      b6 = asm_xor (rkey0, in6);
-      b7 = asm_xor (rkey0, in7);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_ncipher_be (b0, rkey); \
-	      b1 = asm_ncipher_be (b1, rkey); \
-	      b2 = asm_ncipher_be (b2, rkey); \
-	      b3 = asm_ncipher_be (b3, rkey); \
-	      b4 = asm_ncipher_be (b4, rkey); \
-	      b5 = asm_ncipher_be (b5, rkey); \
-	      b6 = asm_ncipher_be (b6, rkey); \
-	      b7 = asm_ncipher_be (b7, rkey);
-
-      DO_ROUND(1);
-      DO_ROUND(2);
-      DO_ROUND(3);
-      DO_ROUND(4);
-      DO_ROUND(5);
-      DO_ROUND(6);
-      DO_ROUND(7);
-      DO_ROUND(8);
-      DO_ROUND(9);
-      if (rounds >= 12)
-	{
-	  DO_ROUND(10);
-	  DO_ROUND(11);
-	  if (rounds > 12)
-	    {
-	      DO_ROUND(12);
-	      DO_ROUND(13);
-	    }
-	}
-
-#undef DO_ROUND
-
-      iv = asm_xor (rkeylast, iv);
-      in0 = asm_xor (rkeylast, in0);
-      in1 = asm_xor (rkeylast, in1);
-      in2 = asm_xor (rkeylast, in2);
-      b0 = asm_ncipherlast_be (b0, iv);
-      iv = in7;
-      b1 = asm_ncipherlast_be (b1, in0);
-      in3 = asm_xor (rkeylast, in3);
-      in4 = asm_xor (rkeylast, in4);
-      b2 = asm_ncipherlast_be (b2, in1);
-      b3 = asm_ncipherlast_be (b3, in2);
-      in5 = asm_xor (rkeylast, in5);
-      in6 = asm_xor (rkeylast, in6);
-      b0 = VEC_BE_SWAP (b0, bige_const);
-      b1 = VEC_BE_SWAP (b1, bige_const);
-      b4 = asm_ncipherlast_be (b4, in3);
-      b5 = asm_ncipherlast_be (b5, in4);
-      b2 = VEC_BE_SWAP (b2, bige_const);
-      b3 = VEC_BE_SWAP (b3, bige_const);
-      b6 = asm_ncipherlast_be (b6, in5);
-      b7 = asm_ncipherlast_be (b7, in6);
-      b4 = VEC_BE_SWAP (b4, bige_const);
-      b5 = VEC_BE_SWAP (b5, bige_const);
-      b6 = VEC_BE_SWAP (b6, bige_const);
-      b7 = VEC_BE_SWAP (b7, bige_const);
-      VEC_STORE_BE_NOSWAP (out, 0, b0);
-      VEC_STORE_BE_NOSWAP (out, 1, b1);
-      VEC_STORE_BE_NOSWAP (out, 2, b2);
-      VEC_STORE_BE_NOSWAP (out, 3, b3);
-      VEC_STORE_BE_NOSWAP (out, 4, b4);
-      VEC_STORE_BE_NOSWAP (out, 5, b5);
-      VEC_STORE_BE_NOSWAP (out, 6, b6);
-      VEC_STORE_BE_NOSWAP (out, 7, b7);
-      out += 8;
-    }
-
-  if (nblocks >= 4)
-    {
-      in0 = VEC_LOAD_BE (in, 0, bige_const);
-      in1 = VEC_LOAD_BE (in, 1, bige_const);
-      in2 = VEC_LOAD_BE (in, 2, bige_const);
-      in3 = VEC_LOAD_BE (in, 3, bige_const);
-
-      b0 = asm_xor (rkey0, in0);
-      b1 = asm_xor (rkey0, in1);
-      b2 = asm_xor (rkey0, in2);
-      b3 = asm_xor (rkey0, in3);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_ncipher_be (b0, rkey); \
-	      b1 = asm_ncipher_be (b1, rkey); \
-	      b2 = asm_ncipher_be (b2, rkey); \
-	      b3 = asm_ncipher_be (b3, rkey);
-
-      DO_ROUND(1);
-      DO_ROUND(2);
-      DO_ROUND(3);
-      DO_ROUND(4);
-      DO_ROUND(5);
-      DO_ROUND(6);
-      DO_ROUND(7);
-      DO_ROUND(8);
-      DO_ROUND(9);
-      if (rounds >= 12)
-	{
-	  DO_ROUND(10);
-	  DO_ROUND(11);
-	  if (rounds > 12)
-	    {
-	      DO_ROUND(12);
-	      DO_ROUND(13);
-	    }
-	}
-
-#undef DO_ROUND
-
-      iv = asm_xor (rkeylast, iv);
-      in0 = asm_xor (rkeylast, in0);
-      in1 = asm_xor (rkeylast, in1);
-      in2 = asm_xor (rkeylast, in2);
-
-      b0 = asm_ncipherlast_be (b0, iv);
-      iv = in3;
-      b1 = asm_ncipherlast_be (b1, in0);
-      b2 = asm_ncipherlast_be (b2, in1);
-      b3 = asm_ncipherlast_be (b3, in2);
-
-      VEC_STORE_BE (out, 0, b0, bige_const);
-      VEC_STORE_BE (out, 1, b1, bige_const);
-      VEC_STORE_BE (out, 2, b2, bige_const);
-      VEC_STORE_BE (out, 3, b3, bige_const);
-
-      in += 4;
-      out += 4;
-      nblocks -= 4;
-    }
-
-  for (; nblocks; nblocks--)
-    {
-      rkeylast = rkeylast_orig ^ iv;
-
-      iv = VEC_LOAD_BE (in, 0, bige_const);
-      b = iv;
-      AES_DECRYPT (b, rounds);
-
-      VEC_STORE_BE (out, 0, b, bige_const);
-
-      in++;
-      out++;
-    }
-
-  VEC_STORE_BE (iv_arg, 0, iv, bige_const);
-}
-
-
-void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr_arg,
-			     void *outbuf_arg, const void *inbuf_arg,
-			     size_t nblocks)
-{
-  static const unsigned char vec_one_const[16] =
-    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 };
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = context;
-  const u128_t *rk = (u128_t *)&ctx->keyschenc;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  ROUND_KEY_VARIABLES;
-  block rkeylast_orig;
-  block ctr, b, one;
-
-  ctr = VEC_LOAD_BE (ctr_arg, 0, bige_const);
-  one = VEC_LOAD_BE (&vec_one_const, 0, bige_const);
-
-  PRELOAD_ROUND_KEYS (rounds);
-  rkeylast_orig = rkeylast;
-
-  if (nblocks >= 4)
-    {
-      block in0, in1, in2, in3, in4, in5, in6, in7;
-      block b0, b1, b2, b3, b4, b5, b6, b7;
-      block two, three, four;
-      block rkey;
-
-      two   = asm_add_uint128 (one, one);
-      three = asm_add_uint128 (two, one);
-      four  = asm_add_uint128 (two, two);
-
-      for (; nblocks >= 8; nblocks -= 8)
-	{
-	  b1 = asm_add_uint128 (ctr, one);
-	  b2 = asm_add_uint128 (ctr, two);
-	  b3 = asm_add_uint128 (ctr, three);
-	  b4 = asm_add_uint128 (ctr, four);
-	  b5 = asm_add_uint128 (b1, four);
-	  b6 = asm_add_uint128 (b2, four);
-	  b7 = asm_add_uint128 (b3, four);
-	  b0 = asm_xor (rkey0, ctr);
-	  rkey = ALIGNED_LOAD (rk, 1);
-	  ctr = asm_add_uint128 (b4, four);
-	  b1 = asm_xor (rkey0, b1);
-	  b2 = asm_xor (rkey0, b2);
-	  b3 = asm_xor (rkey0, b3);
-	  b0 = asm_cipher_be (b0, rkey);
-	  b1 = asm_cipher_be (b1, rkey);
-	  b2 = asm_cipher_be (b2, rkey);
-	  b3 = asm_cipher_be (b3, rkey);
-	  b4 = asm_xor (rkey0, b4);
-	  b5 = asm_xor (rkey0, b5);
-	  b6 = asm_xor (rkey0, b6);
-	  b7 = asm_xor (rkey0, b7);
-	  b4 = asm_cipher_be (b4, rkey);
-	  b5 = asm_cipher_be (b5, rkey);
-	  b6 = asm_cipher_be (b6, rkey);
-	  b7 = asm_cipher_be (b7, rkey);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey); \
-	      b4 = asm_cipher_be (b4, rkey); \
-	      b5 = asm_cipher_be (b5, rkey); \
-	      b6 = asm_cipher_be (b6, rkey); \
-	      b7 = asm_cipher_be (b7, rkey);
-
-	  in0 = VEC_LOAD_BE_NOSWAP (in, 0);
-	  DO_ROUND(2);
-	  in1 = VEC_LOAD_BE_NOSWAP (in, 1);
-	  DO_ROUND(3);
-	  in2 = VEC_LOAD_BE_NOSWAP (in, 2);
-	  DO_ROUND(4);
-	  in3 = VEC_LOAD_BE_NOSWAP (in, 3);
-	  DO_ROUND(5);
-	  in4 = VEC_LOAD_BE_NOSWAP (in, 4);
-	  DO_ROUND(6);
-	  in5 = VEC_LOAD_BE_NOSWAP (in, 5);
-	  DO_ROUND(7);
-	  in6 = VEC_LOAD_BE_NOSWAP (in, 6);
-	  DO_ROUND(8);
-	  in7 = VEC_LOAD_BE_NOSWAP (in, 7);
-	  in += 8;
-	  DO_ROUND(9);
-
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  in0 = VEC_BE_SWAP (in0, bige_const);
-	  in1 = VEC_BE_SWAP (in1, bige_const);
-	  in2 = VEC_BE_SWAP (in2, bige_const);
-	  in3 = VEC_BE_SWAP (in3, bige_const);
-	  in4 = VEC_BE_SWAP (in4, bige_const);
-	  in5 = VEC_BE_SWAP (in5, bige_const);
-	  in6 = VEC_BE_SWAP (in6, bige_const);
-	  in7 = VEC_BE_SWAP (in7, bige_const);
-
-	  in0 = asm_xor (rkeylast, in0);
-	  in1 = asm_xor (rkeylast, in1);
-	  in2 = asm_xor (rkeylast, in2);
-	  in3 = asm_xor (rkeylast, in3);
-	  b0 = asm_cipherlast_be (b0, in0);
-	  b1 = asm_cipherlast_be (b1, in1);
-	  in4 = asm_xor (rkeylast, in4);
-	  in5 = asm_xor (rkeylast, in5);
-	  b2 = asm_cipherlast_be (b2, in2);
-	  b3 = asm_cipherlast_be (b3, in3);
-	  in6 = asm_xor (rkeylast, in6);
-	  in7 = asm_xor (rkeylast, in7);
-	  b4 = asm_cipherlast_be (b4, in4);
-	  b5 = asm_cipherlast_be (b5, in5);
-	  b6 = asm_cipherlast_be (b6, in6);
-	  b7 = asm_cipherlast_be (b7, in7);
-
-	  b0 = VEC_BE_SWAP (b0, bige_const);
-	  b1 = VEC_BE_SWAP (b1, bige_const);
-	  b2 = VEC_BE_SWAP (b2, bige_const);
-	  b3 = VEC_BE_SWAP (b3, bige_const);
-	  b4 = VEC_BE_SWAP (b4, bige_const);
-	  b5 = VEC_BE_SWAP (b5, bige_const);
-	  b6 = VEC_BE_SWAP (b6, bige_const);
-	  b7 = VEC_BE_SWAP (b7, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 0, b0);
-	  VEC_STORE_BE_NOSWAP (out, 1, b1);
-	  VEC_STORE_BE_NOSWAP (out, 2, b2);
-	  VEC_STORE_BE_NOSWAP (out, 3, b3);
-	  VEC_STORE_BE_NOSWAP (out, 4, b4);
-	  VEC_STORE_BE_NOSWAP (out, 5, b5);
-	  VEC_STORE_BE_NOSWAP (out, 6, b6);
-	  VEC_STORE_BE_NOSWAP (out, 7, b7);
-	  out += 8;
-	}
-
-      if (nblocks >= 4)
-	{
-	  b1 = asm_add_uint128 (ctr, one);
-	  b2 = asm_add_uint128 (ctr, two);
-	  b3 = asm_add_uint128 (ctr, three);
-	  b0 = asm_xor (rkey0, ctr);
-	  ctr = asm_add_uint128 (ctr, four);
-	  b1 = asm_xor (rkey0, b1);
-	  b2 = asm_xor (rkey0, b2);
-	  b3 = asm_xor (rkey0, b3);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-	  DO_ROUND(8);
-
-	  in0 = VEC_LOAD_BE (in, 0, bige_const);
-	  in1 = VEC_LOAD_BE (in, 1, bige_const);
-	  in2 = VEC_LOAD_BE (in, 2, bige_const);
-	  in3 = VEC_LOAD_BE (in, 3, bige_const);
-
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  in0 = asm_xor (rkeylast, in0);
-	  in1 = asm_xor (rkeylast, in1);
-	  in2 = asm_xor (rkeylast, in2);
-	  in3 = asm_xor (rkeylast, in3);
-
-	  b0 = asm_cipherlast_be (b0, in0);
-	  b1 = asm_cipherlast_be (b1, in1);
-	  b2 = asm_cipherlast_be (b2, in2);
-	  b3 = asm_cipherlast_be (b3, in3);
-
-	  VEC_STORE_BE (out, 0, b0, bige_const);
-	  VEC_STORE_BE (out, 1, b1, bige_const);
-	  VEC_STORE_BE (out, 2, b2, bige_const);
-	  VEC_STORE_BE (out, 3, b3, bige_const);
-
-	  in += 4;
-	  out += 4;
-	  nblocks -= 4;
-	}
-    }
-
-  for (; nblocks; nblocks--)
-    {
-      b = ctr;
-      ctr = asm_add_uint128 (ctr, one);
-      rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
-
-      AES_ENCRYPT (b, rounds);
-
-      VEC_STORE_BE (out, 0, b, bige_const);
-
-      out++;
-      in++;
-    }
-
-  VEC_STORE_BE (ctr_arg, 0, ctr, bige_const);
-}
-
-
-size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-				 const void *inbuf_arg, size_t nblocks,
-				 int encrypt)
-{
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = (void *)&c->context.c;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  u64 data_nblocks = c->u_mode.ocb.data_nblocks;
-  block l0, l1, l2, l;
-  block b0, b1, b2, b3, b4, b5, b6, b7, b;
-  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
-  block rkey, rkeylf;
-  block ctr, iv;
-  ROUND_KEY_VARIABLES;
-
-  iv = VEC_LOAD_BE (c->u_iv.iv, 0, bige_const);
-  ctr = VEC_LOAD_BE (c->u_ctr.ctr, 0, bige_const);
-
-  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
-  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
-  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);
-
-  if (encrypt)
-    {
-      const u128_t *rk = (u128_t *)&ctx->keyschenc;
-
-      PRELOAD_ROUND_KEYS (rounds);
-
-      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
-	{
-	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
-	  b = VEC_LOAD_BE (in, 0, bige_const);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  iv ^= l;
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  ctr ^= b;
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  b ^= iv;
-	  AES_ENCRYPT (b, rounds);
-	  b ^= iv;
-
-	  VEC_STORE_BE (out, 0, b, bige_const);
-
-	  in += 1;
-	  out += 1;
-	}
-
-      for (; nblocks >= 8; nblocks -= 8)
-	{
-	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
-	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
-	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
-	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
-	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
-	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
-	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
-	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
-	  in += 8;
-	  l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
-	  b0 = VEC_BE_SWAP(b0, bige_const);
-	  b1 = VEC_BE_SWAP(b1, bige_const);
-	  b2 = VEC_BE_SWAP(b2, bige_const);
-	  b3 = VEC_BE_SWAP(b3, bige_const);
-	  b4 = VEC_BE_SWAP(b4, bige_const);
-	  b5 = VEC_BE_SWAP(b5, bige_const);
-	  b6 = VEC_BE_SWAP(b6, bige_const);
-	  b7 = VEC_BE_SWAP(b7, bige_const);
-	  l = VEC_BE_SWAP(l, bige_const);
-
-	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
-
-	  iv ^= rkey0;
-
-	  iv0 = iv ^ l0;
-	  iv1 = iv ^ l0 ^ l1;
-	  iv2 = iv ^ l1;
-	  iv3 = iv ^ l1 ^ l2;
-	  iv4 = iv ^ l1 ^ l2 ^ l0;
-	  iv5 = iv ^ l2 ^ l0;
-	  iv6 = iv ^ l2;
-	  iv7 = iv ^ l2 ^ l;
-
-	  b0 ^= iv0;
-	  b1 ^= iv1;
-	  b2 ^= iv2;
-	  b3 ^= iv3;
-	  b4 ^= iv4;
-	  b5 ^= iv5;
-	  b6 ^= iv6;
-	  b7 ^= iv7;
-	  iv = iv7 ^ rkey0;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey); \
-	      b4 = asm_cipher_be (b4, rkey); \
-	      b5 = asm_cipher_be (b5, rkey); \
-	      b6 = asm_cipher_be (b6, rkey); \
-	      b7 = asm_cipher_be (b7, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-
-	  rkeylf = asm_xor (rkeylast, rkey0);
-
-	  DO_ROUND(8);
-
-	  iv0 = asm_xor (rkeylf, iv0);
-	  iv1 = asm_xor (rkeylf, iv1);
-	  iv2 = asm_xor (rkeylf, iv2);
-	  iv3 = asm_xor (rkeylf, iv3);
-	  iv4 = asm_xor (rkeylf, iv4);
-	  iv5 = asm_xor (rkeylf, iv5);
-	  iv6 = asm_xor (rkeylf, iv6);
-	  iv7 = asm_xor (rkeylf, iv7);
-
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  b0 = asm_cipherlast_be (b0, iv0);
-	  b1 = asm_cipherlast_be (b1, iv1);
-	  b2 = asm_cipherlast_be (b2, iv2);
-	  b3 = asm_cipherlast_be (b3, iv3);
-	  b4 = asm_cipherlast_be (b4, iv4);
-	  b5 = asm_cipherlast_be (b5, iv5);
-	  b6 = asm_cipherlast_be (b6, iv6);
-	  b7 = asm_cipherlast_be (b7, iv7);
-
-	  b0 = VEC_BE_SWAP (b0, bige_const);
-	  b1 = VEC_BE_SWAP (b1, bige_const);
-	  b2 = VEC_BE_SWAP (b2, bige_const);
-	  b3 = VEC_BE_SWAP (b3, bige_const);
-	  b4 = VEC_BE_SWAP (b4, bige_const);
-	  b5 = VEC_BE_SWAP (b5, bige_const);
-	  b6 = VEC_BE_SWAP (b6, bige_const);
-	  b7 = VEC_BE_SWAP (b7, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 0, b0);
-	  VEC_STORE_BE_NOSWAP (out, 1, b1);
-	  VEC_STORE_BE_NOSWAP (out, 2, b2);
-	  VEC_STORE_BE_NOSWAP (out, 3, b3);
-	  VEC_STORE_BE_NOSWAP (out, 4, b4);
-	  VEC_STORE_BE_NOSWAP (out, 5, b5);
-	  VEC_STORE_BE_NOSWAP (out, 6, b6);
-	  VEC_STORE_BE_NOSWAP (out, 7, b7);
-	  out += 8;
-	}
-
-      if (nblocks >= 4 && (data_nblocks % 4) == 0)
-	{
-	  b0 = VEC_LOAD_BE (in, 0, bige_const);
-	  b1 = VEC_LOAD_BE (in, 1, bige_const);
-	  b2 = VEC_LOAD_BE (in, 2, bige_const);
-	  b3 = VEC_LOAD_BE (in, 3, bige_const);
-
-	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
-
-	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
-
-	  iv ^= rkey0;
-
-	  iv0 = iv ^ l0;
-	  iv1 = iv ^ l0 ^ l1;
-	  iv2 = iv ^ l1;
-	  iv3 = iv ^ l1 ^ l;
-
-	  b0 ^= iv0;
-	  b1 ^= iv1;
-	  b2 ^= iv2;
-	  b3 ^= iv3;
-	  iv = iv3 ^ rkey0;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-	  DO_ROUND(8);
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  rkey = rkeylast ^ rkey0;
-	  b0 = asm_cipherlast_be (b0, rkey ^ iv0);
-	  b1 = asm_cipherlast_be (b1, rkey ^ iv1);
-	  b2 = asm_cipherlast_be (b2, rkey ^ iv2);
-	  b3 = asm_cipherlast_be (b3, rkey ^ iv3);
-
-	  VEC_STORE_BE (out, 0, b0, bige_const);
-	  VEC_STORE_BE (out, 1, b1, bige_const);
-	  VEC_STORE_BE (out, 2, b2, bige_const);
-	  VEC_STORE_BE (out, 3, b3, bige_const);
-
-	  in += 4;
-	  out += 4;
-	  nblocks -= 4;
-	}
-
-      for (; nblocks; nblocks--)
-	{
-	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
-	  b = VEC_LOAD_BE (in, 0, bige_const);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  iv ^= l;
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  ctr ^= b;
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  b ^= iv;
-	  AES_ENCRYPT (b, rounds);
-	  b ^= iv;
-
-	  VEC_STORE_BE (out, 0, b, bige_const);
-
-	  in += 1;
-	  out += 1;
-	}
-    }
-  else
-    {
-      const u128_t *rk = (u128_t *)&ctx->keyschdec;
-
-      if (!ctx->decryption_prepared)
-	{
-	  aes_ppc8_prepare_decryption (ctx);
-	  ctx->decryption_prepared = 1;
-	}
-
-      PRELOAD_ROUND_KEYS (rounds);
-
-      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
-	{
-	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
-	  b = VEC_LOAD_BE (in, 0, bige_const);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  iv ^= l;
-	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-	  b ^= iv;
-	  AES_DECRYPT (b, rounds);
-	  b ^= iv;
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  ctr ^= b;
-
-	  VEC_STORE_BE (out, 0, b, bige_const);
-
-	  in += 1;
-	  out += 1;
-	}
-
-      for (; nblocks >= 8; nblocks -= 8)
-	{
-	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
-	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
-	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
-	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
-	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
-	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
-	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
-	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
-	  in += 8;
-	  l = VEC_LOAD_BE_NOSWAP (ocb_get_l (c, data_nblocks += 8), 0);
-	  b0 = VEC_BE_SWAP(b0, bige_const);
-	  b1 = VEC_BE_SWAP(b1, bige_const);
-	  b2 = VEC_BE_SWAP(b2, bige_const);
-	  b3 = VEC_BE_SWAP(b3, bige_const);
-	  b4 = VEC_BE_SWAP(b4, bige_const);
-	  b5 = VEC_BE_SWAP(b5, bige_const);
-	  b6 = VEC_BE_SWAP(b6, bige_const);
-	  b7 = VEC_BE_SWAP(b7, bige_const);
-	  l = VEC_BE_SWAP(l, bige_const);
-
-	  iv ^= rkey0;
-
-	  iv0 = iv ^ l0;
-	  iv1 = iv ^ l0 ^ l1;
-	  iv2 = iv ^ l1;
-	  iv3 = iv ^ l1 ^ l2;
-	  iv4 = iv ^ l1 ^ l2 ^ l0;
-	  iv5 = iv ^ l2 ^ l0;
-	  iv6 = iv ^ l2;
-	  iv7 = iv ^ l2 ^ l;
-
-	  b0 ^= iv0;
-	  b1 ^= iv1;
-	  b2 ^= iv2;
-	  b3 ^= iv3;
-	  b4 ^= iv4;
-	  b5 ^= iv5;
-	  b6 ^= iv6;
-	  b7 ^= iv7;
-	  iv = iv7 ^ rkey0;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_ncipher_be (b0, rkey); \
-	      b1 = asm_ncipher_be (b1, rkey); \
-	      b2 = asm_ncipher_be (b2, rkey); \
-	      b3 = asm_ncipher_be (b3, rkey); \
-	      b4 = asm_ncipher_be (b4, rkey); \
-	      b5 = asm_ncipher_be (b5, rkey); \
-	      b6 = asm_ncipher_be (b6, rkey); \
-	      b7 = asm_ncipher_be (b7, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-
-	  rkeylf = asm_xor (rkeylast, rkey0);
-
-	  DO_ROUND(8);
-
-	  iv0 = asm_xor (rkeylf, iv0);
-	  iv1 = asm_xor (rkeylf, iv1);
-	  iv2 = asm_xor (rkeylf, iv2);
-	  iv3 = asm_xor (rkeylf, iv3);
-	  iv4 = asm_xor (rkeylf, iv4);
-	  iv5 = asm_xor (rkeylf, iv5);
-	  iv6 = asm_xor (rkeylf, iv6);
-	  iv7 = asm_xor (rkeylf, iv7);
-
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  b0 = asm_ncipherlast_be (b0, iv0);
-	  b1 = asm_ncipherlast_be (b1, iv1);
-	  b2 = asm_ncipherlast_be (b2, iv2);
-	  b3 = asm_ncipherlast_be (b3, iv3);
-	  b4 = asm_ncipherlast_be (b4, iv4);
-	  b5 = asm_ncipherlast_be (b5, iv5);
-	  b6 = asm_ncipherlast_be (b6, iv6);
-	  b7 = asm_ncipherlast_be (b7, iv7);
-
-	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
-
-	  b0 = VEC_BE_SWAP (b0, bige_const);
-	  b1 = VEC_BE_SWAP (b1, bige_const);
-	  b2 = VEC_BE_SWAP (b2, bige_const);
-	  b3 = VEC_BE_SWAP (b3, bige_const);
-	  b4 = VEC_BE_SWAP (b4, bige_const);
-	  b5 = VEC_BE_SWAP (b5, bige_const);
-	  b6 = VEC_BE_SWAP (b6, bige_const);
-	  b7 = VEC_BE_SWAP (b7, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 0, b0);
-	  VEC_STORE_BE_NOSWAP (out, 1, b1);
-	  VEC_STORE_BE_NOSWAP (out, 2, b2);
-	  VEC_STORE_BE_NOSWAP (out, 3, b3);
-	  VEC_STORE_BE_NOSWAP (out, 4, b4);
-	  VEC_STORE_BE_NOSWAP (out, 5, b5);
-	  VEC_STORE_BE_NOSWAP (out, 6, b6);
-	  VEC_STORE_BE_NOSWAP (out, 7, b7);
-	  out += 8;
-	}
-
-      if (nblocks >= 4 && (data_nblocks % 4) == 0)
-	{
-	  b0 = VEC_LOAD_BE (in, 0, bige_const);
-	  b1 = VEC_LOAD_BE (in, 1, bige_const);
-	  b2 = VEC_LOAD_BE (in, 2, bige_const);
-	  b3 = VEC_LOAD_BE (in, 3, bige_const);
-
-	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
-
-	  iv ^= rkey0;
-
-	  iv0 = iv ^ l0;
-	  iv1 = iv ^ l0 ^ l1;
-	  iv2 = iv ^ l1;
-	  iv3 = iv ^ l1 ^ l;
-
-	  b0 ^= iv0;
-	  b1 ^= iv1;
-	  b2 ^= iv2;
-	  b3 ^= iv3;
-	  iv = iv3 ^ rkey0;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_ncipher_be (b0, rkey); \
-	      b1 = asm_ncipher_be (b1, rkey); \
-	      b2 = asm_ncipher_be (b2, rkey); \
-	      b3 = asm_ncipher_be (b3, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-	  DO_ROUND(8);
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  rkey = rkeylast ^ rkey0;
-	  b0 = asm_ncipherlast_be (b0, rkey ^ iv0);
-	  b1 = asm_ncipherlast_be (b1, rkey ^ iv1);
-	  b2 = asm_ncipherlast_be (b2, rkey ^ iv2);
-	  b3 = asm_ncipherlast_be (b3, rkey ^ iv3);
-
-	  VEC_STORE_BE (out, 0, b0, bige_const);
-	  VEC_STORE_BE (out, 1, b1, bige_const);
-	  VEC_STORE_BE (out, 2, b2, bige_const);
-	  VEC_STORE_BE (out, 3, b3, bige_const);
-
-	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
-
-	  in += 4;
-	  out += 4;
-	  nblocks -= 4;
-	}
-
-      for (; nblocks; nblocks--)
-	{
-	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
-	  b = VEC_LOAD_BE (in, 0, bige_const);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  iv ^= l;
-	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
-	  b ^= iv;
-	  AES_DECRYPT (b, rounds);
-	  b ^= iv;
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  ctr ^= b;
-
-	  VEC_STORE_BE (out, 0, b, bige_const);
-
-	  in += 1;
-	  out += 1;
-	}
-    }
-
-  VEC_STORE_BE (c->u_iv.iv, 0, iv, bige_const);
-  VEC_STORE_BE (c->u_ctr.ctr, 0, ctr, bige_const);
-  c->u_mode.ocb.data_nblocks = data_nblocks;
-
-  return 0;
-}
-
-size_t _gcry_aes_ppc8_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg,
-				size_t nblocks)
-{
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = (void *)&c->context.c;
-  const u128_t *rk = (u128_t *)&ctx->keyschenc;
-  const u128_t *abuf = (const u128_t *)abuf_arg;
-  int rounds = ctx->rounds;
-  u64 data_nblocks = c->u_mode.ocb.aad_nblocks;
-  block l0, l1, l2, l;
-  block b0, b1, b2, b3, b4, b5, b6, b7, b;
-  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
-  block rkey, frkey;
-  block ctr, iv;
-  ROUND_KEY_VARIABLES;
-
-  iv = VEC_LOAD_BE (c->u_mode.ocb.aad_offset, 0, bige_const);
-  ctr = VEC_LOAD_BE (c->u_mode.ocb.aad_sum, 0, bige_const);
-
-  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], 0, bige_const);
-  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], 0, bige_const);
-  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], 0, bige_const);
-
-  PRELOAD_ROUND_KEYS (rounds);
-
-  for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
-    {
-      l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
-      b = VEC_LOAD_BE (abuf, 0, bige_const);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      iv ^= l;
-      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      b ^= iv;
-      AES_ENCRYPT (b, rounds);
-      ctr ^= b;
-
-      abuf += 1;
-    }
-
-  for (; nblocks >= 8; nblocks -= 8)
-    {
-      b0 = VEC_LOAD_BE (abuf, 0, bige_const);
-      b1 = VEC_LOAD_BE (abuf, 1, bige_const);
-      b2 = VEC_LOAD_BE (abuf, 2, bige_const);
-      b3 = VEC_LOAD_BE (abuf, 3, bige_const);
-      b4 = VEC_LOAD_BE (abuf, 4, bige_const);
-      b5 = VEC_LOAD_BE (abuf, 5, bige_const);
-      b6 = VEC_LOAD_BE (abuf, 6, bige_const);
-      b7 = VEC_LOAD_BE (abuf, 7, bige_const);
-
-      l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), 0, bige_const);
-
-      frkey = rkey0;
-      iv ^= frkey;
-
-      iv0 = iv ^ l0;
-      iv1 = iv ^ l0 ^ l1;
-      iv2 = iv ^ l1;
-      iv3 = iv ^ l1 ^ l2;
-      iv4 = iv ^ l1 ^ l2 ^ l0;
-      iv5 = iv ^ l2 ^ l0;
-      iv6 = iv ^ l2;
-      iv7 = iv ^ l2 ^ l;
-
-      b0 ^= iv0;
-      b1 ^= iv1;
-      b2 ^= iv2;
-      b3 ^= iv3;
-      b4 ^= iv4;
-      b5 ^= iv5;
-      b6 ^= iv6;
-      b7 ^= iv7;
-      iv = iv7 ^ frkey;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey); \
-	      b4 = asm_cipher_be (b4, rkey); \
-	      b5 = asm_cipher_be (b5, rkey); \
-	      b6 = asm_cipher_be (b6, rkey); \
-	      b7 = asm_cipher_be (b7, rkey);
-
-      DO_ROUND(1);
-      DO_ROUND(2);
-      DO_ROUND(3);
-      DO_ROUND(4);
-      DO_ROUND(5);
-      DO_ROUND(6);
-      DO_ROUND(7);
-      DO_ROUND(8);
-      DO_ROUND(9);
-      if (rounds >= 12)
-	{
-	  DO_ROUND(10);
-	  DO_ROUND(11);
-	  if (rounds > 12)
-	    {
-	      DO_ROUND(12);
-	      DO_ROUND(13);
-	    }
-	}
-
-#undef DO_ROUND
-
-      rkey = rkeylast;
-      b0 = asm_cipherlast_be (b0, rkey);
-      b1 = asm_cipherlast_be (b1, rkey);
-      b2 = asm_cipherlast_be (b2, rkey);
-      b3 = asm_cipherlast_be (b3, rkey);
-      b4 = asm_cipherlast_be (b4, rkey);
-      b5 = asm_cipherlast_be (b5, rkey);
-      b6 = asm_cipherlast_be (b6, rkey);
-      b7 = asm_cipherlast_be (b7, rkey);
-
-      ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
-
-      abuf += 8;
-    }
-
-  if (nblocks >= 4 && (data_nblocks % 4) == 0)
-    {
-      b0 = VEC_LOAD_BE (abuf, 0, bige_const);
-      b1 = VEC_LOAD_BE (abuf, 1, bige_const);
-      b2 = VEC_LOAD_BE (abuf, 2, bige_const);
-      b3 = VEC_LOAD_BE (abuf, 3, bige_const);
-
-      l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), 0, bige_const);
-
-      frkey = rkey0;
-      iv ^= frkey;
-
-      iv0 = iv ^ l0;
-      iv1 = iv ^ l0 ^ l1;
-      iv2 = iv ^ l1;
-      iv3 = iv ^ l1 ^ l;
-
-      b0 ^= iv0;
-      b1 ^= iv1;
-      b2 ^= iv2;
-      b3 ^= iv3;
-      iv = iv3 ^ frkey;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey);
-
-      DO_ROUND(1);
-      DO_ROUND(2);
-      DO_ROUND(3);
-      DO_ROUND(4);
-      DO_ROUND(5);
-      DO_ROUND(6);
-      DO_ROUND(7);
-      DO_ROUND(8);
-      DO_ROUND(9);
-      if (rounds >= 12)
-	{
-	  DO_ROUND(10);
-	  DO_ROUND(11);
-	  if (rounds > 12)
-	    {
-	      DO_ROUND(12);
-	      DO_ROUND(13);
-	    }
-	}
-
-#undef DO_ROUND
-
-      rkey = rkeylast;
-      b0 = asm_cipherlast_be (b0, rkey);
-      b1 = asm_cipherlast_be (b1, rkey);
-      b2 = asm_cipherlast_be (b2, rkey);
-      b3 = asm_cipherlast_be (b3, rkey);
-
-      ctr ^= b0 ^ b1 ^ b2 ^ b3;
-
-      abuf += 4;
-      nblocks -= 4;
-    }
-
-  for (; nblocks; nblocks--)
-    {
-      l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), 0, bige_const);
-      b = VEC_LOAD_BE (abuf, 0, bige_const);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      iv ^= l;
-      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      b ^= iv;
-      AES_ENCRYPT (b, rounds);
-      ctr ^= b;
-
-      abuf += 1;
-    }
-
-  VEC_STORE_BE (c->u_mode.ocb.aad_offset, 0, iv, bige_const);
-  VEC_STORE_BE (c->u_mode.ocb.aad_sum, 0, ctr, bige_const);
-  c->u_mode.ocb.aad_nblocks = data_nblocks;
-
-  return 0;
+  internal_aes_ppc_prepare_decryption (ctx);
 }
 
 
-void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak_arg,
-			       void *outbuf_arg, const void *inbuf_arg,
-			       size_t nblocks, int encrypt)
-{
-#ifdef WORDS_BIGENDIAN
-  static const block vec_bswap64_const =
-    { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 };
-  static const block vec_bswap128_const =
-    { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
-#else
-  static const block vec_bswap64_const =
-    { ~8, ~9, ~10, ~11, ~12, ~13, ~14, ~15, ~0, ~1, ~2, ~3, ~4, ~5, ~6, ~7 };
-  static const block vec_bswap128_const =
-    { ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8, ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0 };
-  static const block vec_tweakin_swap_const =
-    { ~12, ~13, ~14, ~15, ~8, ~9, ~10, ~11, ~4, ~5, ~6, ~7, ~0, ~1, ~2, ~3 };
-#endif
-  static const unsigned char vec_tweak_const[16] =
-    { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0x87 };
-  static const vector unsigned long long vec_shift63_const =
-    { 63, 63 };
-  static const vector unsigned long long vec_shift1_const =
-    { 1, 1 };
-  const block bige_const = asm_load_be_const();
-  RIJNDAEL_context *ctx = context;
-  const u128_t *in = (const u128_t *)inbuf_arg;
-  u128_t *out = (u128_t *)outbuf_arg;
-  int rounds = ctx->rounds;
-  block tweak;
-  block b0, b1, b2, b3, b4, b5, b6, b7, b, rkey, rkeylf;
-  block tweak0, tweak1, tweak2, tweak3, tweak4, tweak5, tweak6, tweak7;
-  block tweak_const, bswap64_const, bswap128_const;
-  vector unsigned long long shift63_const, shift1_const;
-  ROUND_KEY_VARIABLES;
-
-  tweak_const = VEC_LOAD_BE (&vec_tweak_const, 0, bige_const);
-  bswap64_const = ALIGNED_LOAD (&vec_bswap64_const, 0);
-  bswap128_const = ALIGNED_LOAD (&vec_bswap128_const, 0);
-  shift63_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift63_const, 0);
-  shift1_const = (vector unsigned long long)ALIGNED_LOAD (&vec_shift1_const, 0);
-
-#ifdef WORDS_BIGENDIAN
-  tweak = VEC_LOAD_BE (tweak_arg, 0, bige_const);
-  tweak = asm_vperm1 (tweak, bswap128_const);
-#else
-  tweak = VEC_LOAD_BE (tweak_arg, 0, vec_tweakin_swap_const);
-#endif
-
-#define GEN_TWEAK(tout, tin) /* Generate next tweak. */ \
-    do { \
-      block tmp1, tmp2; \
-      tmp1 = asm_vperm1((tin), bswap64_const); \
-      tmp2 = (block)vec_sl((vector unsigned long long)(tin), shift1_const); \
-      tmp1 = (block)(vec_sra((vector unsigned long long)tmp1, shift63_const)) & \
-	     tweak_const; \
-      tout = asm_xor(tmp1, tmp2); \
-    } while (0)
-
-  if (encrypt)
-    {
-      const u128_t *rk = (u128_t *)&ctx->keyschenc;
-
-      PRELOAD_ROUND_KEYS (rounds);
-
-      for (; nblocks >= 8; nblocks -= 8)
-	{
-	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
-	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
-	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
-	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
-	  tweak0 = tweak;
-	  GEN_TWEAK (tweak1, tweak0);
-	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
-	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
-	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
-	  GEN_TWEAK (tweak2, tweak1);
-	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
-	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
-	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
-	  in += 8;
-
-	  b0 = VEC_BE_SWAP(b0, bige_const);
-	  b1 = VEC_BE_SWAP(b1, bige_const);
-	  GEN_TWEAK (tweak3, tweak2);
-	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
-	  GEN_TWEAK (tweak4, tweak3);
-	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
-	  b2 = VEC_BE_SWAP(b2, bige_const);
-	  b3 = VEC_BE_SWAP(b3, bige_const);
-	  GEN_TWEAK (tweak5, tweak4);
-	  tweak4 = asm_vperm1 (tweak4, bswap128_const);
-	  GEN_TWEAK (tweak6, tweak5);
-	  tweak5 = asm_vperm1 (tweak5, bswap128_const);
-	  b4 = VEC_BE_SWAP(b4, bige_const);
-	  b5 = VEC_BE_SWAP(b5, bige_const);
-	  GEN_TWEAK (tweak7, tweak6);
-	  tweak6 = asm_vperm1 (tweak6, bswap128_const);
-	  GEN_TWEAK (tweak, tweak7);
-	  tweak7 = asm_vperm1 (tweak7, bswap128_const);
-	  b6 = VEC_BE_SWAP(b6, bige_const);
-	  b7 = VEC_BE_SWAP(b7, bige_const);
-
-	  tweak0 = asm_xor (tweak0, rkey0);
-	  tweak1 = asm_xor (tweak1, rkey0);
-	  tweak2 = asm_xor (tweak2, rkey0);
-	  tweak3 = asm_xor (tweak3, rkey0);
-	  tweak4 = asm_xor (tweak4, rkey0);
-	  tweak5 = asm_xor (tweak5, rkey0);
-	  tweak6 = asm_xor (tweak6, rkey0);
-	  tweak7 = asm_xor (tweak7, rkey0);
-
-	  b0 = asm_xor (b0, tweak0);
-	  b1 = asm_xor (b1, tweak1);
-	  b2 = asm_xor (b2, tweak2);
-	  b3 = asm_xor (b3, tweak3);
-	  b4 = asm_xor (b4, tweak4);
-	  b5 = asm_xor (b5, tweak5);
-	  b6 = asm_xor (b6, tweak6);
-	  b7 = asm_xor (b7, tweak7);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey); \
-	      b4 = asm_cipher_be (b4, rkey); \
-	      b5 = asm_cipher_be (b5, rkey); \
-	      b6 = asm_cipher_be (b6, rkey); \
-	      b7 = asm_cipher_be (b7, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-
-	  rkeylf = asm_xor (rkeylast, rkey0);
-
-	  DO_ROUND(8);
-
-	  tweak0 = asm_xor (tweak0, rkeylf);
-	  tweak1 = asm_xor (tweak1, rkeylf);
-	  tweak2 = asm_xor (tweak2, rkeylf);
-	  tweak3 = asm_xor (tweak3, rkeylf);
-	  tweak4 = asm_xor (tweak4, rkeylf);
-	  tweak5 = asm_xor (tweak5, rkeylf);
-	  tweak6 = asm_xor (tweak6, rkeylf);
-	  tweak7 = asm_xor (tweak7, rkeylf);
-
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  b0 = asm_cipherlast_be (b0, tweak0);
-	  b1 = asm_cipherlast_be (b1, tweak1);
-	  b2 = asm_cipherlast_be (b2, tweak2);
-	  b3 = asm_cipherlast_be (b3, tweak3);
-	  b0 = VEC_BE_SWAP (b0, bige_const);
-	  b1 = VEC_BE_SWAP (b1, bige_const);
-	  b4 = asm_cipherlast_be (b4, tweak4);
-	  b5 = asm_cipherlast_be (b5, tweak5);
-	  b2 = VEC_BE_SWAP (b2, bige_const);
-	  b3 = VEC_BE_SWAP (b3, bige_const);
-	  b6 = asm_cipherlast_be (b6, tweak6);
-	  b7 = asm_cipherlast_be (b7, tweak7);
-	  VEC_STORE_BE_NOSWAP (out, 0, b0);
-	  VEC_STORE_BE_NOSWAP (out, 1, b1);
-	  b4 = VEC_BE_SWAP (b4, bige_const);
-	  b5 = VEC_BE_SWAP (b5, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 2, b2);
-	  VEC_STORE_BE_NOSWAP (out, 3, b3);
-	  b6 = VEC_BE_SWAP (b6, bige_const);
-	  b7 = VEC_BE_SWAP (b7, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 4, b4);
-	  VEC_STORE_BE_NOSWAP (out, 5, b5);
-	  VEC_STORE_BE_NOSWAP (out, 6, b6);
-	  VEC_STORE_BE_NOSWAP (out, 7, b7);
-	  out += 8;
-	}
+#define GCRY_AES_PPC8 1
+#define ENCRYPT_BLOCK_FUNC	_gcry_aes_ppc8_encrypt
+#define DECRYPT_BLOCK_FUNC	_gcry_aes_ppc8_decrypt
+#define CFB_ENC_FUNC		_gcry_aes_ppc8_cfb_enc
+#define CFB_DEC_FUNC		_gcry_aes_ppc8_cfb_dec
+#define CBC_ENC_FUNC		_gcry_aes_ppc8_cbc_enc
+#define CBC_DEC_FUNC		_gcry_aes_ppc8_cbc_dec
+#define CTR_ENC_FUNC		_gcry_aes_ppc8_ctr_enc
+#define OCB_CRYPT_FUNC		_gcry_aes_ppc8_ocb_crypt
+#define OCB_AUTH_FUNC		_gcry_aes_ppc8_ocb_auth
+#define XTS_CRYPT_FUNC		_gcry_aes_ppc8_xts_crypt
 
-      if (nblocks >= 4)
-	{
-	  tweak0 = tweak;
-	  GEN_TWEAK (tweak1, tweak0);
-	  GEN_TWEAK (tweak2, tweak1);
-	  GEN_TWEAK (tweak3, tweak2);
-	  GEN_TWEAK (tweak, tweak3);
-
-	  b0 = VEC_LOAD_BE (in, 0, bige_const);
-	  b1 = VEC_LOAD_BE (in, 1, bige_const);
-	  b2 = VEC_LOAD_BE (in, 2, bige_const);
-	  b3 = VEC_LOAD_BE (in, 3, bige_const);
-
-	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
-	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
-	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
-	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
-
-	  b0 ^= tweak0 ^ rkey0;
-	  b1 ^= tweak1 ^ rkey0;
-	  b2 ^= tweak2 ^ rkey0;
-	  b3 ^= tweak3 ^ rkey0;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_cipher_be (b0, rkey); \
-	      b1 = asm_cipher_be (b1, rkey); \
-	      b2 = asm_cipher_be (b2, rkey); \
-	      b3 = asm_cipher_be (b3, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-	  DO_ROUND(8);
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  rkey = rkeylast;
-	  b0 = asm_cipherlast_be (b0, rkey ^ tweak0);
-	  b1 = asm_cipherlast_be (b1, rkey ^ tweak1);
-	  b2 = asm_cipherlast_be (b2, rkey ^ tweak2);
-	  b3 = asm_cipherlast_be (b3, rkey ^ tweak3);
-
-	  VEC_STORE_BE (out, 0, b0, bige_const);
-	  VEC_STORE_BE (out, 1, b1, bige_const);
-	  VEC_STORE_BE (out, 2, b2, bige_const);
-	  VEC_STORE_BE (out, 3, b3, bige_const);
-
-	  in += 4;
-	  out += 4;
-	  nblocks -= 4;
-	}
-
-      for (; nblocks; nblocks--)
-	{
-	  tweak0 = asm_vperm1 (tweak, bswap128_const);
-
-	  /* Xor-Encrypt/Decrypt-Xor block. */
-	  b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;
-
-	  /* Generate next tweak. */
-	  GEN_TWEAK (tweak, tweak);
-
-	  AES_ENCRYPT (b, rounds);
-
-	  b ^= tweak0;
-	  VEC_STORE_BE (out, 0, b, bige_const);
-
-	  in++;
-	  out++;
-	}
-    }
-  else
-    {
-      const u128_t *rk = (u128_t *)&ctx->keyschdec;
-
-      if (!ctx->decryption_prepared)
-	{
-	  aes_ppc8_prepare_decryption (ctx);
-	  ctx->decryption_prepared = 1;
-	}
-
-      PRELOAD_ROUND_KEYS (rounds);
-
-      for (; nblocks >= 8; nblocks -= 8)
-	{
-	  b0 = VEC_LOAD_BE_NOSWAP (in, 0);
-	  b1 = VEC_LOAD_BE_NOSWAP (in, 1);
-	  b2 = VEC_LOAD_BE_NOSWAP (in, 2);
-	  b3 = VEC_LOAD_BE_NOSWAP (in, 3);
-	  tweak0 = tweak;
-	  GEN_TWEAK (tweak1, tweak0);
-	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
-	  b4 = VEC_LOAD_BE_NOSWAP (in, 4);
-	  b5 = VEC_LOAD_BE_NOSWAP (in, 5);
-	  GEN_TWEAK (tweak2, tweak1);
-	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
-	  b6 = VEC_LOAD_BE_NOSWAP (in, 6);
-	  b7 = VEC_LOAD_BE_NOSWAP (in, 7);
-	  in += 8;
-
-	  b0 = VEC_BE_SWAP(b0, bige_const);
-	  b1 = VEC_BE_SWAP(b1, bige_const);
-	  GEN_TWEAK (tweak3, tweak2);
-	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
-	  GEN_TWEAK (tweak4, tweak3);
-	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
-	  b2 = VEC_BE_SWAP(b2, bige_const);
-	  b3 = VEC_BE_SWAP(b3, bige_const);
-	  GEN_TWEAK (tweak5, tweak4);
-	  tweak4 = asm_vperm1 (tweak4, bswap128_const);
-	  GEN_TWEAK (tweak6, tweak5);
-	  tweak5 = asm_vperm1 (tweak5, bswap128_const);
-	  b4 = VEC_BE_SWAP(b4, bige_const);
-	  b5 = VEC_BE_SWAP(b5, bige_const);
-	  GEN_TWEAK (tweak7, tweak6);
-	  tweak6 = asm_vperm1 (tweak6, bswap128_const);
-	  GEN_TWEAK (tweak, tweak7);
-	  tweak7 = asm_vperm1 (tweak7, bswap128_const);
-	  b6 = VEC_BE_SWAP(b6, bige_const);
-	  b7 = VEC_BE_SWAP(b7, bige_const);
-
-	  tweak0 = asm_xor (tweak0, rkey0);
-	  tweak1 = asm_xor (tweak1, rkey0);
-	  tweak2 = asm_xor (tweak2, rkey0);
-	  tweak3 = asm_xor (tweak3, rkey0);
-	  tweak4 = asm_xor (tweak4, rkey0);
-	  tweak5 = asm_xor (tweak5, rkey0);
-	  tweak6 = asm_xor (tweak6, rkey0);
-	  tweak7 = asm_xor (tweak7, rkey0);
-
-	  b0 = asm_xor (b0, tweak0);
-	  b1 = asm_xor (b1, tweak1);
-	  b2 = asm_xor (b2, tweak2);
-	  b3 = asm_xor (b3, tweak3);
-	  b4 = asm_xor (b4, tweak4);
-	  b5 = asm_xor (b5, tweak5);
-	  b6 = asm_xor (b6, tweak6);
-	  b7 = asm_xor (b7, tweak7);
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_ncipher_be (b0, rkey); \
-	      b1 = asm_ncipher_be (b1, rkey); \
-	      b2 = asm_ncipher_be (b2, rkey); \
-	      b3 = asm_ncipher_be (b3, rkey); \
-	      b4 = asm_ncipher_be (b4, rkey); \
-	      b5 = asm_ncipher_be (b5, rkey); \
-	      b6 = asm_ncipher_be (b6, rkey); \
-	      b7 = asm_ncipher_be (b7, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-
-	  rkeylf = asm_xor (rkeylast, rkey0);
-
-	  DO_ROUND(8);
-
-	  tweak0 = asm_xor (tweak0, rkeylf);
-	  tweak1 = asm_xor (tweak1, rkeylf);
-	  tweak2 = asm_xor (tweak2, rkeylf);
-	  tweak3 = asm_xor (tweak3, rkeylf);
-	  tweak4 = asm_xor (tweak4, rkeylf);
-	  tweak5 = asm_xor (tweak5, rkeylf);
-	  tweak6 = asm_xor (tweak6, rkeylf);
-	  tweak7 = asm_xor (tweak7, rkeylf);
-
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  b0 = asm_ncipherlast_be (b0, tweak0);
-	  b1 = asm_ncipherlast_be (b1, tweak1);
-	  b2 = asm_ncipherlast_be (b2, tweak2);
-	  b3 = asm_ncipherlast_be (b3, tweak3);
-	  b0 = VEC_BE_SWAP (b0, bige_const);
-	  b1 = VEC_BE_SWAP (b1, bige_const);
-	  b4 = asm_ncipherlast_be (b4, tweak4);
-	  b5 = asm_ncipherlast_be (b5, tweak5);
-	  b2 = VEC_BE_SWAP (b2, bige_const);
-	  b3 = VEC_BE_SWAP (b3, bige_const);
-	  b6 = asm_ncipherlast_be (b6, tweak6);
-	  b7 = asm_ncipherlast_be (b7, tweak7);
-	  VEC_STORE_BE_NOSWAP (out, 0, b0);
-	  VEC_STORE_BE_NOSWAP (out, 1, b1);
-	  b4 = VEC_BE_SWAP (b4, bige_const);
-	  b5 = VEC_BE_SWAP (b5, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 2, b2);
-	  VEC_STORE_BE_NOSWAP (out, 3, b3);
-	  b6 = VEC_BE_SWAP (b6, bige_const);
-	  b7 = VEC_BE_SWAP (b7, bige_const);
-	  VEC_STORE_BE_NOSWAP (out, 4, b4);
-	  VEC_STORE_BE_NOSWAP (out, 5, b5);
-	  VEC_STORE_BE_NOSWAP (out, 6, b6);
-	  VEC_STORE_BE_NOSWAP (out, 7, b7);
-	  out += 8;
-	}
-
-      if (nblocks >= 4)
-	{
-	  tweak0 = tweak;
-	  GEN_TWEAK (tweak1, tweak0);
-	  GEN_TWEAK (tweak2, tweak1);
-	  GEN_TWEAK (tweak3, tweak2);
-	  GEN_TWEAK (tweak, tweak3);
-
-	  b0 = VEC_LOAD_BE (in, 0, bige_const);
-	  b1 = VEC_LOAD_BE (in, 1, bige_const);
-	  b2 = VEC_LOAD_BE (in, 2, bige_const);
-	  b3 = VEC_LOAD_BE (in, 3, bige_const);
-
-	  tweak0 = asm_vperm1 (tweak0, bswap128_const);
-	  tweak1 = asm_vperm1 (tweak1, bswap128_const);
-	  tweak2 = asm_vperm1 (tweak2, bswap128_const);
-	  tweak3 = asm_vperm1 (tweak3, bswap128_const);
-
-	  b0 ^= tweak0 ^ rkey0;
-	  b1 ^= tweak1 ^ rkey0;
-	  b2 ^= tweak2 ^ rkey0;
-	  b3 ^= tweak3 ^ rkey0;
-
-#define DO_ROUND(r) \
-	      rkey = ALIGNED_LOAD (rk, r); \
-	      b0 = asm_ncipher_be (b0, rkey); \
-	      b1 = asm_ncipher_be (b1, rkey); \
-	      b2 = asm_ncipher_be (b2, rkey); \
-	      b3 = asm_ncipher_be (b3, rkey);
-
-	  DO_ROUND(1);
-	  DO_ROUND(2);
-	  DO_ROUND(3);
-	  DO_ROUND(4);
-	  DO_ROUND(5);
-	  DO_ROUND(6);
-	  DO_ROUND(7);
-	  DO_ROUND(8);
-	  DO_ROUND(9);
-	  if (rounds >= 12)
-	    {
-	      DO_ROUND(10);
-	      DO_ROUND(11);
-	      if (rounds > 12)
-		{
-		  DO_ROUND(12);
-		  DO_ROUND(13);
-		}
-	    }
-
-#undef DO_ROUND
-
-	  rkey = rkeylast;
-	  b0 = asm_ncipherlast_be (b0, rkey ^ tweak0);
-	  b1 = asm_ncipherlast_be (b1, rkey ^ tweak1);
-	  b2 = asm_ncipherlast_be (b2, rkey ^ tweak2);
-	  b3 = asm_ncipherlast_be (b3, rkey ^ tweak3);
-
-	  VEC_STORE_BE (out, 0, b0, bige_const);
-	  VEC_STORE_BE (out, 1, b1, bige_const);
-	  VEC_STORE_BE (out, 2, b2, bige_const);
-	  VEC_STORE_BE (out, 3, b3, bige_const);
-
-	  in += 4;
-	  out += 4;
-	  nblocks -= 4;
-	}
-
-      for (; nblocks; nblocks--)
-	{
-	  tweak0 = asm_vperm1 (tweak, bswap128_const);
-
-	  /* Xor-Encrypt/Decrypt-Xor block. */
-	  b = VEC_LOAD_BE (in, 0, bige_const) ^ tweak0;
-
-	  /* Generate next tweak. */
-	  GEN_TWEAK (tweak, tweak);
-
-	  AES_DECRYPT (b, rounds);
-
-	  b ^= tweak0;
-	  VEC_STORE_BE (out, 0, b, bige_const);
-
-	  in++;
-	  out++;
-	}
-    }
-
-#ifdef WORDS_BIGENDIAN
-  tweak = asm_vperm1 (tweak, bswap128_const);
-  VEC_STORE_BE (tweak_arg, 0, tweak, bige_const);
-#else
-  VEC_STORE_BE (tweak_arg, 0, tweak, vec_tweakin_swap_const);
-#endif
-
-#undef GEN_TWEAK
-}
+#include <rijndael-ppc-functions.h>
 
 #endif /* USE_PPC_CRYPTO */
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
new file mode 100644
index 000000000..facdedd4f
--- /dev/null
+++ b/cipher/rijndael-ppc9le.c
@@ -0,0 +1,102 @@
+/* Rijndael (AES) for GnuPG - PowerPC Vector Crypto AES implementation
+ * Copyright (C) 2019 Shawn Landden <shawn at git.icu>
+ * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+
+#include "rijndael-internal.h"
+#include "cipher-internal.h"
+#include "bufhelp.h"
+
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+
+#include "rijndael-ppc-common.h"
+
+
+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_const(void)
+{
+  static const block vec_dummy = { 0 };
+  return vec_dummy;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_be_swap(block vec, block be_bswap_const)
+{
+  (void)be_bswap_const;
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE block
+asm_load_be_noswap(unsigned long offset, const void *ptr)
+{
+  block vec;
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("lxvb16x %x0,0,%1\n\t"
+		      : "=wa" (vec)
+		      : "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("lxvb16x %x0,%1,%2\n\t"
+		      : "=wa" (vec)
+		      : "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+  return vec;
+}
+
+static ASM_FUNC_ATTR_INLINE void
+asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
+{
+#if __GNUC__ >= 4
+  if (__builtin_constant_p (offset) && offset == 0)
+    __asm__ volatile ("stxvb16x %x0,0,%1\n\t"
+		      :
+		      : "wa" (vec), "r" ((uintptr_t)ptr)
+		      : "memory");
+  else
+#endif
+    __asm__ volatile ("stxvb16x %x0,%1,%2\n\t"
+		      :
+		      : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
+		      : "memory", "r0");
+}
+
+
+#define GCRY_AES_PPC9LE 1
+#define ENCRYPT_BLOCK_FUNC	_gcry_aes_ppc9le_encrypt
+#define DECRYPT_BLOCK_FUNC	_gcry_aes_ppc9le_decrypt
+#define CFB_ENC_FUNC		_gcry_aes_ppc9le_cfb_enc
+#define CFB_DEC_FUNC		_gcry_aes_ppc9le_cfb_dec
+#define CBC_ENC_FUNC		_gcry_aes_ppc9le_cbc_enc
+#define CBC_DEC_FUNC		_gcry_aes_ppc9le_cbc_dec
+#define CTR_ENC_FUNC		_gcry_aes_ppc9le_ctr_enc
+#define OCB_CRYPT_FUNC		_gcry_aes_ppc9le_ocb_crypt
+#define OCB_AUTH_FUNC		_gcry_aes_ppc9le_ocb_auth
+#define XTS_CRYPT_FUNC		_gcry_aes_ppc9le_xts_crypt
+
+#include <rijndael-ppc-functions.h>
+
+#endif /* USE_PPC_CRYPTO */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index ebd1a11a5..a1c4cfc1a 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -239,6 +239,43 @@ extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
 				      size_t nblocks, int encrypt);
 #endif /*USE_PPC_CRYPTO*/
 
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+/* Power9 little-endian crypto implementations of AES */
+extern unsigned int _gcry_aes_ppc9le_encrypt(const RIJNDAEL_context *ctx,
+					    unsigned char *dst,
+					    const unsigned char *src);
+extern unsigned int _gcry_aes_ppc9le_decrypt(const RIJNDAEL_context *ctx,
+					    unsigned char *dst,
+					    const unsigned char *src);
+
+extern void _gcry_aes_ppc9le_cfb_enc (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+extern void _gcry_aes_ppc9le_cbc_enc (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks, int cbc_mac);
+extern void _gcry_aes_ppc9le_ctr_enc (void *context, unsigned char *ctr,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+extern void _gcry_aes_ppc9le_cfb_dec (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+extern void _gcry_aes_ppc9le_cbc_dec (void *context, unsigned char *iv,
+				      void *outbuf_arg, const void *inbuf_arg,
+				      size_t nblocks);
+
+extern size_t _gcry_aes_ppc9le_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+					  const void *inbuf_arg, size_t nblocks,
+					  int encrypt);
+extern size_t _gcry_aes_ppc9le_ocb_auth (gcry_cipher_hd_t c,
+					const void *abuf_arg, size_t nblocks);
+
+extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak,
+					void *outbuf_arg,
+					const void *inbuf_arg,
+					size_t nblocks, int encrypt);
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
+
 static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
                                 const unsigned char *ax);
 static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -384,6 +421,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
 #ifdef USE_PPC_CRYPTO
   ctx->use_ppc_crypto = 0;
 #endif
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  ctx->use_ppc9le_crypto = 0;
+#endif
 
   if (0)
     {
@@ -464,6 +504,28 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
         }
     }
 #endif
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if ((hwfeatures & HWF_PPC_VCRYPTO) && (hwfeatures & HWF_PPC_ARCH_3_00))
+    {
+      ctx->encrypt_fn = _gcry_aes_ppc9le_encrypt;
+      ctx->decrypt_fn = _gcry_aes_ppc9le_decrypt;
+      ctx->prefetch_enc_fn = NULL;
+      ctx->prefetch_dec_fn = NULL;
+      ctx->use_ppc_crypto = 1; /* same key-setup as USE_PPC_CRYPTO */
+      ctx->use_ppc9le_crypto = 1;
+      if (hd)
+        {
+          hd->bulk.cfb_enc = _gcry_aes_ppc9le_cfb_enc;
+          hd->bulk.cfb_dec = _gcry_aes_ppc9le_cfb_dec;
+          hd->bulk.cbc_enc = _gcry_aes_ppc9le_cbc_enc;
+          hd->bulk.cbc_dec = _gcry_aes_ppc9le_cbc_dec;
+          hd->bulk.ctr_enc = _gcry_aes_ppc9le_ctr_enc;
+          hd->bulk.ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
+          hd->bulk.ocb_auth = _gcry_aes_ppc9le_ocb_auth;
+          hd->bulk.xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+        }
+    }
+#endif
 #ifdef USE_PPC_CRYPTO
   else if (hwfeatures & HWF_PPC_VCRYPTO)
     {
@@ -924,6 +986,13 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
       return;
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      _gcry_aes_ppc9le_cfb_enc (ctx, iv, outbuf, inbuf, nblocks);
+      return;
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -992,6 +1061,13 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
       return;
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      _gcry_aes_ppc9le_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac);
+      return;
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -1067,6 +1143,13 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
       return;
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      _gcry_aes_ppc9le_ctr_enc (ctx, ctr, outbuf, inbuf, nblocks);
+      return;
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -1317,6 +1400,13 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
       return;
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      _gcry_aes_ppc9le_cfb_dec (ctx, iv, outbuf, inbuf, nblocks);
+      return;
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -1382,6 +1472,13 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
       return;
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      _gcry_aes_ppc9le_cbc_dec (ctx, iv, outbuf, inbuf, nblocks);
+      return;
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -1450,6 +1547,12 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       return _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      return _gcry_aes_ppc9le_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -1550,6 +1653,12 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
       return _gcry_aes_armv8_ce_ocb_auth (c, abuf, nblocks);
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      return _gcry_aes_ppc9le_ocb_auth (c, abuf, nblocks);
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
@@ -1619,6 +1728,13 @@ _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
       return;
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
+  else if (ctx->use_ppc9le_crypto)
+    {
+      _gcry_aes_ppc9le_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+      return;
+    }
+#endif /*USE_PPC_CRYPTO_WITH_PPC9LE*/
 #ifdef USE_PPC_CRYPTO
   else if (ctx->use_ppc_crypto)
     {
diff --git a/configure.ac b/configure.ac
index f31b75586..f9d3dd718 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2348,6 +2348,7 @@ if test "$found" = "1" ; then
       powerpc64le-*-*)
          # Build with the crypto extension implementation
          GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc9le.lo"
       ;;
       powerpc64-*-*)
          # Big-Endian.




More information about the Gcrypt-devel mailing list