[PATCH 4/6] rijndael-ppc: enable PowerPC AES-OCB implemention

Fri Aug 23 18:52:15 CEST 2019

* cipher/rijndael-ppc.c (ROUND_KEY_VARIABLES, PRELOAD_ROUND_KEYS)
(AES_ENCRYPT, AES_DECRYPT): New.
(_gcry_aes_ppc8_prepare_decryption): Rename to...
(aes_ppc8_prepare_decryption): ... this.
(_gcry_aes_ppc8_prepare_decryption): New.
(aes_ppc8_encrypt_altivec, aes_ppc8_decrypt_altivec): Remove.
(_gcry_aes_ppc8_encrypt): Use AES_ENCRYPT macro.
(_gcry_aes_ppc8_decrypt): Use AES_DECRYPT macro.
(_gcry_aes_ppc8_ocb_crypt): Uncomment; Optimizations for OCB offset
calculations, etc; Use new load/store and encryption/decryption macros.
* cipher/rijndaelc [USE_PPC_CRYPTO] (_gcry_aes_ppc8_ocb_crypt): New
prototype.
(do_setkey, _gcry_aes_ocb_crypt) [USE_PPC_CRYPTO]: Add PowerPC AES OCB
encryption/decryption.
--

Benchmark on POWER8 ~3.8Ghz:
Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte
        OCB enc |      2.33 ns/B     410.1 MiB/s      8.84 c/B
        OCB dec |      2.34 ns/B     407.2 MiB/s      8.90 c/B
       OCB auth |      2.32 ns/B     411.1 MiB/s      8.82 c/B

After:
        OCB enc |     0.250 ns/B      3818 MiB/s     0.949 c/B
        OCB dec |     0.250 ns/B      3820 MiB/s     0.949 c/B
       OCB auth |      2.31 ns/B     412.5 MiB/s      8.79 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index a7c47a876..01ff6f503 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -64,6 +64,82 @@ typedef union
   (vec_store_be ((vec), 0, (unsigned char *)(out_ptr), bige_const))
 
 
+#define ROUND_KEY_VARIABLES \
+  block rkey0, rkeylast
+
+#define PRELOAD_ROUND_KEYS(rk_ptr, nrounds) \
+  do { \
+    rkey0 = ALIGNED_LOAD(&rk_ptr[0]); \
+    if (nrounds >= 12) \
+      { \
+	if (rounds > 12) \
+	  { \
+	    rkeylast = ALIGNED_LOAD(&rk_ptr[14]); \
+	  } \
+	else \
+	  { \
+	    rkeylast = ALIGNED_LOAD(&rk_ptr[12]); \
+	  } \
+      } \
+    else \
+      { \
+	rkeylast = ALIGNED_LOAD(&rk_ptr[10]); \
+      } \
+  } while (0)
+
+
+#define AES_ENCRYPT(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[1])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[2])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[3])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[4])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[5])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[6])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[7])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[8])); \
+    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[9])); \
+    if (nrounds >= 12) \
+      { \
+	blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[10])); \
+	blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[11])); \
+	if (rounds > 12) \
+	  { \
+	    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[12])); \
+	    blk = vec_cipher_be (blk, ALIGNED_LOAD(&rk[13])); \
+	  } \
+      } \
+    blk = vec_cipherlast_be (blk, rkeylast); \
+  } while (0)
+
+
+#define AES_DECRYPT(blk, nrounds) \
+  do { \
+    blk ^= rkey0; \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[1])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[2])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[3])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[4])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[5])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[6])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[7])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[8])); \
+    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[9])); \
+    if (nrounds >= 12) \
+      { \
+	blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[10])); \
+	blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[11])); \
+	if (rounds > 12) \
+	  { \
+	    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[12])); \
+	    blk = vec_ncipher_be (blk, ALIGNED_LOAD(&rk[13])); \
+	  } \
+      } \
+    blk = vec_ncipherlast_be (blk, rkeylast); \
+  } while (0)
+
+
 static const block vec_bswap32_const =
   { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
 
@@ -287,8 +363,8 @@ _gcry_aes_ppc8_setkey (RIJNDAEL_context *ctx, const byte *key)
 
 
 /* Make a decryption key from an encryption key. */
-void
-_gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
+static ASM_FUNC_ATTR_INLINE void
+aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 {
   u128_t *ekey = (u128_t *)(void *)ctx->keyschenc;
   u128_t *dkey = (u128_t *)(void *)ctx->keyschdec;
@@ -305,634 +381,505 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 }
 
 
-static ASM_FUNC_ATTR_INLINE block
-aes_ppc8_encrypt_altivec (const RIJNDAEL_context *ctx, block a)
+void
+_gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
 {
-  u128_t *rk = (u128_t *)ctx->keyschenc;
-  int rounds = ctx->rounds;
-  int r;
-
-#define DO_ROUND(r) (a = vec_cipher_be (a, ALIGNED_LOAD (&rk[r])))
-
-  a = ALIGNED_LOAD(&rk[0]) ^ a;
-  DO_ROUND(1);
-  DO_ROUND(2);
-  DO_ROUND(3);
-  DO_ROUND(4);
-  DO_ROUND(5);
-  DO_ROUND(6);
-  DO_ROUND(7);
-  DO_ROUND(8);
-  DO_ROUND(9);
-  r = 10;
-  if (rounds >= 12)
-    {
-      DO_ROUND(10);
-      DO_ROUND(11);
-      r = 12;
-      if (rounds > 12)
-	{
-	  DO_ROUND(12);
-	  DO_ROUND(13);
-	  r = 14;
-	}
-    }
-  a = vec_cipherlast_be(a, ALIGNED_LOAD(&rk[r]));
-
-#undef DO_ROUND
-
-  return a;
+  aes_ppc8_prepare_decryption (ctx);
 }
 
 
-static ASM_FUNC_ATTR_INLINE block
-aes_ppc8_decrypt_altivec (const RIJNDAEL_context *ctx, block a)
+unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
+				     unsigned char *out,
+				     const unsigned char *in)
 {
-  u128_t *rk = (u128_t *)ctx->keyschdec;
+  const block bige_const = vec_load_be_const();
+  const u128_t *rk = (u128_t *)&ctx->keyschenc;
   int rounds = ctx->rounds;
-  int r;
-
-#define DO_ROUND(r) (a = vec_ncipher_be (a, ALIGNED_LOAD (&rk[r])))
-
-  a = ALIGNED_LOAD(&rk[0]) ^ a;
-  DO_ROUND(1);
-  DO_ROUND(2);
-  DO_ROUND(3);
-  DO_ROUND(4);
-  DO_ROUND(5);
-  DO_ROUND(6);
-  DO_ROUND(7);
-  DO_ROUND(8);
-  DO_ROUND(9);
-  r = 10;
-  if (rounds >= 12)
-    {
-      DO_ROUND(10);
-      DO_ROUND(11);
-      r = 12;
-      if (rounds > 12)
-	{
-	  DO_ROUND(12);
-	  DO_ROUND(13);
-	  r = 14;
-	}
-    }
-  a = vec_ncipherlast_be(a, ALIGNED_LOAD(&rk[r]));
+  ROUND_KEY_VARIABLES;
+  block b;
 
-#undef DO_ROUND
+  b = VEC_LOAD_BE (in, bige_const);
 
-  return a;
-}
+  PRELOAD_ROUND_KEYS (rk, rounds);
 
-
-unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
-				     unsigned char *b,
-				     const unsigned char *a)
-{
-  const block bige_const = vec_load_be_const();
-  block sa;
-
-  sa = VEC_LOAD_BE (a, bige_const);
-  sa = aes_ppc8_encrypt_altivec (ctx, sa);
-  VEC_STORE_BE (b, sa, bige_const);
+  AES_ENCRYPT (b, rounds);
+  VEC_STORE_BE (out, b, bige_const);
 
   return 0; /* does not use stack */
 }
 
 
 unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
-				     unsigned char *b,
-				     const unsigned char *a)
+				     unsigned char *out,
+				     const unsigned char *in)
 {
   const block bige_const = vec_load_be_const();
-  block sa;
+  const u128_t *rk = (u128_t *)&ctx->keyschdec;
+  int rounds = ctx->rounds;
+  ROUND_KEY_VARIABLES;
+  block b;
+
+  b = VEC_LOAD_BE (in, bige_const);
 
-  sa = VEC_LOAD_BE (a, bige_const);
-  sa = aes_ppc8_decrypt_altivec (ctx, sa);
-  VEC_STORE_BE (b, sa, bige_const);
+  PRELOAD_ROUND_KEYS (rk, rounds);
+
+  AES_DECRYPT (b, rounds);
+  VEC_STORE_BE (out, b, bige_const);
 
   return 0; /* does not use stack */
 }
 
 
-#if 0
 size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-                                            const void *inbuf_arg, size_t nblocks,
-                                            int encrypt)
+				 const void *inbuf_arg, size_t nblocks,
+				 int encrypt)
 {
+  const block bige_const = vec_load_be_const();
   RIJNDAEL_context *ctx = (void *)&c->context.c;
-  unsigned char *outbuf = outbuf_arg;
-  const unsigned char *inbuf = inbuf_arg;
-  block *in = (block*)inbuf;
-  block *out = (block*)outbuf;
-  uintptr_t zero = 0;
-  int r;
+  const u128_t *in = (const u128_t *)inbuf_arg;
+  u128_t *out = (u128_t *)outbuf_arg;
   int rounds = ctx->rounds;
+  u64 data_nblocks = c->u_mode.ocb.data_nblocks;
+  block l0, l1, l2, l;
+  block b0, b1, b2, b3, b4, b5, b6, b7, b;
+  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+  block rkey;
+  block ctr, iv;
+  ROUND_KEY_VARIABLES;
+
+  iv = VEC_LOAD_BE (c->u_iv.iv, bige_const);
+  ctr = VEC_LOAD_BE (c->u_ctr.ctr, bige_const);
+
+  l0 = VEC_LOAD_BE (c->u_mode.ocb.L[0], bige_const);
+  l1 = VEC_LOAD_BE (c->u_mode.ocb.L[1], bige_const);
+  l2 = VEC_LOAD_BE (c->u_mode.ocb.L[2], bige_const);
 
   if (encrypt)
     {
-      const int unroll = 8;
-      block unalignedprev, ctr, iv;
+      const u128_t *rk = (u128_t *)&ctx->keyschenc;
 
-      if (((uintptr_t)inbuf % 16) != 0)
+      PRELOAD_ROUND_KEYS (rk, rounds);
+
+      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
 	{
-	  unalignedprev = vec_ld(0, in++);
-	}
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
+	  b = VEC_LOAD_BE (in, bige_const);
 
-      iv = vec_ld (0, (block*)&c->u_iv.iv);
-      ctr = vec_ld (0, (block*)&c->u_ctr.ctr);
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_ENCRYPT (b, rounds);
+	  b ^= iv;
 
-      for ( ;nblocks >= unroll; nblocks -= unroll)
-	{
-	  u64 i = c->u_mode.ocb.data_nblocks + 1;
-	  block l0, l1, l2, l3, l4, l5, l6, l7;
-	  block b0, b1, b2, b3, b4, b5, b6, b7;
-	  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
-	  const block *rk = (block*)&ctx->keyschenc;
+	  VEC_STORE_BE (out, b, bige_const);
 
-	  c->u_mode.ocb.data_nblocks += unroll;
+	  in += 1;
+	  out += 1;
+	}
 
-	  iv0 = iv;
-	  if ((uintptr_t)inbuf % 16 == 0)
-	    {
-	      b0 = vec_ld (0, in++);
-	      b1 = vec_ld (0, in++);
-	      b2 = vec_ld (0, in++);
-	      b3 = vec_ld (0, in++);
-	      b4 = vec_ld (0, in++);
-	      b5 = vec_ld (0, in++);
-	      b6 = vec_ld (0, in++);
-	      b7 = vec_ld (0, in++);
-	    }
-	  else
-	    {
-	      block unaligned0, unaligned1, unaligned2,
-		unaligned3, unaligned4, unaligned5, unaligned6;
-	      unaligned0 = vec_ld (0, in++);
-	      unaligned1 = vec_ld (0, in++);
-	      unaligned2 = vec_ld (0, in++);
-	      unaligned3 = vec_ld (0, in++);
-	      unaligned4 = vec_ld (0, in++);
-	      unaligned5 = vec_ld (0, in++);
-	      unaligned6 = vec_ld (0, in++);
-	      b0 = vec_perm (unalignedprev, unaligned0, vec_lvsl (0, inbuf));
-	      unalignedprev = vec_ld (0, in++);
-	      b1 = vec_perm(unaligned0, unaligned1, vec_lvsl (0, inbuf));
-	      b2 = vec_perm(unaligned1, unaligned2, vec_lvsl (0, inbuf));
-	      b3 = vec_perm(unaligned2, unaligned3, vec_lvsl (0, inbuf));
-	      b4 = vec_perm(unaligned3, unaligned4, vec_lvsl (0, inbuf));
-	      b5 = vec_perm(unaligned4, unaligned5, vec_lvsl (0, inbuf));
-	      b6 = vec_perm(unaligned5, unaligned6, vec_lvsl (0, inbuf));
-	      b7 = vec_perm(unaligned6, unalignedprev, vec_lvsl (0, inbuf));
-	    }
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE (in + 0, bige_const);
+	  b1 = VEC_LOAD_BE (in + 1, bige_const);
+	  b2 = VEC_LOAD_BE (in + 2, bige_const);
+	  b3 = VEC_LOAD_BE (in + 3, bige_const);
+	  b4 = VEC_LOAD_BE (in + 4, bige_const);
+	  b5 = VEC_LOAD_BE (in + 5, bige_const);
+	  b6 = VEC_LOAD_BE (in + 6, bige_const);
+	  b7 = VEC_LOAD_BE (in + 7, bige_const);
 
-	  l0 = *(block*)ocb_get_l (c, i++);
-	  l1 = *(block*)ocb_get_l (c, i++);
-	  l2 = *(block*)ocb_get_l (c, i++);
-	  l3 = *(block*)ocb_get_l (c, i++);
-	  l4 = *(block*)ocb_get_l (c, i++);
-	  l5 = *(block*)ocb_get_l (c, i++);
-	  l6 = *(block*)ocb_get_l (c, i++);
-	  l7 = *(block*)ocb_get_l (c, i++);
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const);
 
 	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
 
-	  iv0 ^= l0;
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l2;
+	  iv4 = iv ^ l1 ^ l2 ^ l0;
+	  iv5 = iv ^ l2 ^ l0;
+	  iv6 = iv ^ l2;
+	  iv7 = iv ^ l2 ^ l;
+
 	  b0 ^= iv0;
-	  iv1 = iv0 ^ l1;
 	  b1 ^= iv1;
-	  iv2 = iv1 ^ l2;
 	  b2 ^= iv2;
-	  iv3 = iv2 ^ l3;
 	  b3 ^= iv3;
-	  iv4 = iv3 ^ l4;
 	  b4 ^= iv4;
-	  iv5 = iv4 ^ l5;
 	  b5 ^= iv5;
-	  iv6 = iv5 ^ l6;
 	  b6 ^= iv6;
-	  iv7 = iv6 ^ l7;
 	  b7 ^= iv7;
-
-	  b0 = swap_if_le (b0);
-	  b1 = swap_if_le (b1);
-	  b2 = swap_if_le (b2);
-	  b3 = swap_if_le (b3);
-	  b4 = swap_if_le (b4);
-	  b5 = swap_if_le (b5);
-	  b6 = swap_if_le (b6);
-	  b7 = swap_if_le (b7);
-
-	  b0 ^= rk[0];
-	  b1 ^= rk[0];
-	  b2 ^= rk[0];
-	  b3 ^= rk[0];
-	  b4 ^= rk[0];
-	  b5 ^= rk[0];
-	  b6 ^= rk[0];
-	  b7 ^= rk[0];
-
-	  for (r = 1;r < rounds;r++)
-	    {
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b0)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b1)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b2)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b3)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b4)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b5)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b6)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vcipher %0, %0, %1\n\t"
-		:"+v" (b7)
-		:"v" (rk[r]));
-	    }
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b0)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b1)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b2)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b3)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b4)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b5)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b6)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
-	    :"+v" (b7)
-	    :"v" (rk[r]));
-
-	  iv = iv7;
-
-	  /* The unaligned store stxvb16x writes big-endian,
-	     so in the unaligned case we swap the iv instead of the bytes */
-	  if ((uintptr_t)outbuf % 16 == 0)
+	  iv = iv7 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (&rk[r]); \
+	      b0 = vec_cipher_be (b0, rkey); \
+	      b1 = vec_cipher_be (b1, rkey); \
+	      b2 = vec_cipher_be (b2, rkey); \
+	      b3 = vec_cipher_be (b3, rkey); \
+	      b4 = vec_cipher_be (b4, rkey); \
+	      b5 = vec_cipher_be (b5, rkey); \
+	      b6 = vec_cipher_be (b6, rkey); \
+	      b7 = vec_cipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
 	    {
-	      vec_vsx_st (swap_if_le (b0) ^ iv0, 0, out++);
-	      vec_vsx_st (swap_if_le (b1) ^ iv1, 0, out++);
-	      vec_vsx_st (swap_if_le (b2) ^ iv2, 0, out++);
-	      vec_vsx_st (swap_if_le (b3) ^ iv3, 0, out++);
-	      vec_vsx_st (swap_if_le (b4) ^ iv4, 0, out++);
-	      vec_vsx_st (swap_if_le (b5) ^ iv5, 0, out++);
-	      vec_vsx_st (swap_if_le (b6) ^ iv6, 0, out++);
-	      vec_vsx_st (swap_if_le (b7) ^ iv7, 0, out++);
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
 	    }
-	  else
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = vec_cipherlast_be (b0, rkey ^ iv0);
+	  b1 = vec_cipherlast_be (b1, rkey ^ iv1);
+	  b2 = vec_cipherlast_be (b2, rkey ^ iv2);
+	  b3 = vec_cipherlast_be (b3, rkey ^ iv3);
+	  b4 = vec_cipherlast_be (b4, rkey ^ iv4);
+	  b5 = vec_cipherlast_be (b5, rkey ^ iv5);
+	  b6 = vec_cipherlast_be (b6, rkey ^ iv6);
+	  b7 = vec_cipherlast_be (b7, rkey ^ iv7);
+
+	  VEC_STORE_BE (out + 0, b0, bige_const);
+	  VEC_STORE_BE (out + 1, b1, bige_const);
+	  VEC_STORE_BE (out + 2, b2, bige_const);
+	  VEC_STORE_BE (out + 3, b3, bige_const);
+	  VEC_STORE_BE (out + 4, b4, bige_const);
+	  VEC_STORE_BE (out + 5, b5, bige_const);
+	  VEC_STORE_BE (out + 6, b6, bige_const);
+	  VEC_STORE_BE (out + 7, b7, bige_const);
+
+	  in += 8;
+	  out += 8;
+	}
+
+      if (nblocks >= 4 && (data_nblocks % 4) == 0)
+	{
+	  b0 = VEC_LOAD_BE (in + 0, bige_const);
+	  b1 = VEC_LOAD_BE (in + 1, bige_const);
+	  b2 = VEC_LOAD_BE (in + 2, bige_const);
+	  b3 = VEC_LOAD_BE (in + 3, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  iv = iv3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (&rk[r]); \
+	      b0 = vec_cipher_be (b0, rkey); \
+	      b1 = vec_cipher_be (b1, rkey); \
+	      b2 = vec_cipher_be (b2, rkey); \
+	      b3 = vec_cipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
 	    {
-	      b0 ^= swap_if_le (iv0);
-	      b1 ^= swap_if_le (iv1);
-	      b2 ^= swap_if_le (iv2);
-	      b3 ^= swap_if_le (iv3);
-	      b4 ^= swap_if_le (iv4);
-	      b5 ^= swap_if_le (iv5);
-	      b6 ^= swap_if_le (iv6);
-	      b7 ^= swap_if_le (iv7);
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++)));
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
 	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = vec_cipherlast_be (b0, rkey ^ iv0);
+	  b1 = vec_cipherlast_be (b1, rkey ^ iv1);
+	  b2 = vec_cipherlast_be (b2, rkey ^ iv2);
+	  b3 = vec_cipherlast_be (b3, rkey ^ iv3);
+
+	  VEC_STORE_BE (out + 0, b0, bige_const);
+	  VEC_STORE_BE (out + 1, b1, bige_const);
+	  VEC_STORE_BE (out + 2, b2, bige_const);
+	  VEC_STORE_BE (out + 3, b3, bige_const);
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
 	}
 
-      for ( ;nblocks; nblocks-- )
+      for (; nblocks; nblocks--)
 	{
-	  block b;
-	  u64 i = ++c->u_mode.ocb.data_nblocks;
-	  const block l = *(block*)ocb_get_l (c, i);
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
+	  b = VEC_LOAD_BE (in, bige_const);
 
 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	  iv ^= l;
-	  if ((uintptr_t)in % 16 == 0)
-	    {
-	      b = vec_ld (0, in++);
-	    }
-	  else
-	    {
-	      block unalignedprevprev;
-	      unalignedprevprev = unalignedprev;
-	      unalignedprev = vec_ld (0, in++);
-	      b = vec_perm (unalignedprevprev, unalignedprev, vec_lvsl (0, inbuf));
-	    }
-
 	  /* Checksum_i = Checksum_{i-1} xor P_i  */
 	  ctr ^= b;
 	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
 	  b ^= iv;
-	  b = swap_if_le (b);
-	  b = _gcry_aes_ppc8_encrypt_altivec (ctx, b);
-	  if ((uintptr_t)out % 16 == 0)
-	    {
-	      vec_vsx_st (swap_if_le (b) ^ iv, 0, out++);
-	    }
-	  else
-	    {
-	      b ^= swap_if_le (iv);
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-	        :
-	        : "wa" (b), "r" (zero), "r" ((uintptr_t)out++));
-	    }
-	}
+	  AES_ENCRYPT (b, rounds);
+	  b ^= iv;
 
-      /* We want to store iv and ctr big-endian and the unaligned
-         store stxvb16x stores them little endian, so we have to swap them. */
-      iv = swap_if_le (iv);
-      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-	:: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv));
-      ctr = swap_if_le (ctr);
-      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-	:: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr));
+	  VEC_STORE_BE (out, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
     }
   else
     {
-      const int unroll = 8;
-      block unalignedprev, ctr, iv;
-      if (((uintptr_t)inbuf % 16) != 0)
+      const u128_t *rk = (u128_t *)&ctx->keyschdec;
+
+      if (!ctx->decryption_prepared)
 	{
-	  unalignedprev = vec_ld (0, in++);
+	  aes_ppc8_prepare_decryption (ctx);
+	  ctx->decryption_prepared = 1;
 	}
 
-      iv = vec_ld (0, (block*)&c->u_iv.iv);
-      ctr = vec_ld (0, (block*)&c->u_ctr.ctr);
+      PRELOAD_ROUND_KEYS (rk, rounds);
 
-      for ( ;nblocks >= unroll; nblocks -= unroll)
+      for (; nblocks >= 8 && data_nblocks % 8; nblocks--)
 	{
-	  u64 i = c->u_mode.ocb.data_nblocks + 1;
-	  block l0, l1, l2, l3, l4, l5, l6, l7;
-	  block b0, b1, b2, b3, b4, b5, b6, b7;
-	  block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
-	  const block *rk = (block*)&ctx->keyschdec;
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
+	  b = VEC_LOAD_BE (in, bige_const);
 
-	  c->u_mode.ocb.data_nblocks += unroll;
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  iv ^= l;
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_DECRYPT (b, rounds);
+	  b ^= iv;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  ctr ^= b;
 
-	  iv0 = iv;
-	  if ((uintptr_t)inbuf % 16 == 0)
-	    {
-	      b0 = vec_ld (0, in++);
-	      b1 = vec_ld (0, in++);
-	      b2 = vec_ld (0, in++);
-	      b3 = vec_ld (0, in++);
-	      b4 = vec_ld (0, in++);
-	      b5 = vec_ld (0, in++);
-	      b6 = vec_ld (0, in++);
-	      b7 = vec_ld (0, in++);
-	    }
-	  else
-	    {
-	      block unaligned0, unaligned1, unaligned2,
-		unaligned3, unaligned4, unaligned5, unaligned6;
-	      unaligned0 = vec_ld (0, in++);
-	      unaligned1 = vec_ld (0, in++);
-	      unaligned2 = vec_ld (0, in++);
-	      unaligned3 = vec_ld (0, in++);
-	      unaligned4 = vec_ld (0, in++);
-	      unaligned5 = vec_ld (0, in++);
-	      unaligned6 = vec_ld (0, in++);
-	      b0 = vec_perm (unalignedprev, unaligned0, vec_lvsl (0, inbuf));
-	      unalignedprev = vec_ld (0, in++);
-	      b1 = vec_perm (unaligned0, unaligned1, vec_lvsl (0, inbuf));
-	      b2 = vec_perm (unaligned1, unaligned2, vec_lvsl (0, inbuf));
-	      b3 = vec_perm (unaligned2, unaligned3, vec_lvsl (0, inbuf));
-	      b4 = vec_perm (unaligned3, unaligned4, vec_lvsl (0, inbuf));
-	      b5 = vec_perm (unaligned4, unaligned5, vec_lvsl (0, inbuf));
-	      b6 = vec_perm (unaligned5, unaligned6, vec_lvsl (0, inbuf));
-	      b7 = vec_perm (unaligned6, unalignedprev, vec_lvsl (0, inbuf));
-	    }
+	  VEC_STORE_BE (out, b, bige_const);
 
-	  l0 = *(block*)ocb_get_l (c, i++);
-	  l1 = *(block*)ocb_get_l (c, i++);
-	  l2 = *(block*)ocb_get_l (c, i++);
-	  l3 = *(block*)ocb_get_l (c, i++);
-	  l4 = *(block*)ocb_get_l (c, i++);
-	  l5 = *(block*)ocb_get_l (c, i++);
-	  l6 = *(block*)ocb_get_l (c, i++);
-	  l7 = *(block*)ocb_get_l (c, i++);
+	  in += 1;
+	  out += 1;
+	}
+
+      for (; nblocks >= 8; nblocks -= 8)
+	{
+	  b0 = VEC_LOAD_BE (in + 0, bige_const);
+	  b1 = VEC_LOAD_BE (in + 1, bige_const);
+	  b2 = VEC_LOAD_BE (in + 2, bige_const);
+	  b3 = VEC_LOAD_BE (in + 3, bige_const);
+	  b4 = VEC_LOAD_BE (in + 4, bige_const);
+	  b5 = VEC_LOAD_BE (in + 5, bige_const);
+	  b6 = VEC_LOAD_BE (in + 6, bige_const);
+	  b7 = VEC_LOAD_BE (in + 7, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 8), bige_const);
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l2;
+	  iv4 = iv ^ l1 ^ l2 ^ l0;
+	  iv5 = iv ^ l2 ^ l0;
+	  iv6 = iv ^ l2;
+	  iv7 = iv ^ l2 ^ l;
 
-	  iv0 ^= l0;
 	  b0 ^= iv0;
-	  iv1 = iv0 ^ l1;
 	  b1 ^= iv1;
-	  iv2 = iv1 ^ l2;
 	  b2 ^= iv2;
-	  iv3 = iv2 ^ l3;
 	  b3 ^= iv3;
-	  iv4 = iv3 ^ l4;
 	  b4 ^= iv4;
-	  iv5 = iv4 ^ l5;
 	  b5 ^= iv5;
-	  iv6 = iv5 ^ l6;
 	  b6 ^= iv6;
-	  iv7 = iv6 ^ l7;
 	  b7 ^= iv7;
-
-	  b0 = swap_if_le (b0);
-	  b1 = swap_if_le (b1);
-	  b2 = swap_if_le (b2);
-	  b3 = swap_if_le (b3);
-	  b4 = swap_if_le (b4);
-	  b5 = swap_if_le (b5);
-	  b6 = swap_if_le (b6);
-	  b7 = swap_if_le (b7);
-
-	  b0 ^= rk[0];
-	  b1 ^= rk[0];
-	  b2 ^= rk[0];
-	  b3 ^= rk[0];
-	  b4 ^= rk[0];
-	  b5 ^= rk[0];
-	  b6 ^= rk[0];
-	  b7 ^= rk[0];
-
-	  for (r = 1;r < rounds;r++)
+	  iv = iv7 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (&rk[r]); \
+	      b0 = vec_ncipher_be (b0, rkey); \
+	      b1 = vec_ncipher_be (b1, rkey); \
+	      b2 = vec_ncipher_be (b2, rkey); \
+	      b3 = vec_ncipher_be (b3, rkey); \
+	      b4 = vec_ncipher_be (b4, rkey); \
+	      b5 = vec_ncipher_be (b5, rkey); \
+	      b6 = vec_ncipher_be (b6, rkey); \
+	      b7 = vec_ncipher_be (b7, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
 	    {
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b0)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b1)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b2)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b3)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b4)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b5)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b6)
-		:"v" (rk[r]));
-	      __asm__ volatile ("vncipher %0, %0, %1\n\t"
-		:"+v" (b7)
-		:"v" (rk[r]));
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
 	    }
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b0)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b1)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b2)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b3)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b4)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b5)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b6)
-	    :"v" (rk[r]));
-	  __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
-	    :"+v" (b7)
-	    :"v" (rk[r]));
-
-	  iv = iv7;
-
-	  b0 = swap_if_le (b0) ^ iv0;
-	  b1 = swap_if_le (b1) ^ iv1;
-	  b2 = swap_if_le (b2) ^ iv2;
-	  b3 = swap_if_le (b3) ^ iv3;
-	  b4 = swap_if_le (b4) ^ iv4;
-	  b5 = swap_if_le (b5) ^ iv5;
-	  b6 = swap_if_le (b6) ^ iv6;
-	  b7 = swap_if_le (b7) ^ iv7;
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = vec_ncipherlast_be (b0, rkey ^ iv0);
+	  b1 = vec_ncipherlast_be (b1, rkey ^ iv1);
+	  b2 = vec_ncipherlast_be (b2, rkey ^ iv2);
+	  b3 = vec_ncipherlast_be (b3, rkey ^ iv3);
+	  b4 = vec_ncipherlast_be (b4, rkey ^ iv4);
+	  b5 = vec_ncipherlast_be (b5, rkey ^ iv5);
+	  b6 = vec_ncipherlast_be (b6, rkey ^ iv6);
+	  b7 = vec_ncipherlast_be (b7, rkey ^ iv7);
+
+	  VEC_STORE_BE (out + 0, b0, bige_const);
+	  VEC_STORE_BE (out + 1, b1, bige_const);
+	  VEC_STORE_BE (out + 2, b2, bige_const);
+	  VEC_STORE_BE (out + 3, b3, bige_const);
+	  VEC_STORE_BE (out + 4, b4, bige_const);
+	  VEC_STORE_BE (out + 5, b5, bige_const);
+	  VEC_STORE_BE (out + 6, b6, bige_const);
+	  VEC_STORE_BE (out + 7, b7, bige_const);
 
 	  ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
 
-	  /* The unaligned store stxvb16x writes big-endian */
-	  if ((uintptr_t)outbuf % 16 == 0)
-	    {
-	      vec_vsx_st (b0, 0, out++);
-	      vec_vsx_st (b1, 0, out++);
-	      vec_vsx_st (b2, 0, out++);
-	      vec_vsx_st (b3, 0, out++);
-	      vec_vsx_st (b4, 0, out++);
-	      vec_vsx_st (b5, 0, out++);
-	      vec_vsx_st (b6, 0, out++);
-	      vec_vsx_st (b7, 0, out++);
-	    }
-	  else
+	  in += 8;
+	  out += 8;
+	}
+
+      if (nblocks >= 4 && (data_nblocks % 4) == 0)
+	{
+	  b0 = VEC_LOAD_BE (in + 0, bige_const);
+	  b1 = VEC_LOAD_BE (in + 1, bige_const);
+	  b2 = VEC_LOAD_BE (in + 2, bige_const);
+	  b3 = VEC_LOAD_BE (in + 3, bige_const);
+
+	  l = VEC_LOAD_BE (ocb_get_l (c, data_nblocks += 4), bige_const);
+
+	  iv ^= rkey0;
+
+	  iv0 = iv ^ l0;
+	  iv1 = iv ^ l0 ^ l1;
+	  iv2 = iv ^ l1;
+	  iv3 = iv ^ l1 ^ l;
+
+	  b0 ^= iv0;
+	  b1 ^= iv1;
+	  b2 ^= iv2;
+	  b3 ^= iv3;
+	  iv = iv3 ^ rkey0;
+
+#define DO_ROUND(r) \
+	      rkey = ALIGNED_LOAD (&rk[r]); \
+	      b0 = vec_ncipher_be (b0, rkey); \
+	      b1 = vec_ncipher_be (b1, rkey); \
+	      b2 = vec_ncipher_be (b2, rkey); \
+	      b3 = vec_ncipher_be (b3, rkey);
+
+	  DO_ROUND(1);
+	  DO_ROUND(2);
+	  DO_ROUND(3);
+	  DO_ROUND(4);
+	  DO_ROUND(5);
+	  DO_ROUND(6);
+	  DO_ROUND(7);
+	  DO_ROUND(8);
+	  DO_ROUND(9);
+	  if (rounds >= 12)
 	    {
-	      b0 = swap_if_le (b0);
-	      b1 = swap_if_le (b1);
-	      b2 = swap_if_le (b2);
-	      b3 = swap_if_le (b3);
-	      b4 = swap_if_le (b4);
-	      b5 = swap_if_le (b5);
-	      b6 = swap_if_le (b6);
-	      b7 = swap_if_le (b7);
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++)));
-	      __asm__ ("stxvb16x %x0, %1, %2\n\t"
-		:: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++)));
+	      DO_ROUND(10);
+	      DO_ROUND(11);
+	      if (rounds > 12)
+		{
+		  DO_ROUND(12);
+		  DO_ROUND(13);
+		}
 	    }
+
+#undef DO_ROUND
+
+	  rkey = rkeylast ^ rkey0;
+	  b0 = vec_ncipherlast_be (b0, rkey ^ iv0);
+	  b1 = vec_ncipherlast_be (b1, rkey ^ iv1);
+	  b2 = vec_ncipherlast_be (b2, rkey ^ iv2);
+	  b3 = vec_ncipherlast_be (b3, rkey ^ iv3);
+
+	  VEC_STORE_BE (out + 0, b0, bige_const);
+	  VEC_STORE_BE (out + 1, b1, bige_const);
+	  VEC_STORE_BE (out + 2, b2, bige_const);
+	  VEC_STORE_BE (out + 3, b3, bige_const);
+
+	  ctr ^= b0 ^ b1 ^ b2 ^ b3;
+
+	  in += 4;
+	  out += 4;
+	  nblocks -= 4;
 	}
 
-      for ( ;nblocks; nblocks-- )
+      for (; nblocks; nblocks--)
 	{
-	  block b;
-	  u64 i = ++c->u_mode.ocb.data_nblocks;
-	  const block l = *(block*)ocb_get_l (c, i);
+	  l = VEC_LOAD_BE (ocb_get_l (c, ++data_nblocks), bige_const);
+	  b = VEC_LOAD_BE (in, bige_const);
 
 	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
 	  iv ^= l;
-	  if ((uintptr_t)in % 16 == 0)
-	    {
-	      b = vec_ld (0, in++);
-	    }
-	  else
-	    {
-	      block unalignedprevprev;
-	      unalignedprevprev = unalignedprev;
-	      unalignedprev = vec_ld (0, in++);
-	      b = vec_perm (unalignedprevprev, unalignedprev, vec_lvsl (0, inbuf));
-	    }
-
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+	  b ^= iv;
+	  AES_DECRYPT (b, rounds);
 	  b ^= iv;
-	  b = swap_if_le (b);
-	  b = _gcry_aes_ppc8_decrypt_altivec (ctx, b);
-	  b = swap_if_le (b) ^ iv;
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
 	  ctr ^= b;
-	  if ((uintptr_t)out % 16 == 0)
-	    {
-	      vec_vsx_st (b, 0, out++);
-	    }
-	  else
-	    {
-	      b = swap_if_le (b);
-	      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-		:
-		: "wa" (b), "r" (zero), "r" ((uintptr_t)out++));
-	    }
-	}
 
-      /* We want to store iv and ctr big-endian and the unaligned
-         store stxvb16x stores them little endian, so we have to swap them. */
-      iv = swap_if_le (iv);
-      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-	:: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv));
-      ctr = swap_if_le(ctr);
-      __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
-	:: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr));
+	  VEC_STORE_BE (out, b, bige_const);
+
+	  in += 1;
+	  out += 1;
+	}
     }
+
+  VEC_STORE_BE (c->u_iv.iv, iv, bige_const);
+  VEC_STORE_BE (c->u_ctr.ctr, ctr, bige_const);
+  c->u_mode.ocb.data_nblocks = data_nblocks;
+
   return 0;
 }
-#endif
 
 #endif /* USE_PPC_CRYPTO */
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 8a27dfe0b..c7bc467cf 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -210,6 +210,9 @@ extern unsigned int _gcry_aes_ppc8_encrypt(const RIJNDAEL_context *ctx,
 extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx,
 					   unsigned char *dst,
 					   const unsigned char *src);
+extern size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+					const void *inbuf_arg, size_t nblocks,
+					int encrypt);
 #endif /*USE_PPC_CRYPTO*/
 
 static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -447,6 +450,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       ctx->use_ppc_crypto = 1;
       if (hd)
         {
+          hd->bulk.ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
         }
     }
 #endif
@@ -1380,6 +1384,12 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       return _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
     }
 #endif /*USE_ARM_CE*/
+#ifdef USE_PPC_CRYPTO
+  else if (ctx->use_ppc_crypto)
+    {
+      return _gcry_aes_ppc8_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+    }
+#endif /*USE_PPC_CRYPTO*/
   else if (encrypt)
     {
       union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;