[PATCH 1/2] rijndael-ppc: small speed-up for CBC and CFB encryption
Jussi Kivilinna
jussi.kivilinna at iki.fi
Wed Jul 27 22:16:31 CEST 2022
* cipher/rijndael-ppc-common.h (AES_ENCRYPT_ALL): Remove
* cipher/rijndael-ppc-functions.h (CFB_ENC_FUNC)
(CBC_ENC_FUNC): Removed two block unrolled loop; Optimized single
block loop for shorter critical-path.
--
Patch gives small ~3% performance increase for CBC and CFB
encryption, tested with POWER8.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-ppc-common.h | 25 ------
cipher/rijndael-ppc-functions.h | 131 ++++++++++++++++++--------------
2 files changed, 74 insertions(+), 82 deletions(-)
diff --git a/cipher/rijndael-ppc-common.h b/cipher/rijndael-ppc-common.h
index bbbeaac0..3fa9a0b9 100644
--- a/cipher/rijndael-ppc-common.h
+++ b/cipher/rijndael-ppc-common.h
@@ -158,31 +158,6 @@ typedef union
rkeylast = ALIGNED_LOAD (rk, nrounds); \
} while (0)
-#define AES_ENCRYPT_ALL(blk, nrounds) \
- do { \
- blk ^= rkey0; \
- blk = asm_cipher_be (blk, rkey1); \
- blk = asm_cipher_be (blk, rkey2); \
- blk = asm_cipher_be (blk, rkey3); \
- blk = asm_cipher_be (blk, rkey4); \
- blk = asm_cipher_be (blk, rkey5); \
- blk = asm_cipher_be (blk, rkey6); \
- blk = asm_cipher_be (blk, rkey7); \
- blk = asm_cipher_be (blk, rkey8); \
- blk = asm_cipher_be (blk, rkey9); \
- if (nrounds >= 12) \
- { \
- blk = asm_cipher_be (blk, rkey10); \
- blk = asm_cipher_be (blk, rkey11); \
- if (rounds > 12) \
- { \
- blk = asm_cipher_be (blk, rkey12); \
- blk = asm_cipher_be (blk, rkey13); \
- } \
- } \
- blk = asm_cipherlast_be (blk, rkeylast); \
- } while (0)
-
static ASM_FUNC_ATTR_INLINE block
asm_aligned_ld(unsigned long offset, const void *ptr)
diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 72f31852..f95489d9 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -76,43 +76,46 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
u128_t *out = (u128_t *)outbuf_arg;
int rounds = ctx->rounds;
ROUND_KEY_VARIABLES_ALL;
- block rkeylast_orig;
- block iv;
+ block key0_xor_keylast;
+ block iv, outiv;
iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
+ outiv = iv;
PRELOAD_ROUND_KEYS_ALL (rounds);
- rkeylast_orig = rkeylast;
-
- for (; nblocks >= 2; nblocks -= 2)
- {
- block in2, iv1;
-
- rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
- in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
- in += 2;
-
- AES_ENCRYPT_ALL (iv, rounds);
-
- iv1 = iv;
- rkeylast = rkeylast_orig ^ in2;
-
- AES_ENCRYPT_ALL (iv, rounds);
-
- VEC_STORE_BE (out++, 0, iv1, bige_const);
- VEC_STORE_BE (out++, 0, iv, bige_const);
- }
+ key0_xor_keylast = rkey0 ^ rkeylast;
+ iv ^= rkey0;
for (; nblocks; nblocks--)
{
- rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in++, 0, bige_const);
-
- AES_ENCRYPT_ALL (iv, rounds);
+ rkeylast = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const);
+
+ iv = asm_cipher_be (iv, rkey1);
+ iv = asm_cipher_be (iv, rkey2);
+ iv = asm_cipher_be (iv, rkey3);
+ iv = asm_cipher_be (iv, rkey4);
+ iv = asm_cipher_be (iv, rkey5);
+ iv = asm_cipher_be (iv, rkey6);
+ iv = asm_cipher_be (iv, rkey7);
+ iv = asm_cipher_be (iv, rkey8);
+ iv = asm_cipher_be (iv, rkey9);
+ if (rounds >= 12)
+ {
+ iv = asm_cipher_be (iv, rkey10);
+ iv = asm_cipher_be (iv, rkey11);
+ if (rounds > 12)
+ {
+ iv = asm_cipher_be (iv, rkey12);
+ iv = asm_cipher_be (iv, rkey13);
+ }
+ }
+ iv = asm_cipherlast_be (iv, rkeylast);
- VEC_STORE_BE (out++, 0, iv, bige_const);
+ outiv = rkey0 ^ iv;
+ VEC_STORE_BE (out++, 0, outiv, bige_const);
}
- VEC_STORE_BE (iv_arg, 0, iv, bige_const);
+ VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
}
void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
@@ -324,47 +327,61 @@ void CBC_ENC_FUNC (void *context, unsigned char *iv_arg,
byte *out = (byte *)outbuf_arg;
int rounds = ctx->rounds;
ROUND_KEY_VARIABLES_ALL;
- block lastiv, b;
+ block iv, key0_xor_keylast, nextiv, outiv;
unsigned int outadd = -(!cbc_mac) & 16;
- lastiv = VEC_LOAD_BE (iv_arg, 0, bige_const);
-
- PRELOAD_ROUND_KEYS_ALL (rounds);
-
- for (; nblocks >= 2; nblocks -= 2)
- {
- block in2, lastiv1;
-
- b = lastiv ^ VEC_LOAD_BE (in, 0, bige_const);
- in2 = VEC_LOAD_BE (in + 1, 0, bige_const);
- in += 2;
-
- AES_ENCRYPT_ALL (b, rounds);
+ if (nblocks == 0) /* CMAC may call with nblocks 0. */
+ return;
- lastiv1 = b;
- b = lastiv1 ^ in2;
+ iv = VEC_LOAD_BE (iv_arg, 0, bige_const);
- AES_ENCRYPT_ALL (b, rounds);
+ PRELOAD_ROUND_KEYS_ALL (rounds);
+ key0_xor_keylast = rkey0 ^ rkeylast;
- lastiv = b;
- VEC_STORE_BE ((u128_t *)out, 0, lastiv1, bige_const);
- out += outadd;
- VEC_STORE_BE ((u128_t *)out, 0, lastiv, bige_const);
- out += outadd;
- }
+ nextiv = VEC_LOAD_BE (in++, 0, bige_const);
+ iv ^= rkey0 ^ nextiv;
- for (; nblocks; nblocks--)
+ do
{
- b = lastiv ^ VEC_LOAD_BE (in++, 0, bige_const);
-
- AES_ENCRYPT_ALL (b, rounds);
+ if (--nblocks)
+ {
+ nextiv = key0_xor_keylast ^ VEC_LOAD_BE (in++, 0, bige_const);
+ }
- lastiv = b;
- VEC_STORE_BE ((u128_t *)out, 0, b, bige_const);
+ iv = asm_cipher_be (iv, rkey1);
+ iv = asm_cipher_be (iv, rkey2);
+ iv = asm_cipher_be (iv, rkey3);
+ iv = asm_cipher_be (iv, rkey4);
+ iv = asm_cipher_be (iv, rkey5);
+ iv = asm_cipher_be (iv, rkey6);
+ iv = asm_cipher_be (iv, rkey7);
+ iv = asm_cipher_be (iv, rkey8);
+ iv = asm_cipher_be (iv, rkey9);
+ if (rounds >= 12)
+ {
+ iv = asm_cipher_be (iv, rkey10);
+ iv = asm_cipher_be (iv, rkey11);
+ if (rounds > 12)
+ {
+ iv = asm_cipher_be (iv, rkey12);
+ iv = asm_cipher_be (iv, rkey13);
+ }
+ }
+ outiv = iv;
+ /* Proper order for following instructions is important for best
+ * performance on POWER8: the output path vcipherlast needs to be
+ * last one. */
+ __asm__ volatile ("vcipherlast %0, %0, %2\n\t"
+ "vcipherlast %1, %1, %3\n\t"
+ : "+v" (iv), "+outiv" (outiv)
+ : "v" (nextiv), "v" (rkeylast));
+
+ VEC_STORE_BE ((u128_t *)out, 0, outiv, bige_const);
out += outadd;
}
+ while (nblocks);
- VEC_STORE_BE (iv_arg, 0, lastiv, bige_const);
+ VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
}
void CBC_DEC_FUNC (void *context, unsigned char *iv_arg,
--
2.34.1
More information about the Gcrypt-devel
mailing list