[PATCH 2/5] aes-ppc: add ECB bulk acceleration for benchmarking purposes
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Feb 26 14:00:34 CET 2023
* cipher/rijndael-ppc-functions.h (ECB_CRYPT_FUNC): New.
* cipher/rijndael-ppc.c (_gcry_aes_ppc8_ecb_crypt): New.
* cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ecb_crypt): New.
* cipher/rijndael.c (_gcry_aes_ppc8_ecb_crypt)
(_gcry_aes_ppc9le_ecb_crypt): New.
(do_setkey): Set up _gcry_aes_ppc8_ecb_crypt for POWER8 and
_gcry_aes_ppc9le_ecb_crypt for POWER9.
--
Benchmark on POWER9:
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 0.875 ns/B 1090 MiB/s 2.01 c/B
ECB dec | 1.06 ns/B 899.8 MiB/s 2.44 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 0.305 ns/B 3126 MiB/s 0.702 c/B
ECB dec | 0.305 ns/B 3126 MiB/s 0.702 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-ppc-functions.h | 257 ++++++++++++++++++++++++++++++++
cipher/rijndael-ppc.c | 1 +
cipher/rijndael-ppc9le.c | 1 +
cipher/rijndael.c | 10 ++
4 files changed, 269 insertions(+)
diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 063c5358..8a05d3c9 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -118,6 +118,263 @@ void CFB_ENC_FUNC (void *context, unsigned char *iv_arg,
VEC_STORE_BE (iv_arg, 0, outiv, bige_const);
}
+
+void ECB_CRYPT_FUNC (void *context, void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt)
+{
+ const block bige_const = asm_load_be_const();
+ RIJNDAEL_context *ctx = context;
+ const u128_t *rk = encrypt ? (u128_t *)&ctx->keyschenc
+ : (u128_t *)&ctx->keyschdec;
+ const u128_t *in = (const u128_t *)inbuf_arg;
+ u128_t *out = (u128_t *)outbuf_arg;
+ int rounds = ctx->rounds;
+ ROUND_KEY_VARIABLES;
+ block b0, b1, b2, b3, b4, b5, b6, b7;
+ block rkey;
+
+ if (!encrypt && !ctx->decryption_prepared)
+ {
+ internal_aes_ppc_prepare_decryption (ctx);
+ ctx->decryption_prepared = 1;
+ }
+
+ PRELOAD_ROUND_KEYS (rounds);
+
+ for (; nblocks >= 8; nblocks -= 8)
+ {
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+ b1 = VEC_LOAD_BE (in, 1, bige_const);
+ b2 = VEC_LOAD_BE (in, 2, bige_const);
+ b3 = VEC_LOAD_BE (in, 3, bige_const);
+ b0 = asm_xor (rkey0, b0);
+ b1 = asm_xor (rkey0, b1);
+ b4 = VEC_LOAD_BE (in, 4, bige_const);
+ b5 = VEC_LOAD_BE (in, 5, bige_const);
+ b2 = asm_xor (rkey0, b2);
+ b3 = asm_xor (rkey0, b3);
+ b6 = VEC_LOAD_BE (in, 6, bige_const);
+ b7 = VEC_LOAD_BE (in, 7, bige_const);
+ in += 8;
+ b4 = asm_xor (rkey0, b4);
+ b5 = asm_xor (rkey0, b5);
+ b6 = asm_xor (rkey0, b6);
+ b7 = asm_xor (rkey0, b7);
+
+ if (encrypt)
+ {
+#define DO_ROUND(r) \
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);
+
+ DO_ROUND(1);
+ DO_ROUND(2);
+ DO_ROUND(3);
+ DO_ROUND(4);
+ DO_ROUND(5);
+ DO_ROUND(6);
+ DO_ROUND(7);
+ DO_ROUND(8);
+ DO_ROUND(9);
+ if (rounds >= 12)
+ {
+ DO_ROUND(10);
+ DO_ROUND(11);
+ if (rounds > 12)
+ {
+ DO_ROUND(12);
+ DO_ROUND(13);
+ }
+ }
+
+#undef DO_ROUND
+
+ b0 = asm_cipherlast_be (b0, rkeylast);
+ b1 = asm_cipherlast_be (b1, rkeylast);
+ b2 = asm_cipherlast_be (b2, rkeylast);
+ b3 = asm_cipherlast_be (b3, rkeylast);
+ b4 = asm_cipherlast_be (b4, rkeylast);
+ b5 = asm_cipherlast_be (b5, rkeylast);
+ b6 = asm_cipherlast_be (b6, rkeylast);
+ b7 = asm_cipherlast_be (b7, rkeylast);
+ }
+ else
+ {
+#define DO_ROUND(r) \
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey); \
+ b4 = asm_ncipher_be (b4, rkey); \
+ b5 = asm_ncipher_be (b5, rkey); \
+ b6 = asm_ncipher_be (b6, rkey); \
+ b7 = asm_ncipher_be (b7, rkey);
+
+ DO_ROUND(1);
+ DO_ROUND(2);
+ DO_ROUND(3);
+ DO_ROUND(4);
+ DO_ROUND(5);
+ DO_ROUND(6);
+ DO_ROUND(7);
+ DO_ROUND(8);
+ DO_ROUND(9);
+ if (rounds >= 12)
+ {
+ DO_ROUND(10);
+ DO_ROUND(11);
+ if (rounds > 12)
+ {
+ DO_ROUND(12);
+ DO_ROUND(13);
+ }
+ }
+
+#undef DO_ROUND
+
+ b0 = asm_ncipherlast_be (b0, rkeylast);
+ b1 = asm_ncipherlast_be (b1, rkeylast);
+ b2 = asm_ncipherlast_be (b2, rkeylast);
+ b3 = asm_ncipherlast_be (b3, rkeylast);
+ b4 = asm_ncipherlast_be (b4, rkeylast);
+ b5 = asm_ncipherlast_be (b5, rkeylast);
+ b6 = asm_ncipherlast_be (b6, rkeylast);
+ b7 = asm_ncipherlast_be (b7, rkeylast);
+ }
+
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);
+ VEC_STORE_BE (out, 4, b4, bige_const);
+ VEC_STORE_BE (out, 5, b5, bige_const);
+ VEC_STORE_BE (out, 6, b6, bige_const);
+ VEC_STORE_BE (out, 7, b7, bige_const);
+ out += 8;
+ }
+
+ if (nblocks >= 4)
+ {
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+ b1 = VEC_LOAD_BE (in, 1, bige_const);
+ b2 = VEC_LOAD_BE (in, 2, bige_const);
+ b3 = VEC_LOAD_BE (in, 3, bige_const);
+
+ b0 = asm_xor (rkey0, b0);
+ b1 = asm_xor (rkey0, b1);
+ b2 = asm_xor (rkey0, b2);
+ b3 = asm_xor (rkey0, b3);
+
+ if (encrypt)
+ {
+#define DO_ROUND(r) \
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);
+
+ DO_ROUND(1);
+ DO_ROUND(2);
+ DO_ROUND(3);
+ DO_ROUND(4);
+ DO_ROUND(5);
+ DO_ROUND(6);
+ DO_ROUND(7);
+ DO_ROUND(8);
+ DO_ROUND(9);
+ if (rounds >= 12)
+ {
+ DO_ROUND(10);
+ DO_ROUND(11);
+ if (rounds > 12)
+ {
+ DO_ROUND(12);
+ DO_ROUND(13);
+ }
+ }
+#undef DO_ROUND
+
+ b0 = asm_cipherlast_be (b0, rkeylast);
+ b1 = asm_cipherlast_be (b1, rkeylast);
+ b2 = asm_cipherlast_be (b2, rkeylast);
+ b3 = asm_cipherlast_be (b3, rkeylast);
+ }
+ else
+ {
+#define DO_ROUND(r) \
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_ncipher_be (b0, rkey); \
+ b1 = asm_ncipher_be (b1, rkey); \
+ b2 = asm_ncipher_be (b2, rkey); \
+ b3 = asm_ncipher_be (b3, rkey);
+
+ DO_ROUND(1);
+ DO_ROUND(2);
+ DO_ROUND(3);
+ DO_ROUND(4);
+ DO_ROUND(5);
+ DO_ROUND(6);
+ DO_ROUND(7);
+ DO_ROUND(8);
+ DO_ROUND(9);
+ if (rounds >= 12)
+ {
+ DO_ROUND(10);
+ DO_ROUND(11);
+ if (rounds > 12)
+ {
+ DO_ROUND(12);
+ DO_ROUND(13);
+ }
+ }
+#undef DO_ROUND
+
+ b0 = asm_ncipherlast_be (b0, rkeylast);
+ b1 = asm_ncipherlast_be (b1, rkeylast);
+ b2 = asm_ncipherlast_be (b2, rkeylast);
+ b3 = asm_ncipherlast_be (b3, rkeylast);
+ }
+
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);
+
+ in += 4;
+ out += 4;
+ nblocks -= 4;
+ }
+
+ for (; nblocks; nblocks--)
+ {
+ b0 = VEC_LOAD_BE (in, 0, bige_const);
+
+ if (encrypt)
+ {
+ AES_ENCRYPT (b0, rounds);
+ }
+ else
+ {
+ AES_DECRYPT (b0, rounds);
+ }
+
+ VEC_STORE_BE (out, 0, b0, bige_const);
+
+ out++;
+ in++;
+ }
+}
+
+
void CFB_DEC_FUNC (void *context, unsigned char *iv_arg,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks)
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 19f6a7e1..53c4f126 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -189,6 +189,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
#define GCRY_AES_PPC8 1
#define ENCRYPT_BLOCK_FUNC _gcry_aes_ppc8_encrypt
#define DECRYPT_BLOCK_FUNC _gcry_aes_ppc8_decrypt
+#define ECB_CRYPT_FUNC _gcry_aes_ppc8_ecb_crypt
#define CFB_ENC_FUNC _gcry_aes_ppc8_cfb_enc
#define CFB_DEC_FUNC _gcry_aes_ppc8_cfb_dec
#define CBC_ENC_FUNC _gcry_aes_ppc8_cbc_enc
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index facdedd4..9ce9c224 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -88,6 +88,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
#define GCRY_AES_PPC9LE 1
#define ENCRYPT_BLOCK_FUNC _gcry_aes_ppc9le_encrypt
#define DECRYPT_BLOCK_FUNC _gcry_aes_ppc9le_decrypt
+#define ECB_CRYPT_FUNC _gcry_aes_ppc9le_ecb_crypt
#define CFB_ENC_FUNC _gcry_aes_ppc9le_cfb_enc
#define CFB_DEC_FUNC _gcry_aes_ppc9le_cfb_dec
#define CBC_ENC_FUNC _gcry_aes_ppc9le_cbc_enc
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 84cb7109..071d4a16 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -250,6 +250,10 @@ extern unsigned int _gcry_aes_ppc8_decrypt(const RIJNDAEL_context *ctx,
unsigned char *dst,
const unsigned char *src);
+extern void _gcry_aes_ppc8_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+
extern void _gcry_aes_ppc8_cfb_enc (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
@@ -287,6 +291,10 @@ extern unsigned int _gcry_aes_ppc9le_decrypt(const RIJNDAEL_context *ctx,
unsigned char *dst,
const unsigned char *src);
+extern void _gcry_aes_ppc9le_ecb_crypt (void *context, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+
extern void _gcry_aes_ppc9le_cfb_enc (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
@@ -616,6 +624,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
/* Setup PPC9LE bulk encryption routines. */
+ bulk_ops->ecb_crypt = _gcry_aes_ppc9le_ecb_crypt;
bulk_ops->cfb_enc = _gcry_aes_ppc9le_cfb_enc;
bulk_ops->cfb_dec = _gcry_aes_ppc9le_cfb_dec;
bulk_ops->cbc_enc = _gcry_aes_ppc9le_cbc_enc;
@@ -645,6 +654,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
ctx->prepare_decryption = _gcry_aes_ppc8_prepare_decryption;
/* Setup PPC8 bulk encryption routines. */
+ bulk_ops->ecb_crypt = _gcry_aes_ppc8_ecb_crypt;
bulk_ops->cfb_enc = _gcry_aes_ppc8_cfb_enc;
bulk_ops->cfb_dec = _gcry_aes_ppc8_cfb_dec;
bulk_ops->cbc_enc = _gcry_aes_ppc8_cbc_enc;
--
2.37.2
More information about the Gcrypt-devel
mailing list