[PATCH 3/5] aes-ppc: add CTR32LE bulk acceleration
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Feb 26 14:00:35 CET 2023
* cipher/rijndael-ppc-functions.h (CTR32LE_ENC_FUNC): New.
* cipher/rijndael-ppc.c (_gcry_aes_ppc8_ctr32le_enc): New.
* cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ctr32le_enc): New.
* cipher/rijndael.c (_gcry_aes_ppc8_ctr32le_enc)
(_gcry_aes_ppc9le_ctr32le_enc): New.
(do_setkey): Setup _gcry_aes_ppc8_ctr32le_enc for POWER8 and
_gcry_aes_ppc9le_ctr32le_enc for POWER9.
--
Benchmark on POWER9:
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte
GCM-SIV enc | 1.42 ns/B 672.2 MiB/s 3.26 c/B
After:
AES | nanosecs/byte mebibytes/sec cycles/byte
GCM-SIV enc | 0.725 ns/B 1316 MiB/s 1.67 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-ppc-functions.h | 245 ++++++++++++++++++++++++++++++++
cipher/rijndael-ppc.c | 1 +
cipher/rijndael-ppc9le.c | 1 +
cipher/rijndael.c | 11 ++
4 files changed, 258 insertions(+)
diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h
index 8a05d3c9..79eca7a2 100644
--- a/cipher/rijndael-ppc-functions.h
+++ b/cipher/rijndael-ppc-functions.h
@@ -2292,3 +2292,248 @@ void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg,
#undef GEN_TWEAK
}
+
+
+void CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks)
+{
+#ifndef WORDS_BIGENDIAN
+ static const vec_u32 vec_u32_one = { 1, 0, 0, 0 };
+#else
+ static const vec_u32 vec_u32_one = { 0, 0, 0, 1 };
+#endif
+ const block bige_const = asm_load_be_const();
+ RIJNDAEL_context *ctx = context;
+ const u128_t *rk = (u128_t *)&ctx->keyschenc;
+ const u128_t *in = (const u128_t *)inbuf_arg;
+ u128_t *out = (u128_t *)outbuf_arg;
+ int rounds = ctx->rounds;
+ ROUND_KEY_VARIABLES;
+ block rkeylast_orig;
+ block b;
+ vec_u32 ctr, one;
+
+ ctr = (vec_u32)vec_reve (VEC_LOAD_BE (ctr_arg, 0, bige_const));
+ one = vec_u32_one;
+
+ PRELOAD_ROUND_KEYS (rounds);
+ rkeylast_orig = rkeylast;
+
+#define VEC_ADD_CTRLE32(ctrv_u32, addv_u32) \
+ vec_reve((block)((ctrv_u32) + (addv_u32)))
+
+ if (nblocks >= 4)
+ {
+ block in0, in1, in2, in3, in4, in5, in6, in7;
+ block b0, b1, b2, b3, b4, b5, b6, b7;
+ vec_u32 two, three, four, five, six, seven, eight;
+ block rkey;
+
+ two = one + one;
+ three = two + one;
+ four = two + two;
+ five = three + two;
+ six = three + three;
+ seven = four + three;
+ eight = four + four;
+
+ for (; nblocks >= 8; nblocks -= 8)
+ {
+ b1 = VEC_ADD_CTRLE32 (ctr, one);
+ b2 = VEC_ADD_CTRLE32 (ctr, two);
+ b3 = VEC_ADD_CTRLE32 (ctr, three);
+ b4 = VEC_ADD_CTRLE32 (ctr, four);
+ b5 = VEC_ADD_CTRLE32 (ctr, five);
+ b6 = VEC_ADD_CTRLE32 (ctr, six);
+ b7 = VEC_ADD_CTRLE32 (ctr, seven);
+ b0 = asm_xor (rkey0, vec_reve((block)ctr));
+ rkey = ALIGNED_LOAD (rk, 1);
+ ctr = ctr + eight;
+ b1 = asm_xor (rkey0, b1);
+ b2 = asm_xor (rkey0, b2);
+ b3 = asm_xor (rkey0, b3);
+ b0 = asm_cipher_be (b0, rkey);
+ b1 = asm_cipher_be (b1, rkey);
+ b2 = asm_cipher_be (b2, rkey);
+ b3 = asm_cipher_be (b3, rkey);
+ b4 = asm_xor (rkey0, b4);
+ b5 = asm_xor (rkey0, b5);
+ b6 = asm_xor (rkey0, b6);
+ b7 = asm_xor (rkey0, b7);
+ b4 = asm_cipher_be (b4, rkey);
+ b5 = asm_cipher_be (b5, rkey);
+ b6 = asm_cipher_be (b6, rkey);
+ b7 = asm_cipher_be (b7, rkey);
+
+#define DO_ROUND(r) \
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey); \
+ b4 = asm_cipher_be (b4, rkey); \
+ b5 = asm_cipher_be (b5, rkey); \
+ b6 = asm_cipher_be (b6, rkey); \
+ b7 = asm_cipher_be (b7, rkey);
+
+ in0 = VEC_LOAD_BE_NOSWAP (in, 0);
+ DO_ROUND(2);
+ in1 = VEC_LOAD_BE_NOSWAP (in, 1);
+ DO_ROUND(3);
+ in2 = VEC_LOAD_BE_NOSWAP (in, 2);
+ DO_ROUND(4);
+ in3 = VEC_LOAD_BE_NOSWAP (in, 3);
+ DO_ROUND(5);
+ in4 = VEC_LOAD_BE_NOSWAP (in, 4);
+ DO_ROUND(6);
+ in5 = VEC_LOAD_BE_NOSWAP (in, 5);
+ DO_ROUND(7);
+ in6 = VEC_LOAD_BE_NOSWAP (in, 6);
+ DO_ROUND(8);
+ in7 = VEC_LOAD_BE_NOSWAP (in, 7);
+ in += 8;
+ DO_ROUND(9);
+
+ if (rounds >= 12)
+ {
+ DO_ROUND(10);
+ DO_ROUND(11);
+ if (rounds > 12)
+ {
+ DO_ROUND(12);
+ DO_ROUND(13);
+ }
+ }
+
+#undef DO_ROUND
+
+ in0 = VEC_BE_SWAP (in0, bige_const);
+ in1 = VEC_BE_SWAP (in1, bige_const);
+ in2 = VEC_BE_SWAP (in2, bige_const);
+ in3 = VEC_BE_SWAP (in3, bige_const);
+ in4 = VEC_BE_SWAP (in4, bige_const);
+ in5 = VEC_BE_SWAP (in5, bige_const);
+ in6 = VEC_BE_SWAP (in6, bige_const);
+ in7 = VEC_BE_SWAP (in7, bige_const);
+
+ in0 = asm_xor (rkeylast, in0);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ in3 = asm_xor (rkeylast, in3);
+ b0 = asm_cipherlast_be (b0, in0);
+ b1 = asm_cipherlast_be (b1, in1);
+ in4 = asm_xor (rkeylast, in4);
+ in5 = asm_xor (rkeylast, in5);
+ b2 = asm_cipherlast_be (b2, in2);
+ b3 = asm_cipherlast_be (b3, in3);
+ in6 = asm_xor (rkeylast, in6);
+ in7 = asm_xor (rkeylast, in7);
+ b4 = asm_cipherlast_be (b4, in4);
+ b5 = asm_cipherlast_be (b5, in5);
+ b6 = asm_cipherlast_be (b6, in6);
+ b7 = asm_cipherlast_be (b7, in7);
+
+ b0 = VEC_BE_SWAP (b0, bige_const);
+ b1 = VEC_BE_SWAP (b1, bige_const);
+ b2 = VEC_BE_SWAP (b2, bige_const);
+ b3 = VEC_BE_SWAP (b3, bige_const);
+ b4 = VEC_BE_SWAP (b4, bige_const);
+ b5 = VEC_BE_SWAP (b5, bige_const);
+ b6 = VEC_BE_SWAP (b6, bige_const);
+ b7 = VEC_BE_SWAP (b7, bige_const);
+ VEC_STORE_BE_NOSWAP (out, 0, b0);
+ VEC_STORE_BE_NOSWAP (out, 1, b1);
+ VEC_STORE_BE_NOSWAP (out, 2, b2);
+ VEC_STORE_BE_NOSWAP (out, 3, b3);
+ VEC_STORE_BE_NOSWAP (out, 4, b4);
+ VEC_STORE_BE_NOSWAP (out, 5, b5);
+ VEC_STORE_BE_NOSWAP (out, 6, b6);
+ VEC_STORE_BE_NOSWAP (out, 7, b7);
+ out += 8;
+ }
+
+ if (nblocks >= 4)
+ {
+ b1 = VEC_ADD_CTRLE32 (ctr, one);
+ b2 = VEC_ADD_CTRLE32 (ctr, two);
+ b3 = VEC_ADD_CTRLE32 (ctr, three);
+ b0 = asm_xor (rkey0, vec_reve((block)ctr));
+ ctr = ctr + four;
+ b1 = asm_xor (rkey0, b1);
+ b2 = asm_xor (rkey0, b2);
+ b3 = asm_xor (rkey0, b3);
+
+#define DO_ROUND(r) \
+ rkey = ALIGNED_LOAD (rk, r); \
+ b0 = asm_cipher_be (b0, rkey); \
+ b1 = asm_cipher_be (b1, rkey); \
+ b2 = asm_cipher_be (b2, rkey); \
+ b3 = asm_cipher_be (b3, rkey);
+
+ DO_ROUND(1);
+ DO_ROUND(2);
+ DO_ROUND(3);
+ DO_ROUND(4);
+ DO_ROUND(5);
+ DO_ROUND(6);
+ DO_ROUND(7);
+ DO_ROUND(8);
+
+ in0 = VEC_LOAD_BE (in, 0, bige_const);
+ in1 = VEC_LOAD_BE (in, 1, bige_const);
+ in2 = VEC_LOAD_BE (in, 2, bige_const);
+ in3 = VEC_LOAD_BE (in, 3, bige_const);
+
+ DO_ROUND(9);
+ if (rounds >= 12)
+ {
+ DO_ROUND(10);
+ DO_ROUND(11);
+ if (rounds > 12)
+ {
+ DO_ROUND(12);
+ DO_ROUND(13);
+ }
+ }
+
+#undef DO_ROUND
+
+ in0 = asm_xor (rkeylast, in0);
+ in1 = asm_xor (rkeylast, in1);
+ in2 = asm_xor (rkeylast, in2);
+ in3 = asm_xor (rkeylast, in3);
+
+ b0 = asm_cipherlast_be (b0, in0);
+ b1 = asm_cipherlast_be (b1, in1);
+ b2 = asm_cipherlast_be (b2, in2);
+ b3 = asm_cipherlast_be (b3, in3);
+
+ VEC_STORE_BE (out, 0, b0, bige_const);
+ VEC_STORE_BE (out, 1, b1, bige_const);
+ VEC_STORE_BE (out, 2, b2, bige_const);
+ VEC_STORE_BE (out, 3, b3, bige_const);
+
+ in += 4;
+ out += 4;
+ nblocks -= 4;
+ }
+ }
+
+ for (; nblocks; nblocks--)
+ {
+ b = vec_reve((block)ctr);
+ ctr = ctr + one;
+ rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const);
+
+ AES_ENCRYPT (b, rounds);
+
+ VEC_STORE_BE (out, 0, b, bige_const);
+
+ out++;
+ in++;
+ }
+
+#undef VEC_ADD_CTRLE32
+
+ VEC_STORE_BE (ctr_arg, 0, vec_reve((block)ctr), bige_const);
+}
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
index 53c4f126..d16fbb40 100644
--- a/cipher/rijndael-ppc.c
+++ b/cipher/rijndael-ppc.c
@@ -198,6 +198,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx)
#define OCB_CRYPT_FUNC _gcry_aes_ppc8_ocb_crypt
#define OCB_AUTH_FUNC _gcry_aes_ppc8_ocb_auth
#define XTS_CRYPT_FUNC _gcry_aes_ppc8_xts_crypt
+#define CTR32LE_ENC_FUNC _gcry_aes_ppc8_ctr32le_enc
#include <rijndael-ppc-functions.h>
diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c
index 9ce9c224..f7055290 100644
--- a/cipher/rijndael-ppc9le.c
+++ b/cipher/rijndael-ppc9le.c
@@ -97,6 +97,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr)
#define OCB_CRYPT_FUNC _gcry_aes_ppc9le_ocb_crypt
#define OCB_AUTH_FUNC _gcry_aes_ppc9le_ocb_auth
#define XTS_CRYPT_FUNC _gcry_aes_ppc9le_xts_crypt
+#define CTR32LE_ENC_FUNC _gcry_aes_ppc9le_ctr32le_enc
#include <rijndael-ppc-functions.h>
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 071d4a16..b49a0642 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -280,6 +280,10 @@ extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
void *outbuf_arg,
const void *inbuf_arg,
size_t nblocks, int encrypt);
+
+extern void _gcry_aes_ppc8_ctr32le_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks);
#endif /*USE_PPC_CRYPTO*/
#ifdef USE_PPC_CRYPTO_WITH_PPC9LE
@@ -322,6 +326,11 @@ extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak,
const void *inbuf_arg,
size_t nblocks, int encrypt);
+extern void _gcry_aes_ppc9le_ctr32le_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks);
+
extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg,
size_t nblocks, int encrypt);
@@ -633,6 +642,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt;
bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt;
+ bulk_ops->ctr32le_enc = _gcry_aes_ppc9le_ctr32le_enc;
if (hwfeatures & HWF_PPC_ARCH_3_10) /* for P10 */
bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt;
# ifdef ENABLE_FORCE_SOFT_HWFEATURES
@@ -663,6 +673,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth;
bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt;
+ bulk_ops->ctr32le_enc = _gcry_aes_ppc8_ctr32le_enc;
}
#endif
#ifdef USE_S390X_CRYPTO
--
2.37.2
More information about the Gcrypt-devel
mailing list