[PATCH 3/5] Add PowerPC crypto acceleration support for SHA2.
Shawn Landden
shawn at git.icu
Tue Jul 9 16:58:10 CEST 2019
There is also a Power assembly version available for <= POWER 7,
but it was not ported.
I am getting ~10% improovement on sha256 and 40% improvement on sha512.
These numbers are incorrect because of turbo mode.
Before:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA224 | 3.24 ns/B 293.9 MiB/s 6.15 c/B 1895
SHA256 | 3.24 ns/B 294.7 MiB/s 6.11 c/B 1887
SHA384 | 2.69 ns/B 354.8 MiB/s 5.10 c/B 1895
SHA512 | 2.69 ns/B 354.7 MiB/s 5.10 c/B 1895
After:
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
SHA224 | 2.91 ns/B 327.4 MiB/s 5.52 c/B 1895
SHA256 | 2.92 ns/B 326.1 MiB/s 5.54 c/B 1895
SHA384 | 1.82 ns/B 523.4 MiB/s 3.45 c/B 1895
SHA512 | 1.82 ns/B 525.0 MiB/s 3.44 c/B 1895
Fixes T4530
2019-07-09 Shawn Landden <shawn at git.icu>
* cipher/Makefile.am: Build SHA assembly files.
* configure.ac: Link SHA assembly files based on target.
* cipher/rijndael-ppc8.pl: Do not load rounds from ctx, but pass in register.
* cipher/rijndael-ppc8.S: Regenerate.
* cipher/rijndael-ppc8.pl: Regenerate.
* cipher/rijndael-ppc832.S: Regenerate.
* cipher/rijndael-ppc8be.S: Regenerate.
* cipher/rijndael.c: Glue in all the new code.
* cipher/sha256.c: Glue. Move some stuff to...
* cipher/sha512.c: Glue. Move some stuff to...
* cipher/sha2-common.h: New file, incorporating common stuff of sha256.c and sha512.c
* cipher/sha512p8-ppc.pl: Renamed to...
* cipher/sha512-ppc8.pl: And no changes.
---
cipher/Makefile.am | 3 +
cipher/rijndael-ppc8.S | 2 -
cipher/rijndael-ppc8.pl | 2 -
cipher/rijndael-ppc832.S | 1 -
cipher/rijndael-ppc8be.S | 1 -
cipher/rijndael.c | 248 +++++++++++++++++----
cipher/sha2-common.h | 94 ++++++++
cipher/sha256.c | 164 +++++---------
cipher/{sha512p8-ppc.pl => sha512-ppc8.pl} | 3 +
cipher/sha512.c | 84 ++-----
configure.ac | 3 +
11 files changed, 374 insertions(+), 231 deletions(-)
create mode 100644 cipher/sha2-common.h
rename cipher/{sha512p8-ppc.pl => sha512-ppc8.pl} (99%)
mode change 100755 => 100644
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 495b2f6d..fe98fa3d 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -115,10 +115,13 @@ EXTRA_libcipher_la_SOURCES = \
sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
sha256-intel-shaext.c \
sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S \
sha512-avx2-bmi2-amd64.S \
sha512-armv7-neon.S sha512-arm.S \
+ sha512-ppc8.pl \
+ sha512-ppc8.S sha512-ppc8be.S sha512-ppc832.S \
+ sha256-ppc8.S sha256-ppc8be.S sha256-ppc832.S \
sm3.c \
keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
stribog.c \
tiger.c \
whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/rijndael-ppc8.S b/cipher/rijndael-ppc8.S
index 49db6f95..66c70bf7 100644
--- a/cipher/rijndael-ppc8.S
+++ b/cipher/rijndael-ppc8.S
@@ -501,11 +501,10 @@ aes_p8_cbc_encrypt:
vxor 6,6,3
vperm 4,4,5,6
neg 11,3
lvsr 10,0,6
- lwz 9,480(6)
lvsr 6,0,11
lvx 5,0,3
addi 3,3,15
vxor 6,6,3
@@ -1240,11 +1239,10 @@ aes_p8_ctr32_encrypt_blocks:
vperm 4,4,5,6
vsldoi 11,0,11,1
neg 11,3
lvsr 10,0,6
- lwz 9,480(6)
lvsr 6,0,11
lvx 5,0,3
addi 3,3,15
vxor 6,6,3
diff --git a/cipher/rijndael-ppc8.pl b/cipher/rijndael-ppc8.pl
index 00bf30d9..244780a6 100755
--- a/cipher/rijndael-ppc8.pl
+++ b/cipher/rijndael-ppc8.pl
@@ -524,11 +524,10 @@ $code.=<<___;
le?vxor $inpperm,$inpperm,$tmp
vperm $ivec,$ivec,$inptail,$inpperm
neg r11,$inp
?lvsl $keyperm,0,$key # prepare for unaligned key
- lwz $rounds,480($key)
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inptail,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
@@ -1285,11 +1284,10 @@ $code.=<<___;
vperm $ivec,$ivec,$inptail,$inpperm
vsldoi $one,$rndkey0,$one,1
neg r11,$inp
?lvsl $keyperm,0,$key # prepare for unaligned key
- lwz $rounds,480($key)
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inptail,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
diff --git a/cipher/rijndael-ppc832.S b/cipher/rijndael-ppc832.S
index 838083e7..bdf1f6ac 100644
--- a/cipher/rijndael-ppc832.S
+++ b/cipher/rijndael-ppc832.S
@@ -490,11 +490,10 @@ aes_p8_cbc_encrypt:
vperm 4,4,5,6
neg 11,3
lvsl 10,0,6
- lwz 9,480(6)
lvsr 6,0,11
lvx 5,0,3
addi 3,3,15
diff --git a/cipher/rijndael-ppc8be.S b/cipher/rijndael-ppc8be.S
index 4fd21131..abc246eb 100644
--- a/cipher/rijndael-ppc8be.S
+++ b/cipher/rijndael-ppc8be.S
@@ -519,11 +519,10 @@ aes_p8_cbc_encrypt:
vperm 4,4,5,6
neg 11,3
lvsl 10,0,6
- lwz 9,480(6)
lvsr 6,0,11
lvx 5,0,3
addi 3,3,15
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index e6e53b4f..2d1c38bf 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -226,10 +226,108 @@ static unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
extern int aes_p8_set_encrypt_key (const unsigned char *userKey, const int bits,
RIJNDAEL_context *key);
extern int aes_p8_set_decrypt_key (const unsigned char *userKey, const int bits,
/* this is the decryption key part of context */
const unsigned (*)[15][4]);
+extern void aes_p8_cbc_encrypt (const unsigned char *in, unsigned char *out,
+ size_t length, const void *key,
+ unsigned char *ivec, const int enc, int rounds);
+static void _gcry_aes_ppc8_cbc_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks) {
+ const RIJNDAEL_context *ctx = context;
+ aes_p8_cbc_encrypt (inbuf_arg, outbuf_arg, nblocks * 16, &ctx->u2, iv, 0, ctx->rounds);
+ return;
+}
+/* forward declaration */
+static int _gcry_aes_generic_cbc_enc (const void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks,
+ int cbc_mac);
+static void _gcry_aes_ppc8_cbc_enc (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int cbc_mac) {
+ const RIJNDAEL_context *ctx = context;
+#ifdef __builtin_expect
+ __builtin_expect (cbc_mac, 0);
+#endif
+ if (cbc_mac) {
+ _gcry_aes_generic_cbc_enc (context, iv, outbuf_arg, inbuf_arg, nblocks, cbc_mac);
+ return;
+ }
+ aes_p8_cbc_encrypt (inbuf_arg, outbuf_arg, nblocks * 16, &ctx->u1, iv, 1, ctx->rounds);
+ _gcry_burn_stack (16 * 8);
+ return;
+}
+extern void aes_p8_xts_encrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const void *key1,
+ const void *key2, const void *iv);
+extern void aes_p8_xts_decrypt(const unsigned char *inp, unsigned char *out,
+ size_t len, const void *key1,
+ const void *key2, const void *iv);
+void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ size_t nblocks, int encrypt) {
+ const RIJNDAEL_context *ctx = context;
+ if (encrypt)
+ aes_p8_xts_encrypt (inbuf_arg, outbuf_arg, nblocks * 16, &ctx->u1, NULL, tweak);
+ else
+ aes_p8_xts_decrypt (inbuf_arg, outbuf_arg, nblocks * 16, &ctx->u2, NULL, tweak);
+ _gcry_burn_stack (16 * 6);
+}
+extern void aes_p8_ctr32_encrypt_blocks (const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ const void *ivec, int unused, int rounds);
+static inline void add_be128(void *ctr, uint64_t add) {
+ uint64_t s[2];
+ s[0] = buf_get_be64((char*)ctr + 8);
+ s[1] = buf_get_be64((char*)ctr + 0);
+ s[0] += add;
+ if (s[0] < add) {
+ s[1]++;
+ buf_put_be64((char*)ctr + 0, s[1]);
+ }
+ buf_put_be64((char*)ctr + 8, s[0]);
+}
+static void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks) {
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned char *outbuf = outbuf_arg;
+ const RIJNDAEL_context *ctx = context;
+ const uint64_t two32 = 1ULL << 32;
+ int overflow;
+ u64 s[2], e[2];
+ s[0] = buf_get_be64(ctr + 8);
+ overflow = two32 - (s[0] % two32) < nblocks;
+#ifdef __builtin_expect
+ __builtin_expect(overflow, 0);
+#endif
+ if (overflow) {
+ uint32_t first_set = (two32 - (s[0] % two32)) % two32;
+ aes_p8_ctr32_encrypt_blocks (inbuf, outbuf, first_set, &ctx->u1, ctr, /*unused*/0, ctx->rounds);
+ inbuf += first_set * BLOCKSIZE;
+ outbuf += first_set * BLOCKSIZE;
+ nblocks -= first_set;
+ add_be128(ctr, first_set);
+ while (nblocks > UINT32_MAX) {
+ aes_p8_ctr32_encrypt_blocks (inbuf, outbuf, two32, &ctx->u1, ctr, /*unused*/0, ctx->rounds);
+ inbuf += two32 * BLOCKSIZE;
+ outbuf += two32 * BLOCKSIZE;
+ nblocks -= two32;
+ add_be128(ctr, two32);
+ }
+ aes_p8_ctr32_encrypt_blocks (inbuf, outbuf, nblocks, &ctx->u1, ctr, /*unused*/0, ctx->rounds);
+
+ } else {
+ aes_p8_ctr32_encrypt_blocks (inbuf, outbuf, nblocks, &ctx->u1, ctr, /*unused*/0, ctx->rounds);
+ }
+ add_be128(ctr, nblocks);
+ _gcry_burn_stack (16 * 8);
+ return;
+}
#endif /*USE_PPC_ASM*/
static unsigned int do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
const unsigned char *ax);
static unsigned int do_decrypt (const RIJNDAEL_context *ctx, unsigned char *bx,
@@ -461,10 +559,16 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
ctx->encrypt_fn = _gcry_aes_ppc8_encrypt;
ctx->decrypt_fn = _gcry_aes_ppc8_decrypt;
ctx->prefetch_enc_fn = NULL;
ctx->prefetch_dec_fn = NULL;
ctx->use_ppc_asm = 1;
+ if (hd) {
+ hd->bulk.cbc_dec = _gcry_aes_ppc8_cbc_dec;
+ hd->bulk.cbc_enc = _gcry_aes_ppc8_cbc_enc;
+ hd->bulk.xts_crypt = _gcry_aes_ppc8_xts_crypt;
+ hd->bulk.ctr_enc = _gcry_aes_ppc8_ctr_enc;
+ }
}
#endif
else
{
ctx->encrypt_fn = do_encrypt;
@@ -924,10 +1028,48 @@ _gcry_aes_cfb_enc (void *context, unsigned char *iv,
if (burn_depth)
_gcry_burn_stack (burn_depth + 4 * sizeof(void *));
}
+
+static int _gcry_aes_generic_cbc_enc (const void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks,
+ int cbc_mac)
+{
+ const RIJNDAEL_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
+ int burn_depth = 0;
+
+ unsigned char *last_iv = iv;
+
+ if (ctx->prefetch_enc_fn)
+ ctx->prefetch_enc_fn();
+
+ for ( ;nblocks; nblocks-- )
+ {
+ cipher_block_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
+
+ burn_depth = encrypt_fn (ctx, outbuf, outbuf);
+
+ last_iv = outbuf;
+ inbuf += BLOCKSIZE;
+ if (!cbc_mac)
+ outbuf += BLOCKSIZE;
+ }
+
+ if (last_iv != iv)
+ cipher_block_cpy (iv, last_iv, BLOCKSIZE);
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+
+ return 0;
+}
+
/* Bulk encryption of complete blocks in CBC mode. Caller needs to
make sure that IV is aligned on an unsigned long boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
void
@@ -936,11 +1078,10 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
size_t nblocks, int cbc_mac)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char *last_iv;
unsigned int burn_depth = 0;
if (0)
;
#ifdef USE_AESNI
@@ -961,34 +1102,22 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
else if (ctx->use_arm_ce)
{
_gcry_aes_armv8_ce_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac);
return;
}
+#endif /*USE_ARM_CE*/
+#ifdef USE_PPC_ASM
+ else if (ctx->use_ppc_asm)
+ {
+ _gcry_aes_ppc8_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac);
+ return;
+ }
#endif /*USE_ARM_CE*/
else
{
- rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
-
- if (ctx->prefetch_enc_fn)
- ctx->prefetch_enc_fn();
-
- last_iv = iv;
-
- for ( ;nblocks; nblocks-- )
- {
- cipher_block_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
-
- burn_depth = encrypt_fn (ctx, outbuf, outbuf);
-
- last_iv = outbuf;
- inbuf += BLOCKSIZE;
- if (!cbc_mac)
- outbuf += BLOCKSIZE;
- }
-
- if (last_iv != iv)
- cipher_block_cpy (iv, last_iv, BLOCKSIZE);
+ _gcry_aes_generic_cbc_enc (ctx, iv, outbuf, inbuf, nblocks, cbc_mac);
+ return;
}
if (burn_depth)
_gcry_burn_stack (burn_depth + 4 * sizeof(void *));
}
@@ -1030,10 +1159,17 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
{
_gcry_aes_armv8_ce_ctr_enc (ctx, ctr, outbuf, inbuf, nblocks);
return;
}
#endif /*USE_ARM_CE*/
+#ifdef USE_PPC_ASM
+ else if (ctx->use_ppc_asm)
+ {
+ _gcry_aes_ppc8_ctr_enc (ctx, ctr, outbuf, inbuf, nblocks);
+ return;
+ }
+#endif /*USE_PPC_ASM*/
else
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } tmp;
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;
@@ -1293,10 +1429,44 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
if (burn_depth)
_gcry_burn_stack (burn_depth + 4 * sizeof(void *));
}
+static void
+_gcry_aes_generic_cbc_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks)
+{
+ RIJNDAEL_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16;
+ unsigned burn_depth = 0;
+ rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn;
+
+ check_decryption_preparation (ctx);
+
+ if (ctx->prefetch_dec_fn)
+ ctx->prefetch_dec_fn();
+
+ for ( ;nblocks; nblocks-- )
+ {
+ /* INBUF is needed later and it may be identical to OUTBUF, so store
+ the intermediate result to SAVEBUF. */
+
+ burn_depth = decrypt_fn (ctx, savebuf, inbuf);
+
+ cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE);
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
+
+ wipememory(savebuf, sizeof(savebuf));
+
+ if (burn_depth)
+ _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+}
/* Bulk decryption of complete blocks in CBC mode. Caller needs to
make sure that IV is aligned on an unsigned long boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
@@ -1306,11 +1476,10 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
size_t nblocks)
{
RIJNDAEL_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned int burn_depth = 0;
if (0)
;
#ifdef USE_AESNI
else if (ctx->use_aesni)
@@ -1333,35 +1502,13 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
return;
}
#endif /*USE_ARM_CE*/
else
{
- unsigned char savebuf[BLOCKSIZE] ATTR_ALIGNED_16;
- rijndael_cryptfn_t decrypt_fn = ctx->decrypt_fn;
-
- check_decryption_preparation (ctx);
-
- if (ctx->prefetch_dec_fn)
- ctx->prefetch_dec_fn();
-
- for ( ;nblocks; nblocks-- )
- {
- /* INBUF is needed later and it may be identical to OUTBUF, so store
- the intermediate result to SAVEBUF. */
-
- burn_depth = decrypt_fn (ctx, savebuf, inbuf);
-
- cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE);
- inbuf += BLOCKSIZE;
- outbuf += BLOCKSIZE;
- }
-
- wipememory(savebuf, sizeof(savebuf));
+ _gcry_aes_generic_cbc_dec (ctx, iv, outbuf, inbuf, nblocks);
+ return;
}
-
- if (burn_depth)
- _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
}
/* Bulk encryption/decryption of complete blocks in OCB mode. */
@@ -1549,10 +1696,17 @@ _gcry_aes_xts_crypt (void *context, unsigned char *tweak,
{
_gcry_aes_armv8_ce_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
return;
}
#endif /*USE_ARM_CE*/
+#ifdef USE_PPC_ASM
+ else if (ctx->use_ppc_asm)
+ {
+ _gcry_aes_ppc8_xts_crypt (ctx, tweak, outbuf, inbuf, nblocks, encrypt);
+ return;
+ }
+#endif /*USE_PPC_ASM*/
else
{
if (encrypt)
{
if (ctx->prefetch_enc_fn)
diff --git a/cipher/sha2-common.h b/cipher/sha2-common.h
new file mode 100644
index 00000000..9ad67c95
--- /dev/null
+++ b/cipher/sha2-common.h
@@ -0,0 +1,94 @@
+/* SHA2 for GnuPG
+ * Copyright (C) 2000, 2001, 2002, 2003, 2007,
+ * 2008, 2011, 2012 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef G10_SHA2_COMMON_H
+#define G10_SHA2_COMMON_H
+
+/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
+#undef USE_ARM_NEON_ASM
+#ifdef ENABLE_NEON_SUPPORT
+# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
+ && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
+ && defined(HAVE_GCC_INLINE_ASM_NEON)
+# define USE_ARM_NEON_ASM 1
+# endif
+#endif /*ENABLE_NEON_SUPPORT*/
+
+
+/* USE_ARM_ASM indicates whether to enable ARM assembly code. */
+#undef USE_ARM_ASM
+#if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
+# define USE_ARM_ASM 1
+#endif
+
+
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_SSSE3 1
+#endif
+
+
+/* USE_AVX indicates whether to compile with Intel AVX code. */
+#undef USE_AVX
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX 1
+#endif
+
+
+/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
+ defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX2 1
+#endif
+
+/* USE_PPC_ASM indicates whether to compile with PowerISA 2.07 crypto support */
+#undef USE_PPC_ASM
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(__powerpc64__) || defined(__powerpc__)
+# define USE_PPC_ASM 1
+# endif
+#endif
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \
+ defined(USE_SHAEXT)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
+# else
+# define ASM_FUNC_ABI
+# define ASM_EXTRA_STACK 0
+# endif
+#endif
+#endif
\ No newline at end of file
diff --git a/cipher/sha256.c b/cipher/sha256.c
index 6c683348..957b4d78 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -43,86 +43,25 @@
#include "g10lib.h"
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher.h"
#include "hash-common.h"
+#include "sha2-common.h"
-
-/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
-#undef USE_SSSE3
-#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_SSSE3 1
-#endif
-
-/* USE_AVX indicates whether to compile with Intel AVX code. */
-#undef USE_AVX
-#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_AVX 1
+/* Helper macro to force alignment to 16 bytes. */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16 __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
#endif
-/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
-#undef USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
- defined(HAVE_GCC_INLINE_ASM_BMI2) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_AVX2 1
-#endif
-
-/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
-#undef USE_SHAEXT
-#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
- defined(HAVE_GCC_INLINE_ASM_SSE41) && \
- defined(ENABLE_SHAEXT_SUPPORT)
-# define USE_SHAEXT 1
-#endif
-
-/* USE_ARM_CE indicates whether to enable ARMv8 Crypto Extension assembly
- * code. */
-#undef USE_ARM_CE
-#ifdef ENABLE_ARM_CRYPTO_SUPPORT
-# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
- && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
- && defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
-# define USE_ARM_CE 1
-# elif defined(__AARCH64EL__) \
- && defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) \
- && defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
-# define USE_ARM_CE 1
-# endif
-#endif
-
-
typedef struct {
gcry_md_block_ctx_t bctx;
- u32 h0,h1,h2,h3,h4,h5,h6,h7;
+ u32 h0 ATTR_ALIGNED_16;
+ u32 h1,h2,h3,h4,h5,h6,h7;
} SHA256_CONTEXT;
-
-/* Assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef ASM_FUNC_ABI
-#undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2) || \
- defined(USE_SHAEXT)
-# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define ASM_FUNC_ABI __attribute__((sysv_abi))
-# define ASM_EXTRA_STACK (10 * 16 + sizeof(void *) * 4)
-# else
-# define ASM_FUNC_ABI
-# define ASM_EXTRA_STACK 0
-# endif
-#endif
-
-
#ifdef USE_SSSE3
unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data,
u32 state[8],
size_t num_blks) ASM_FUNC_ABI;
@@ -194,37 +133,35 @@ do_sha256_transform_armv8_ce(void *ctx, const unsigned char *data,
SHA256_CONTEXT *hd = ctx;
return _gcry_sha256_transform_armv8_ce (&hd->h0, data, nblks);
}
#endif
+#ifdef USE_PPC_ASM
+void sha256_block_p8 (u32 state[8],
+ const unsigned char *data,
+ size_t len);
+static unsigned int
+do_sha256_transform_ppc8 (void *ctx, const unsigned char *data,
+ size_t nblks)
+{
+ SHA256_CONTEXT *hd = ctx;
+ sha256_block_p8 (&hd->h0, data, nblks);
+ return 128; /* uses 128 bytes of stack space */
+}
+#endif
static unsigned int
do_transform_generic (void *ctx, const unsigned char *data, size_t nblks);
-
static void
-sha256_init (void *context, unsigned int flags)
+sha256_init_common (void *context, unsigned int flags)
{
SHA256_CONTEXT *hd = context;
unsigned int features = _gcry_get_hw_features ();
(void)flags;
- hd->h0 = 0x6a09e667;
- hd->h1 = 0xbb67ae85;
- hd->h2 = 0x3c6ef372;
- hd->h3 = 0xa54ff53a;
- hd->h4 = 0x510e527f;
- hd->h5 = 0x9b05688c;
- hd->h6 = 0x1f83d9ab;
- hd->h7 = 0x5be0cd19;
-
- hd->bctx.nblocks = 0;
- hd->bctx.nblocks_high = 0;
- hd->bctx.count = 0;
- hd->bctx.blocksize = 64;
-
/* Order of feature checks is important here; last match will be
* selected. Keep slower implementations at the top and faster at
* the bottom. */
hd->bctx.bwrite = do_transform_generic;
#ifdef USE_SSSE3
@@ -246,20 +183,47 @@ sha256_init (void *context, unsigned int flags)
hd->bctx.bwrite = do_sha256_transform_intel_shaext;
#endif
#ifdef USE_ARM_CE
if ((features & HWF_ARM_SHA2) != 0)
hd->bctx.bwrite = do_sha256_transform_armv8_ce;
+#endif
+#ifdef USE_PPC_ASM
+ if ((features & HWF_PPC_VCRYPTO) != 0)
+ hd->bctx.bwrite = do_sha256_transform_ppc8;
#endif
(void)features;
}
+static void
+sha256_init (void *context, unsigned int flags)
+{
+ SHA256_CONTEXT *hd = context;
+
+ (void)flags;
+
+ hd->h0 = 0x6a09e667;
+ hd->h1 = 0xbb67ae85;
+ hd->h2 = 0x3c6ef372;
+ hd->h3 = 0xa54ff53a;
+ hd->h4 = 0x510e527f;
+ hd->h5 = 0x9b05688c;
+ hd->h6 = 0x1f83d9ab;
+ hd->h7 = 0x5be0cd19;
+
+ hd->bctx.nblocks = 0;
+ hd->bctx.nblocks_high = 0;
+ hd->bctx.count = 0;
+ hd->bctx.blocksize = 64;
+
+ sha256_init_common (context, flags);
+}
+
static void
sha224_init (void *context, unsigned int flags)
{
SHA256_CONTEXT *hd = context;
- unsigned int features = _gcry_get_hw_features ();
(void)flags;
hd->h0 = 0xc1059ed8;
hd->h1 = 0x367cd507;
@@ -273,37 +237,11 @@ sha224_init (void *context, unsigned int flags)
hd->bctx.nblocks = 0;
hd->bctx.nblocks_high = 0;
hd->bctx.count = 0;
hd->bctx.blocksize = 64;
- /* Order of feature checks is important here; last match will be
- * selected. Keep slower implementations at the top and faster at
- * the bottom. */
- hd->bctx.bwrite = do_transform_generic;
-#ifdef USE_SSSE3
- if ((features & HWF_INTEL_SSSE3) != 0)
- hd->bctx.bwrite = do_sha256_transform_amd64_ssse3;
-#endif
-#ifdef USE_AVX
- /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs.
- * Therefore use this implementation on Intel CPUs only. */
- if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD))
- hd->bctx.bwrite = do_sha256_transform_amd64_avx;
-#endif
-#ifdef USE_AVX2
- if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
- hd->bctx.bwrite = do_sha256_transform_amd64_avx2;
-#endif
-#ifdef USE_SHAEXT
- if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
- hd->bctx.bwrite = do_sha256_transform_intel_shaext;
-#endif
-#ifdef USE_ARM_CE
- if ((features & HWF_ARM_SHA2) != 0)
- hd->bctx.bwrite = do_sha256_transform_armv8_ce;
-#endif
- (void)features;
+ sha256_init_common (context, flags);
}
/*
Transform the message X which consists of 16 32-bit-words. See FIPS
diff --git a/cipher/sha512p8-ppc.pl b/cipher/sha512-ppc8.pl
old mode 100755
new mode 100644
similarity index 99%
rename from cipher/sha512p8-ppc.pl
rename to cipher/sha512-ppc8.pl
index 811d85e2..21fe1e61
--- a/cipher/sha512p8-ppc.pl
+++ b/cipher/sha512-ppc8.pl
@@ -1,7 +1,9 @@
#! /usr/bin/env perl
# SPDX-License-Identifier: BSD-3-Clause
+#
+#Changes: rename ppc-xlate.pl
# ====================================================================
# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
@@ -52,10 +54,11 @@ if ($flavour =~ /64/) {
} else { die "nonsense $flavour"; }
$LENDIAN=($flavour=~/le/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}asm-common-ppc.pl" and -f $xlate ) or
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 1a808f88..f5c75eb6 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -51,59 +51,11 @@
#include "g10lib.h"
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher.h"
#include "hash-common.h"
-
-
-/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */
-#undef USE_ARM_NEON_ASM
-#ifdef ENABLE_NEON_SUPPORT
-# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) \
- && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) \
- && defined(HAVE_GCC_INLINE_ASM_NEON)
-# define USE_ARM_NEON_ASM 1
-# endif
-#endif /*ENABLE_NEON_SUPPORT*/
-
-
-/* USE_ARM_ASM indicates whether to enable ARM assembly code. */
-#undef USE_ARM_ASM
-#if defined(__ARMEL__) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS)
-# define USE_ARM_ASM 1
-#endif
-
-
-/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
-#undef USE_SSSE3
-#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_SSSE3 1
-#endif
-
-
-/* USE_AVX indicates whether to compile with Intel AVX code. */
-#undef USE_AVX
-#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_AVX 1
-#endif
-
-
-/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */
-#undef USE_AVX2
-#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX2) && \
- defined(HAVE_GCC_INLINE_ASM_BMI2) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
- defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
-# define USE_AVX2 1
-#endif
+#include "sha2-common.h"
typedef struct
{
u64 h0, h1, h2, h3, h4, h5, h6, h7;
@@ -158,26 +110,10 @@ static const u64 k[] =
U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
};
-
-/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
- * stack to store XMM6-XMM15 needed on Win64. */
-#undef ASM_FUNC_ABI
-#undef ASM_EXTRA_STACK
-#if defined(USE_SSSE3) || defined(USE_AVX) || defined(USE_AVX2)
-# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
-# define ASM_FUNC_ABI __attribute__((sysv_abi))
-# define ASM_EXTRA_STACK (10 * 16 + 4 * sizeof(void *))
-# else
-# define ASM_FUNC_ABI
-# define ASM_EXTRA_STACK 0
-# endif
-#endif
-
-
#ifdef USE_ARM_NEON_ASM
unsigned int _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd,
const unsigned char *data,
const u64 k[], size_t num_blks);
@@ -188,10 +124,24 @@ do_sha512_transform_armv7_neon(void *ctx, const unsigned char *data,
SHA512_CONTEXT *hd = ctx;
return _gcry_sha512_transform_armv7_neon (&hd->state, data, k, nblks);
}
#endif
+#ifdef USE_PPC_ASM
+void sha512_block_p8 (SHA512_STATE *hd,
+ const unsigned char *data,
+ size_t len);
+static unsigned int
+do_sha512_transform_ppc8 (void *ctx, const unsigned char *data,
+ size_t nblks)
+{
+ SHA512_CONTEXT *hd = ctx;
+ sha512_block_p8 (&hd->state, data, nblks);
+ return 128; /* uses 128 bytes of stack space */
+}
+#endif
+
#ifdef USE_SSSE3
unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data,
void *state,
size_t num_blks) ASM_FUNC_ABI;
@@ -272,10 +222,14 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags)
ctx->bctx.bwrite = do_transform_generic;
#ifdef USE_ARM_NEON_ASM
if ((features & HWF_ARM_NEON) != 0)
ctx->bctx.bwrite = do_sha512_transform_armv7_neon;
#endif
+#ifdef USE_PPC_ASM
+ if ((features & HWF_PPC_VCRYPTO) != 0)
+ ctx->bctx.bwrite = do_sha512_transform_ppc8;
+#endif
#ifdef USE_SSSE3
if ((features & HWF_INTEL_SSSE3) != 0)
ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3;
#endif
#ifdef USE_AVX
diff --git a/configure.ac b/configure.ac
index e65ce280..2d8503ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2239,20 +2239,23 @@ if test "$found" = "1" ; then
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv8-aarch64-ce.lo"
;;
powerpc64le-*-*)
# Build with the crypto extension implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc8.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc8.lo sha256-ppc8.lo"
;;
powerpc64-*-*)
# Big-Endian.
# Build with the crypto extension implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc8be.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc8be.lo sha256-ppc8be.lo"
;;
powerpc-*-*)
# Big-Endian.
# Build with the crypto extension implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc832.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc832.lo sha256-ppc832.lo"
;;
esac
case "$mpi_cpu_arch" in
x86)
--
2.20.1
More information about the Gcrypt-devel
mailing list