From jussi.kivilinna at iki.fi Sat Oct 1 08:15:12 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 1 Oct 2022 09:15:12 +0300 Subject: Uninteded Variable Length Array in ec-nist.c In-Reply-To: <20220930113606.GE7424@yoink.cs.uwaterloo.ca> References: <87a66lvdql.fsf@akagi.fsij.org> <3a5400f0-39c7-d7af-e5c7-63259b8d8ec6@iki.fi> <87k05ls2hz.fsf@akagi.fsij.org> <20220930113606.GE7424@yoink.cs.uwaterloo.ca> Message-ID: On 30.9.2022 14.36, Ian Goldberg via Gcrypt-devel wrote: > On Fri, Sep 30, 2022 at 05:14:16PM +0900, NIIBE Yutaka wrote: >> Thank you for your quick response. >> >> Jussi Kivilinna wrote: >>> How about instead define arrays with wanted size and define 'wsize' with >>> sizeof the array. This would avoid having macros. For example like this: >>> >>> index 69b05a6d..0de41e48 100644 >>> --- a/mpi/ec-nist.c >>> +++ b/mpi/ec-nist.c >>> @@ -94,9 +94,9 @@ _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx) >>> }; >>> const mpi_limb64_t zero = LIMB_TO64(0); >>> mpi_ptr_t wp; >>> - mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64; >>> - mpi_limb64_t s[wsize + 1]; >>> - mpi_limb64_t o[wsize + 1]; >>> + mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1]; >>> + mpi_limb64_t o[sizeof(s)]; > > Note that sizeof(s) is the number of *bytes* of s, not the number of > *elements* of s, so the above new code will declare o to be much larger > than the old code did. Thanks, I somehow missed that. Next line in my example used DIM macro which does the right thing of giving number of elements in array. + const mpi_size_t wsize = DIM(s) - 1; Just need to change to use DIM for array definitions too. -Jussi From jussi.kivilinna at iki.fi Sat Oct 1 09:48:21 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 1 Oct 2022 10:48:21 +0300 Subject: [PATCH 3/5] tests/bench-slope: remove VLA usage In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi> References: <20221001074823.201798-1-jussi.kivilinna@iki.fi> Message-ID: <20221001074823.201798-3-jussi.kivilinna@iki.fi> * tests/bench-slope.c (bench_set_cipher_key): New. (bench_encrypt_init, bench_xts_encrypt_init): Use'bench_set_cipher_key' to remove VLA usage. -- Signed-off-by: Jussi Kivilinna --- tests/bench-slope.c | 59 ++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/tests/bench-slope.c b/tests/bench-slope.c index aaddaa85..1cad6813 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -982,6 +982,35 @@ struct bench_cipher_mode }; +static void +bench_set_cipher_key (gcry_cipher_hd_t hd, int keylen) +{ + char *key; + int err, i; + + key = malloc (keylen); + if (!key) + { + fprintf (stderr, PGM ": couldn't allocate %d bytes\n", keylen); + gcry_cipher_close (hd); + exit (1); + } + + for (i = 0; i < keylen; i++) + key[i] = 0x33 ^ (11 - i); + + err = gcry_cipher_setkey (hd, key, keylen); + free (key); + if (err) + { + fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hd); + exit (1); + } +} + + static int bench_encrypt_init (struct bench_obj *obj) { @@ -1010,20 +1039,7 @@ bench_encrypt_init (struct bench_obj *obj) if (keylen) { - char key[keylen]; - int i; - - for (i = 0; i < keylen; i++) - key[i] = 0x33 ^ (11 - i); - - err = gcry_cipher_setkey (hd, key, keylen); - if (err) - { - fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n", - gpg_strerror (err)); - gcry_cipher_close (hd); - exit (1); - } + bench_set_cipher_key (hd, keylen); } else { @@ -1119,20 +1135,7 @@ bench_xts_encrypt_init (struct bench_obj *obj) keylen = gcry_cipher_get_algo_keylen (mode->algo) * 2; if (keylen) { - char key[keylen]; - int i; - - for (i = 0; i < keylen; i++) - key[i] = 0x33 ^ (11 - i); - - err = gcry_cipher_setkey (hd, key, keylen); - if (err) - { - fprintf (stderr, PGM ": gcry_cipher_setkey failed: %s\n", - gpg_strerror (err)); - gcry_cipher_close (hd); - exit (1); - } + bench_set_cipher_key (hd, keylen); } else { -- 2.34.1 From jussi.kivilinna at iki.fi Sat Oct 1 09:48:23 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 1 Oct 2022 10:48:23 +0300 Subject: [PATCH 5/5] =?UTF-8?q?t-rsa-testparm:=20fix=20'function=20declara?= =?UTF-8?q?tion=20isn=E2=80=99t=20a=20prototype'=20warning?= In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi> References: <20221001074823.201798-1-jussi.kivilinna@iki.fi> Message-ID: <20221001074823.201798-5-jussi.kivilinna@iki.fi> * cipher/t-rsa-testparm.c (check_rsa_testparm): Define parameters as void. -- Signed-off-by: Jussi Kivilinna --- tests/t-rsa-testparm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/t-rsa-testparm.c b/tests/t-rsa-testparm.c index 65617855..d62d9abf 100644 --- a/tests/t-rsa-testparm.c +++ b/tests/t-rsa-testparm.c @@ -32,7 +32,7 @@ static void -check_rsa_testparm () +check_rsa_testparm (void) { gpg_error_t err; gcry_sexp_t keyspec = NULL; -- 2.34.1 From jussi.kivilinna at iki.fi Sat Oct 1 09:48:19 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 1 Oct 2022 10:48:19 +0300 Subject: [PATCH 1/5] mpi/ec: remove VLA usage Message-ID: <20221001074823.201798-1-jussi.kivilinna@iki.fi> * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod) (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod) (_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays on stack. * mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays on stack. -- Signed-off-by: Jussi Kivilinna --- mpi/ec-nist.c | 40 ++++++++++++++++++++-------------------- mpi/ec.c | 6 +++--- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c index 69b05a6d..14e3c3ab 100644 --- a/mpi/ec-nist.c +++ b/mpi/ec-nist.c @@ -94,9 +94,9 @@ _gcry_mpi_ec_nist192_mod (gcry_mpi_t w, mpi_ec_t ctx) }; const mpi_limb64_t zero = LIMB_TO64(0); mpi_ptr_t wp; - mpi_size_t wsize = 192 / BITS_PER_MPI_LIMB64; - mpi_limb64_t s[wsize + 1]; - mpi_limb64_t o[wsize + 1]; + mpi_limb64_t s[192 / BITS_PER_MPI_LIMB64 + 1]; + mpi_limb64_t o[DIM(s)]; + const mpi_size_t wsize = DIM(s) - 1; mpi_limb_t mask1; mpi_limb_t mask2; mpi_limb_t s_is_negative; @@ -186,10 +186,10 @@ _gcry_mpi_ec_nist224_mod (gcry_mpi_t w, mpi_ec_t ctx) }; const mpi_limb64_t zero = LIMB_TO64(0); mpi_ptr_t wp; - mpi_size_t wsize = (224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64; + mpi_limb64_t s[(224 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64]; + mpi_limb64_t d[DIM(s)]; + const mpi_size_t wsize = DIM(s); mpi_size_t psize = ctx->p->nlimbs; - mpi_limb64_t s[wsize]; - mpi_limb64_t d[wsize]; mpi_limb_t mask1; mpi_limb_t mask2; mpi_limb_t s_is_negative; @@ -345,12 +345,12 @@ _gcry_mpi_ec_nist256_mod (gcry_mpi_t w, mpi_ec_t ctx) }; const mpi_limb64_t zero = LIMB_TO64(0); mpi_ptr_t wp; - mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64; + mpi_limb64_t s[(256 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1]; + mpi_limb64_t t[DIM(s)]; + mpi_limb64_t d[DIM(s)]; + mpi_limb64_t e[DIM(s)]; + const mpi_size_t wsize = DIM(s) - 1; mpi_size_t psize = ctx->p->nlimbs; - mpi_limb64_t s[wsize + 1]; - mpi_limb64_t t[wsize + 1]; - mpi_limb64_t d[wsize + 1]; - mpi_limb64_t e[wsize + 1]; mpi_limb_t mask1; mpi_limb_t mask2; mpi_limb_t mask3; @@ -595,15 +595,15 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx) }; const mpi_limb64_t zero = LIMB_TO64(0); mpi_ptr_t wp; - mpi_size_t wsize = (384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64; - mpi_size_t psize = ctx->p->nlimbs; + mpi_limb64_t s[(384 + BITS_PER_MPI_LIMB64 - 1) / BITS_PER_MPI_LIMB64 + 1]; + mpi_limb64_t t[DIM(s)]; + mpi_limb64_t d[DIM(s)]; + mpi_limb64_t x[DIM(s)]; #if (BITS_PER_MPI_LIMB64 == BITS_PER_MPI_LIMB) && defined(WORDS_BIGENDIAN) - mpi_limb_t wp_shr32[wsize * LIMBS_PER_LIMB64]; + mpi_limb_t wp_shr32[(DIM(s) - 1) * LIMBS_PER_LIMB64]; #endif - mpi_limb64_t s[wsize + 1]; - mpi_limb64_t t[wsize + 1]; - mpi_limb64_t d[wsize + 1]; - mpi_limb64_t x[wsize + 1]; + const mpi_size_t wsize = DIM(s) - 1; + mpi_size_t psize = ctx->p->nlimbs; mpi_limb_t mask1; mpi_limb_t mask2; mpi_limb_t s_is_negative; @@ -791,8 +791,8 @@ _gcry_mpi_ec_nist384_mod (gcry_mpi_t w, mpi_ec_t ctx) void _gcry_mpi_ec_nist521_mod (gcry_mpi_t w, mpi_ec_t ctx) { - mpi_size_t wsize = (521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB; - mpi_limb_t s[wsize]; + mpi_limb_t s[(521 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB]; + const mpi_size_t wsize = DIM(s); mpi_limb_t cy; mpi_ptr_t wp; diff --git a/mpi/ec.c b/mpi/ec.c index c24921ee..0ad6769c 100644 --- a/mpi/ec.c +++ b/mpi/ec.c @@ -581,9 +581,9 @@ ec_pow2_448 (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx) static void ec_secp256k1_mod (gcry_mpi_t w, mpi_ec_t ctx) { - mpi_size_t wsize = (256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB; - mpi_limb_t n[wsize + 1]; - mpi_limb_t s[wsize + 1]; + mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1]; + mpi_limb_t n[sizeof(s)]; + const mpi_size_t wsize = DIM(s) - 1; mpi_limb_t cy, borrow; mpi_ptr_t wp; -- 2.34.1 From jussi.kivilinna at iki.fi Sat Oct 1 09:48:20 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 1 Oct 2022 10:48:20 +0300 Subject: [PATCH 2/5] cipher-ccm: remove VLA usage In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi> References: <20221001074823.201798-1-jussi.kivilinna@iki.fi> Message-ID: <20221001074823.201798-2-jussi.kivilinna@iki.fi> * cipher/cipher-ccm.c (do_cbc_mac): Avoid VLA for stack array. -- Signed-off-by: Jussi Kivilinna --- cipher/cipher-ccm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c index f8b6865c..b325c966 100644 --- a/cipher/cipher-ccm.c +++ b/cipher/cipher-ccm.c @@ -38,9 +38,9 @@ static unsigned int do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen, int do_padding) { - const unsigned int blocksize = 16; gcry_cipher_encrypt_t enc_fn = c->spec->encrypt; - unsigned char tmp[blocksize]; + unsigned char tmp[16]; + const unsigned int blocksize = DIM(tmp); unsigned int burn = 0; unsigned int unused = c->u_mode.ccm.mac_unused; size_t nblocks; -- 2.34.1 From jussi.kivilinna at iki.fi Sat Oct 1 09:48:22 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 1 Oct 2022 10:48:22 +0300 Subject: [PATCH 4/5] tests/benchmark: remove VLA usage In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi> References: <20221001074823.201798-1-jussi.kivilinna@iki.fi> Message-ID: <20221001074823.201798-4-jussi.kivilinna@iki.fi> * cipher/benchmark.c (ccm_aead_init): Avoid VLA in stack array. -- Signed-off-by: Jussi Kivilinna --- tests/benchmark.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/benchmark.c b/tests/benchmark.c index a23cf74b..e9223f5a 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -719,15 +719,16 @@ mac_bench ( const char *algoname ) static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen) { - const int _L = 4; - const int noncelen = 15 - _L; - char nonce[noncelen]; + const char _L[4]; + char nonce[15 - sizeof(_L)]; u64 params[3]; gcry_error_t err = GPG_ERR_NO_ERROR; - memset (nonce, 0x33, noncelen); + (void)_L; - err = gcry_cipher_setiv (hd, nonce, noncelen); + memset (nonce, 0x33, sizeof(nonce)); + + err = gcry_cipher_setiv (hd, nonce, sizeof(nonce)); if (err) { fprintf (stderr, "gcry_cipher_setiv failed: %s\n", -- 2.34.1 From jcb62281 at gmail.com Mon Oct 3 02:19:30 2022 From: jcb62281 at gmail.com (Jacob Bachmeyer) Date: Sun, 02 Oct 2022 19:19:30 -0500 Subject: [PATCH 1/5] mpi/ec: remove VLA usage In-Reply-To: <20221001074823.201798-1-jussi.kivilinna@iki.fi> References: <20221001074823.201798-1-jussi.kivilinna@iki.fi> Message-ID: <633A2A92.3050900@gmail.com> Jussi Kivilinna wrote: > * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod) > (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod) > (_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays on stack. > * mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays on stack. > -- > > Signed-off-by: Jussi Kivilinna > --- > mpi/ec-nist.c | 40 ++++++++++++++++++++-------------------- > mpi/ec.c | 6 +++--- > 2 files changed, 23 insertions(+), 23 deletions(-) > > diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c > index 69b05a6d..14e3c3ab 100644 > [...] > + mpi_limb64_t o[DIM(s)]; > [...] > + mpi_limb64_t d[DIM(s)]; > [...] > + mpi_limb64_t t[DIM(s)]; > + mpi_limb64_t d[DIM(s)]; > + mpi_limb64_t e[DIM(s)]; > + const mpi_size_t wsize = DIM(s) - 1; > [...] > + mpi_limb64_t t[DIM(s)]; > + mpi_limb64_t d[DIM(s)]; > + mpi_limb64_t x[DIM(s)]; > [...] > + mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1]; > + mpi_limb_t n[sizeof(s)]; > + const mpi_size_t wsize = DIM(s) - 1; > mpi_limb_t cy, borrow; > mpi_ptr_t wp; > Am I misreading the patch (e.g. sizeof(mpi_limb_t) == 1?) or did you miss a spot at the end? -- Jacob From jussi.kivilinna at iki.fi Mon Oct 3 17:30:59 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 3 Oct 2022 18:30:59 +0300 Subject: [PATCH 1/5] mpi/ec: remove VLA usage In-Reply-To: <633A2A92.3050900@gmail.com> References: <20221001074823.201798-1-jussi.kivilinna@iki.fi> <633A2A92.3050900@gmail.com> Message-ID: <5090d5a3-7777-1bae-9f80-eaf256db4f3e@iki.fi> On 3.10.2022 3.19, Jacob Bachmeyer wrote: > Jussi Kivilinna wrote: >> * mpi/ec-nist.c (_gcry_mpi_ec_nist192_mod, _gcry_mpi_ec_nist224_mod) >> (_gcry_mpi_ec_nist256_mod, _gcry_mpi_ec_nist384_mod) >> (_gcry_mpi_ec_nist521_mod): Avoid VLA for arrays on stack. >> * mpi/ec.c (ec_secp256k1_mod): Avoid VLA for arrays on stack. >> -- >> >> Signed-off-by: Jussi Kivilinna >> --- >> ?mpi/ec-nist.c | 40 ++++++++++++++++++++-------------------- >> ?mpi/ec.c????? |? 6 +++--- >> ?2 files changed, 23 insertions(+), 23 deletions(-) >> >> diff --git a/mpi/ec-nist.c b/mpi/ec-nist.c >> index 69b05a6d..14e3c3ab 100644 >> [...] >> +? mpi_limb64_t o[DIM(s)]; >> [...] >> +? mpi_limb64_t d[DIM(s)]; >> [...] >> +? mpi_limb64_t t[DIM(s)]; >> +? mpi_limb64_t d[DIM(s)]; >> +? mpi_limb64_t e[DIM(s)]; >> +? const mpi_size_t wsize = DIM(s) - 1; >> [...] >> +? mpi_limb64_t t[DIM(s)]; >> +? mpi_limb64_t d[DIM(s)]; >> +? mpi_limb64_t x[DIM(s)]; >> [...] >> +? mpi_limb_t s[(256 + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB + 1]; >> +? mpi_limb_t n[sizeof(s)]; >> +? const mpi_size_t wsize = DIM(s) - 1; >> ?? mpi_limb_t cy, borrow; >> ?? mpi_ptr_t wp; > > Am I misreading the patch (e.g. sizeof(mpi_limb_t) == 1?) or did you miss a spot at the end? > I missed it at first, but fixed before pushing to master. https://git.gnupg.org/cgi-bin/gitweb.cgi?p=libgcrypt.git;a=commitdiff;h=9978fc22045ca7623a6e0cbf704fb48ab1550419;hp=0cb29a5736cfcd6bce4ce2495cd0481f0bdb34a4 -Jussi From jussi.kivilinna at iki.fi Tue Oct 4 20:48:16 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 4 Oct 2022 21:48:16 +0300 Subject: [PATCH 2/3] mpi/longlong: fix generic smul_ppmm ifdef In-Reply-To: <20221004184817.140959-1-jussi.kivilinna@iki.fi> References: <20221004184817.140959-1-jussi.kivilinna@iki.fi> Message-ID: <20221004184817.140959-2-jussi.kivilinna@iki.fi> * mpi/longlong.h [!umul_ppmm] (smul_ppmm): Change ifdef from !defined(umul_ppmm) to !defined(smul_ppmm). -- Signed-off-by: Jussi Kivilinna --- mpi/longlong.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpi/longlong.h b/mpi/longlong.h index 6a829f49..2921e9bd 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -1710,7 +1710,7 @@ typedef unsigned int UTItype __attribute__ ((mode (TI))); } while (0) #endif -#if !defined (umul_ppmm) +#if !defined (smul_ppmm) # define smul_ppmm(w1, w0, u, v) \ do { \ UWtype __w1; \ -- 2.34.1 From jussi.kivilinna at iki.fi Tue Oct 4 20:48:15 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 4 Oct 2022 21:48:15 +0300 Subject: [PATCH 1/3] mpi/longlong: provide generic implementation using double word type Message-ID: <20221004184817.140959-1-jussi.kivilinna@iki.fi> * configure.ac: Add check for 'unsigned __int128'. * mpi/longlong.h (UDWtype): Define for 32-bit or 64-bit when 'unsigned long long' or 'unsigned __int128' is available. (add_ssaaaa, sub_ddmmss, umul_ppmm, udiv_qrnnd) [UDWtype]: New. -- New generic longlong.h implementation by using 'unsigned long long' on 32-bit and 'unsigned __int128' on 64-bit (for new architectures like RISC-V). Signed-off-by: Jussi Kivilinna --- configure.ac | 1 + mpi/longlong.h | 75 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/configure.ac b/configure.ac index c39257b5..6f68a945 100644 --- a/configure.ac +++ b/configure.ac @@ -360,6 +360,7 @@ AC_CHECK_SIZEOF(unsigned short, 2) AC_CHECK_SIZEOF(unsigned int, 4) AC_CHECK_SIZEOF(unsigned long, 4) AC_CHECK_SIZEOF(unsigned long long, 0) +AC_CHECK_SIZEOF(unsigned __int128, 0) AC_CHECK_SIZEOF(void *, 0) AC_TYPE_UINTPTR_T diff --git a/mpi/longlong.h b/mpi/longlong.h index c299534c..6a829f49 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -20,18 +20,28 @@ along with this file; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +/* On 32-bit, use 64-bit 'unsigned long long' for UDWtype, if available. */ +#if !defined (UDWtype) && SIZEOF_UNSIGNED_LONG_LONG * 8 == W_TYPE_SIZE * 2 +# define UDWtype unsigned long long +#endif + +/* On 64-bit, use 128-bit 'unsigned __int128' for UDWtype, if available. */ +#if !defined (UDWtype) && SIZEOF_UNSIGNED___INT128 * 8 == W_TYPE_SIZE * 2 +# define UDWtype unsigned __int128 +#endif + /* You have to define the following before including this file: - UWtype -- An unsigned type, default type for operations (typically a "word") + UWtype -- An unsigned type, default type for operations (typically a "word"). UHWtype -- An unsigned type, at least half the size of UWtype. - UDWtype -- An unsigned type, at least twice as large a UWtype - W_TYPE_SIZE -- size in bits of UWtype + UDWtype -- An unsigned type, at least twice as large a UWtype. + W_TYPE_SIZE -- size in bits of UWtype. SItype, USItype -- Signed and unsigned 32 bit types. DItype, UDItype -- Signed and unsigned 64 bit types. - On a 32 bit machine UWtype should typically be USItype; - on a 64 bit machine, UWtype should typically be UDItype. + On a 32 bit machine UWtype should typically be USItype. + On a 64 bit machine, UWtype should typically be UDItype. */ #define __BITS4 (W_TYPE_SIZE / 4) @@ -1617,7 +1627,21 @@ typedef unsigned int UTItype __attribute__ ((mode (TI))); /* If this machine has no inline assembler, use C macros. */ -#if !defined (add_ssaaaa) +#if !defined (add_ssaaaa) && defined (UDWtype) +/* Use double word type when available. */ +# define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + UDWtype __audw = (ah); \ + UDWtype __budw = (bh); \ + __audw <<= W_TYPE_SIZE; \ + __audw |= (al); \ + __budw <<= W_TYPE_SIZE; \ + __budw |= (bl); \ + __audw += __budw; \ + (sh) = (UWtype)(__audw >> W_TYPE_SIZE); \ + (sl) = (UWtype)(__audw); \ + } while (0) +#elif !defined (add_ssaaaa) # define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ UWtype __x; \ @@ -1627,7 +1651,21 @@ typedef unsigned int UTItype __attribute__ ((mode (TI))); } while (0) #endif -#if !defined (sub_ddmmss) +#if !defined (sub_ddmmss) && defined (UDWtype) +/* Use double word type when available. */ +# define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + UDWtype __audw = (ah); \ + UDWtype __budw = (bh); \ + __audw <<= W_TYPE_SIZE; \ + __audw |= (al); \ + __budw <<= W_TYPE_SIZE; \ + __budw |= (bl); \ + __audw -= __budw; \ + (sh) = (UWtype)(__audw >> W_TYPE_SIZE); \ + (sl) = (UWtype)(__audw); \ + } while (0) +#elif !defined (sub_ddmmss) # define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ UWtype __x; \ @@ -1637,7 +1675,15 @@ typedef unsigned int UTItype __attribute__ ((mode (TI))); } while (0) #endif -#if !defined (umul_ppmm) +#if !defined (umul_ppmm) && defined (UDWtype) +# define umul_ppmm(w1, w0, u, v) \ + do { \ + UDWtype __x = (u); \ + __x *= (v); \ + (w1) = (UWtype)(__x >> W_TYPE_SIZE); \ + (w0) = (UWtype)(__x); \ + } while (0) +#elif !defined (umul_ppmm) # define umul_ppmm(w1, w0, u, v) \ do { \ UWtype __x0, __x1, __x2, __x3; \ @@ -1712,6 +1758,19 @@ typedef unsigned int UTItype __attribute__ ((mode (TI))); (r) = __r0; \ } while (0) +/* Use double word type if available. */ +#if !defined (udiv_qrnnd) && defined (UDWtype) +# define udiv_qrnnd(q, r, nh, nl, d) \ + do { \ + UWtype __d = (d); \ + UDWtype __nudw = (nh); \ + __nudw <<= W_TYPE_SIZE; \ + __nudw |= (nl); \ + (q) = (UWtype)(__nudw / __d); \ + (r) = (UWtype)(__nudw % __d); \ + } while (0) +#endif + /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through __udiv_w_sdiv (defined in libgcc or elsewhere). */ #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) -- 2.34.1 From jussi.kivilinna at iki.fi Tue Oct 4 20:48:17 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 4 Oct 2022 21:48:17 +0300 Subject: [PATCH 3/3] mpi/longlong.h: x86-64: use tzcnt instruction for trailing zeros In-Reply-To: <20221004184817.140959-1-jussi.kivilinna@iki.fi> References: <20221004184817.140959-1-jussi.kivilinna@iki.fi> Message-ID: <20221004184817.140959-3-jussi.kivilinna@iki.fi> * mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix for 'bsfq'. -- "rep;bsf" aka "tzcnt" is new instruction with well defined operation on zero input and as result is faster on new CPUs. On old CPUs, "tzcnt" functions as old "bsf" with undefined behaviour on zero input. Signed-off-by: Jussi Kivilinna --- mpi/longlong.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpi/longlong.h b/mpi/longlong.h index 2921e9bd..706ac723 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -624,7 +624,7 @@ extern USItype __udiv_qrnnd (); # define count_trailing_zeros(count, x) \ do { \ UDItype __cbtmp; \ - __asm__ ("bsfq %1,%0" \ + __asm__ ("rep;bsfq %1,%0" \ : "=r" (__cbtmp) : "rm" ((UDItype)(x)) \ __CLOBBER_CC); \ (count) = __cbtmp; \ -- 2.34.1 From gniibe at fsij.org Thu Oct 6 09:09:07 2022 From: gniibe at fsij.org (NIIBE Yutaka) Date: Thu, 06 Oct 2022 16:09:07 +0900 Subject: [PATCH 3/3] mpi/longlong.h: x86-64: use tzcnt instruction for trailing zeros In-Reply-To: <20221004184817.140959-3-jussi.kivilinna@iki.fi> References: <20221004184817.140959-1-jussi.kivilinna@iki.fi> <20221004184817.140959-3-jussi.kivilinna@iki.fi> Message-ID: <87a6695sz0.fsf@jumper.gniibe.org> Hello, Jussi Kivilinna wrote: > * mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix > for 'bsfq'. Is it also applicable to 80x86 (IA-32) (adding 'rep')? Besides, I have another issue/concern here. IIUC, longlong.h upstream is GCC. It would be good to import some other changes from the upstream. For example, in our version for PPC/POWER, we still have old two-syntax asm code, that's quite outdated. ( https://dev.gnupg.org/T5980 ) -- From jussi.kivilinna at iki.fi Sat Oct 8 14:01:36 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Oct 2022 15:01:36 +0300 Subject: [PATCH 3/3] mpi/longlong.h: x86-64: use tzcnt instruction for trailing zeros In-Reply-To: <87a6695sz0.fsf@jumper.gniibe.org> References: <20221004184817.140959-1-jussi.kivilinna@iki.fi> <20221004184817.140959-3-jussi.kivilinna@iki.fi> <87a6695sz0.fsf@jumper.gniibe.org> Message-ID: <86aa299c-7e6c-235d-09d4-31393394df14@iki.fi> On 6.10.2022 10.09, NIIBE Yutaka wrote: > Hello, > > Jussi Kivilinna wrote: >> * mpi/longlong.h [__x86_64__] (count_trailing_zeros): Add 'rep' prefix >> for 'bsfq'. > > Is it also applicable to 80x86 (IA-32) (adding 'rep')? > Yes it is, I'll add 'rep' for i386 too. > > Besides, I have another issue/concern here. IIUC, longlong.h upstream > is GCC. It would be good to import some other changes from the > upstream. For example, in our version for PPC/POWER, we still have old > two-syntax asm code, that's quite outdated. ( https://dev.gnupg.org/T5980 ) I can take look into it. -Jussi From jussi.kivilinna at iki.fi Sat Oct 22 16:14:26 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 22 Oct 2022 17:14:26 +0300 Subject: [PATCH 2/2] hwf-x86: enable VPGATHER usage for AMD CPUs with AVX512 In-Reply-To: <20221022141426.293283-1-jussi.kivilinna@iki.fi> References: <20221022141426.293283-1-jussi.kivilinna@iki.fi> Message-ID: <20221022141426.293283-2-jussi.kivilinna@iki.fi> * src/hwf-x86.c (detect_x86_gnuc): Move model based checks and forced soft hwfeatures enablement at end; Enable VPGATHER for AMD CPUs with AVX512. -- AMD Zen4 is able to benefit from VPGATHER based table-lookup for Twofish. Benchmark on Ryzen 9 7900X: Before: TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CTR enc | 1.79 ns/B 532.8 MiB/s 10.07 c/B 5625 CTR dec | 1.79 ns/B 532.6 MiB/s 10.07 c/B 5625 After (~10% faster): TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CTR enc | 1.61 ns/B 593.5 MiB/s 9.05 c/B 5631?2 CTR dec | 1.61 ns/B 590.8 MiB/s 9.08 c/B 5625 Signed-off-by: Jussi Kivilinna --- src/hwf-x86.c | 157 ++++++++++++++++++++++++++------------------------ 1 file changed, 83 insertions(+), 74 deletions(-) diff --git a/src/hwf-x86.c b/src/hwf-x86.c index b440827e..c6f493eb 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -187,6 +187,7 @@ detect_x86_gnuc (void) unsigned int fms, family, model; unsigned int result = 0; unsigned int avoid_vpgather = 0; + unsigned int is_amd_cpu = 0; (void)os_supports_avx_avx2_registers; (void)os_supports_avx512_registers; @@ -242,6 +243,7 @@ detect_x86_gnuc (void) else if (!strcmp (vendor_id.c, "AuthenticAMD")) { /* This is an AMD CPU. */ + is_amd_cpu = 1; } /* Detect Intel features, that might also be supported by other @@ -253,77 +255,6 @@ detect_x86_gnuc (void) family = ((fms & 0xf00) >> 8) + ((fms & 0xff00000) >> 20); model = ((fms & 0xf0) >> 4) + ((fms & 0xf0000) >> 12); - if ((result & HWF_INTEL_CPU) && family == 6) - { - /* These Intel Core processor models have SHLD/SHRD instruction that - * can do integer rotation faster actual ROL/ROR instructions. */ - switch (model) - { - case 0x2A: - case 0x2D: - case 0x3A: - case 0x3C: - case 0x3F: - case 0x45: - case 0x46: - case 0x3D: - case 0x4F: - case 0x56: - case 0x47: - case 0x4E: - case 0x5E: - case 0x8E: - case 0x9E: - case 0x55: - case 0x66: - result |= HWF_INTEL_FAST_SHLD; - break; - } - - /* These Intel Core processors that have AVX2 have slow VPGATHER and - * should be avoided for table-lookup use. */ - switch (model) - { - case 0x3C: - case 0x3F: - case 0x45: - case 0x46: - /* Haswell */ - avoid_vpgather |= 1; - break; - } - } - else - { - /* Avoid VPGATHER for non-Intel CPUs as testing is needed to - * make sure it is fast enough. */ - - avoid_vpgather |= 1; - } - -#ifdef ENABLE_FORCE_SOFT_HWFEATURES - /* Soft HW features mark functionality that is available on all systems - * but not feasible to use because of slow HW implementation. */ - - /* SHLD is faster at rotating register than actual ROR/ROL instructions - * on older Intel systems (~sandy-bridge era). However, SHLD is very - * slow on almost anything else and later Intel processors have faster - * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled - * only for those Intel processors that benefit from the SHLD - * instruction. Enabled here unconditionally as requested. */ - result |= HWF_INTEL_FAST_SHLD; - - /* VPGATHER instructions are used for look-up table based - * implementations which require VPGATHER to be fast enough to beat - * regular parallelized look-up table implementations (see Twofish). - * So far, only Intel processors beginning with skylake have had - * VPGATHER fast enough to be enabled. AMD Zen3 comes close to - * being feasible, but not quite (where twofish-avx2 is few percent - * slower than twofish-3way). Enable VPGATHER here unconditionally - * as requested. */ - avoid_vpgather = 0; -#endif - #ifdef ENABLE_PCLMUL_SUPPORT /* Test bit 1 for PCLMUL. */ if (features & 0x00000002) @@ -392,9 +323,6 @@ detect_x86_gnuc (void) if (features & 0x00000020) if (os_supports_avx_avx2_registers) result |= HWF_INTEL_AVX2; - - if ((result & HWF_INTEL_AVX2) && !avoid_vpgather) - result |= HWF_INTEL_FAST_VPGATHER; #endif /*ENABLE_AVX_SUPPORT*/ /* Test bit 29 for SHA Extensions. */ @@ -446,6 +374,87 @@ detect_x86_gnuc (void) result |= HWF_INTEL_GFNI; } + if ((result & HWF_INTEL_CPU) && family == 6) + { + /* These Intel Core processor models have SHLD/SHRD instruction that + * can do integer rotation faster actual ROL/ROR instructions. */ + switch (model) + { + case 0x2A: + case 0x2D: + case 0x3A: + case 0x3C: + case 0x3F: + case 0x45: + case 0x46: + case 0x3D: + case 0x4F: + case 0x56: + case 0x47: + case 0x4E: + case 0x5E: + case 0x8E: + case 0x9E: + case 0x55: + case 0x66: + result |= HWF_INTEL_FAST_SHLD; + break; + } + + /* These Intel Core processors that have AVX2 have slow VPGATHER and + * should be avoided for table-lookup use. */ + switch (model) + { + case 0x3C: + case 0x3F: + case 0x45: + case 0x46: + /* Haswell */ + avoid_vpgather |= 1; + break; + } + } + else if (is_amd_cpu) + { + /* Non-AVX512 AMD CPUs (pre-Zen4) have slow VPGATHER and should be + * avoided for table-lookup use. */ + avoid_vpgather |= !(result & HWF_INTEL_AVX512); + } + else + { + /* Avoid VPGATHER for non-Intel/non-AMD CPUs as testing is needed to + * make sure it is fast enough. */ + avoid_vpgather |= 1; + } + +#ifdef ENABLE_FORCE_SOFT_HWFEATURES + /* Soft HW features mark functionality that is available on all systems + * but not feasible to use because of slow HW implementation. */ + + /* Some implementations are disabled for non-Intel CPUs. Mark + * current CPU as Intel one to enable those implementations. */ + result |= HWF_INTEL_CPU; + + /* SHLD is faster at rotating register than actual ROR/ROL instructions + * on older Intel systems (~sandy-bridge era). However, SHLD is very + * slow on almost anything else and later Intel processors have faster + * ROR/ROL. Therefore in regular build HWF_INTEL_FAST_SHLD is enabled + * only for those Intel processors that benefit from the SHLD + * instruction. Enabled here unconditionally as requested. */ + result |= HWF_INTEL_FAST_SHLD; + + /* VPGATHER instructions are used for look-up table based + * implementations which require VPGATHER to be fast enough to beat + * regular parallelized look-up table implementations (see Twofish). + * So far, only Intel processors beginning with Skylake and AMD + * processors starting with Zen4 have had VPGATHER fast enough to be + * enabled. Enable VPGATHER here unconditionally as requested. */ + avoid_vpgather = 0; +#endif + + if ((result & HWF_INTEL_AVX2) && !avoid_vpgather) + result |= HWF_INTEL_FAST_VPGATHER; + return result; } #endif /* HAS_X86_CPUID */ -- 2.37.2 From jussi.kivilinna at iki.fi Sat Oct 22 16:14:25 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 22 Oct 2022 17:14:25 +0300 Subject: [PATCH 1/2] sha512-avx512: enable only on Intel CPUs for now Message-ID: <20221022141426.293283-1-jussi.kivilinna@iki.fi> * cipher/sha512.c (sha512_init_common): Enable AVX512 implementation only for Intel CPUs. -- SHA512-AVX512 implementation is slightly slower than AVX2 variant on AMD Zen4 (AVX512 4.88 cpb, AVX2 4.35 cpb). This is likely because AVX512 implementation uses vector registers for round function unlike AVX2 where general purpose registers are used for round function. On Zen4, message expansion and round function then end up competing for narrower vector execution bandwidth and gives slower performance. Signed-off-by: Jussi Kivilinna --- cipher/sha512.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cipher/sha512.c b/cipher/sha512.c index 9ac412b3..492d021a 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -466,7 +466,7 @@ sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags) ctx->bctx.bwrite = do_sha512_transform_amd64_avx2; #endif #ifdef USE_AVX512 - if ((features & HWF_INTEL_AVX512) != 0) + if ((features & HWF_INTEL_AVX512) && (features & HWF_INTEL_CPU)) ctx->bctx.bwrite = do_sha512_transform_amd64_avx512; #endif #ifdef USE_PPC_CRYPTO -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:07:14 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:07:14 +0300 Subject: [PATCH] mpi/longlong: update powerpc macros from GCC Message-ID: <20221023160714.1144288-1-jussi.kivilinna@iki.fi> * mpi/longlong.h [__powerpc__, __powerpc64__]: Update macros. -- Update longlong.h powerpc macros with more up to date versions from GCC's longlong.h. Note, GCC's version is licensed under LGPLv2.1+. Signed-off-by: Jussi Kivilinna --- mpi/longlong.h | 212 +++++++++++++++++++------------------------------ 1 file changed, 81 insertions(+), 131 deletions(-) diff --git a/mpi/longlong.h b/mpi/longlong.h index 9e94ef30..fb860cb6 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -979,180 +979,130 @@ typedef unsigned int UTItype __attribute__ ((mode (TI))); /*************************************** ************** PPC ****************** ***************************************/ -#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32 +/* Powerpc 32 bit support taken from GCC longlong.h. */ +#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 32 # define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ - if (__builtin_constant_p (bh) && (bh) == 0) \ - __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "%r" ((USItype)(ah)), \ - "%r" ((USItype)(al)), \ - "rI" ((USItype)(bl))); \ - else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0) \ - __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "%r" ((USItype)(ah)), \ - "%r" ((USItype)(al)), \ - "rI" ((USItype)(bl))); \ + if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ + __CLOBBER_CC); \ + else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ + __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ + __CLOBBER_CC); \ else \ - __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "%r" ((USItype)(ah)), \ - "r" ((USItype)(bh)), \ - "%r" ((USItype)(al)), \ - "rI" ((USItype)(bl))); \ + __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl) \ + __CLOBBER_CC); \ } while (0) # define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ - if (__builtin_constant_p (ah) && (ah) == 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "r" ((USItype)(bh)), \ - "rI" ((USItype)(al)), \ - "r" ((USItype)(bl))); \ - else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "r" ((USItype)(bh)), \ - "rI" ((USItype)(al)), \ - "r" ((USItype)(bl))); \ + if (__builtin_constant_p (ah) && (ah) == 0) \ + __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ + else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \ + __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ else if (__builtin_constant_p (bh) && (bh) == 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "r" ((USItype)(ah)), \ - "rI" ((USItype)(al)), \ - "r" ((USItype)(bl))); \ - else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "r" ((USItype)(ah)), \ - "rI" ((USItype)(al)), \ - "r" ((USItype)(bl))); \ + __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ + else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \ + __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ else \ - __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ - : "=r" ((sh)), \ - "=&r" ((sl)) \ - : "r" ((USItype)(ah)), \ - "r" ((USItype)(bh)), \ - "rI" ((USItype)(al)), \ - "r" ((USItype)(bl))); \ + __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ } while (0) # define count_leading_zeros(count, x) \ - __asm__ ("{cntlz|cntlzw} %0,%1" \ - : "=r" ((count)) \ - : "r" ((USItype)(x))) + __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x)) # define COUNT_LEADING_ZEROS_0 32 -# if defined (_ARCH_PPC) -# define umul_ppmm(ph, pl, m0, m1) \ +# define umul_ppmm(ph, pl, m0, m1) \ do { \ USItype __m0 = (m0), __m1 = (m1); \ - __asm__ ("mulhwu %0,%1,%2" \ - : "=r" (ph) \ - : "%r" (__m0), \ - "r" (__m1)); \ - (pl) = __m0 * __m1; \ + __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ } while (0) -# define UMUL_TIME 15 -# define smul_ppmm(ph, pl, m0, m1) \ +# define UMUL_TIME 15 +# define smul_ppmm(ph, pl, m0, m1) \ do { \ SItype __m0 = (m0), __m1 = (m1); \ - __asm__ ("mulhw %0,%1,%2" \ - : "=r" ((SItype) ph) \ - : "%r" (__m0), \ - "r" (__m1)); \ - (pl) = __m0 * __m1; \ - } while (0) -# define SMUL_TIME 14 -# define UDIV_TIME 120 -# else -# define umul_ppmm(xh, xl, m0, m1) \ - do { \ - USItype __m0 = (m0), __m1 = (m1); \ - __asm__ ("mul %0,%2,%3" \ - : "=r" ((xh)), \ - "=q" ((xl)) \ - : "r" (__m0), \ - "r" (__m1)); \ - (xh) += ((((SItype) __m0 >> 31) & __m1) \ - + (((SItype) __m1 >> 31) & __m0)); \ + __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ + (pl) = __m0 * __m1; \ } while (0) -# define UMUL_TIME 8 -# define smul_ppmm(xh, xl, m0, m1) \ - __asm__ ("mul %0,%2,%3" \ - : "=r" ((SItype)(xh)), \ - "=q" ((SItype)(xl)) \ - : "r" (m0), \ - "r" (m1)) -# define SMUL_TIME 4 -# define sdiv_qrnnd(q, r, nh, nl, d) \ - __asm__ ("div %0,%2,%4" \ - : "=r" ((SItype)(q)), "=q" ((SItype)(r)) \ - : "r" ((SItype)(nh)), "1" ((SItype)(nl)), "r" ((SItype)(d))) -# define UDIV_TIME 100 -# endif -#endif /* Power architecture variants. */ +# define SMUL_TIME 14 +# define UDIV_TIME 120 +#endif /* 32-bit POWER architecture variants. */ -/* Powerpc 64 bit support taken from gmp-4.1.2. */ +/* Powerpc 64 bit support taken from GCC longlong.h. */ /* We should test _IBMR2 here when we add assembly support for the system vendor compilers. */ -#if (defined (_ARCH_PPC) || defined (__powerpc__)) && W_TYPE_SIZE == 64 -#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ +#if (defined (_ARCH_PPC64) || defined (__powerpc64__)) && W_TYPE_SIZE == 64 +# define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ - __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ + __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ + __CLOBBER_CC); \ else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ - __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl));\ + __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \ + __CLOBBER_CC); \ else \ - __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ - : "=r" (sh), "=&r" (sl) \ - : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \ + __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \ + : "=r" (sh), "=&r" (sl) \ + : "%r" (ah), "r" (bh), "%r" (al), "rI" (bl) \ + __CLOBBER_CC); \ } while (0) -#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ +# define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (ah) && (ah) == 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ + __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\ + __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ else if (__builtin_constant_p (bh) && (bh) == 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ + __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ - : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\ + __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \ + : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ else \ - __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ - : "=r" (sh), "=&r" (sl) \ - : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \ + __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \ + : "=r" (sh), "=&r" (sl) \ + : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \ + __CLOBBER_CC); \ } while (0) -#define count_leading_zeros(count, x) \ +# define count_leading_zeros(count, x) \ __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x)) -#define COUNT_LEADING_ZEROS_0 64 -#define umul_ppmm(ph, pl, m0, m1) \ +# define COUNT_LEADING_ZEROS_0 64 +# define umul_ppmm(ph, pl, m0, m1) \ do { \ UDItype __m0 = (m0), __m1 = (m1); \ __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ (pl) = __m0 * __m1; \ } while (0) -#define UMUL_TIME 15 -#define smul_ppmm(ph, pl, m0, m1) \ +# define UMUL_TIME 15 +# define smul_ppmm(ph, pl, m0, m1) \ do { \ DItype __m0 = (m0), __m1 = (m1); \ __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \ (pl) = __m0 * __m1; \ } while (0) -#define SMUL_TIME 14 /* ??? */ -#define UDIV_TIME 120 /* ??? */ +# define SMUL_TIME 14 /* ??? */ +# define UDIV_TIME 120 /* ??? */ #endif /* 64-bit PowerPC. */ /*************************************** -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:04 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:04 +0300 Subject: [PATCH 4/8] sm4: fix lookup-table prefetching In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-4-jussi.kivilinna@iki.fi> * cipher/sm4.c (sm4_expand_key): Prefetch sbox table. (sm4_get_crypt_blk1_16_fn): Do not prefetch sbox table. (sm4_expand_key, _gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec) (_gcry_sm4_cfb_dec): Prefetch sbox table if table look-up implementation is used. -- Signed-off-by: Jussi Kivilinna --- cipher/sm4.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cipher/sm4.c b/cipher/sm4.c index 99a1e840..32a21dd9 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -701,6 +701,8 @@ sm4_expand_key (SM4_context *ctx, const byte *key) } #endif + prefetch_sbox_table (); + rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0]; rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1]; rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2]; @@ -1008,7 +1010,6 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) else { (void)ctx; - prefetch_sbox_table (); return &sm4_crypt_blocks; } } @@ -1149,6 +1150,9 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, unsigned int tmp_used = 16; size_t nburn; + if (crypt_blk1_16 == &sm4_crypt_blocks) + prefetch_sbox_table (); + nburn = bulk_ctr_enc_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf, nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used); @@ -1295,6 +1299,9 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, unsigned int tmp_used = 16; size_t nburn; + if (crypt_blk1_16 == &sm4_crypt_blocks) + prefetch_sbox_table (); + nburn = bulk_cbc_dec_128(ctx->rkey_dec, crypt_blk1_16, outbuf, inbuf, nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used); @@ -1441,6 +1448,9 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, unsigned int tmp_used = 16; size_t nburn; + if (crypt_blk1_16 == &sm4_crypt_blocks) + prefetch_sbox_table (); + nburn = bulk_cfb_dec_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf, nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used); @@ -1457,6 +1467,7 @@ static unsigned int sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, unsigned int num_blks, const u32 *rk) { + crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; unsigned int stack_burn_size = 0; unsigned int nburn; @@ -1479,7 +1490,7 @@ sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, do { unsigned int curr_blks = num_blks > 16 ? 16 : num_blks; - nburn = ctx->crypt_blk1_16 (rk, outbuf, inbuf, curr_blks); + nburn = crypt_blk1_16 (rk, outbuf, inbuf, curr_blks); stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size; outbuf += curr_blks * 16; inbuf += curr_blks * 16; @@ -1534,6 +1545,9 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, unsigned int tmp_used = 16; size_t nburn; + if (ctx->crypt_blk1_16 == &sm4_crypt_blocks) + prefetch_sbox_table (); + nburn = bulk_xts_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32 : sm4_decrypt_blk1_32, outbuf, inbuf, nblocks, -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:06 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:06 +0300 Subject: [PATCH 6/8] twofish: accelerate XTS and ECB modes In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-6-jussi.kivilinna@iki.fi> * cipher/twofish-amd64.S (_gcry_twofish_amd64_blk3): New. * cipher/twofish-avx2-amd64.S (_gcry_twofish_avx2_blk16): New. (_gcry_twofish_xts_crypt, _gcry_twofish_ecb_crypt) (_gcry_twofish_avx2_blk16, _gcry_twofish_amd64_blk3) (twofish_crypt_blk1_16, twofish_encrypt_blk1_16) (twofish_decrypt_blk1_16): New. (twofish_setkey): Setup XTS and ECB bulk functions. -- Benchmark on AMD Ryzen 9 7900X: Before: TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.52 ns/B 378.2 MiB/s 14.18 c/B 5625 ECB dec | 2.51 ns/B 380.2 MiB/s 14.11 c/B 5625 XTS enc | 2.65 ns/B 359.9 MiB/s 14.91 c/B 5625 XTS dec | 2.63 ns/B 362.0 MiB/s 14.60 c/B 5541 After: TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 1.60 ns/B 594.8 MiB/s 9.02 c/B 5625 ECB dec | 1.60 ns/B 594.8 MiB/s 9.02 c/B 5625 XTS enc | 1.66 ns/B 573.9 MiB/s 9.35 c/B 5625 XTS dec | 1.67 ns/B 569.6 MiB/s 9.41 c/B 5619?2 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna --- cipher/twofish-amd64.S | 74 ++++++++++++++++++ cipher/twofish-avx2-amd64.S | 46 +++++++++++ cipher/twofish.c | 147 +++++++++++++++++++++++++++++++++++- 3 files changed, 264 insertions(+), 3 deletions(-) diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S index a7a60553..8998d296 100644 --- a/cipher/twofish-amd64.S +++ b/cipher/twofish-amd64.S @@ -544,6 +544,80 @@ __twofish_dec_blk3: CFI_ENDPROC(); ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;) +.align 8 +.globl _gcry_twofish_amd64_blk3 +ELF(.type _gcry_twofish_amd64_blk3, at function;) +_gcry_twofish_amd64_blk3: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (3 blocks) + * %rdx: src (3 blocks) + * %ecx: encrypt (0 or 1) + */ + CFI_STARTPROC(); + ENTER_SYSV_FUNC_PARAMS_0_4 + + subq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(8 * 8); + movq %rbp, (0 * 8)(%rsp); + movq %rbx, (1 * 8)(%rsp); + movq %r12, (2 * 8)(%rsp); + movq %r13, (3 * 8)(%rsp); + movq %r14, (4 * 8)(%rsp); + movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); + + testl %ecx, %ecx; + movq %rdx, RX0; + movq %rsi, (6 * 8)(%rsp); + + movq (0 * 8)(RX0), RAB0; + movq (1 * 8)(RX0), RCD0; + movq (2 * 8)(RX0), RAB1; + movq (3 * 8)(RX0), RCD1; + movq (4 * 8)(RX0), RAB2; + movq (5 * 8)(RX0), RCD2; + + jz .Lblk1_3_dec; + call __twofish_enc_blk3; + jmp .Lblk1_3_end; + .Lblk1_3_dec: + call __twofish_dec_blk3; + +.Lblk1_3_end: + movq (6 * 8)(%rsp), RX0; + movq RCD0, (0 * 8)(RX0); + movq RAB0, (1 * 8)(RX0); + movq RCD1, (2 * 8)(RX0); + movq RAB1, (3 * 8)(RX0); + movq RCD2, (4 * 8)(RX0); + movq RAB2, (5 * 8)(RX0); + + movq (0 * 8)(%rsp), %rbp; + movq (1 * 8)(%rsp), %rbx; + movq (2 * 8)(%rsp), %r12; + movq (3 * 8)(%rsp), %r13; + movq (4 * 8)(%rsp), %r14; + movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); + addq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-8 * 8); + + EXIT_SYSV_FUNC + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;) + .align 8 .globl _gcry_twofish_amd64_ctr_enc ELF(.type _gcry_twofish_amd64_ctr_enc, at function;) diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S index 930ac792..0cb9a64c 100644 --- a/cipher/twofish-avx2-amd64.S +++ b/cipher/twofish-avx2-amd64.S @@ -468,6 +468,52 @@ __twofish_dec_blk16: CFI_ENDPROC(); ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;) +.align 8 +.globl _gcry_twofish_avx2_blk16 +ELF(.type _gcry_twofish_avx2_blk16, at function;) +_gcry_twofish_avx2_blk16: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (16 blocks) + * %rdx: src (16 blocks) + * %ecx: encrypt + */ + CFI_STARTPROC(); + + vzeroupper; + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RB0; + vmovdqu (2 * 32)(%rdx), RC0; + vmovdqu (3 * 32)(%rdx), RD0; + vmovdqu (4 * 32)(%rdx), RA1; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RC1; + vmovdqu (7 * 32)(%rdx), RD1; + + testl %ecx, %ecx; + jz .Lblk16_dec; + call __twofish_enc_blk16; + jmp .Lblk16_end; + .Lblk16_dec: + call __twofish_dec_blk16; + +.Lblk16_end: + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RB0, (1 * 32)(%rsi); + vmovdqu RC0, (2 * 32)(%rsi); + vmovdqu RD0, (3 * 32)(%rsi); + vmovdqu RA1, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RC1, (6 * 32)(%rsi); + vmovdqu RD1, (7 * 32)(%rsi); + + vzeroall; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;) + #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ diff --git a/cipher/twofish.c b/cipher/twofish.c index b300715b..92c463fc 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -101,7 +101,12 @@ static size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, int encrypt); static size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); - +static void _gcry_twofish_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); +static void _gcry_twofish_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); /* Structure for an expanded Twofish key. s contains the key-dependent * S-boxes composed with the MDS matrix; w contains the eight "whitening" @@ -775,7 +780,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen, bulk_ops->cfb_dec = _gcry_twofish_cfb_dec; bulk_ops->ctr_enc = _gcry_twofish_ctr_enc; bulk_ops->ocb_crypt = _gcry_twofish_ocb_crypt; - bulk_ops->ocb_auth = _gcry_twofish_ocb_auth; + bulk_ops->ocb_auth = _gcry_twofish_ocb_auth; + bulk_ops->xts_crypt = _gcry_twofish_xts_crypt; + bulk_ops->ecb_crypt = _gcry_twofish_ecb_crypt; (void)hwfeatures; @@ -788,6 +795,9 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen, /* Assembler implementations of Twofish using AVX2. Process 16 block in parallel. */ +extern void _gcry_twofish_avx2_blk16 (const TWOFISH_context *c, byte *out, + const byte *in, int encrypt) ASM_FUNC_ABI; + extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx, unsigned char *out, const unsigned char *in, @@ -835,6 +845,9 @@ extern void _gcry_twofish_amd64_decrypt_block(const TWOFISH_context *c, byte *out, const byte *in); /* These assembly implementations process three blocks in parallel. */ +extern void _gcry_twofish_amd64_blk3(const TWOFISH_context *c, byte *out, + const byte *in, int encrypt); + extern void _gcry_twofish_amd64_ctr_enc(const TWOFISH_context *c, byte *out, const byte *in, byte *ctr); @@ -1501,7 +1514,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, blkn += 3; twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, Ls); + c->u_mode.ocb.aad_sum, Ls); nblocks -= 3; abuf += 3 * TWOFISH_BLOCKSIZE; @@ -1527,6 +1540,134 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, } +static unsigned int +twofish_crypt_blk1_16(const void *context, byte *out, const byte *in, + unsigned int num_blks, int encrypt) +{ + const TWOFISH_context *ctx = context; + unsigned int burn, burn_stack_depth = 0; + +#ifdef USE_AVX2 + if (num_blks == 16 && ctx->use_avx2) + { + _gcry_twofish_avx2_blk16 (ctx, out, in, encrypt); + return 0; + } +#endif + +#ifdef USE_AMD64_ASM + while (num_blks >= 3) + { + _gcry_twofish_amd64_blk3 (ctx, out, in, encrypt); + burn = 8 * sizeof(void *); + burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth; + out += 3 * TWOFISH_BLOCKSIZE; + in += 3 * TWOFISH_BLOCKSIZE; + num_blks -= 3; + } +#endif + + while (num_blks >= 1) + { + if (encrypt) + burn = twofish_encrypt((void *)ctx, out, in); + else + burn = twofish_decrypt((void *)ctx, out, in); + + burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth; + out += TWOFISH_BLOCKSIZE; + in += TWOFISH_BLOCKSIZE; + num_blks--; + } + + return burn_stack_depth; +} + +static unsigned int +twofish_encrypt_blk1_16(const void *ctx, byte *out, const byte *in, + unsigned int num_blks) +{ + return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 1); +} + +static unsigned int +twofish_decrypt_blk1_16(const void *ctx, byte *out, const byte *in, + unsigned int num_blks) +{ + return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 0); +} + + +/* Bulk encryption/decryption of complete blocks in XTS mode. */ +static void +_gcry_twofish_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * 16]; + unsigned int tmp_used = 16; + size_t tmpbufsize = 15 * 16; + size_t nburn; + +#ifdef USE_AVX2 + if (ctx->use_avx2) + tmpbufsize = 16 * 16; +#endif + + nburn = bulk_xts_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16 + : twofish_decrypt_blk1_16, + outbuf, inbuf, nblocks, + tweak, tmpbuf, tmpbufsize / 16, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory(tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk encryption/decryption in ECB mode. */ +static void +_gcry_twofish_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt) +{ + TWOFISH_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + size_t fn_maxblocks = 15; + size_t nburn; + +#ifdef USE_AVX2 + if (ctx->use_avx2) + fn_maxblocks = 16; +#endif + + nburn = bulk_ecb_crypt_128(ctx, encrypt ? twofish_encrypt_blk1_16 + : twofish_decrypt_blk1_16, + outbuf, inbuf, nblocks, fn_maxblocks); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + + /* Test a single encryption and decryption with each key size. */ -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:03 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:03 +0300 Subject: [PATCH 3/8] camellia: accelerate ECB (for benchmarking) In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-3-jussi.kivilinna@iki.fi> * cipher/bulkhelp.h (bulk_ecb_crypt_128): New. * cipher/camellia-glue.c (_gcry_camellia_ecb_crypt): New. (camellia_setkey): Select ECB bulk function with AESNI/AVX2, VAES/AVX2 and GFNI/AVX2. -- Benchmark on AMD Ryzen 9 7900X: Before: CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 3.27 ns/B 291.8 MiB/s 18.38 c/B 5625 ECB dec | 3.25 ns/B 293.3 MiB/s 18.29 c/B 5625 After (OCB for reference): CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.146 ns/B 6533 MiB/s 0.803 c/B 5500 ECB dec | 0.149 ns/B 6384 MiB/s 0.822 c/B 5500 OCB enc | 0.170 ns/B 5608 MiB/s 0.957 c/B 5625 OCB dec | 0.175 ns/B 5452 MiB/s 0.984 c/B 5625 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna --- cipher/bulkhelp.h | 19 +++++++++++++++++++ cipher/camellia-glue.c | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h index 444973ab..b86abc27 100644 --- a/cipher/bulkhelp.h +++ b/cipher/bulkhelp.h @@ -470,5 +470,24 @@ bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, return burn_depth; } +static inline unsigned int +bulk_ecb_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf, + const byte *inbuf, size_t nblocks, size_t fn_max_nblocks) +{ + unsigned int burn_depth = 0; + unsigned int nburn; + + while (nblocks >= 1) + { + size_t curr_blks = nblocks > fn_max_nblocks ? fn_max_nblocks : nblocks; + nburn = crypt_fn (priv, outbuf, inbuf, curr_blks); + burn_depth = nburn > burn_depth ? nburn : burn_depth; + inbuf += curr_blks * 16; + outbuf += curr_blks * 16; + nblocks -= curr_blks; + } + + return burn_depth; +} #endif /*GCRYPT_BULKHELP_H*/ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index b2a50233..a81d586a 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -405,11 +405,14 @@ static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak, - void *outbuf_arg, const void *inbuf_arg, - size_t nblocks, int encrypt); + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); +static void _gcry_camellia_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); static void _gcry_camellia_ctr32le_enc (void *context, unsigned char *ctr, - void *outbuf_arg, const void *inbuf_arg, - size_t nblocks); + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); @@ -474,10 +477,12 @@ camellia_setkey(void *c, const byte *key, unsigned keylen, if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2) { bulk_ops->xts_crypt = _gcry_camellia_xts_crypt; + bulk_ops->ecb_crypt = _gcry_camellia_ecb_crypt; bulk_ops->ctr32le_enc = _gcry_camellia_ctr32le_enc; } #else (void)_gcry_camellia_xts_crypt; + (void)_gcry_camellia_ecb_crypt; (void)_gcry_camellia_ctr32le_enc; #endif @@ -1126,6 +1131,31 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv, _gcry_burn_stack(burn_stack_depth); } +/* Bulk encryption/decryption in ECB mode. */ +static void +_gcry_camellia_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + CAMELLIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + size_t nburn; + + nburn = bulk_ecb_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64 + : camellia_decrypt_blk1_64, + outbuf, inbuf, nblocks, 64); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + /* Bulk encryption/decryption of complete blocks in XTS mode. */ static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak, -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:05 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:05 +0300 Subject: [PATCH 5/8] sm4: accelerate ECB (for benchmarking) In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-5-jussi.kivilinna@iki.fi> * cipher/sm4.c (_gcry_sm4_ecb_crypt): New. (sm4_setkey): Setup ECB bulk function. -- Benchmark on AMD Ryzen 9 7900X: Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 4.75 ns/B 200.6 MiB/s 26.74 c/B 5625 ECB dec | 4.79 ns/B 199.3 MiB/s 26.92 c/B 5625 After (OCB for reference): SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.252 ns/B 3782 MiB/s 1.42 c/B 5624 ECB dec | 0.253 ns/B 3770 MiB/s 1.42 c/B 5625 OCB enc | 0.277 ns/B 3446 MiB/s 1.56 c/B 5625 OCB dec | 0.281 ns/B 3399 MiB/s 1.54 c/B 5500 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna --- cipher/sm4.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cipher/sm4.c b/cipher/sm4.c index 32a21dd9..20852cfb 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -129,6 +129,9 @@ static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv, static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +static void _gcry_sm4_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); static void _gcry_sm4_ctr32le_enc(void *context, unsigned char *ctr, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); @@ -796,6 +799,7 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, bulk_ops->cfb_dec = _gcry_sm4_cfb_dec; bulk_ops->ctr_enc = _gcry_sm4_ctr_enc; bulk_ops->xts_crypt = _gcry_sm4_xts_crypt; + bulk_ops->ecb_crypt = _gcry_sm4_ecb_crypt; bulk_ops->ctr32le_enc = _gcry_sm4_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt; bulk_ops->ocb_auth = _gcry_sm4_ocb_auth; @@ -1517,6 +1521,34 @@ sm4_decrypt_blk1_32 (const void *context, byte *out, const byte *in, return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_dec); } +/* Bulk encryption/decryption in ECB mode. */ +static void +_gcry_sm4_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + SM4_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + size_t nburn; + + if (ctx->crypt_blk1_16 == &sm4_crypt_blocks) + prefetch_sbox_table (); + + nburn = bulk_ecb_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32 + : sm4_decrypt_blk1_32, + outbuf, inbuf, nblocks, 32); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + /* Bulk encryption/decryption of complete blocks in XTS mode. */ static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:07 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:07 +0300 Subject: [PATCH 7/8] serpent: fix compiler warning on 32-bit ARM In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-7-jussi.kivilinna@iki.fi> * cipher/serpent.c (_gcry_serpent_ocb_crypt) (_gcry_serpent_ocb_auth) [USE_NEON]: Cast "Ls" to 'const void **'. -- Signed-off-by: Jussi Kivilinna --- cipher/serpent.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cipher/serpent.c b/cipher/serpent.c index 11eeb079..93c561c5 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -1369,10 +1369,10 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, if (encrypt) _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, (void **)Ls); + c->u_ctr.ctr, (const void **)Ls); else _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, - c->u_ctr.ctr, (void **)Ls); + c->u_ctr.ctr, (const void **)Ls); nblocks -= 8; outbuf += 8 * sizeof(serpent_block_t); @@ -1508,7 +1508,8 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, (void **)Ls); + c->u_mode.ocb.aad_sum, + (const void **)Ls); nblocks -= 8; abuf += 8 * sizeof(serpent_block_t); -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:02 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:02 +0300 Subject: [PATCH 2/8] rijndael-vaes: align asm functions In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-2-jussi.kivilinna@iki.fi> * cipher/rijndael-vaes-avx2-amd64.S: Align functions to 16 bytes. -- Signed-off-by: Jussi Kivilinna --- cipher/rijndael-vaes-avx2-amd64.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index 655fdf55..13fe7ab0 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -86,6 +86,7 @@ **********************************************************************/ ELF(.type _gcry_vaes_avx2_cbc_dec_amd64, at function) .globl _gcry_vaes_avx2_cbc_dec_amd64 +.align 16 _gcry_vaes_avx2_cbc_dec_amd64: /* input: * %rdi: round keys @@ -392,6 +393,7 @@ ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64) **********************************************************************/ ELF(.type _gcry_vaes_avx2_cfb_dec_amd64, at function) .globl _gcry_vaes_avx2_cfb_dec_amd64 +.align 16 _gcry_vaes_avx2_cfb_dec_amd64: /* input: * %rdi: round keys @@ -700,6 +702,7 @@ ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64) **********************************************************************/ ELF(.type _gcry_vaes_avx2_ctr_enc_amd64, at function) .globl _gcry_vaes_avx2_ctr_enc_amd64 +.align 16 _gcry_vaes_avx2_ctr_enc_amd64: /* input: * %rdi: round keys @@ -1112,6 +1115,7 @@ ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64) **********************************************************************/ ELF(.type _gcry_vaes_avx2_ctr32le_enc_amd64, at function) .globl _gcry_vaes_avx2_ctr32le_enc_amd64 +.align 16 _gcry_vaes_avx2_ctr32le_enc_amd64: /* input: * %rdi: round keys @@ -1396,6 +1400,7 @@ ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64) **********************************************************************/ ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64, at function) .globl _gcry_vaes_avx2_ocb_crypt_amd64 +.align 16 _gcry_vaes_avx2_ocb_crypt_amd64: /* input: * %rdi: round keys @@ -2361,6 +2366,7 @@ ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64) **********************************************************************/ ELF(.type _gcry_vaes_avx2_xts_crypt_amd64, at function) .globl _gcry_vaes_avx2_xts_crypt_amd64 +.align 16 _gcry_vaes_avx2_xts_crypt_amd64: /* input: * %rdi: round keys @@ -2878,6 +2884,7 @@ ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64) **********************************************************************/ ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64, at function) .globl _gcry_vaes_avx2_ecb_crypt_amd64 +.align 16 _gcry_vaes_avx2_ecb_crypt_amd64: /* input: * %rdi: round keys -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:08 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:08 +0300 Subject: [PATCH 8/8] serpent: accelerate XTS and ECB modes In-Reply-To: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> References: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> Message-ID: <20221023161608.1145423-8-jussi.kivilinna@iki.fi> * cipher/serpent-armv7-neon.S (_gcry_serpent_neon_blk8): New. * cipher/serpent-avx2-amd64.S (_gcry_serpent_avx2_blk16): New. * cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_blk8): New. * cipher/serpent.c (_gcry_serpent_sse2_blk8) (_gcry_serpent_avx2_blk16, _gcry_serpent_neon_blk8) (_gcry_serpent_xts_crypt, _gcry_serpent_ecb_crypt) (serpent_crypt_blk1_16, serpent_encrypt_blk1_16) (serpent_decrypt_blk1_16): New. (serpent_setkey): Setup XTS and ECB bulk functions. -- Benchmark on AMD Ryzen 9 7900X: Before: SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 5.42 ns/B 176.0 MiB/s 30.47 c/B 5625 ECB dec | 4.82 ns/B 197.9 MiB/s 27.11 c/B 5625 XTS enc | 5.57 ns/B 171.3 MiB/s 31.31 c/B 5625 XTS dec | 4.99 ns/B 191.1 MiB/s 28.07 c/B 5625 After: SERPENT128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.708 ns/B 1347 MiB/s 3.98 c/B 5625 ECB dec | 0.694 ns/B 1373 MiB/s 3.91 c/B 5625 XTS enc | 0.766 ns/B 1246 MiB/s 4.31 c/B 5625 XTS dec | 0.754 ns/B 1264 MiB/s 4.24 c/B 5625 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna --- cipher/serpent-armv7-neon.S | 56 ++++++++++++++ cipher/serpent-avx2-amd64.S | 50 ++++++++++++ cipher/serpent-sse2-amd64.S | 65 ++++++++++++++++ cipher/serpent.c | 147 +++++++++++++++++++++++++++++++++++- 4 files changed, 317 insertions(+), 1 deletion(-) diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S index adff6394..4179ba2c 100644 --- a/cipher/serpent-armv7-neon.S +++ b/cipher/serpent-armv7-neon.S @@ -600,6 +600,62 @@ __serpent_dec_blk8: bx lr; .size __serpent_dec_blk8,.-__serpent_dec_blk8; +.align 3 +.globl _gcry_serpent_neon_blk8 +.type _gcry_serpent_neon_blk8,%function; +_gcry_serpent_neon_blk8: + /* input: + * r0: ctx, CTX + * r1: dst (8 blocks) + * r2: src (8 blocks) + * r3: encrypt + */ + + push {lr}; + vpush {RA4-RB2}; + + cmp r3, #0 + + vld1.8 {RA0, RA1}, [r2]!; + vld1.8 {RA2, RA3}, [r2]!; + vld1.8 {RB0, RB1}, [r2]!; + vld1.8 {RB2, RB3}, [r2]!; + + beq .Lblk8_dec; + bl __serpent_enc_blk8; + vst1.8 {RA4}, [r1]!; + vst1.8 {RA1, RA2}, [r1]!; + vst1.8 {RA0}, [r1]!; + vst1.8 {RB4}, [r1]!; + vst1.8 {RB1, RB2}, [r1]!; + vst1.8 {RB0}, [r1]!; + b .Lblk8_end; + .Lblk8_dec: + bl __serpent_dec_blk8; + vst1.8 {RA0, RA1}, [r1]!; + vst1.8 {RA2, RA3}, [r1]!; + vst1.8 {RB0, RB1}, [r1]!; + vst1.8 {RB2, RB3}, [r1]!; + +.Lblk8_end: + /* clear the used registers */ + veor RA0, RA0; + veor RA1, RA1; + veor RA2, RA2; + veor RA3, RA3; + + vpop {RA4-RB2}; + + veor RB3, RB3; + veor RB4, RB4; + veor RT0, RT0; + veor RT1, RT1; + veor RT2, RT2; + veor RT3, RT3; + + pop {pc}; +.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec; + .align 3 .globl _gcry_serpent_neon_ctr_enc .type _gcry_serpent_neon_ctr_enc,%function; diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index d3515a21..54ff61e4 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -583,6 +583,56 @@ __serpent_dec_blk16: CFI_ENDPROC(); ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;) +.align 8 +.globl _gcry_serpent_avx2_blk16 +ELF(.type _gcry_serpent_avx2_blk16, at function;) +_gcry_serpent_avx2_blk16: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %ecx: encrypt + */ + CFI_STARTPROC(); + + vmovdqu (0 * 32)(%rdx), RA0; + vmovdqu (1 * 32)(%rdx), RA1; + vmovdqu (2 * 32)(%rdx), RA2; + vmovdqu (3 * 32)(%rdx), RA3; + vmovdqu (4 * 32)(%rdx), RB0; + vmovdqu (5 * 32)(%rdx), RB1; + vmovdqu (6 * 32)(%rdx), RB2; + vmovdqu (7 * 32)(%rdx), RB3; + + testl %ecx, %ecx; + jz .Lblk16_dec; + call __serpent_enc_blk16; + vmovdqu RA4, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA0, (3 * 32)(%rsi); + vmovdqu RB4, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB0, (7 * 32)(%rsi); + jmp .Lblk16_end; + .Lblk16_dec: + call __serpent_dec_blk16; + vmovdqu RA0, (0 * 32)(%rsi); + vmovdqu RA1, (1 * 32)(%rsi); + vmovdqu RA2, (2 * 32)(%rsi); + vmovdqu RA3, (3 * 32)(%rsi); + vmovdqu RB0, (4 * 32)(%rsi); + vmovdqu RB1, (5 * 32)(%rsi); + vmovdqu RB2, (6 * 32)(%rsi); + vmovdqu RB3, (7 * 32)(%rsi); + +.Lblk16_end: + vzeroall; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;) + #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index b5935095..01723a2a 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -605,6 +605,71 @@ __serpent_dec_blk8: CFI_ENDPROC(); ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;) +.align 8 +.globl _gcry_serpent_sse2_blk8 +ELF(.type _gcry_serpent_sse2_blk8, at function;) +_gcry_serpent_sse2_blk8: + /* input: + * %rdi: ctx, CTX + * %rsi: dst (8 blocks) + * %rdx: src (8 blocks) + * %ecx: encrypt + */ + CFI_STARTPROC(); + + movdqu (0 * 16)(%rdx), RA0; + movdqu (1 * 16)(%rdx), RA1; + movdqu (2 * 16)(%rdx), RA2; + movdqu (3 * 16)(%rdx), RA3; + movdqu (4 * 16)(%rdx), RB0; + movdqu (5 * 16)(%rdx), RB1; + movdqu (6 * 16)(%rdx), RB2; + movdqu (7 * 16)(%rdx), RB3; + + testl %ecx, %ecx; + jz .Lblk8_dec; + call __serpent_enc_blk8; + movdqu RA4, (0 * 16)(%rsi); + movdqu RA1, (1 * 16)(%rsi); + movdqu RA2, (2 * 16)(%rsi); + movdqu RA0, (3 * 16)(%rsi); + movdqu RB4, (4 * 16)(%rsi); + movdqu RB1, (5 * 16)(%rsi); + movdqu RB2, (6 * 16)(%rsi); + movdqu RB0, (7 * 16)(%rsi); + jmp .Lblk8_end; + .Lblk8_dec: + call __serpent_dec_blk8; + movdqu RA0, (0 * 16)(%rsi); + movdqu RA1, (1 * 16)(%rsi); + movdqu RA2, (2 * 16)(%rsi); + movdqu RA3, (3 * 16)(%rsi); + movdqu RB0, (4 * 16)(%rsi); + movdqu RB1, (5 * 16)(%rsi); + movdqu RB2, (6 * 16)(%rsi); + movdqu RB3, (7 * 16)(%rsi); + +.Lblk8_end: + /* clear the used registers */ + pxor RA0, RA0; + pxor RA1, RA1; + pxor RA2, RA2; + pxor RA3, RA3; + pxor RA4, RA4; + pxor RB0, RB0; + pxor RB1, RB1; + pxor RB2, RB2; + pxor RB3, RB3; + pxor RB4, RB4; + pxor RTMP0, RTMP0; + pxor RTMP1, RTMP1; + pxor RTMP2, RTMP2; + pxor RNOT, RNOT; + + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;) + .align 8 .globl _gcry_serpent_sse2_ctr_enc ELF(.type _gcry_serpent_sse2_ctr_enc, at function;) diff --git a/cipher/serpent.c b/cipher/serpent.c index 93c561c5..0a9ed27c 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -139,6 +139,9 @@ extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx, unsigned char *offset, unsigned char *checksum, const u64 Ls[8]) ASM_FUNC_ABI; + +extern void _gcry_serpent_sse2_blk8(const serpent_context_t *c, byte *out, + const byte *in, int encrypt) ASM_FUNC_ABI; #endif #ifdef USE_AVX2 @@ -179,6 +182,9 @@ extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_serpent_avx2_blk16(const serpent_context_t *c, byte *out, + const byte *in, int encrypt) ASM_FUNC_ABI; #endif #ifdef USE_NEON @@ -219,6 +225,9 @@ extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx, unsigned char *offset, unsigned char *checksum, const void *Ls[8]); + +extern void _gcry_serpent_neon_blk8(const serpent_context_t *c, byte *out, + const byte *in, int encrypt); #endif @@ -239,6 +248,12 @@ static size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, int encrypt); static size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); +static void _gcry_serpent_xts_crypt (void *context, unsigned char *tweak, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); +static void _gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); /* @@ -790,7 +805,9 @@ serpent_setkey (void *ctx, bulk_ops->cfb_dec = _gcry_serpent_cfb_dec; bulk_ops->ctr_enc = _gcry_serpent_ctr_enc; bulk_ops->ocb_crypt = _gcry_serpent_ocb_crypt; - bulk_ops->ocb_auth = _gcry_serpent_ocb_auth; + bulk_ops->ocb_auth = _gcry_serpent_ocb_auth; + bulk_ops->xts_crypt = _gcry_serpent_xts_crypt; + bulk_ops->ecb_crypt = _gcry_serpent_ecb_crypt; if (serpent_test_ret) ret = GPG_ERR_SELFTEST_FAILED; @@ -1538,6 +1555,134 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, return nblocks; } + +static unsigned int +serpent_crypt_blk1_16(const void *context, byte *out, const byte *in, + unsigned int num_blks, int encrypt) +{ + const serpent_context_t *ctx = context; + unsigned int burn, burn_stack_depth = 0; + +#ifdef USE_AVX2 + if (num_blks == 16 && ctx->use_avx2) + { + _gcry_serpent_avx2_blk16 (ctx, out, in, encrypt); + return 0; + } +#endif + +#ifdef USE_SSE2 + while (num_blks >= 8) + { + _gcry_serpent_sse2_blk8 (ctx, out, in, encrypt); + out += 8 * sizeof(serpent_block_t); + in += 8 * sizeof(serpent_block_t); + num_blks -= 8; + } +#endif + +#ifdef USE_NEON + if (ctx->use_neon) + { + while (num_blks >= 8) + { + _gcry_serpent_neon_blk8 (ctx, out, in, encrypt); + out += 8 * sizeof(serpent_block_t); + in += 8 * sizeof(serpent_block_t); + num_blks -= 8; + } + } +#endif + + while (num_blks >= 1) + { + if (encrypt) + serpent_encrypt_internal((void *)ctx, in, out); + else + serpent_decrypt_internal((void *)ctx, in, out); + + burn = 2 * sizeof(serpent_block_t); + burn_stack_depth = (burn > burn_stack_depth) ? burn : burn_stack_depth; + out += sizeof(serpent_block_t); + in += sizeof(serpent_block_t); + num_blks--; + } + + return burn_stack_depth; +} + +static unsigned int +serpent_encrypt_blk1_16(const void *ctx, byte *out, const byte *in, + unsigned int num_blks) +{ + return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 1); +} + +static unsigned int +serpent_decrypt_blk1_16(const void *ctx, byte *out, const byte *in, + unsigned int num_blks) +{ + return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 0); +} + + +/* Bulk encryption/decryption of complete blocks in XTS mode. */ +static void +_gcry_serpent_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + serpent_context_t *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * 16]; + unsigned int tmp_used = 16; + size_t nburn; + + nburn = bulk_xts_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16 + : serpent_decrypt_blk1_16, + outbuf, inbuf, nblocks, + tweak, tmpbuf, sizeof(tmpbuf) / 16, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory(tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + + +/* Bulk encryption/decryption in ECB mode. */ +static void +_gcry_serpent_ecb_crypt (void *context, void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt) +{ + serpent_context_t *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + size_t nburn; + + nburn = bulk_ecb_crypt_128(ctx, encrypt ? serpent_encrypt_blk1_16 + : serpent_decrypt_blk1_16, + outbuf, inbuf, nblocks, 16); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } + + if (burn_stack_depth) + _gcry_burn_stack(burn_stack_depth); +} + /* Serpent test. */ -- 2.37.2 From jussi.kivilinna at iki.fi Sun Oct 23 18:16:01 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 23 Oct 2022 19:16:01 +0300 Subject: [PATCH 1/8] rijndael: add ECB acceleration (for benchmarking purposes) Message-ID: <20221023161608.1145423-1-jussi.kivilinna@iki.fi> * cipher/cipher-internal.h (cipher_bulk_ops): Add 'ecb_crypt'. * cipher/cipher.c (do_ecb_crypt): Use bulk function if available. * cipher/rijndael-aesni.c (do_aesni_enc_vec8): Change asm label '.Ldeclast' to '.Lenclast'. (_gcry_aes_aesni_ecb_crypt): New. * cipher/rijndael-armv8-aarch32-ce.S (_gcry_aes_ecb_enc_armv8_ce) (_gcry_aes_ecb_dec_armv8_ce): New. * cipher/rijndael-armv8-aarch64-ce.S (_gcry_aes_ecb_enc_armv8_ce) (_gcry_aes_ecb_dec_armv8_ce): New. * cipher/rijndael-armv8-ce.c (_gcry_aes_ocb_enc_armv8_ce) (_gcry_aes_ocb_dec_armv8_ce, _gcry_aes_ocb_auth_armv8_ce): Change return value from void to size_t. (ocb_crypt_fn_t, xts_crypt_fn_t): Remove. (_gcry_aes_armv8_ce_ocb_crypt, _gcry_aes_armv8_ce_xts_crypt): Remove indirect function call; Return value from called function (allows tail call optimization). (_gcry_aes_armv8_ce_ocb_auth): Return value from called function (allows tail call optimization). (_gcry_aes_ecb_enc_armv8_ce, _gcry_aes_ecb_dec_armv8_ce) (_gcry_aes_armv8_ce_ecb_crypt): New. * cipher/rijndael-vaes-avx2-amd64.S (_gcry_vaes_avx2_ecb_crypt_amd64): New. * cipher/rijndael-vaes.c (_gcry_vaes_avx2_ecb_crypt_amd64) (_gcry_aes_vaes_ecb_crypt): New. * cipher/rijndael.c (_gcry_aes_aesni_ecb_crypt) (_gcry_aes_vaes_ecb_crypt, _gcry_aes_armv8_ce_ecb_crypt): New. (do_setkey): Setup ECB bulk function for x86 AESNI/VAES and ARM CE. -- Benchmark on AMD Ryzen 9 7900X: Before (OCB for reference): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.128 ns/B 7460 MiB/s 0.720 c/B 5634?1 ECB dec | 0.134 ns/B 7103 MiB/s 0.753 c/B 5608 OCB enc | 0.029 ns/B 32930 MiB/s 0.163 c/B 5625 OCB dec | 0.029 ns/B 32738 MiB/s 0.164 c/B 5625 After: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 0.028 ns/B 33761 MiB/s 0.159 c/B 5625 ECB dec | 0.028 ns/B 33917 MiB/s 0.158 c/B 5625 GnuPG-bug-id: T6242 Signed-off-by: Jussi Kivilinna --- cipher/cipher-internal.h | 2 + cipher/cipher.c | 41 ++- cipher/rijndael-aesni.c | 160 ++++++++++- cipher/rijndael-armv8-aarch32-ce.S | 152 +++++++++- cipher/rijndael-armv8-aarch64-ce.S | 125 ++++++++- cipher/rijndael-armv8-ce.c | 124 +++++---- cipher/rijndael-vaes-avx2-amd64.S | 432 ++++++++++++++++++++++++++++- cipher/rijndael-vaes.c | 26 ++ cipher/rijndael.c | 12 + 9 files changed, 997 insertions(+), 77 deletions(-) diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 66b75955..4e022f38 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -161,6 +161,8 @@ typedef struct cipher_mode_ops not NULL. */ typedef struct cipher_bulk_ops { + void (*ecb_crypt)(void *context, void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int encrypt); void (*cfb_enc)(void *context, unsigned char *iv, void *outbuf_arg, const void *inbuf_arg, size_t nblocks); void (*cfb_dec)(void *context, unsigned char *iv, void *outbuf_arg, diff --git a/cipher/cipher.c b/cipher/cipher.c index 6c335aec..026c1511 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -983,14 +983,11 @@ cipher_reset (gcry_cipher_hd_t c) static gcry_err_code_t -do_ecb_crypt (gcry_cipher_hd_t c, - unsigned char *outbuf, size_t outbuflen, - const unsigned char *inbuf, size_t inbuflen, - gcry_cipher_encrypt_t crypt_fn) +do_ecb_crypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, + const unsigned char *inbuf, size_t inbuflen, int encrypt) { unsigned int blocksize = c->spec->blocksize; size_t n, nblocks; - unsigned int burn, nburn; if (outbuflen < inbuflen) return GPG_ERR_BUFFER_TOO_SHORT; @@ -998,18 +995,32 @@ do_ecb_crypt (gcry_cipher_hd_t c, return GPG_ERR_INV_LENGTH; nblocks = inbuflen / blocksize; - burn = 0; - for (n=0; n < nblocks; n++ ) + if (nblocks == 0) + return 0; + + if (c->bulk.ecb_crypt) { - nburn = crypt_fn (&c->context.c, outbuf, inbuf); - burn = nburn > burn ? nburn : burn; - inbuf += blocksize; - outbuf += blocksize; + c->bulk.ecb_crypt (&c->context.c, outbuf, inbuf, nblocks, encrypt); } + else + { + gcry_cipher_encrypt_t crypt_fn = + encrypt ? c->spec->encrypt : c->spec->decrypt; + unsigned int burn = 0; + unsigned int nburn; - if (burn > 0) - _gcry_burn_stack (burn + 4 * sizeof(void *)); + for (n = 0; n < nblocks; n++) + { + nburn = crypt_fn (&c->context.c, outbuf, inbuf); + burn = nburn > burn ? nburn : burn; + inbuf += blocksize; + outbuf += blocksize; + } + + if (burn > 0) + _gcry_burn_stack (burn + 4 * sizeof(void *)); + } return 0; } @@ -1019,7 +1030,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { - return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->encrypt); + return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 1); } static gcry_err_code_t @@ -1027,7 +1038,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, size_t outbuflen, const unsigned char *inbuf, size_t inbuflen) { - return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, c->spec->decrypt); + return do_ecb_crypt (c, outbuf, outbuflen, inbuf, inbuflen, 0); } diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 156af015..906737a6 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -870,7 +870,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xa0(%[key]), %%xmm0\n\t" - "jb .Ldeclast%=\n\t" + "jb .Lenclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" @@ -889,7 +889,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm10\n\t" "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xc0(%[key]), %%xmm0\n\t" - "je .Ldeclast%=\n\t" + "je .Lenclast%=\n\t" "aesenc %%xmm0, %%xmm1\n\t" "aesenc %%xmm0, %%xmm2\n\t" "aesenc %%xmm0, %%xmm3\n\t" @@ -909,7 +909,7 @@ do_aesni_enc_vec8 (const RIJNDAEL_context *ctx) "aesenc %%xmm0, %%xmm11\n\t" "movdqa 0xe0(%[key]), %%xmm0\n" - ".Ldeclast%=:\n\t" + ".Lenclast%=:\n\t" : /* no output */ : [key] "r" (ctx->keyschenc), [rounds] "r" (ctx->rounds) @@ -1717,6 +1717,160 @@ _gcry_aes_aesni_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst, } +void ASM_FUNC_ATTR +_gcry_aes_aesni_ecb_crypt (RIJNDAEL_context *ctx, unsigned char *dst, + const unsigned char *src, size_t nblocks, + int encrypt) +{ + aesni_prepare_2_7_variable; + + aesni_prepare (); + aesni_prepare_2_7(); + + if (!encrypt && !ctx->decryption_prepared) + { + do_aesni_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + +#ifdef __x86_64__ + if (nblocks >= 8) + { + const void *key = encrypt ? ctx->keyschenc : ctx->keyschdec; + aesni_prepare_8_15_variable; + + aesni_prepare_8_15(); + + for (; nblocks >= 8; nblocks -= 8) + { + asm volatile + ("movdqa (%[key]), %%xmm0\n\t" + "movdqu 0*16(%[src]), %%xmm1\n\t" + "movdqu 1*16(%[src]), %%xmm2\n\t" + "movdqu 2*16(%[src]), %%xmm3\n\t" + "movdqu 3*16(%[src]), %%xmm4\n\t" + "movdqu 4*16(%[src]), %%xmm8\n\t" + "movdqu 5*16(%[src]), %%xmm9\n\t" + "movdqu 6*16(%[src]), %%xmm10\n\t" + "movdqu 7*16(%[src]), %%xmm11\n\t" + "pxor %%xmm0, %%xmm1\n\t" + "pxor %%xmm0, %%xmm2\n\t" + "pxor %%xmm0, %%xmm3\n\t" + "pxor %%xmm0, %%xmm4\n\t" + "pxor %%xmm0, %%xmm8\n\t" + "pxor %%xmm0, %%xmm9\n\t" + "pxor %%xmm0, %%xmm10\n\t" + "pxor %%xmm0, %%xmm11\n\t" + : /* No output */ + : [src] "r" (src), + [key] "r" (key) + : "memory"); + + if (encrypt) + { + do_aesni_enc_vec8 (ctx); + asm volatile + ("aesenclast %%xmm0, %%xmm1\n\t" + "aesenclast %%xmm0, %%xmm2\n\t" + "aesenclast %%xmm0, %%xmm3\n\t" + "aesenclast %%xmm0, %%xmm4\n\t" + "aesenclast %%xmm0, %%xmm8\n\t" + "aesenclast %%xmm0, %%xmm9\n\t" + "aesenclast %%xmm0, %%xmm10\n\t" + "aesenclast %%xmm0, %%xmm11\n\t" + ::: "memory" ); + } + else + { + do_aesni_dec_vec8 (ctx); + asm volatile + ("aesdeclast %%xmm0, %%xmm1\n\t" + "aesdeclast %%xmm0, %%xmm2\n\t" + "aesdeclast %%xmm0, %%xmm3\n\t" + "aesdeclast %%xmm0, %%xmm4\n\t" + "aesdeclast %%xmm0, %%xmm8\n\t" + "aesdeclast %%xmm0, %%xmm9\n\t" + "aesdeclast %%xmm0, %%xmm10\n\t" + "aesdeclast %%xmm0, %%xmm11\n\t" + ::: "memory" ); + } + + asm volatile + ("movdqu %%xmm1, 0*16(%[dst])\n\t" + "movdqu %%xmm2, 1*16(%[dst])\n\t" + "movdqu %%xmm3, 2*16(%[dst])\n\t" + "movdqu %%xmm4, 3*16(%[dst])\n\t" + "movdqu %%xmm8, 4*16(%[dst])\n\t" + "movdqu %%xmm9, 5*16(%[dst])\n\t" + "movdqu %%xmm10, 6*16(%[dst])\n\t" + "movdqu %%xmm11, 7*16(%[dst])\n\t" + : /* No output */ + : [dst] "r" (dst) + : "memory"); + + dst += 8*BLOCKSIZE; + src += 8*BLOCKSIZE; + } + + aesni_cleanup_8_15(); + } +#endif + + for (; nblocks >= 4; nblocks -= 4) + { + asm volatile + ("movdqu 0*16(%[src]), %%xmm1\n\t" + "movdqu 1*16(%[src]), %%xmm2\n\t" + "movdqu 2*16(%[src]), %%xmm3\n\t" + "movdqu 3*16(%[src]), %%xmm4\n\t" + : /* No output */ + : [src] "r" (src) + : "memory"); + + if (encrypt) + do_aesni_enc_vec4 (ctx); + else + do_aesni_dec_vec4 (ctx); + + asm volatile + ("movdqu %%xmm1, 0*16(%[dst])\n\t" + "movdqu %%xmm2, 1*16(%[dst])\n\t" + "movdqu %%xmm3, 2*16(%[dst])\n\t" + "movdqu %%xmm4, 3*16(%[dst])\n\t" + : /* No output */ + : [dst] "r" (dst) + : "memory"); + + dst += 4*BLOCKSIZE; + src += 4*BLOCKSIZE; + } + + for (; nblocks; nblocks--) + { + asm volatile ("movdqu %[src], %%xmm0\n\t" + : + : [src] "m" (*src) + : "memory" ); + + if (encrypt) + do_aesni_enc (ctx); + else + do_aesni_dec (ctx); + + asm volatile ("movdqu %%xmm0, %[dst]\n\t" + : [dst] "=m" (*dst) + : + : "memory" ); + + dst += BLOCKSIZE; + src += BLOCKSIZE; + } + + aesni_cleanup (); + aesni_cleanup_2_7 (); +} + + void ASM_FUNC_ATTR _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 1eafa93e..6208652b 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -653,6 +653,149 @@ _gcry_aes_cbc_dec_armv8_ce: .size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce; +/* + * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_enc_armv8_ce +.type _gcry_aes_ecb_enc_armv8_ce,%function; +_gcry_aes_ecb_enc_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: nblocks + * %st+0: nrounds => r4 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + cmp r3, #0 + beq .Lecb_enc_skip + ldr r4, [sp, #(16+0)] + vpush {q4-q7} + + cmp r4, #12 + aes_preload_keys(r0, lr); + + beq .Lecb_entry_192e + bhi .Lecb_entry_256e + +#define ECB_CRYPT(bits, e_d, mc_imc, ...) \ + .Lecb_entry_##bits##e_d: \ + cmp r3, #4; \ + blo .Lecb_loop_##bits##e_d; \ + \ + .Lecb_loop4_##bits##e_d: \ + vld1.8 {q1-q2}, [r2]!; /* load plaintext */ \ + sub r3, r3, #4; \ + vld1.8 {q3-q4}, [r2]!; /* load plaintext */ \ + cmp r3, #4; \ + \ + do_aes_4_##bits(e_d, mc_imc, q1, q2, q3, q4, ##__VA_ARGS__); \ + \ + vst1.8 {q1-q2}, [r1]!; /* store ciphertext */ \ + vst1.8 {q3-q4}, [r1]!; /* store ciphertext */ \ + \ + bhs .Lecb_loop4_##bits##e_d; \ + cmp r3, #0; \ + beq .Lecb_done_##e_d; \ + \ + .Lecb_loop_##bits##e_d: \ + vld1.8 {q1}, [r2]!; /* load ciphertext */ \ + subs r3, r3, #1; \ + \ + do_aes_one##bits(e_d, mc_imc, q1, q1, ##__VA_ARGS__); \ + \ + vst1.8 {q1}, [r1]!; /* store plaintext */ \ + bne .Lecb_loop_##bits##e_d; \ + b .Lecb_done_##e_d; + + ECB_CRYPT(128, e, mc) + ECB_CRYPT(192, e, mc, r0, lr) + ECB_CRYPT(256, e, mc, r0, lr) + +.Lecb_done_e: + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lecb_enc_skip: + pop {r4-r6,pc} +.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce; + + +/* + * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, + * unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_dec_armv8_ce +.type _gcry_aes_ecb_dec_armv8_ce,%function; +_gcry_aes_ecb_dec_armv8_ce: + /* input: + * r0: keysched + * r1: outbuf + * r2: inbuf + * r3: nblocks + * %st+0: nrounds => r4 + */ + + push {r4-r6,lr} /* 4*4 = 16b */ + cmp r3, #0 + beq .Lecb_enc_skip + ldr r4, [sp, #(16+0)] + vpush {q4-q7} + + cmp r4, #12 + + aes_preload_keys(r0, lr); + + beq .Lecb_entry_192d + bhi .Lecb_entry_256d + + ECB_CRYPT(128, d, imc) + ECB_CRYPT(192, d, imc, r0, lr) + ECB_CRYPT(256, d, imc, r0, lr) + +#undef ECB_CRYPT + +.Lecb_done_d: + CLEAR_REG(q0) + CLEAR_REG(q1) + CLEAR_REG(q2) + CLEAR_REG(q3) + CLEAR_REG(q8) + CLEAR_REG(q9) + vpop {q4-q7} + CLEAR_REG(q10) + CLEAR_REG(q11) + CLEAR_REG(q12) + CLEAR_REG(q13) + CLEAR_REG(q14) + +.Lecb_dec_skip: + pop {r4-r6,pc} +.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce; + + /* * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, @@ -1138,7 +1281,7 @@ _gcry_aes_ctr32le_enc_armv8_ce: /* - * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1305,6 +1448,7 @@ _gcry_aes_ocb_enc_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr @@ -1312,7 +1456,7 @@ _gcry_aes_ocb_enc_armv8_ce: /* - * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1479,6 +1623,7 @@ _gcry_aes_ocb_dec_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr @@ -1486,7 +1631,7 @@ _gcry_aes_ocb_dec_armv8_ce: /* - * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, @@ -1632,6 +1777,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(q13) CLEAR_REG(q14) + mov r0, #0 pop {r4-r12,lr} vpop {q4-q7} bx lr diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index 4fef0345..97d3d7eb 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -385,6 +385,119 @@ _gcry_aes_dec_armv8_ce: ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;) +/* + * void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_enc_armv8_ce +ELF(.type _gcry_aes_ecb_enc_armv8_ce,%function;) +_gcry_aes_ecb_enc_armv8_ce: + /* input: + * x0: keysched + * x1: outbuf + * x2: inbuf + * x3: nblocks + * w4: nrounds + */ + CFI_STARTPROC(); + + cbz x3, .Lecb_enc_skip + + aes_preload_keys(x0, w4); + + b.eq .Lecb_entry_192e + b.hi .Lecb_entry_256e + +#define ECB_CRYPT(bits, e_d, mc_imc) \ + .Lecb_entry_##bits##e_d: \ + cmp x3, #4; \ + b.lo .Lecb_loop_##bits##e_d; \ + \ + .Lecb_loop4_##bits##e_d: \ + sub x3, x3, #4; \ + ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \ + cmp x3, #4; \ + do_aes_4_##bits(e_d, mc_imc, v0, v1, v2, v3); \ + st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \ + \ + b.hs .Lecb_loop4_##bits##e_d; \ + CLEAR_REG(v1); \ + CLEAR_REG(v2); \ + CLEAR_REG(v3); \ + cbz x3, .Lecb_done_##e_d; \ + \ + .Lecb_loop_##bits##e_d: \ + ld1 {v0.16b}, [x2], #16; /* load ciphertext */ \ + sub x3, x3, #1; \ + do_aes_one##bits(e_d, mc_imc, v0, v0, vk0); \ + st1 {v0.16b}, [x1], #16; /* store plaintext */ \ + \ + cbnz x3, .Lecb_loop_##bits##e_d; \ + b .Lecb_done_##e_d; + + ECB_CRYPT(128, e, mc) + ECB_CRYPT(192, e, mc) + ECB_CRYPT(256, e, mc) + +.Lecb_done_e: + aes_clear_keys(w4) + + CLEAR_REG(v0) + +.Lecb_enc_skip: + ret_spec_stop + CFI_ENDPROC(); +ELF(.size _gcry_aes_ecb_enc_armv8_ce,.-_gcry_aes_ecb_enc_armv8_ce;) + + +/* + * void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + * unsigned char *outbuf, + * const unsigned char *inbuf, + * size_t nblocks, unsigned int nrounds); + */ + +.align 3 +.globl _gcry_aes_ecb_dec_armv8_ce +ELF(.type _gcry_aes_ecb_dec_armv8_ce,%function;) +_gcry_aes_ecb_dec_armv8_ce: + /* input: + * x0: keysched + * x1: outbuf + * x2: inbuf + * x3: nblocks + * w4: nrounds + */ + CFI_STARTPROC(); + + cbz x3, .Lecb_enc_skip + + aes_preload_keys(x0, w4); + + b.eq .Lecb_entry_192d + b.hi .Lecb_entry_256d + + ECB_CRYPT(128, d, imc) + ECB_CRYPT(192, d, imc) + ECB_CRYPT(256, d, imc) + +#undef ECB_CRYPT + +.Lecb_done_d: + aes_clear_keys(w4) + + CLEAR_REG(v0) + +.Lecb_dec_skip: + ret_spec_stop + CFI_ENDPROC(); +ELF(.size _gcry_aes_ecb_dec_armv8_ce,.-_gcry_aes_ecb_dec_armv8_ce;) + + /* * void _gcry_aes_cbc_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, @@ -471,7 +584,8 @@ ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;) * void _gcry_aes_cbc_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, - * unsigned char *iv, unsigned int nrounds); + * unsigned char *iv, + * size_t nblocks, unsigned int nrounds); */ .align 3 @@ -1136,7 +1250,7 @@ ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;) /* - * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_enc_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1379,13 +1493,14 @@ _gcry_aes_ocb_enc_armv8_ce: add sp, sp, #128; CFI_ADJUST_CFA_OFFSET(-128); + mov x0, #0 ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;) /* - * void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_dec_armv8_ce (const void *keysched, * unsigned char *outbuf, * const unsigned char *inbuf, * unsigned char *offset, @@ -1458,13 +1573,14 @@ _gcry_aes_ocb_dec_armv8_ce: add sp, sp, #128; CFI_ADJUST_CFA_OFFSET(-128); + mov x0, #0 ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;) /* - * void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + * long _gcry_aes_ocb_auth_armv8_ce (const void *keysched, * const unsigned char *abuf, * unsigned char *offset, * unsigned char *checksum, @@ -1605,6 +1721,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(v2) CLEAR_REG(v16) + mov x0, #0 ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;) diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c index c9c37654..042b7d42 100644 --- a/cipher/rijndael-armv8-ce.c +++ b/cipher/rijndael-armv8-ce.c @@ -80,32 +80,32 @@ extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched, unsigned char *iv, size_t nblocks, unsigned int nrounds); -extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *offset, - unsigned char *checksum, - unsigned char *L_table, - size_t nblocks, - unsigned int nrounds, - unsigned int blkn); -extern void _gcry_aes_ocb_dec_armv8_ce (const void *keysched, - unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *offset, - unsigned char *checksum, - unsigned char *L_table, - size_t nblocks, - unsigned int nrounds, - unsigned int blkn); -extern void _gcry_aes_ocb_auth_armv8_ce (const void *keysched, - const unsigned char *abuf, - unsigned char *offset, - unsigned char *checksum, - unsigned char *L_table, - size_t nblocks, - unsigned int nrounds, - unsigned int blkn); +extern size_t _gcry_aes_ocb_enc_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + size_t nblocks, + unsigned int nrounds, + unsigned int blkn); +extern size_t _gcry_aes_ocb_dec_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + size_t nblocks, + unsigned int nrounds, + unsigned int blkn); +extern size_t _gcry_aes_ocb_auth_armv8_ce (const void *keysched, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + unsigned char *L_table, + size_t nblocks, + unsigned int nrounds, + unsigned int blkn); extern void _gcry_aes_xts_enc_armv8_ce (const void *keysched, unsigned char *outbuf, const unsigned char *inbuf, @@ -116,17 +116,14 @@ extern void _gcry_aes_xts_dec_armv8_ce (const void *keysched, const unsigned char *inbuf, unsigned char *tweak, size_t nblocks, unsigned int nrounds); - -typedef void (*ocb_crypt_fn_t) (const void *keysched, unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *offset, unsigned char *checksum, - unsigned char *L_table, size_t nblocks, - unsigned int nrounds, unsigned int blkn); - -typedef void (*xts_crypt_fn_t) (const void *keysched, unsigned char *outbuf, - const unsigned char *inbuf, - unsigned char *tweak, size_t nblocks, - unsigned int nrounds); +extern void _gcry_aes_ecb_enc_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + size_t nblocks, unsigned int nrounds); +extern void _gcry_aes_ecb_dec_armv8_ce (const void *keysched, + unsigned char *outbuf, + const unsigned char *inbuf, + size_t nblocks, unsigned int nrounds); void @@ -312,8 +309,6 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, { RIJNDAEL_context *ctx = (void *)&c->context.c; const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; - ocb_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_ocb_enc_armv8_ce - : _gcry_aes_ocb_dec_armv8_ce; unsigned char *outbuf = outbuf_arg; const unsigned char *inbuf = inbuf_arg; unsigned int nrounds = ctx->rounds; @@ -327,10 +322,16 @@ _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, c->u_mode.ocb.data_nblocks = blkn + nblocks; - crypt_fn(keysched, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, - c->u_mode.ocb.L[0], nblocks, nrounds, (unsigned int)blkn); - - return 0; + if (encrypt) + return _gcry_aes_ocb_enc_armv8_ce (keysched, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, + c->u_mode.ocb.L[0], nblocks, nrounds, + (unsigned int)blkn); + else + return _gcry_aes_ocb_dec_armv8_ce (keysched, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, + c->u_mode.ocb.L[0], nblocks, nrounds, + (unsigned int)blkn); } size_t @@ -345,11 +346,9 @@ _gcry_aes_armv8_ce_ocb_auth (gcry_cipher_hd_t c, void *abuf_arg, c->u_mode.ocb.aad_nblocks = blkn + nblocks; - _gcry_aes_ocb_auth_armv8_ce(keysched, abuf, c->u_mode.ocb.aad_offset, - c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0], - nblocks, nrounds, (unsigned int)blkn); - - return 0; + return _gcry_aes_ocb_auth_armv8_ce (keysched, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, c->u_mode.ocb.L[0], + nblocks, nrounds, (unsigned int)blkn); } void @@ -358,8 +357,6 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, size_t nblocks, int encrypt) { const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; - xts_crypt_fn_t crypt_fn = encrypt ? _gcry_aes_xts_enc_armv8_ce - : _gcry_aes_xts_dec_armv8_ce; unsigned int nrounds = ctx->rounds; if ( !encrypt && !ctx->decryption_prepared ) @@ -368,7 +365,32 @@ _gcry_aes_armv8_ce_xts_crypt (RIJNDAEL_context *ctx, unsigned char *tweak, ctx->decryption_prepared = 1; } - crypt_fn(keysched, outbuf, inbuf, tweak, nblocks, nrounds); + if (encrypt) + _gcry_aes_xts_enc_armv8_ce (keysched, outbuf, inbuf, tweak, + nblocks, nrounds); + else + _gcry_aes_xts_dec_armv8_ce (keysched, outbuf, inbuf, tweak, + nblocks, nrounds); } +void +_gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf, + const void *inbuf, size_t nblocks, + int encrypt) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + unsigned int nrounds = ctx->rounds; + + if ( !encrypt && !ctx->decryption_prepared ) + { + _gcry_aes_armv8_ce_prepare_decryption ( ctx ); + ctx->decryption_prepared = 1; + } + + if (encrypt) + _gcry_aes_ecb_enc_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds); + else + _gcry_aes_ecb_dec_armv8_ce (keysched, outbuf, inbuf, nblocks, nrounds); +} #endif /* USE_ARM_CE */ diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index e36e82a0..655fdf55 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -2357,7 +2357,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64: ELF(.size _gcry_vaes_avx2_ocb_crypt_amd64,.-_gcry_vaes_avx2_ocb_crypt_amd64) /********************************************************************** - CTR-mode encryption + XTS-mode encryption **********************************************************************/ ELF(.type _gcry_vaes_avx2_xts_crypt_amd64, at function) .globl _gcry_vaes_avx2_xts_crypt_amd64 @@ -2873,6 +2873,436 @@ _gcry_vaes_avx2_xts_crypt_amd64: CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64) +/********************************************************************** + ECB-mode encryption + **********************************************************************/ +ELF(.type _gcry_vaes_avx2_ecb_crypt_amd64, at function) +.globl _gcry_vaes_avx2_ecb_crypt_amd64 +_gcry_vaes_avx2_ecb_crypt_amd64: + /* input: + * %rdi: round keys + * %esi: encrypt + * %rdx: dst + * %rcx: src + * %r8: nblocks + * %r9: nrounds + */ + CFI_STARTPROC(); + + /* Process 16 blocks per loop. */ +.align 8 +.Lecb_blk16: + cmpq $16, %r8; + jb .Lecb_blk8; + + leaq -16(%r8), %r8; + + /* Load input and xor first key. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm8; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vmovdqu (8 * 16)(%rcx), %ymm4; + vmovdqu (10 * 16)(%rcx), %ymm5; + vmovdqu (12 * 16)(%rcx), %ymm6; + vmovdqu (14 * 16)(%rcx), %ymm7; + vpxor %ymm8, %ymm0, %ymm0; + vpxor %ymm8, %ymm1, %ymm1; + vpxor %ymm8, %ymm2, %ymm2; + vpxor %ymm8, %ymm3, %ymm3; + vpxor %ymm8, %ymm4, %ymm4; + vpxor %ymm8, %ymm5, %ymm5; + vpxor %ymm8, %ymm6, %ymm6; + vpxor %ymm8, %ymm7, %ymm7; + vbroadcasti128 (1 * 16)(%rdi), %ymm8; + leaq (16 * 16)(%rcx), %rcx; + + testl %esi, %esi; + jz .Lecb_dec_blk16; + /* AES rounds */ + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lecb_enc_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lecb_enc_blk16_last; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESENC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + .Lecb_enc_blk16_last: + vaesenclast %ymm8, %ymm0, %ymm0; + vaesenclast %ymm8, %ymm1, %ymm1; + vaesenclast %ymm8, %ymm2, %ymm2; + vaesenclast %ymm8, %ymm3, %ymm3; + vaesenclast %ymm8, %ymm4, %ymm4; + vaesenclast %ymm8, %ymm5, %ymm5; + vaesenclast %ymm8, %ymm6, %ymm6; + vaesenclast %ymm8, %ymm7, %ymm7; + jmp .Lecb_blk16_end; + + .align 8 + .Lecb_dec_blk16: + /* AES rounds */ + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (2 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (3 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (4 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (5 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (6 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (7 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (8 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (9 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (10 * 16)(%rdi), %ymm8; + cmpl $12, %r9d; + jb .Lecb_dec_blk16_last; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (11 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (12 * 16)(%rdi), %ymm8; + jz .Lecb_dec_blk16_last; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (13 * 16)(%rdi), %ymm8; + VAESDEC8(%ymm8, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); + vbroadcasti128 (14 * 16)(%rdi), %ymm8; + .Lecb_dec_blk16_last: + vaesdeclast %ymm8, %ymm0, %ymm0; + vaesdeclast %ymm8, %ymm1, %ymm1; + vaesdeclast %ymm8, %ymm2, %ymm2; + vaesdeclast %ymm8, %ymm3, %ymm3; + vaesdeclast %ymm8, %ymm4, %ymm4; + vaesdeclast %ymm8, %ymm5, %ymm5; + vaesdeclast %ymm8, %ymm6, %ymm6; + vaesdeclast %ymm8, %ymm7, %ymm7; + jmp .Lecb_blk16_end; + + .align 8 + .Lecb_blk16_end: + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + vmovdqu %ymm4, (8 * 16)(%rdx); + vmovdqu %ymm5, (10 * 16)(%rdx); + vmovdqu %ymm6, (12 * 16)(%rdx); + vmovdqu %ymm7, (14 * 16)(%rdx); + leaq (16 * 16)(%rdx), %rdx; + + jmp .Lecb_blk16; + + /* Handle trailing eight blocks. */ +.align 8 +.Lecb_blk8: + cmpq $8, %r8; + jmp .Lecb_blk4; + + leaq -8(%r8), %r8; + + /* Load input and xor first key. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vpxor %ymm4, %ymm2, %ymm2; + vpxor %ymm4, %ymm3, %ymm3; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + leaq (8 * 16)(%rcx), %rcx; + + testl %esi, %esi; + jz .Lecb_dec_blk8; + /* AES rounds */ + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_enc_blk8_last; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_enc_blk8_last: + vaesenclast %ymm4, %ymm0, %ymm0; + vaesenclast %ymm4, %ymm1, %ymm1; + vaesenclast %ymm4, %ymm2, %ymm2; + vaesenclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + jmp .Lecb_blk4; + + .align 8 + .Lecb_dec_blk8: + /* AES rounds */ + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_dec_blk8_last; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_dec_blk8_last: + vaesdeclast %ymm4, %ymm0, %ymm0; + vaesdeclast %ymm4, %ymm1, %ymm1; + vaesdeclast %ymm4, %ymm2, %ymm2; + vaesdeclast %ymm4, %ymm3, %ymm3; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + vmovdqu %ymm2, (4 * 16)(%rdx); + vmovdqu %ymm3, (6 * 16)(%rdx); + leaq (8 * 16)(%rdx), %rdx; + + /* Handle trailing four blocks. */ +.align 8 +.Lecb_blk4: + cmpq $4, %r8; + jb .Lecb_blk1; + + leaq -4(%r8), %r8; + + /* Load input and xor first key. */ + vbroadcasti128 (0 * 16)(%rdi), %ymm4; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm4, %ymm1, %ymm1; + vbroadcasti128 (1 * 16)(%rdi), %ymm4; + leaq (4 * 16)(%rcx), %rcx; + + testl %esi, %esi; + jz .Lecb_dec_blk4; + /* AES rounds */ + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_enc_blk4_last; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESENC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_enc_blk4_last: + vaesenclast %ymm4, %ymm0, %ymm0; + vaesenclast %ymm4, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + jmp .Lecb_blk1; + + .align 8 + .Lecb_dec_blk4: + /* AES rounds */ + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (2 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (3 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (4 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (5 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (6 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (7 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (8 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (9 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (10 * 16)(%rdi), %ymm4; + cmpl $12, %r9d; + jb .Lecb_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (11 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (12 * 16)(%rdi), %ymm4; + jz .Lecb_dec_blk4_last; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (13 * 16)(%rdi), %ymm4; + VAESDEC2(%ymm4, %ymm0, %ymm1); + vbroadcasti128 (14 * 16)(%rdi), %ymm4; + .Lecb_dec_blk4_last: + vaesdeclast %ymm4, %ymm0, %ymm0; + vaesdeclast %ymm4, %ymm1, %ymm1; + vmovdqu %ymm0, (0 * 16)(%rdx); + vmovdqu %ymm1, (2 * 16)(%rdx); + leaq (4 * 16)(%rdx), %rdx; + + /* Process trailing one to three blocks, one per loop. */ +.align 8 +.Lecb_blk1: + cmpq $1, %r8; + jb .Ldone_ecb; + + leaq -1(%r8), %r8; + + /* Load input. */ + vmovdqu (%rcx), %xmm2; + leaq 16(%rcx), %rcx; + + /* Xor first key. */ + vpxor (0 * 16)(%rdi), %xmm2, %xmm0; + + testl %esi, %esi; + jz .Lecb_dec_blk1; + /* AES rounds. */ + vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (3 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (4 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (5 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (6 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (7 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (8 * 16)(%rdi), %xmm0, %xmm0; + vaesenc (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lecb_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lecb_enc_blk1_last; + vaesenc %xmm1, %xmm0, %xmm0; + vaesenc (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + .Lecb_enc_blk1_last: + vaesenclast %xmm1, %xmm0, %xmm0; + jmp .Lecb_blk1_end; + + .align 8 + .Lecb_dec_blk1: + /* AES rounds. */ + vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (3 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (4 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (5 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (6 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (7 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (8 * 16)(%rdi), %xmm0, %xmm0; + vaesdec (9 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (10 * 16)(%rdi), %xmm1; + cmpl $12, %r9d; + jb .Lecb_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (11 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (12 * 16)(%rdi), %xmm1; + jz .Lecb_dec_blk1_last; + vaesdec %xmm1, %xmm0, %xmm0; + vaesdec (13 * 16)(%rdi), %xmm0, %xmm0; + vmovdqa (14 * 16)(%rdi), %xmm1; + .Lecb_dec_blk1_last: + vaesdeclast %xmm1, %xmm0, %xmm0; + jmp .Lecb_blk1_end; + + .align 8 + .Lecb_blk1_end: + vmovdqu %xmm0, (%rdx); + leaq 16(%rdx), %rdx; + + jmp .Lecb_blk1; + +.align 8 +.Ldone_ecb: + vzeroall; + ret_spec_stop + CFI_ENDPROC(); +ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64) + /********************************************************************** constants **********************************************************************/ diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c index dbcf9afa..978c86da 100644 --- a/cipher/rijndael-vaes.c +++ b/cipher/rijndael-vaes.c @@ -91,6 +91,32 @@ extern void _gcry_vaes_avx2_xts_crypt_amd64 (const void *keysched, unsigned int nrounds, int encrypt) ASM_FUNC_ABI; +extern void _gcry_vaes_avx2_ecb_crypt_amd64 (const void *keysched, + int encrypt, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks, + unsigned int nrounds) ASM_FUNC_ABI; + + +void +_gcry_aes_vaes_ecb_crypt (void *context, void *outbuf, + const void *inbuf, size_t nblocks, + int encrypt) +{ + RIJNDAEL_context *ctx = context; + const void *keysched = encrypt ? ctx->keyschenc32 : ctx->keyschdec32; + unsigned int nrounds = ctx->rounds; + + if (!encrypt && !ctx->decryption_prepared) + { + _gcry_aes_aesni_prepare_decryption (ctx); + ctx->decryption_prepared = 1; + } + + _gcry_vaes_avx2_ecb_crypt_amd64 (keysched, encrypt, outbuf, inbuf, + nblocks, nrounds); +} void _gcry_aes_vaes_cbc_dec (void *context, unsigned char *iv, diff --git a/cipher/rijndael.c b/cipher/rijndael.c index f3060ea5..84cb7109 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -102,6 +102,9 @@ extern size_t _gcry_aes_aesni_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg extern void _gcry_aes_aesni_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_aesni_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); #endif #ifdef USE_VAES @@ -125,6 +128,9 @@ extern size_t _gcry_aes_vaes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, extern void _gcry_aes_vaes_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_vaes_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); #endif #ifdef USE_SSSE3 @@ -227,6 +233,9 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_armv8_ce_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, + int encrypt); #endif /*USE_ARM_ASM*/ #ifdef USE_PPC_CRYPTO @@ -524,6 +533,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_aesni_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_aesni_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_aesni_xts_crypt; + bulk_ops->ecb_crypt = _gcry_aes_aesni_ecb_crypt; #ifdef USE_VAES if ((hwfeatures & HWF_INTEL_VAES_VPCLMUL) && @@ -536,6 +546,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ctr32le_enc = _gcry_aes_vaes_ctr32le_enc; bulk_ops->ocb_crypt = _gcry_aes_vaes_ocb_crypt; bulk_ops->xts_crypt = _gcry_aes_vaes_xts_crypt; + bulk_ops->ecb_crypt = _gcry_aes_vaes_ecb_crypt; } #endif } @@ -591,6 +602,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt; + bulk_ops->ecb_crypt = _gcry_aes_armv8_ce_ecb_crypt; } #endif #ifdef USE_PPC_CRYPTO_WITH_PPC9LE -- 2.37.2