From jussi.kivilinna at iki.fi Wed Jan 4 11:13:31 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 4 Jan 2023 12:13:31 +0200 Subject: [PATCH] Fix compiler warnings seen with clang-powerpc64le target Message-ID: <20230104101331.218990-1-jussi.kivilinna@iki.fi> * cipher/rijndael-ppc-common.h (asm_sbox_be): New. * cipher/rijndael-ppc.c (_gcry_aes_sbox4_ppc8): Use 'asm_sbox_be' instead of 'vec_sbox_be' since this instrinsics has different prototype definition on GCC and Clang ('vector uchar' vs 'vector ulong long'). * cipher/sha256-ppc.c (vec_ror_u32): Remove unused function. -- Signed-off-by: Jussi Kivilinna --- cipher/rijndael-ppc-common.h | 10 ++++++++++ cipher/rijndael-ppc.c | 4 ++-- cipher/sha256-ppc.c | 7 ------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cipher/rijndael-ppc-common.h b/cipher/rijndael-ppc-common.h index e4a90934..fc8ee526 100644 --- a/cipher/rijndael-ppc-common.h +++ b/cipher/rijndael-ppc-common.h @@ -256,6 +256,16 @@ asm_xor(block a, block b) return res; } +static ASM_FUNC_ATTR_INLINE block +asm_sbox_be(block b) +{ + block o; + __asm__ volatile ("vsbox %0, %1\n\t" + : "=v" (o) + : "v" (b)); + return o; +} + static ASM_FUNC_ATTR_INLINE block asm_cipher_be(block b, block rk) { diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c index 9db1c811..19f6a7e1 100644 --- a/cipher/rijndael-ppc.c +++ b/cipher/rijndael-ppc.c @@ -106,9 +106,9 @@ _gcry_aes_sbox4_ppc8(u32 fourbytes) { vec_u32 vec_fourbyte = { fourbytes, fourbytes, fourbytes, fourbytes }; #ifdef WORDS_BIGENDIAN - return ((vec_u32)vec_sbox_be((block)vec_fourbyte))[1]; + return ((vec_u32)asm_sbox_be((block)vec_fourbyte))[1]; #else - return ((vec_u32)vec_sbox_be((block)vec_fourbyte))[2]; + return ((vec_u32)asm_sbox_be((block)vec_fourbyte))[2]; #endif } diff --git a/cipher/sha256-ppc.c b/cipher/sha256-ppc.c index a9b59714..c49d9ff2 100644 --- a/cipher/sha256-ppc.c +++ b/cipher/sha256-ppc.c @@ -86,13 +86,6 @@ vec_merge_idx0_elems(vector4x_u32 v0, vector4x_u32 v1, } -static ASM_FUNC_ATTR_INLINE vector4x_u32 -vec_ror_u32(vector4x_u32 v, unsigned int shift) -{ - return (v >> (shift & 31)) ^ (v << ((32 - shift) & 31)); -} - - static ASM_FUNC_ATTR_INLINE vector4x_u32 vec_vshasigma_u32(vector4x_u32 v, unsigned int a, unsigned int b) { -- 2.37.2 From jussi.kivilinna at iki.fi Wed Jan 4 18:53:03 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 4 Jan 2023 19:53:03 +0200 Subject: [PATCH 1/3] Add GMAC-SM4 and Poly1305-SM4 Message-ID: <20230104175305.626195-1-jussi.kivilinna@iki.fi> * cipher/cipher.c (cipher_list_algo301): Remove comma at the end of last entry. * cipher/mac-gmac.c (map_mac_algo_to_cipher): Add SM4. (_gcry_mac_type_spec_gmac_sm4): New. * cipher/max-internal.h (_gcry_mac_type_spec_gmac_sm4) (_gcry_mac_type_spec_poly1305mac_sm4): New. * cipher/mac-poly1305.c (poly1305mac_open): Add SM4. (_gcry_mac_type_spec_poly1305mac_sm4): New. * cipher/mac.c (mac_list, mac_list_algo401, mac_list_algo501): Add GMAC-SM4 and Poly1304-SM4. (mac_list_algo101): Remove comma at the end of last entry. * cipher/md.c (digest_list_algo301): Remove comma at the end of last entry. * doc/gcrypt.texi: Add GCRY_MAC_GMAC_SM4 and GCRY_MAC_POLY1305_SM4. * src/gcrypt.h.in (GCRY_MAC_GMAC_SM4, GCRY_MAC_POLY1305_SM4): New. * tests/bench-slope.c (bench_mac_init): Setup IV for GCRY_MAC_POLY1305_SM4. * tests/benchmark.c (mac_bench): Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/cipher.c | 6 +++--- cipher/mac-gmac.c | 8 ++++++++ cipher/mac-internal.h | 6 ++++++ cipher/mac-poly1305.c | 9 +++++++++ cipher/mac.c | 22 +++++++++++++++++----- cipher/md.c | 4 ++-- doc/gcrypt.texi | 8 ++++++++ src/gcrypt.h.in | 4 +++- tests/bench-slope.c | 1 + tests/benchmark.c | 2 +- 10 files changed, 58 insertions(+), 12 deletions(-) diff --git a/cipher/cipher.c b/cipher/cipher.c index 026c1511..6f92b75a 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -91,7 +91,7 @@ static gcry_cipher_spec_t * const cipher_list[] = #if USE_SM4 &_gcry_cipher_spec_sm4, #endif - NULL + NULL }; /* Cipher implementations starting with index 0 (enum gcry_cipher_algos) */ @@ -207,9 +207,9 @@ static gcry_cipher_spec_t * const cipher_list_algo301[] = NULL, #endif #if USE_SM4 - &_gcry_cipher_spec_sm4, + &_gcry_cipher_spec_sm4 #else - NULL, + NULL #endif }; diff --git a/cipher/mac-gmac.c b/cipher/mac-gmac.c index 12f515eb..5e350010 100644 --- a/cipher/mac-gmac.c +++ b/cipher/mac-gmac.c @@ -45,6 +45,8 @@ map_mac_algo_to_cipher (int mac_algo) return GCRY_CIPHER_SERPENT128; case GCRY_MAC_GMAC_SEED: return GCRY_CIPHER_SEED; + case GCRY_MAC_GMAC_SM4: + return GCRY_CIPHER_SM4; } } @@ -185,3 +187,9 @@ const gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia = { &gmac_ops }; #endif +#if USE_SM4 +const gcry_mac_spec_t _gcry_mac_type_spec_gmac_sm4 = { + GCRY_MAC_GMAC_SM4, {0, 0}, "GMAC_SM4", + &gmac_ops +}; +#endif diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 01998152..39876f55 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -253,6 +253,9 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_seed; #if USE_CAMELLIA extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia; #endif +#if USE_SM4 +extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_sm4; +#endif /* * The Poly1305 MAC algorithm specifications (mac-poly1305.c). @@ -273,3 +276,6 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_serpent; #if USE_SEED extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed; #endif +#if USE_SM4 +extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_sm4; +#endif diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c index 3abc7774..5b6c489e 100644 --- a/cipher/mac-poly1305.c +++ b/cipher/mac-poly1305.c @@ -83,6 +83,9 @@ poly1305mac_open (gcry_mac_hd_t h) case GCRY_MAC_POLY1305_SEED: cipher_algo = GCRY_CIPHER_SEED; break; + case GCRY_MAC_POLY1305_SM4: + cipher_algo = GCRY_CIPHER_SM4; + break; } err = _gcry_cipher_open_internal (&mac_ctx->hd, cipher_algo, @@ -362,3 +365,9 @@ const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed = { &poly1305mac_ops }; #endif +#if USE_SM4 +const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_sm4 = { + GCRY_MAC_POLY1305_SM4, {0, 0}, "POLY1305_SM4", + &poly1305mac_ops +}; +#endif diff --git a/cipher/mac.c b/cipher/mac.c index ba1eb300..05d2c64c 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -132,8 +132,10 @@ static const gcry_mac_spec_t * const mac_list[] = { &_gcry_mac_type_spec_poly1305mac, #if USE_SM4 &_gcry_mac_type_spec_cmac_sm4, + &_gcry_mac_type_spec_gmac_sm4, + &_gcry_mac_type_spec_poly1305mac_sm4, #endif - NULL, + NULL }; /* HMAC implementations start with index 101 (enum gcry_mac_algos) */ @@ -242,10 +244,10 @@ static const gcry_mac_spec_t * const mac_list_algo101[] = #endif #if USE_SHA512 &_gcry_mac_type_spec_hmac_sha512_256, - &_gcry_mac_type_spec_hmac_sha512_224, + &_gcry_mac_type_spec_hmac_sha512_224 #else NULL, - NULL, + NULL #endif }; @@ -338,7 +340,12 @@ static const gcry_mac_spec_t * const mac_list_algo401[] = NULL, #endif #if USE_SEED - &_gcry_mac_type_spec_gmac_seed + &_gcry_mac_type_spec_gmac_seed, +#else + NULL, +#endif +#if USE_SM4 + &_gcry_mac_type_spec_gmac_sm4 #else NULL #endif @@ -369,7 +376,12 @@ static const gcry_mac_spec_t * const mac_list_algo501[] = NULL, #endif #if USE_SEED - &_gcry_mac_type_spec_poly1305mac_seed + &_gcry_mac_type_spec_poly1305mac_seed, +#else + NULL, +#endif +#if USE_SM4 + &_gcry_mac_type_spec_poly1305mac_sm4 #else NULL #endif diff --git a/cipher/md.c b/cipher/md.c index 34336b5c..40a862f6 100644 --- a/cipher/md.c +++ b/cipher/md.c @@ -240,10 +240,10 @@ static const gcry_md_spec_t * const digest_list_algo301[] = #endif #if USE_SHA512 &_gcry_digest_spec_sha512_256, - &_gcry_digest_spec_sha512_224, + &_gcry_digest_spec_sha512_224 #else NULL, - NULL, + NULL #endif }; diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 74615757..db4ad1e6 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -4261,6 +4261,10 @@ block cipher algorithm. This is GMAC message authentication algorithm based on the SEED block cipher algorithm. + at item GCRY_MAC_GMAC_SM4 +This is GMAC message authentication algorithm based on the SM4 +block cipher algorithm. + @item GCRY_MAC_POLY1305 This is plain Poly1305 message authentication algorithm, used with one-time key. @@ -4285,6 +4289,10 @@ key and one-time nonce. This is Poly1305-SEED message authentication algorithm, used with key and one-time nonce. + at item GCRY_MAC_POLY1305_SM4 +This is Poly1305-SM4 message authentication algorithm, used with +key and one-time nonce. + @item GCRY_MAC_GOST28147_IMIT This is MAC construction defined in GOST 28147-89 (see RFC 5830 Section 8). diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 8451a4ce..47d73339 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1516,13 +1516,15 @@ enum gcry_mac_algos GCRY_MAC_GMAC_TWOFISH = 403, GCRY_MAC_GMAC_SERPENT = 404, GCRY_MAC_GMAC_SEED = 405, + GCRY_MAC_GMAC_SM4 = 406, GCRY_MAC_POLY1305 = 501, GCRY_MAC_POLY1305_AES = 502, GCRY_MAC_POLY1305_CAMELLIA = 503, GCRY_MAC_POLY1305_TWOFISH = 504, GCRY_MAC_POLY1305_SERPENT = 505, - GCRY_MAC_POLY1305_SEED = 506 + GCRY_MAC_POLY1305_SEED = 506, + GCRY_MAC_POLY1305_SM4 = 507 }; /* Flags used with the open function. */ diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 1cad6813..eb301569 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -2063,6 +2063,7 @@ bench_mac_init (struct bench_obj *obj) case GCRY_MAC_POLY1305_TWOFISH: case GCRY_MAC_POLY1305_SERPENT: case GCRY_MAC_POLY1305_SEED: + case GCRY_MAC_POLY1305_SM4: gcry_mac_setiv (hd, key, 16); break; } diff --git a/tests/benchmark.c b/tests/benchmark.c index e9223f5a..60abd2cb 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -652,7 +652,7 @@ mac_bench ( const char *algoname ) for (i=0; i < bufsize; i++) buf[i] = i; - if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_SEED) + if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_SM4) { static const char iv[16] = { 1, 2, 3, 4, }; err = gcry_mac_setiv(hd, iv, sizeof(iv)); -- 2.37.2 From jussi.kivilinna at iki.fi Wed Jan 4 18:53:05 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 4 Jan 2023 19:53:05 +0200 Subject: [PATCH 3/3] sm4: add missing OCB 16-way GFNI-AVX512 path In-Reply-To: <20230104175305.626195-1-jussi.kivilinna@iki.fi> References: <20230104175305.626195-1-jussi.kivilinna@iki.fi> Message-ID: <20230104175305.626195-3-jussi.kivilinna@iki.fi> * cipher/sm4.c (_gcry_sm4_ocb_crypt) [USE_GFNI_AVX512]: Add 16-way GFNI-AVX512 handling. -- Signed-off-by: Jussi Kivilinna --- cipher/sm4.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/cipher/sm4.c b/cipher/sm4.c index 0e89be78..b0402b64 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -1663,6 +1663,26 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, inbuf += 32 * 16; } } + + if (nblocks >= 16) + { + l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn); + + /* Process data in 16 block chunks. */ + blkn += 16; + *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); + + if (encrypt) + _gcry_sm4_gfni_avx512_ocb_enc(ctx->rkey_enc, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, Ls); + else + _gcry_sm4_gfni_avx512_ocb_dec(ctx->rkey_dec, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, Ls); + + nblocks -= 16; + outbuf += 16 * 16; + inbuf += 16 * 16; + } } #endif -- 2.37.2 From jussi.kivilinna at iki.fi Wed Jan 4 18:53:04 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 4 Jan 2023 19:53:04 +0200 Subject: [PATCH 2/3] bulkhelp: change bulk function definition to allow modifying context In-Reply-To: <20230104175305.626195-1-jussi.kivilinna@iki.fi> References: <20230104175305.626195-1-jussi.kivilinna@iki.fi> Message-ID: <20230104175305.626195-2-jussi.kivilinna@iki.fi> * cipher/bulkhelp.h (bulk_crypt_fn_t): Make 'ctx' non-constant and change 'num_blks' from 'unsigned int' to 'size_t'. * cipher/camellia-glue.c (camellia_encrypt_blk1_32) (camellia_encrypt_blk1_64, camellia_decrypt_blk1_32) (camellia_decrypt_blk1_64): Adjust to match 'bulk_crypt_fn_t'. * cipher/serpent.c (serpent_crypt_blk1_16, serpent_encrypt_blk1_16) (serpent_decrypt_blk1_16): Likewise. * cipher/sm4.c (crypt_blk1_16_fn_t, _gcry_sm4_aesni_avx_crypt_blk1_8) (sm4_aesni_avx_crypt_blk1_16, _gcry_sm4_aesni_avx2_crypt_blk1_16) (sm4_aesni_avx2_crypt_blk1_16, _gcry_sm4_gfni_avx2_crypt_blk1_16) (sm4_gfni_avx2_crypt_blk1_16, _gcry_sm4_gfni_avx512_crypt_blk1_16) (_gcry_sm4_gfni_avx512_crypt_blk32, sm4_gfni_avx512_crypt_blk1_16) (_gcry_sm4_aarch64_crypt_blk1_8, sm4_aarch64_crypt_blk1_16) (_gcry_sm4_armv8_ce_crypt_blk1_8, sm4_armv8_ce_crypt_blk1_16) (_gcry_sm4_armv9_sve_ce_crypt, sm4_armv9_sve_ce_crypt_blk1_16) (sm4_crypt_blocks, sm4_crypt_blk1_32, sm4_encrypt_blk1_32) (sm4_decrypt_blk1_32): Likewise. * cipher/twofish.c (twofish_crypt_blk1_16, twofish_encrypt_blk1_16) (twofish_decrypt_blk1_16): Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/bulkhelp.h | 4 +-- cipher/camellia-glue.c | 20 ++++++------- cipher/serpent.c | 14 ++++----- cipher/sm4.c | 68 ++++++++++++++++++++---------------------- cipher/twofish.c | 14 ++++----- 5 files changed, 59 insertions(+), 61 deletions(-) diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h index b86abc27..833262e2 100644 --- a/cipher/bulkhelp.h +++ b/cipher/bulkhelp.h @@ -32,9 +32,9 @@ typedef u64 ocb_L_uintptr_t; typedef uintptr_t ocb_L_uintptr_t; #endif -typedef unsigned int (*bulk_crypt_fn_t) (const void *ctx, byte *out, +typedef unsigned int (*bulk_crypt_fn_t) (void *ctx, byte *out, const byte *in, - unsigned int num_blks); + size_t num_blks); static inline ocb_L_uintptr_t * diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index a81d586a..2e00f563 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -616,8 +616,8 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf) static unsigned int -camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf, - unsigned int num_blks) +camellia_encrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, + size_t num_blks) { const CAMELLIA_context *ctx = priv; unsigned int stack_burn_size = 0; @@ -664,10 +664,10 @@ camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf, } static unsigned int -camellia_encrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf, - unsigned int num_blks) +camellia_encrypt_blk1_64 (void *priv, byte *outbuf, const byte *inbuf, + size_t num_blks) { - const CAMELLIA_context *ctx = priv; + CAMELLIA_context *ctx = priv; unsigned int stack_burn_size = 0; unsigned int nburn; @@ -696,8 +696,8 @@ camellia_encrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf, } static unsigned int -camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf, - unsigned int num_blks) +camellia_decrypt_blk1_32 (void *priv, byte *outbuf, const byte *inbuf, + size_t num_blks) { const CAMELLIA_context *ctx = priv; unsigned int stack_burn_size = 0; @@ -744,10 +744,10 @@ camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf, } static unsigned int -camellia_decrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf, - unsigned int num_blks) +camellia_decrypt_blk1_64 (void *priv, byte *outbuf, const byte *inbuf, + size_t num_blks) { - const CAMELLIA_context *ctx = priv; + CAMELLIA_context *ctx = priv; unsigned int stack_burn_size = 0; unsigned int nburn; diff --git a/cipher/serpent.c b/cipher/serpent.c index 0a9ed27c..8fa47c7c 100644 --- a/cipher/serpent.c +++ b/cipher/serpent.c @@ -1557,10 +1557,10 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, static unsigned int -serpent_crypt_blk1_16(const void *context, byte *out, const byte *in, - unsigned int num_blks, int encrypt) +serpent_crypt_blk1_16(void *context, byte *out, const byte *in, + size_t num_blks, int encrypt) { - const serpent_context_t *ctx = context; + serpent_context_t *ctx = context; unsigned int burn, burn_stack_depth = 0; #ifdef USE_AVX2 @@ -1612,15 +1612,15 @@ serpent_crypt_blk1_16(const void *context, byte *out, const byte *in, } static unsigned int -serpent_encrypt_blk1_16(const void *ctx, byte *out, const byte *in, - unsigned int num_blks) +serpent_encrypt_blk1_16(void *ctx, byte *out, const byte *in, + size_t num_blks) { return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 1); } static unsigned int -serpent_decrypt_blk1_16(const void *ctx, byte *out, const byte *in, - unsigned int num_blks) +serpent_decrypt_blk1_16(void *ctx, byte *out, const byte *in, + size_t num_blks) { return serpent_crypt_blk1_16 (ctx, out, in, num_blks, 0); } diff --git a/cipher/sm4.c b/cipher/sm4.c index 20852cfb..0e89be78 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -141,9 +141,7 @@ static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks); -typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out, - const byte *in, - unsigned int num_blks); +typedef bulk_crypt_fn_t crypt_blk1_16_fn_t; typedef struct { @@ -274,12 +272,12 @@ extern void _gcry_sm4_aesni_avx_ocb_auth(const u32 *rk_enc, const u64 Ls[8]) ASM_FUNC_ABI; extern unsigned int -_gcry_sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in, +_gcry_sm4_aesni_avx_crypt_blk1_8(u32 *rk, byte *out, const byte *in, unsigned int num_blks) ASM_FUNC_ABI; static inline unsigned int -sm4_aesni_avx_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_aesni_avx_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { if (num_blks > 8) { @@ -328,12 +326,12 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc, const u64 Ls[16]) ASM_FUNC_ABI; extern unsigned int -_gcry_sm4_aesni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in, +_gcry_sm4_aesni_avx2_crypt_blk1_16(u32 *rk, byte *out, const byte *in, unsigned int num_blks) ASM_FUNC_ABI; static inline unsigned int -sm4_aesni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_aesni_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { #ifdef USE_AESNI_AVX /* Use 128-bit register implementation for short input. */ @@ -384,12 +382,12 @@ extern void _gcry_sm4_gfni_avx2_ocb_auth(const u32 *rk_enc, const u64 Ls[16]) ASM_FUNC_ABI; extern unsigned int -_gcry_sm4_gfni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in, +_gcry_sm4_gfni_avx2_crypt_blk1_16(u32 *rk, byte *out, const byte *in, unsigned int num_blks) ASM_FUNC_ABI; static inline unsigned int -sm4_gfni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_gfni_avx2_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { return _gcry_sm4_gfni_avx2_crypt_blk1_16(rk, out, in, num_blks); } @@ -460,16 +458,16 @@ extern void _gcry_sm4_gfni_avx512_ocb_dec_blk32(const u32 *rk_dec, const u64 Ls[32]) ASM_FUNC_ABI; extern unsigned int -_gcry_sm4_gfni_avx512_crypt_blk1_16(const u32 *rk, byte *out, const byte *in, +_gcry_sm4_gfni_avx512_crypt_blk1_16(u32 *rk, byte *out, const byte *in, unsigned int num_blks) ASM_FUNC_ABI; extern unsigned int -_gcry_sm4_gfni_avx512_crypt_blk32(const u32 *rk, byte *out, +_gcry_sm4_gfni_avx512_crypt_blk32(u32 *rk, byte *out, const byte *in) ASM_FUNC_ABI; static inline unsigned int -sm4_gfni_avx512_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_gfni_avx512_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { return _gcry_sm4_gfni_avx512_crypt_blk1_16(rk, out, in, num_blks); } @@ -496,13 +494,13 @@ extern void _gcry_sm4_aarch64_cfb_dec(const u32 *rk_enc, byte *out, byte *iv, size_t nblocks); -extern void _gcry_sm4_aarch64_crypt_blk1_8(const u32 *rk, byte *out, +extern void _gcry_sm4_aarch64_crypt_blk1_8(u32 *rk, byte *out, const byte *in, size_t num_blocks); static inline unsigned int -sm4_aarch64_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_aarch64_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { if (num_blks > 8) { @@ -547,13 +545,13 @@ extern void _gcry_sm4_armv8_ce_xts_crypt(const u32 *rk, byte *out, byte *tweak, size_t nblocks); -extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out, +extern void _gcry_sm4_armv8_ce_crypt_blk1_8(u32 *rk, byte *out, const byte *in, size_t num_blocks); static inline unsigned int -sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_armv8_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { if (num_blks > 8) { @@ -570,7 +568,7 @@ sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, #endif /* USE_ARM_CE */ #ifdef USE_ARM_SVE_CE -extern void _gcry_sm4_armv9_sve_ce_crypt(const u32 *rk, byte *out, +extern void _gcry_sm4_armv9_sve_ce_crypt(u32 *rk, byte *out, const byte *in, size_t nblocks); @@ -590,8 +588,8 @@ extern void _gcry_sm4_armv9_sve_ce_cfb_dec(const u32 *rk_enc, byte *out, size_t nblocks); static inline unsigned int -sm4_armv9_sve_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, - unsigned int num_blks) +sm4_armv9_sve_ce_crypt_blk1_16(void *rk, byte *out, const byte *in, + size_t num_blks) { _gcry_sm4_armv9_sve_ce_crypt(rk, out, in, num_blks); return 0; @@ -934,8 +932,8 @@ sm4_do_crypt_blks2 (const u32 *rk, byte *out, const byte *in) } static unsigned int -sm4_crypt_blocks (const void *ctx, byte *out, const byte *in, - unsigned int num_blks) +sm4_crypt_blocks (void *ctx, byte *out, const byte *in, + size_t num_blks) { const u32 *rk = ctx; unsigned int burn_depth = 0; @@ -1468,8 +1466,8 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, } static unsigned int -sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, - unsigned int num_blks, const u32 *rk) +sm4_crypt_blk1_32 (SM4_context *ctx, byte *outbuf, const byte *inbuf, + size_t num_blks, u32 *rk) { crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16; unsigned int stack_burn_size = 0; @@ -1506,18 +1504,18 @@ sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf, } static unsigned int -sm4_encrypt_blk1_32 (const void *context, byte *out, const byte *in, - unsigned int num_blks) +sm4_encrypt_blk1_32 (void *context, byte *out, const byte *in, + size_t num_blks) { - const SM4_context *ctx = context; + SM4_context *ctx = context; return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_enc); } static unsigned int -sm4_decrypt_blk1_32 (const void *context, byte *out, const byte *in, - unsigned int num_blks) +sm4_decrypt_blk1_32 (void *context, byte *out, const byte *in, + size_t num_blks) { - const SM4_context *ctx = context; + SM4_context *ctx = context; return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_dec); } diff --git a/cipher/twofish.c b/cipher/twofish.c index 92c463fc..e5eae770 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -1541,10 +1541,10 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, static unsigned int -twofish_crypt_blk1_16(const void *context, byte *out, const byte *in, - unsigned int num_blks, int encrypt) +twofish_crypt_blk1_16(void *context, byte *out, const byte *in, + size_t num_blks, int encrypt) { - const TWOFISH_context *ctx = context; + TWOFISH_context *ctx = context; unsigned int burn, burn_stack_depth = 0; #ifdef USE_AVX2 @@ -1584,15 +1584,15 @@ twofish_crypt_blk1_16(const void *context, byte *out, const byte *in, } static unsigned int -twofish_encrypt_blk1_16(const void *ctx, byte *out, const byte *in, - unsigned int num_blks) +twofish_encrypt_blk1_16(void *ctx, byte *out, const byte *in, + size_t num_blks) { return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 1); } static unsigned int -twofish_decrypt_blk1_16(const void *ctx, byte *out, const byte *in, - unsigned int num_blks) +twofish_decrypt_blk1_16(void *ctx, byte *out, const byte *in, + size_t num_blks) { return twofish_crypt_blk1_16 (ctx, out, in, num_blks, 0); } -- 2.37.2 From jussi.kivilinna at iki.fi Fri Jan 6 09:57:18 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 6 Jan 2023 10:57:18 +0200 Subject: [PATCH 2/2] aria: add generic 2-way bulk processing In-Reply-To: <20230106085718.236228-1-jussi.kivilinna@iki.fi> References: <20230106085718.236228-1-jussi.kivilinna@iki.fi> Message-ID: <20230106085718.236228-2-jussi.kivilinna@iki.fi> * cipher/aria.c (ARIA_context): Add 'bulk_prefetch_ready'. (aria_crypt_2blks, aria_crypt_blocks, aria_enc_blocks, aria_dec_blocks) (_gcry_aria_ctr_enc, _gcry_aria_cbc_enc, _gcry_aria_cbc_dec) (_gcry_aria_cfb_enc, _gcry_aria_cfb_dec, _gcry_aria_ecb_crypt) (_gcry_aria_xts_crypt, _gcry_aria_ctr32le_enc, _gcry_aria_ocb_crypt) (_gcry_aria_ocb_auth): New. (aria_setkey): Setup 'bulk_ops' function pointers. -- Patch adds 2-way parallel generic ARIA implementation for modest performance increase. Benchmark on AMD Ryzen 9 7900X (x86-64) shows ~40% performance improvement for parallelizable modes: ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 2.62 ns/B 364.0 MiB/s 14.74 c/B 5625 ECB dec | 2.61 ns/B 365.2 MiB/s 14.69 c/B 5625 CBC enc | 3.62 ns/B 263.7 MiB/s 20.34 c/B 5625 CBC dec | 2.63 ns/B 363.0 MiB/s 14.78 c/B 5625 CFB enc | 3.59 ns/B 265.3 MiB/s 20.22 c/B 5625 CFB dec | 2.63 ns/B 362.0 MiB/s 14.82 c/B 5625 OFB enc | 3.98 ns/B 239.7 MiB/s 22.38 c/B 5625 OFB dec | 4.00 ns/B 238.2 MiB/s 22.52 c/B 5625 CTR enc | 2.64 ns/B 360.6 MiB/s 14.87 c/B 5624 CTR dec | 2.65 ns/B 360.0 MiB/s 14.90 c/B 5625 XTS enc | 2.68 ns/B 355.8 MiB/s 15.08 c/B 5625 XTS dec | 2.67 ns/B 356.9 MiB/s 15.03 c/B 5625 CCM enc | 6.24 ns/B 152.7 MiB/s 35.12 c/B 5625 CCM dec | 6.25 ns/B 152.5 MiB/s 35.18 c/B 5625 CCM auth | 3.59 ns/B 265.4 MiB/s 20.21 c/B 5625 EAX enc | 6.23 ns/B 153.0 MiB/s 35.06 c/B 5625 EAX dec | 6.23 ns/B 153.1 MiB/s 35.05 c/B 5625 EAX auth | 3.59 ns/B 265.4 MiB/s 20.22 c/B 5625 GCM enc | 2.68 ns/B 355.8 MiB/s 15.08 c/B 5625 GCM dec | 2.69 ns/B 354.7 MiB/s 15.12 c/B 5625 GCM auth | 0.031 ns/B 30832 MiB/s 0.174 c/B 5625 OCB enc | 2.71 ns/B 351.4 MiB/s 15.27 c/B 5625 OCB dec | 2.74 ns/B 347.6 MiB/s 15.43 c/B 5625 OCB auth | 2.64 ns/B 360.8 MiB/s 14.87 c/B 5625 SIV enc | 6.24 ns/B 152.9 MiB/s 35.08 c/B 5625 SIV dec | 6.24 ns/B 152.8 MiB/s 35.10 c/B 5625 SIV auth | 3.59 ns/B 266.0 MiB/s 20.17 c/B 5625 GCM-SIV enc | 2.67 ns/B 356.7 MiB/s 15.04 c/B 5625 GCM-SIV dec | 2.68 ns/B 355.7 MiB/s 15.08 c/B 5625 GCM-SIV auth | 0.034 ns/B 28303 MiB/s 0.190 c/B 5625 Cc: Taehee Yoo Signed-off-by: Jussi Kivilinna --- cipher/aria.c | 479 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 477 insertions(+), 2 deletions(-) diff --git a/cipher/aria.c b/cipher/aria.c index 893763a9..700ea409 100644 --- a/cipher/aria.c +++ b/cipher/aria.c @@ -66,8 +66,9 @@ typedef struct u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS]; int rounds; - /* The decryption key schedule is available */ - unsigned int decryption_prepared:1; + unsigned int decryption_prepared:1; /* The decryption key is set up. */ + unsigned int bulk_prefetch_ready:1; /* Look-up table prefetch ready for + * current bulk operation. */ } ARIA_context; @@ -506,6 +507,7 @@ aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2, u32 *t3) *t2 ^= rk[2]; *t3 ^= rk[3]; } + /* Odd round Substitution & Diffusion */ static ALWAYS_INLINE void aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3) @@ -803,6 +805,469 @@ aria_decrypt(void *c, byte *outbuf, const byte *inbuf) } +static unsigned int +aria_crypt_2blks(ARIA_context *ctx, byte *out, const byte *in, + u32 key[][ARIA_RD_KEY_WORDS]) +{ + u32 ra0, ra1, ra2, ra3; + u32 rb0, rb1, rb2, rb3; + int rounds = ctx->rounds; + int rkidx = 0; + + ra0 = buf_get_be32(in + 0); + ra1 = buf_get_be32(in + 4); + ra2 = buf_get_be32(in + 8); + ra3 = buf_get_be32(in + 12); + rb0 = buf_get_be32(in + 16); + rb1 = buf_get_be32(in + 20); + rb2 = buf_get_be32(in + 24); + rb3 = buf_get_be32(in + 28); + + while (1) + { + aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3); + aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3); + rkidx++; + + aria_subst_diff_odd(&ra0, &ra1, &ra2, &ra3); + aria_subst_diff_odd(&rb0, &rb1, &rb2, &rb3); + aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3); + aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3); + rkidx++; + + if (rkidx >= rounds) + break; + + aria_subst_diff_even(&ra0, &ra1, &ra2, &ra3); + aria_subst_diff_even(&rb0, &rb1, &rb2, &rb3); + } + + aria_last_round(&ra0, &ra1, &ra2, &ra3); + aria_last_round(&rb0, &rb1, &rb2, &rb3); + aria_add_round_key(key[rkidx], &ra0, &ra1, &ra2, &ra3); + aria_add_round_key(key[rkidx], &rb0, &rb1, &rb2, &rb3); + + buf_put_be32(out + 0, ra0); + buf_put_be32(out + 4, ra1); + buf_put_be32(out + 8, ra2); + buf_put_be32(out + 12, ra3); + buf_put_be32(out + 16, rb0); + buf_put_be32(out + 20, rb1); + buf_put_be32(out + 24, rb2); + buf_put_be32(out + 28, rb3); + + return 4 * sizeof(void *) + 8 * sizeof(u32); /* stack burn depth */ +} + +static unsigned int +aria_crypt_blocks (ARIA_context *ctx, byte *out, const byte *in, + size_t num_blks, u32 key[][ARIA_RD_KEY_WORDS]) +{ + unsigned int burn_depth = 0; + unsigned int nburn; + + if (!ctx->bulk_prefetch_ready) + { + prefetch_sboxes(); + ctx->bulk_prefetch_ready = 1; + } + + while (num_blks >= 2) + { + nburn = aria_crypt_2blks (ctx, out, in, key); + burn_depth = nburn > burn_depth ? nburn : burn_depth; + out += 2 * 16; + in += 2 * 16; + num_blks -= 2; + } + + while (num_blks) + { + nburn = aria_crypt (ctx, out, in, key); + burn_depth = nburn > burn_depth ? nburn : burn_depth; + out += 16; + in += 16; + num_blks--; + } + + if (burn_depth) + burn_depth += sizeof(void *) * 5; + return burn_depth; +} + +static unsigned int +aria_enc_blocks (void *c, byte *out, const byte *in, size_t num_blks) +{ + ARIA_context *ctx = (ARIA_context *)c; + + return aria_crypt_blocks (ctx, out, in, num_blks, ctx->enc_key); +} + +static unsigned int +aria_dec_blocks (void *c, byte *out, const byte *in, size_t num_blks) +{ + ARIA_context *ctx = (ARIA_context *)c; + + return aria_crypt_blocks (ctx, out, in, num_blks, ctx->dec_key); +} + + +/* Bulk encryption of complete blocks in CTR mode. This function is only + intended for the bulk encryption feature of cipher.c. CTR is expected to be + of size 16. */ +static void +_gcry_aria_ctr_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + ARIA_context *ctx = context; + byte *outbuf = outbuf_arg; + const byte *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + byte tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + + nburn = bulk_ctr_enc_128(ctx, aria_enc_blocks, outbuf, inbuf, + nblocks, ctr, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + +/* Bulk encryption of complete blocks in CBC mode. */ +static void +_gcry_aria_cbc_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks, int cbc_mac) +{ + ARIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned char *last_iv; + unsigned int burn_depth = 0; + + prefetch_sboxes(); + + last_iv = iv; + + for (; nblocks; nblocks--) + { + cipher_block_xor (outbuf, inbuf, last_iv, ARIA_BLOCK_SIZE); + + burn_depth = aria_crypt (ctx, outbuf, outbuf, ctx->enc_key); + + last_iv = outbuf; + inbuf += ARIA_BLOCK_SIZE; + if (!cbc_mac) + outbuf += ARIA_BLOCK_SIZE; + } + + if (last_iv != iv) + cipher_block_cpy (iv, last_iv, ARIA_BLOCK_SIZE); + + if (burn_depth) + _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); +} + +/* Bulk decryption of complete blocks in CBC mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +static void +_gcry_aria_cbc_dec(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + ARIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + if (!ctx->decryption_prepared) + { + aria_set_decrypt_key (ctx); + ctx->decryption_prepared = 1; + } + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + + nburn = bulk_cbc_dec_128(ctx, aria_dec_blocks, outbuf, inbuf, + nblocks, iv, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + +/* Bulk encryption of complete blocks in CFB mode. */ +static void +_gcry_aria_cfb_enc (void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + ARIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + unsigned int burn_depth = 0; + + prefetch_sboxes(); + + for (; nblocks; nblocks--) + { + /* Encrypt the IV. */ + burn_depth = aria_crypt (ctx, iv, iv, ctx->enc_key); + /* XOR the input with the IV and store input into IV. */ + cipher_block_xor_2dst(outbuf, iv, inbuf, ARIA_BLOCK_SIZE); + outbuf += ARIA_BLOCK_SIZE; + inbuf += ARIA_BLOCK_SIZE; + } + + if (burn_depth) + _gcry_burn_stack (burn_depth + 4 * sizeof(void *)); +} + +/* Bulk decryption of complete blocks in CFB mode. This function is only + intended for the bulk encryption feature of cipher.c. */ +static void +_gcry_aria_cfb_dec(void *context, unsigned char *iv, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + ARIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + + nburn = bulk_cfb_dec_128(ctx, aria_enc_blocks, outbuf, inbuf, + nblocks, iv, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + +/* Bulk encryption/decryption in ECB mode. */ +static void +_gcry_aria_ecb_crypt (void *context, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + ARIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + if (!encrypt && !ctx->decryption_prepared) + { + aria_set_decrypt_key (ctx); + ctx->decryption_prepared = 1; + } + + /* Process remaining blocks. */ + if (nblocks) + { + bulk_crypt_fn_t crypt_blk1_16; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks; + + nburn = bulk_ecb_crypt_128(ctx, crypt_blk1_16, + outbuf, inbuf, nblocks, 16); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + +/* Bulk encryption/decryption of complete blocks in XTS mode. */ +static void +_gcry_aria_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + ARIA_context *ctx = context; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + if (!encrypt && !ctx->decryption_prepared) + { + aria_set_decrypt_key (ctx); + ctx->decryption_prepared = 1; + } + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + bulk_crypt_fn_t crypt_blk1_16; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks; + + nburn = bulk_xts_crypt_128(ctx, crypt_blk1_16, + outbuf, inbuf, nblocks, + tweak, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + +/* Bulk encryption of complete blocks in CTR32LE mode (for GCM-SIV). */ +static void +_gcry_aria_ctr32le_enc(void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks) +{ + ARIA_context *ctx = context; + byte *outbuf = outbuf_arg; + const byte *inbuf = inbuf_arg; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + + nburn = bulk_ctr32le_enc_128 (ctx, aria_enc_blocks, outbuf, inbuf, + nblocks, ctr, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); +} + +/* Bulk encryption/decryption of complete blocks in OCB mode. */ +static size_t +_gcry_aria_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks, int encrypt) +{ + ARIA_context *ctx = (void *)&c->context.c; + unsigned char *outbuf = outbuf_arg; + const unsigned char *inbuf = inbuf_arg; + u64 blkn = c->u_mode.ocb.data_nblocks; + int burn_stack_depth = 0; + + if (!encrypt && !ctx->decryption_prepared) + { + aria_set_decrypt_key (ctx); + ctx->decryption_prepared = 1; + } + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + bulk_crypt_fn_t crypt_blk1_16; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + crypt_blk1_16 = encrypt ? aria_enc_blocks : aria_dec_blocks; + + nburn = bulk_ocb_crypt_128 (c, ctx, crypt_blk1_16, outbuf, inbuf, nblocks, + &blkn, encrypt, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, + &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + c->u_mode.ocb.data_nblocks = blkn; + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); + + return 0; +} + +/* Bulk authentication of complete blocks in OCB mode. */ +static size_t +_gcry_aria_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) +{ + ARIA_context *ctx = (void *)&c->context.c; + const unsigned char *abuf = abuf_arg; + u64 blkn = c->u_mode.ocb.aad_nblocks; + int burn_stack_depth = 0; + + /* Process remaining blocks. */ + if (nblocks) + { + unsigned char tmpbuf[16 * ARIA_BLOCK_SIZE]; + unsigned int tmp_used = ARIA_BLOCK_SIZE; + size_t nburn; + + ctx->bulk_prefetch_ready = 0; + + nburn = bulk_ocb_auth_128 (c, ctx, aria_enc_blocks, abuf, nblocks, + &blkn, tmpbuf, + sizeof(tmpbuf) / ARIA_BLOCK_SIZE, &tmp_used); + burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth; + + wipememory (tmpbuf, tmp_used); + } + + c->u_mode.ocb.aad_nblocks = blkn; + + if (burn_stack_depth) + _gcry_burn_stack (burn_stack_depth); + + return 0; +} + + static gcry_err_code_t aria_setkey(void *c, const byte *key, unsigned keylen, cipher_bulk_ops_t *bulk_ops) @@ -827,6 +1292,16 @@ aria_setkey(void *c, const byte *key, unsigned keylen, /* Setup bulk encryption routines. */ memset (bulk_ops, 0, sizeof(*bulk_ops)); + bulk_ops->cbc_enc = _gcry_aria_cbc_enc; + bulk_ops->cbc_dec = _gcry_aria_cbc_dec; + bulk_ops->cfb_enc = _gcry_aria_cfb_enc; + bulk_ops->cfb_dec = _gcry_aria_cfb_dec; + bulk_ops->ctr_enc = _gcry_aria_ctr_enc; + bulk_ops->ctr32le_enc = _gcry_aria_ctr32le_enc; + bulk_ops->ecb_crypt = _gcry_aria_ecb_crypt; + bulk_ops->xts_crypt = _gcry_aria_xts_crypt; + bulk_ops->ocb_crypt = _gcry_aria_ocb_crypt; + bulk_ops->ocb_auth = _gcry_aria_ocb_auth; /* Setup context and encryption key. */ ctx->decryption_prepared = 0; -- 2.37.2 From jussi.kivilinna at iki.fi Fri Jan 6 09:57:17 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 6 Jan 2023 10:57:17 +0200 Subject: [PATCH 1/2] Add ARIA block cipher Message-ID: <20230106085718.236228-1-jussi.kivilinna@iki.fi> * cipher/Makefile.am: Add 'aria.c'. * cipher/aria.c: New. * cipher/cipher.c (cipher_list, cipher_list_algo301): Add ARIA cipher specs. * cipher/mac-cmac.c (map_mac_algo_to_cipher): Add GCRY_MAC_CMAC_ARIA. (_gcry_mac_type_spec_cmac_aria): New. * cipher/mac-gmac.c (map_mac_algo_to_cipher): Add GCRY_MAC_GMAC_ARIA. (_gcry_mac_type_spec_gmac_aria): New. * cipher/mac-internal.h (_gcry_mac_type_spec_cmac_aria) (_gcry_mac_type_spec_gmac_aria) (_gcry_mac_type_spec_poly1305mac_aria): New. * cipher/mac-poly1305.c (poly1305mac_open): Add GCRY_MAC_GMAC_ARIA. (_gcry_mac_type_spec_poly1305mac_aria): New. * cipher/mac.c (mac_list, mac_list_algo201, mac_list_algo401) (mac_list_algo501): Add ARIA MAC specs. * configure.ac (available_ciphers): Add 'aria'. (GCRYPT_CIPHERS): Add 'aria.lo'. (USE_ARIA): New. * doc/gcrypt.texi: Add GCRY_CIPHER_ARIA128, GCRY_CIPHER_ARIA192, GCRY_CIPHER_ARIA256, GCRY_MAC_CMAC_ARIA, GCRY_MAC_GMAC_ARIA and GCRY_MAC_POLY1305_ARIA. * src/cipher.h (_gcry_cipher_spec_aria128, _gcry_cipher_spec_aria192) (_gcry_cipher_spec_aria256): New. * src/gcrypt.h.in (gcry_cipher_algos): Add GCRY_CIPHER_ARIA128, GCRY_CIPHER_ARIA192 and GCRY_CIPHER_ARIA256. (gcry_mac_algos): GCRY_MAC_CMAC_ARIA, GCRY_MAC_GMAC_ARIA and GCRY_MAC_POLY1305_ARIA. * tests/basic.c (check_ecb_cipher, check_ctr_cipher) (check_cfb_cipher, check_ocb_cipher) [USE_ARIA]: Add ARIA test-vectors. (check_ciphers) [USE_ARIA]: Add GCRY_CIPHER_ARIA128, GCRY_CIPHER_ARIA192 and GCRY_CIPHER_ARIA256. (main): Also run 'check_bulk_cipher_modes' for 'cipher_modes_only'-mode. * tests/bench-slope.c (bench_mac_init): Add GCRY_MAC_POLY1305_ARIA setiv-handling. * tests/benchmark.c (mac_bench): Likewise. -- This patch adds ARIA block cipher for libgcrypt. This implementation is based on work by Taehee Yoo, with following notable changes: - Integration to libgcrypt, use of bithelp.h and bufhelp.h helper functions where possible. - Added lookup table prefetching as is done in AES, GCM and SM4 implementations. - Changed `get_u8` to return `u32` as returning `byte` caused sub-optimal code generation with gcc-12/x86-64 (zero extending from 8-bit to 32-bit register, followed by extraneous sign extending from 32-bit to 64-bit register). - Changed 'aria_crypt' loop structure a bit for tiny performance increase (~1% seen with gcc-12/x86-64/zen4). Benchmark on AMD Ryzen 9 7900X (x86-64): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 3.99 ns/B 239.1 MiB/s 22.43 c/B 5625 ECB dec | 4.00 ns/B 238.4 MiB/s 22.50 c/B 5625 Benchmark on AMD Ryzen 9 7900X (win32): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 4.57 ns/B 208.7 MiB/s 25.31 c/B 5538 ECB dec | 4.66 ns/B 204.8 MiB/s 25.39 c/B 5453 Benchmark on ARM Cortex-A53 (aarch64): ARIA128 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 74.69 ns/B 12.77 MiB/s 48.40 c/B 647.9 ECB dec | 74.99 ns/B 12.72 MiB/s 48.58 c/B 647.9 Cc: Taehee Yoo Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 1 + cipher/aria.c | 928 ++++++++++++++++++++++++++++++++++++++++++ cipher/cipher.c | 16 +- cipher/mac-cmac.c | 8 + cipher/mac-gmac.c | 8 + cipher/mac-internal.h | 9 + cipher/mac-poly1305.c | 9 + cipher/mac.c | 26 +- configure.ac | 8 +- doc/gcrypt.texi | 21 + src/cipher.h | 3 + src/gcrypt.h.in | 10 +- tests/basic.c | 453 +++++++++++++++++++++ tests/bench-slope.c | 1 + tests/benchmark.c | 2 +- 15 files changed, 1495 insertions(+), 8 deletions(-) create mode 100644 cipher/aria.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 8e47e5be..7ebcd179 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -77,6 +77,7 @@ EXTRA_libcipher_la_SOURCES = \ asm-poly1305-aarch64.h \ asm-poly1305-amd64.h \ asm-poly1305-s390x.h \ + aria.c \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ diff --git a/cipher/aria.c b/cipher/aria.c new file mode 100644 index 00000000..893763a9 --- /dev/null +++ b/cipher/aria.c @@ -0,0 +1,928 @@ +/* aria.c - ARIA Cipher Algorithm + * + * Copyright (C) 2022-2023 Taehee Yoo + * Copyright (C) 2023 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#include "types.h" +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "cipher-internal.h" +#include "bulkhelp.h" + +/* Attribute macro to force alignment to 64 bytes. */ +#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# define ATTR_ALIGNED_64 __attribute__ ((aligned (64))) +#else +# define ATTR_ALIGNED_64 +#endif + +/* Attribute macro to force inlining of function. */ +#if __GNUC__ >= 4 +# define ALWAYS_INLINE inline __attribute__ ((always_inline)) +#else +# define ALWAYS_INLINE inline +#endif + +/* Attribute macro to prevent inlining of function. */ +#if __GNUC__ >= 4 +# define NO_INLINE __attribute__ ((noinline)) +#else +# define NO_INLINE +#endif + + +static const char *aria_selftest (void); + + +#define ARIA_MIN_KEY_SIZE 16 +#define ARIA_MAX_KEY_SIZE 32 +#define ARIA_BLOCK_SIZE 16 +#define ARIA_MAX_RD_KEYS 17 +#define ARIA_RD_KEY_WORDS (ARIA_BLOCK_SIZE / sizeof(u32)) + + +typedef struct +{ + u32 enc_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS]; + u32 dec_key[ARIA_MAX_RD_KEYS][ARIA_RD_KEY_WORDS]; + int rounds; + + /* The decryption key schedule is available */ + unsigned int decryption_prepared:1; +} ARIA_context; + + +static const u32 key_rc[20] = + { + 0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0, + 0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0, + 0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e, + 0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0, + 0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0 + }; + + +static struct +{ + volatile u32 counter_head; + u32 cacheline_align[64 / 4 - 1]; + u32 s1[256]; + u32 s2[256]; + u32 x1[256]; + u32 x2[256]; + volatile u32 counter_tail; +} sboxes ATTR_ALIGNED_64 = + { + 0, + { 0, }, + + { /* s1 */ + 0x00636363, 0x007c7c7c, 0x00777777, 0x007b7b7b, + 0x00f2f2f2, 0x006b6b6b, 0x006f6f6f, 0x00c5c5c5, + 0x00303030, 0x00010101, 0x00676767, 0x002b2b2b, + 0x00fefefe, 0x00d7d7d7, 0x00ababab, 0x00767676, + 0x00cacaca, 0x00828282, 0x00c9c9c9, 0x007d7d7d, + 0x00fafafa, 0x00595959, 0x00474747, 0x00f0f0f0, + 0x00adadad, 0x00d4d4d4, 0x00a2a2a2, 0x00afafaf, + 0x009c9c9c, 0x00a4a4a4, 0x00727272, 0x00c0c0c0, + 0x00b7b7b7, 0x00fdfdfd, 0x00939393, 0x00262626, + 0x00363636, 0x003f3f3f, 0x00f7f7f7, 0x00cccccc, + 0x00343434, 0x00a5a5a5, 0x00e5e5e5, 0x00f1f1f1, + 0x00717171, 0x00d8d8d8, 0x00313131, 0x00151515, + 0x00040404, 0x00c7c7c7, 0x00232323, 0x00c3c3c3, + 0x00181818, 0x00969696, 0x00050505, 0x009a9a9a, + 0x00070707, 0x00121212, 0x00808080, 0x00e2e2e2, + 0x00ebebeb, 0x00272727, 0x00b2b2b2, 0x00757575, + 0x00090909, 0x00838383, 0x002c2c2c, 0x001a1a1a, + 0x001b1b1b, 0x006e6e6e, 0x005a5a5a, 0x00a0a0a0, + 0x00525252, 0x003b3b3b, 0x00d6d6d6, 0x00b3b3b3, + 0x00292929, 0x00e3e3e3, 0x002f2f2f, 0x00848484, + 0x00535353, 0x00d1d1d1, 0x00000000, 0x00ededed, + 0x00202020, 0x00fcfcfc, 0x00b1b1b1, 0x005b5b5b, + 0x006a6a6a, 0x00cbcbcb, 0x00bebebe, 0x00393939, + 0x004a4a4a, 0x004c4c4c, 0x00585858, 0x00cfcfcf, + 0x00d0d0d0, 0x00efefef, 0x00aaaaaa, 0x00fbfbfb, + 0x00434343, 0x004d4d4d, 0x00333333, 0x00858585, + 0x00454545, 0x00f9f9f9, 0x00020202, 0x007f7f7f, + 0x00505050, 0x003c3c3c, 0x009f9f9f, 0x00a8a8a8, + 0x00515151, 0x00a3a3a3, 0x00404040, 0x008f8f8f, + 0x00929292, 0x009d9d9d, 0x00383838, 0x00f5f5f5, + 0x00bcbcbc, 0x00b6b6b6, 0x00dadada, 0x00212121, + 0x00101010, 0x00ffffff, 0x00f3f3f3, 0x00d2d2d2, + 0x00cdcdcd, 0x000c0c0c, 0x00131313, 0x00ececec, + 0x005f5f5f, 0x00979797, 0x00444444, 0x00171717, + 0x00c4c4c4, 0x00a7a7a7, 0x007e7e7e, 0x003d3d3d, + 0x00646464, 0x005d5d5d, 0x00191919, 0x00737373, + 0x00606060, 0x00818181, 0x004f4f4f, 0x00dcdcdc, + 0x00222222, 0x002a2a2a, 0x00909090, 0x00888888, + 0x00464646, 0x00eeeeee, 0x00b8b8b8, 0x00141414, + 0x00dedede, 0x005e5e5e, 0x000b0b0b, 0x00dbdbdb, + 0x00e0e0e0, 0x00323232, 0x003a3a3a, 0x000a0a0a, + 0x00494949, 0x00060606, 0x00242424, 0x005c5c5c, + 0x00c2c2c2, 0x00d3d3d3, 0x00acacac, 0x00626262, + 0x00919191, 0x00959595, 0x00e4e4e4, 0x00797979, + 0x00e7e7e7, 0x00c8c8c8, 0x00373737, 0x006d6d6d, + 0x008d8d8d, 0x00d5d5d5, 0x004e4e4e, 0x00a9a9a9, + 0x006c6c6c, 0x00565656, 0x00f4f4f4, 0x00eaeaea, + 0x00656565, 0x007a7a7a, 0x00aeaeae, 0x00080808, + 0x00bababa, 0x00787878, 0x00252525, 0x002e2e2e, + 0x001c1c1c, 0x00a6a6a6, 0x00b4b4b4, 0x00c6c6c6, + 0x00e8e8e8, 0x00dddddd, 0x00747474, 0x001f1f1f, + 0x004b4b4b, 0x00bdbdbd, 0x008b8b8b, 0x008a8a8a, + 0x00707070, 0x003e3e3e, 0x00b5b5b5, 0x00666666, + 0x00484848, 0x00030303, 0x00f6f6f6, 0x000e0e0e, + 0x00616161, 0x00353535, 0x00575757, 0x00b9b9b9, + 0x00868686, 0x00c1c1c1, 0x001d1d1d, 0x009e9e9e, + 0x00e1e1e1, 0x00f8f8f8, 0x00989898, 0x00111111, + 0x00696969, 0x00d9d9d9, 0x008e8e8e, 0x00949494, + 0x009b9b9b, 0x001e1e1e, 0x00878787, 0x00e9e9e9, + 0x00cecece, 0x00555555, 0x00282828, 0x00dfdfdf, + 0x008c8c8c, 0x00a1a1a1, 0x00898989, 0x000d0d0d, + 0x00bfbfbf, 0x00e6e6e6, 0x00424242, 0x00686868, + 0x00414141, 0x00999999, 0x002d2d2d, 0x000f0f0f, + 0x00b0b0b0, 0x00545454, 0x00bbbbbb, 0x00161616 + }, + { /* s2 */ + 0xe200e2e2, 0x4e004e4e, 0x54005454, 0xfc00fcfc, + 0x94009494, 0xc200c2c2, 0x4a004a4a, 0xcc00cccc, + 0x62006262, 0x0d000d0d, 0x6a006a6a, 0x46004646, + 0x3c003c3c, 0x4d004d4d, 0x8b008b8b, 0xd100d1d1, + 0x5e005e5e, 0xfa00fafa, 0x64006464, 0xcb00cbcb, + 0xb400b4b4, 0x97009797, 0xbe00bebe, 0x2b002b2b, + 0xbc00bcbc, 0x77007777, 0x2e002e2e, 0x03000303, + 0xd300d3d3, 0x19001919, 0x59005959, 0xc100c1c1, + 0x1d001d1d, 0x06000606, 0x41004141, 0x6b006b6b, + 0x55005555, 0xf000f0f0, 0x99009999, 0x69006969, + 0xea00eaea, 0x9c009c9c, 0x18001818, 0xae00aeae, + 0x63006363, 0xdf00dfdf, 0xe700e7e7, 0xbb00bbbb, + 0x00000000, 0x73007373, 0x66006666, 0xfb00fbfb, + 0x96009696, 0x4c004c4c, 0x85008585, 0xe400e4e4, + 0x3a003a3a, 0x09000909, 0x45004545, 0xaa00aaaa, + 0x0f000f0f, 0xee00eeee, 0x10001010, 0xeb00ebeb, + 0x2d002d2d, 0x7f007f7f, 0xf400f4f4, 0x29002929, + 0xac00acac, 0xcf00cfcf, 0xad00adad, 0x91009191, + 0x8d008d8d, 0x78007878, 0xc800c8c8, 0x95009595, + 0xf900f9f9, 0x2f002f2f, 0xce00cece, 0xcd00cdcd, + 0x08000808, 0x7a007a7a, 0x88008888, 0x38003838, + 0x5c005c5c, 0x83008383, 0x2a002a2a, 0x28002828, + 0x47004747, 0xdb00dbdb, 0xb800b8b8, 0xc700c7c7, + 0x93009393, 0xa400a4a4, 0x12001212, 0x53005353, + 0xff00ffff, 0x87008787, 0x0e000e0e, 0x31003131, + 0x36003636, 0x21002121, 0x58005858, 0x48004848, + 0x01000101, 0x8e008e8e, 0x37003737, 0x74007474, + 0x32003232, 0xca00caca, 0xe900e9e9, 0xb100b1b1, + 0xb700b7b7, 0xab00abab, 0x0c000c0c, 0xd700d7d7, + 0xc400c4c4, 0x56005656, 0x42004242, 0x26002626, + 0x07000707, 0x98009898, 0x60006060, 0xd900d9d9, + 0xb600b6b6, 0xb900b9b9, 0x11001111, 0x40004040, + 0xec00ecec, 0x20002020, 0x8c008c8c, 0xbd00bdbd, + 0xa000a0a0, 0xc900c9c9, 0x84008484, 0x04000404, + 0x49004949, 0x23002323, 0xf100f1f1, 0x4f004f4f, + 0x50005050, 0x1f001f1f, 0x13001313, 0xdc00dcdc, + 0xd800d8d8, 0xc000c0c0, 0x9e009e9e, 0x57005757, + 0xe300e3e3, 0xc300c3c3, 0x7b007b7b, 0x65006565, + 0x3b003b3b, 0x02000202, 0x8f008f8f, 0x3e003e3e, + 0xe800e8e8, 0x25002525, 0x92009292, 0xe500e5e5, + 0x15001515, 0xdd00dddd, 0xfd00fdfd, 0x17001717, + 0xa900a9a9, 0xbf00bfbf, 0xd400d4d4, 0x9a009a9a, + 0x7e007e7e, 0xc500c5c5, 0x39003939, 0x67006767, + 0xfe00fefe, 0x76007676, 0x9d009d9d, 0x43004343, + 0xa700a7a7, 0xe100e1e1, 0xd000d0d0, 0xf500f5f5, + 0x68006868, 0xf200f2f2, 0x1b001b1b, 0x34003434, + 0x70007070, 0x05000505, 0xa300a3a3, 0x8a008a8a, + 0xd500d5d5, 0x79007979, 0x86008686, 0xa800a8a8, + 0x30003030, 0xc600c6c6, 0x51005151, 0x4b004b4b, + 0x1e001e1e, 0xa600a6a6, 0x27002727, 0xf600f6f6, + 0x35003535, 0xd200d2d2, 0x6e006e6e, 0x24002424, + 0x16001616, 0x82008282, 0x5f005f5f, 0xda00dada, + 0xe600e6e6, 0x75007575, 0xa200a2a2, 0xef00efef, + 0x2c002c2c, 0xb200b2b2, 0x1c001c1c, 0x9f009f9f, + 0x5d005d5d, 0x6f006f6f, 0x80008080, 0x0a000a0a, + 0x72007272, 0x44004444, 0x9b009b9b, 0x6c006c6c, + 0x90009090, 0x0b000b0b, 0x5b005b5b, 0x33003333, + 0x7d007d7d, 0x5a005a5a, 0x52005252, 0xf300f3f3, + 0x61006161, 0xa100a1a1, 0xf700f7f7, 0xb000b0b0, + 0xd600d6d6, 0x3f003f3f, 0x7c007c7c, 0x6d006d6d, + 0xed00eded, 0x14001414, 0xe000e0e0, 0xa500a5a5, + 0x3d003d3d, 0x22002222, 0xb300b3b3, 0xf800f8f8, + 0x89008989, 0xde00dede, 0x71007171, 0x1a001a1a, + 0xaf00afaf, 0xba00baba, 0xb500b5b5, 0x81008181 + }, + { /* x1 */ + 0x52520052, 0x09090009, 0x6a6a006a, 0xd5d500d5, + 0x30300030, 0x36360036, 0xa5a500a5, 0x38380038, + 0xbfbf00bf, 0x40400040, 0xa3a300a3, 0x9e9e009e, + 0x81810081, 0xf3f300f3, 0xd7d700d7, 0xfbfb00fb, + 0x7c7c007c, 0xe3e300e3, 0x39390039, 0x82820082, + 0x9b9b009b, 0x2f2f002f, 0xffff00ff, 0x87870087, + 0x34340034, 0x8e8e008e, 0x43430043, 0x44440044, + 0xc4c400c4, 0xdede00de, 0xe9e900e9, 0xcbcb00cb, + 0x54540054, 0x7b7b007b, 0x94940094, 0x32320032, + 0xa6a600a6, 0xc2c200c2, 0x23230023, 0x3d3d003d, + 0xeeee00ee, 0x4c4c004c, 0x95950095, 0x0b0b000b, + 0x42420042, 0xfafa00fa, 0xc3c300c3, 0x4e4e004e, + 0x08080008, 0x2e2e002e, 0xa1a100a1, 0x66660066, + 0x28280028, 0xd9d900d9, 0x24240024, 0xb2b200b2, + 0x76760076, 0x5b5b005b, 0xa2a200a2, 0x49490049, + 0x6d6d006d, 0x8b8b008b, 0xd1d100d1, 0x25250025, + 0x72720072, 0xf8f800f8, 0xf6f600f6, 0x64640064, + 0x86860086, 0x68680068, 0x98980098, 0x16160016, + 0xd4d400d4, 0xa4a400a4, 0x5c5c005c, 0xcccc00cc, + 0x5d5d005d, 0x65650065, 0xb6b600b6, 0x92920092, + 0x6c6c006c, 0x70700070, 0x48480048, 0x50500050, + 0xfdfd00fd, 0xeded00ed, 0xb9b900b9, 0xdada00da, + 0x5e5e005e, 0x15150015, 0x46460046, 0x57570057, + 0xa7a700a7, 0x8d8d008d, 0x9d9d009d, 0x84840084, + 0x90900090, 0xd8d800d8, 0xabab00ab, 0x00000000, + 0x8c8c008c, 0xbcbc00bc, 0xd3d300d3, 0x0a0a000a, + 0xf7f700f7, 0xe4e400e4, 0x58580058, 0x05050005, + 0xb8b800b8, 0xb3b300b3, 0x45450045, 0x06060006, + 0xd0d000d0, 0x2c2c002c, 0x1e1e001e, 0x8f8f008f, + 0xcaca00ca, 0x3f3f003f, 0x0f0f000f, 0x02020002, + 0xc1c100c1, 0xafaf00af, 0xbdbd00bd, 0x03030003, + 0x01010001, 0x13130013, 0x8a8a008a, 0x6b6b006b, + 0x3a3a003a, 0x91910091, 0x11110011, 0x41410041, + 0x4f4f004f, 0x67670067, 0xdcdc00dc, 0xeaea00ea, + 0x97970097, 0xf2f200f2, 0xcfcf00cf, 0xcece00ce, + 0xf0f000f0, 0xb4b400b4, 0xe6e600e6, 0x73730073, + 0x96960096, 0xacac00ac, 0x74740074, 0x22220022, + 0xe7e700e7, 0xadad00ad, 0x35350035, 0x85850085, + 0xe2e200e2, 0xf9f900f9, 0x37370037, 0xe8e800e8, + 0x1c1c001c, 0x75750075, 0xdfdf00df, 0x6e6e006e, + 0x47470047, 0xf1f100f1, 0x1a1a001a, 0x71710071, + 0x1d1d001d, 0x29290029, 0xc5c500c5, 0x89890089, + 0x6f6f006f, 0xb7b700b7, 0x62620062, 0x0e0e000e, + 0xaaaa00aa, 0x18180018, 0xbebe00be, 0x1b1b001b, + 0xfcfc00fc, 0x56560056, 0x3e3e003e, 0x4b4b004b, + 0xc6c600c6, 0xd2d200d2, 0x79790079, 0x20200020, + 0x9a9a009a, 0xdbdb00db, 0xc0c000c0, 0xfefe00fe, + 0x78780078, 0xcdcd00cd, 0x5a5a005a, 0xf4f400f4, + 0x1f1f001f, 0xdddd00dd, 0xa8a800a8, 0x33330033, + 0x88880088, 0x07070007, 0xc7c700c7, 0x31310031, + 0xb1b100b1, 0x12120012, 0x10100010, 0x59590059, + 0x27270027, 0x80800080, 0xecec00ec, 0x5f5f005f, + 0x60600060, 0x51510051, 0x7f7f007f, 0xa9a900a9, + 0x19190019, 0xb5b500b5, 0x4a4a004a, 0x0d0d000d, + 0x2d2d002d, 0xe5e500e5, 0x7a7a007a, 0x9f9f009f, + 0x93930093, 0xc9c900c9, 0x9c9c009c, 0xefef00ef, + 0xa0a000a0, 0xe0e000e0, 0x3b3b003b, 0x4d4d004d, + 0xaeae00ae, 0x2a2a002a, 0xf5f500f5, 0xb0b000b0, + 0xc8c800c8, 0xebeb00eb, 0xbbbb00bb, 0x3c3c003c, + 0x83830083, 0x53530053, 0x99990099, 0x61610061, + 0x17170017, 0x2b2b002b, 0x04040004, 0x7e7e007e, + 0xbaba00ba, 0x77770077, 0xd6d600d6, 0x26260026, + 0xe1e100e1, 0x69690069, 0x14140014, 0x63630063, + 0x55550055, 0x21210021, 0x0c0c000c, 0x7d7d007d + }, + { /* x2 */ + 0x30303000, 0x68686800, 0x99999900, 0x1b1b1b00, + 0x87878700, 0xb9b9b900, 0x21212100, 0x78787800, + 0x50505000, 0x39393900, 0xdbdbdb00, 0xe1e1e100, + 0x72727200, 0x09090900, 0x62626200, 0x3c3c3c00, + 0x3e3e3e00, 0x7e7e7e00, 0x5e5e5e00, 0x8e8e8e00, + 0xf1f1f100, 0xa0a0a000, 0xcccccc00, 0xa3a3a300, + 0x2a2a2a00, 0x1d1d1d00, 0xfbfbfb00, 0xb6b6b600, + 0xd6d6d600, 0x20202000, 0xc4c4c400, 0x8d8d8d00, + 0x81818100, 0x65656500, 0xf5f5f500, 0x89898900, + 0xcbcbcb00, 0x9d9d9d00, 0x77777700, 0xc6c6c600, + 0x57575700, 0x43434300, 0x56565600, 0x17171700, + 0xd4d4d400, 0x40404000, 0x1a1a1a00, 0x4d4d4d00, + 0xc0c0c000, 0x63636300, 0x6c6c6c00, 0xe3e3e300, + 0xb7b7b700, 0xc8c8c800, 0x64646400, 0x6a6a6a00, + 0x53535300, 0xaaaaaa00, 0x38383800, 0x98989800, + 0x0c0c0c00, 0xf4f4f400, 0x9b9b9b00, 0xededed00, + 0x7f7f7f00, 0x22222200, 0x76767600, 0xafafaf00, + 0xdddddd00, 0x3a3a3a00, 0x0b0b0b00, 0x58585800, + 0x67676700, 0x88888800, 0x06060600, 0xc3c3c300, + 0x35353500, 0x0d0d0d00, 0x01010100, 0x8b8b8b00, + 0x8c8c8c00, 0xc2c2c200, 0xe6e6e600, 0x5f5f5f00, + 0x02020200, 0x24242400, 0x75757500, 0x93939300, + 0x66666600, 0x1e1e1e00, 0xe5e5e500, 0xe2e2e200, + 0x54545400, 0xd8d8d800, 0x10101000, 0xcecece00, + 0x7a7a7a00, 0xe8e8e800, 0x08080800, 0x2c2c2c00, + 0x12121200, 0x97979700, 0x32323200, 0xababab00, + 0xb4b4b400, 0x27272700, 0x0a0a0a00, 0x23232300, + 0xdfdfdf00, 0xefefef00, 0xcacaca00, 0xd9d9d900, + 0xb8b8b800, 0xfafafa00, 0xdcdcdc00, 0x31313100, + 0x6b6b6b00, 0xd1d1d100, 0xadadad00, 0x19191900, + 0x49494900, 0xbdbdbd00, 0x51515100, 0x96969600, + 0xeeeeee00, 0xe4e4e400, 0xa8a8a800, 0x41414100, + 0xdadada00, 0xffffff00, 0xcdcdcd00, 0x55555500, + 0x86868600, 0x36363600, 0xbebebe00, 0x61616100, + 0x52525200, 0xf8f8f800, 0xbbbbbb00, 0x0e0e0e00, + 0x82828200, 0x48484800, 0x69696900, 0x9a9a9a00, + 0xe0e0e000, 0x47474700, 0x9e9e9e00, 0x5c5c5c00, + 0x04040400, 0x4b4b4b00, 0x34343400, 0x15151500, + 0x79797900, 0x26262600, 0xa7a7a700, 0xdedede00, + 0x29292900, 0xaeaeae00, 0x92929200, 0xd7d7d700, + 0x84848400, 0xe9e9e900, 0xd2d2d200, 0xbababa00, + 0x5d5d5d00, 0xf3f3f300, 0xc5c5c500, 0xb0b0b000, + 0xbfbfbf00, 0xa4a4a400, 0x3b3b3b00, 0x71717100, + 0x44444400, 0x46464600, 0x2b2b2b00, 0xfcfcfc00, + 0xebebeb00, 0x6f6f6f00, 0xd5d5d500, 0xf6f6f600, + 0x14141400, 0xfefefe00, 0x7c7c7c00, 0x70707000, + 0x5a5a5a00, 0x7d7d7d00, 0xfdfdfd00, 0x2f2f2f00, + 0x18181800, 0x83838300, 0x16161600, 0xa5a5a500, + 0x91919100, 0x1f1f1f00, 0x05050500, 0x95959500, + 0x74747400, 0xa9a9a900, 0xc1c1c100, 0x5b5b5b00, + 0x4a4a4a00, 0x85858500, 0x6d6d6d00, 0x13131300, + 0x07070700, 0x4f4f4f00, 0x4e4e4e00, 0x45454500, + 0xb2b2b200, 0x0f0f0f00, 0xc9c9c900, 0x1c1c1c00, + 0xa6a6a600, 0xbcbcbc00, 0xececec00, 0x73737300, + 0x90909000, 0x7b7b7b00, 0xcfcfcf00, 0x59595900, + 0x8f8f8f00, 0xa1a1a100, 0xf9f9f900, 0x2d2d2d00, + 0xf2f2f200, 0xb1b1b100, 0x00000000, 0x94949400, + 0x37373700, 0x9f9f9f00, 0xd0d0d000, 0x2e2e2e00, + 0x9c9c9c00, 0x6e6e6e00, 0x28282800, 0x3f3f3f00, + 0x80808000, 0xf0f0f000, 0x3d3d3d00, 0xd3d3d300, + 0x25252500, 0x8a8a8a00, 0xb5b5b500, 0xe7e7e700, + 0x42424200, 0xb3b3b300, 0xc7c7c700, 0xeaeaea00, + 0xf7f7f700, 0x4c4c4c00, 0x11111100, 0x33333300, + 0x03030300, 0xa2a2a200, 0xacacac00, 0x60606000 + }, + 0 + }; + +/* Prefetching for sbox tables. */ +static inline void +prefetch_table(const volatile byte *tab, size_t len) +{ + size_t i; + + for (i = 0; len - i >= 8 * 32; i += 8 * 32) + { + (void)tab[i + 0 * 32]; + (void)tab[i + 1 * 32]; + (void)tab[i + 2 * 32]; + (void)tab[i + 3 * 32]; + (void)tab[i + 4 * 32]; + (void)tab[i + 5 * 32]; + (void)tab[i + 6 * 32]; + (void)tab[i + 7 * 32]; + } + for (; i < len; i += 32) + { + (void)tab[i]; + } + + (void)tab[len - 1]; +} + +static inline void +prefetch_sboxes(void) +{ + /* Modify counters to trigger copy-on-write and unsharing if physical pages + * of look-up table are shared between processes. Modifying counters also + * causes checksums for pages to change and hint same-page merging algorithm + * that these pages are frequently changing. */ + sboxes.counter_head++; + sboxes.counter_tail++; + + /* Prefetch look-up tables to cache. */ + prefetch_table((const void *)&sboxes, sizeof(sboxes)); +} + + +static ALWAYS_INLINE +u32 rotr32(u32 v, u32 r) +{ + return ror(v, r); +} + +static ALWAYS_INLINE +u32 bswap32(u32 v) +{ + return _gcry_bswap32(v); +} + +static ALWAYS_INLINE u32 +get_u8(u32 x, u32 y) +{ + return (x >> ((3 - y) * 8)) & 0xFF; +} + +static ALWAYS_INLINE u32 +make_u32(byte v0, byte v1, byte v2, byte v3) +{ + return ((u32)v0 << 24) | ((u32)v1 << 16) | ((u32)v2 << 8) | ((u32)v3); +} + +static ALWAYS_INLINE u32 +aria_m(u32 t0) +{ + return rotr32(t0, 8) ^ rotr32(t0 ^ rotr32(t0, 8), 16); +} + +/* S-Box Layer 1 + M */ +static ALWAYS_INLINE void +aria_sbox_layer1_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + *t0 = sboxes.s1[get_u8(*t0, 0)] ^ + sboxes.s2[get_u8(*t0, 1)] ^ + sboxes.x1[get_u8(*t0, 2)] ^ + sboxes.x2[get_u8(*t0, 3)]; + *t1 = sboxes.s1[get_u8(*t1, 0)] ^ + sboxes.s2[get_u8(*t1, 1)] ^ + sboxes.x1[get_u8(*t1, 2)] ^ + sboxes.x2[get_u8(*t1, 3)]; + *t2 = sboxes.s1[get_u8(*t2, 0)] ^ + sboxes.s2[get_u8(*t2, 1)] ^ + sboxes.x1[get_u8(*t2, 2)] ^ + sboxes.x2[get_u8(*t2, 3)]; + *t3 = sboxes.s1[get_u8(*t3, 0)] ^ + sboxes.s2[get_u8(*t3, 1)] ^ + sboxes.x1[get_u8(*t3, 2)] ^ + sboxes.x2[get_u8(*t3, 3)]; +} + +/* S-Box Layer 2 + M */ +static ALWAYS_INLINE void +aria_sbox_layer2_with_pre_diff(u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + *t0 = sboxes.x1[get_u8(*t0, 0)] ^ + sboxes.x2[get_u8(*t0, 1)] ^ + sboxes.s1[get_u8(*t0, 2)] ^ + sboxes.s2[get_u8(*t0, 3)]; + *t1 = sboxes.x1[get_u8(*t1, 0)] ^ + sboxes.x2[get_u8(*t1, 1)] ^ + sboxes.s1[get_u8(*t1, 2)] ^ + sboxes.s2[get_u8(*t1, 3)]; + *t2 = sboxes.x1[get_u8(*t2, 0)] ^ + sboxes.x2[get_u8(*t2, 1)] ^ + sboxes.s1[get_u8(*t2, 2)] ^ + sboxes.s2[get_u8(*t2, 3)]; + *t3 = sboxes.x1[get_u8(*t3, 0)] ^ + sboxes.x2[get_u8(*t3, 1)] ^ + sboxes.s1[get_u8(*t3, 2)] ^ + sboxes.s2[get_u8(*t3, 3)]; +} + +/* Word-level diffusion */ +static ALWAYS_INLINE void +aria_diff_word(u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + *t1 ^= *t2; + *t2 ^= *t3; + *t0 ^= *t1; + + *t3 ^= *t1; + *t2 ^= *t0; + *t1 ^= *t2; +} + +/* Byte-level diffusion */ +static inline void aria_diff_byte(u32 *t1, u32 *t2, u32 *t3) +{ + *t1 = ((*t1 << 8) & 0xff00ff00) ^ ((*t1 >> 8) & 0x00ff00ff); + *t2 = rotr32(*t2, 16); + *t3 = bswap32(*t3); +} + +/* Key XOR Layer */ +static ALWAYS_INLINE void +aria_add_round_key(u32 *rk, u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + *t0 ^= rk[0]; + *t1 ^= rk[1]; + *t2 ^= rk[2]; + *t3 ^= rk[3]; +} +/* Odd round Substitution & Diffusion */ +static ALWAYS_INLINE void +aria_subst_diff_odd(u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + aria_sbox_layer1_with_pre_diff(t0, t1, t2, t3); + aria_diff_word(t0, t1, t2, t3); + aria_diff_byte(t1, t2, t3); + aria_diff_word(t0, t1, t2, t3); +} + +/* Even round Substitution & Diffusion */ +static ALWAYS_INLINE void +aria_subst_diff_even(u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + aria_sbox_layer2_with_pre_diff(t0, t1, t2, t3); + aria_diff_word(t0, t1, t2, t3); + aria_diff_byte(t3, t0, t1); + aria_diff_word(t0, t1, t2, t3); +} + +/* Last round */ +static ALWAYS_INLINE void +aria_last_round(u32 *t0, u32 *t1, u32 *t2, u32 *t3) +{ + *t0 = make_u32((byte)(sboxes.x1[get_u8(*t0, 0)]), + (byte)(sboxes.x2[get_u8(*t0, 1)] >> 24), + (byte)(sboxes.s1[get_u8(*t0, 2)]), + (byte)(sboxes.s2[get_u8(*t0, 3)])); + *t1 = make_u32((byte)(sboxes.x1[get_u8(*t1, 0)]), + (byte)(sboxes.x2[get_u8(*t1, 1)] >> 24), + (byte)(sboxes.s1[get_u8(*t1, 2)]), + (byte)(sboxes.s2[get_u8(*t1, 3)])); + *t2 = make_u32((byte)(sboxes.x1[get_u8(*t2, 0)]), + (byte)(sboxes.x2[get_u8(*t2, 1)] >> 24), + (byte)(sboxes.s1[get_u8(*t2, 2)]), + (byte)(sboxes.s2[get_u8(*t2, 3)])); + *t3 = make_u32((byte)(sboxes.x1[get_u8(*t3, 0)]), + (byte)(sboxes.x2[get_u8(*t3, 1)] >> 24), + (byte)(sboxes.s1[get_u8(*t3, 2)]), + (byte)(sboxes.s2[get_u8(*t3, 3)])); +} + +/* Q, R Macro expanded ARIA GSRK */ +static ALWAYS_INLINE void +aria_gsrk(u32 *rk, u32 *x, u32 *y, u32 n) +{ + int q = 4 - (n / 32); + int r = n % 32; + + rk[0] = (x[0]) ^ + ((y[q % 4]) >> r) ^ + ((y[(q + 3) % 4]) << (32 - r)); + rk[1] = (x[1]) ^ + ((y[(q + 1) % 4]) >> r) ^ + ((y[q % 4]) << (32 - r)); + rk[2] = (x[2]) ^ + ((y[(q + 2) % 4]) >> r) ^ + ((y[(q + 1) % 4]) << (32 - r)); + rk[3] = (x[3]) ^ + ((y[(q + 3) % 4]) >> r) ^ + ((y[(q + 2) % 4]) << (32 - r)); +} + + +static NO_INLINE void +aria_set_encrypt_key(ARIA_context *ctx, const byte *in_key, u32 key_len) +{ + u32 w0[4], w1[4], w2[4], w3[4]; + u32 reg0, reg1, reg2, reg3; + const u32 *ck; + int rkidx = 0; + + ctx->rounds = (key_len + 32) / 4; + prefetch_sboxes(); + + ck = &key_rc[(key_len - 16) / 2]; + + w0[0] = buf_get_be32(in_key + 0); + w0[1] = buf_get_be32(in_key + 4); + w0[2] = buf_get_be32(in_key + 8); + w0[3] = buf_get_be32(in_key + 12); + + reg0 = w0[0] ^ ck[0]; + reg1 = w0[1] ^ ck[1]; + reg2 = w0[2] ^ ck[2]; + reg3 = w0[3] ^ ck[3]; + + aria_subst_diff_odd(®0, ®1, ®2, ®3); + + if (key_len > 16) + { + w1[0] = buf_get_be32(in_key + 16); + w1[1] = buf_get_be32(in_key + 20); + if (key_len > 24) + { + w1[2] = buf_get_be32(in_key + 24); + w1[3] = buf_get_be32(in_key + 28); + } + else + { + w1[2] = 0; + w1[3] = 0; + } + } + else + { + w1[0] = 0; + w1[1] = 0; + w1[2] = 0; + w1[3] = 0; + } + + w1[0] ^= reg0; + w1[1] ^= reg1; + w1[2] ^= reg2; + w1[3] ^= reg3; + + reg0 = w1[0]; + reg1 = w1[1]; + reg2 = w1[2]; + reg3 = w1[3]; + + reg0 ^= ck[4]; + reg1 ^= ck[5]; + reg2 ^= ck[6]; + reg3 ^= ck[7]; + + aria_subst_diff_even(®0, ®1, ®2, ®3); + + reg0 ^= w0[0]; + reg1 ^= w0[1]; + reg2 ^= w0[2]; + reg3 ^= w0[3]; + + w2[0] = reg0; + w2[1] = reg1; + w2[2] = reg2; + w2[3] = reg3; + + reg0 ^= ck[8]; + reg1 ^= ck[9]; + reg2 ^= ck[10]; + reg3 ^= ck[11]; + + aria_subst_diff_odd(®0, ®1, ®2, ®3); + + w3[0] = reg0 ^ w1[0]; + w3[1] = reg1 ^ w1[1]; + w3[2] = reg2 ^ w1[2]; + w3[3] = reg3 ^ w1[3]; + + aria_gsrk(ctx->enc_key[rkidx], w0, w1, 19); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w1, w2, 19); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w2, w3, 19); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w3, w0, 19); + + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w0, w1, 31); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w1, w2, 31); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w2, w3, 31); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w3, w0, 31); + + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w0, w1, 67); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w1, w2, 67); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w2, w3, 67); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w3, w0, 67); + + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w0, w1, 97); + if (key_len > 16) + { + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w1, w2, 97); + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w2, w3, 97); + + if (key_len > 24) + { + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w3, w0, 97); + + rkidx++; + aria_gsrk(ctx->enc_key[rkidx], w0, w1, 109); + } + } + + wipememory(w0, sizeof(w0)); + wipememory(w1, sizeof(w1)); + wipememory(w2, sizeof(w2)); + wipememory(w3, sizeof(w3)); +} + +static void +aria_set_decrypt_key(ARIA_context *ctx) +{ + int i; + + for (i = 0; i < 4; i++) + { + ctx->dec_key[0][i] = ctx->enc_key[ctx->rounds][i]; + ctx->dec_key[ctx->rounds][i] = ctx->enc_key[0][i]; + } + + for (i = 1; i < ctx->rounds; i++) + { + ctx->dec_key[i][0] = aria_m(ctx->enc_key[ctx->rounds - i][0]); + ctx->dec_key[i][1] = aria_m(ctx->enc_key[ctx->rounds - i][1]); + ctx->dec_key[i][2] = aria_m(ctx->enc_key[ctx->rounds - i][2]); + ctx->dec_key[i][3] = aria_m(ctx->enc_key[ctx->rounds - i][3]); + + aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1], + &ctx->dec_key[i][2], &ctx->dec_key[i][3]); + aria_diff_byte(&ctx->dec_key[i][1], + &ctx->dec_key[i][2], &ctx->dec_key[i][3]); + aria_diff_word(&ctx->dec_key[i][0], &ctx->dec_key[i][1], + &ctx->dec_key[i][2], &ctx->dec_key[i][3]); + } +} + +static NO_INLINE unsigned int +aria_crypt(ARIA_context *ctx, byte *out, const byte *in, + u32 key[][ARIA_RD_KEY_WORDS]) +{ + u32 reg0, reg1, reg2, reg3; + int rounds = ctx->rounds; + int rkidx = 0; + + reg0 = buf_get_be32(in + 0); + reg1 = buf_get_be32(in + 4); + reg2 = buf_get_be32(in + 8); + reg3 = buf_get_be32(in + 12); + + aria_add_round_key(key[rkidx], ®0, ®1, ®2, ®3); + rkidx++; + + while (1) + { + aria_subst_diff_odd(®0, ®1, ®2, ®3); + aria_add_round_key(key[rkidx], ®0, ®1, ®2, ®3); + rkidx++; + + if (rkidx >= rounds) + break; + + aria_subst_diff_even(®0, ®1, ®2, ®3); + aria_add_round_key(key[rkidx], ®0, ®1, ®2, ®3); + rkidx++; + } + + aria_last_round(®0, ®1, ®2, ®3); + aria_add_round_key(key[rkidx], ®0, ®1, ®2, ®3); + + buf_put_be32(out + 0, reg0); + buf_put_be32(out + 4, reg1); + buf_put_be32(out + 8, reg2); + buf_put_be32(out + 12, reg3); + + return 4 * sizeof(void *) + 4 * sizeof(u32); /* stack burn depth */ +} + +unsigned int +aria_encrypt(void *c, byte *outbuf, const byte *inbuf) +{ + ARIA_context *ctx = (ARIA_context *)c; + + prefetch_sboxes (); + + return aria_crypt (ctx, outbuf, inbuf, ctx->enc_key); +} + +unsigned int +aria_decrypt(void *c, byte *outbuf, const byte *inbuf) +{ + ARIA_context *ctx = (ARIA_context *)c; + + if (!ctx->decryption_prepared) + { + aria_set_decrypt_key (ctx); + ctx->decryption_prepared = 1; + } + + prefetch_sboxes (); + + return aria_crypt (ctx, outbuf, inbuf, ctx->dec_key); +} + + +static gcry_err_code_t +aria_setkey(void *c, const byte *key, unsigned keylen, + cipher_bulk_ops_t *bulk_ops) +{ + ARIA_context *ctx = c; + static int initialized = 0; + static const char *selftest_failed = NULL; + + if (keylen != 16 && keylen != 24 && keylen != 32) + return GPG_ERR_INV_KEYLEN; + + if (!initialized) + { + initialized = 1; + selftest_failed = aria_selftest (); + if (selftest_failed) + log_error("%s\n", selftest_failed); + } + + if (selftest_failed) + return GPG_ERR_SELFTEST_FAILED; + + /* Setup bulk encryption routines. */ + memset (bulk_ops, 0, sizeof(*bulk_ops)); + + /* Setup context and encryption key. */ + ctx->decryption_prepared = 0; + aria_set_encrypt_key (ctx, key, keylen); + + _gcry_burn_stack (3 * sizeof(void *) + 5 * 4 * sizeof(u32)); + return 0; +} + + +static const char * +aria_selftest (void) +{ + ARIA_context ctx; + byte scratch[16]; + + static const byte key[16] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f + }; + static const byte plaintext[16] = { + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, + }; + static const byte ciphertext[16] = { + 0xd7, 0x18, 0xfb, 0xd6, 0xab, 0x64, 0x4c, 0x73, + 0x9d, 0xa9, 0x5f, 0x3b, 0xe6, 0x45, 0x17, 0x78 + }; + + memset (&ctx, 0, sizeof(ctx)); + + aria_set_encrypt_key (&ctx, key, 16); + aria_encrypt (&ctx, scratch, plaintext); + if (memcmp (scratch, ciphertext, sizeof (ciphertext))) + return "ARIA test encryption failed."; + aria_decrypt (&ctx, scratch, scratch); + if (memcmp (scratch, plaintext, sizeof (plaintext))) + return "ARIA test decryption failed."; + + return NULL; +} + + +static const gcry_cipher_oid_spec_t aria128_oids[] = + { + { "1.2.410.200046.1.1.1", GCRY_CIPHER_MODE_ECB }, + { "1.2.410.200046.1.1.2", GCRY_CIPHER_MODE_CBC }, + { "1.2.410.200046.1.1.3", GCRY_CIPHER_MODE_CFB }, + { "1.2.410.200046.1.1.4", GCRY_CIPHER_MODE_OFB }, + { "1.2.410.200046.1.1.5", GCRY_CIPHER_MODE_CTR }, + { "1.2.410.200046.1.1.34", GCRY_CIPHER_MODE_GCM }, + { "1.2.410.200046.1.1.37", GCRY_CIPHER_MODE_CCM }, + { NULL } + }; + +static const gcry_cipher_oid_spec_t aria192_oids[] = + { + { "1.2.410.200046.1.1.6", GCRY_CIPHER_MODE_ECB }, + { "1.2.410.200046.1.1.7", GCRY_CIPHER_MODE_CBC }, + { "1.2.410.200046.1.1.8", GCRY_CIPHER_MODE_CFB }, + { "1.2.410.200046.1.1.9", GCRY_CIPHER_MODE_OFB }, + { "1.2.410.200046.1.1.10", GCRY_CIPHER_MODE_CTR }, + { "1.2.410.200046.1.1.35", GCRY_CIPHER_MODE_GCM }, + { "1.2.410.200046.1.1.38", GCRY_CIPHER_MODE_CCM }, + { NULL } + }; + +static const gcry_cipher_oid_spec_t aria256_oids[] = + { + { "1.2.410.200046.1.1.11", GCRY_CIPHER_MODE_ECB }, + { "1.2.410.200046.1.1.12", GCRY_CIPHER_MODE_CBC }, + { "1.2.410.200046.1.1.13", GCRY_CIPHER_MODE_CFB }, + { "1.2.410.200046.1.1.14", GCRY_CIPHER_MODE_OFB }, + { "1.2.410.200046.1.1.15", GCRY_CIPHER_MODE_CTR }, + { "1.2.410.200046.1.1.36", GCRY_CIPHER_MODE_GCM }, + { "1.2.410.200046.1.1.39", GCRY_CIPHER_MODE_CCM }, + { NULL } + }; + +gcry_cipher_spec_t _gcry_cipher_spec_aria128 = + { + GCRY_CIPHER_ARIA128, { 0, 0 }, + "ARIA128", NULL, aria128_oids, ARIA_BLOCK_SIZE, 128, + sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt + }; + +gcry_cipher_spec_t _gcry_cipher_spec_aria192 = + { + GCRY_CIPHER_ARIA192, { 0, 0 }, + "ARIA192",NULL,aria192_oids, ARIA_BLOCK_SIZE, 192, + sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt + }; + +gcry_cipher_spec_t _gcry_cipher_spec_aria256 = + { + GCRY_CIPHER_ARIA256, { 0, 0 }, + "ARIA256", NULL, aria256_oids, ARIA_BLOCK_SIZE, 256, + sizeof(ARIA_context), aria_setkey, aria_encrypt, aria_decrypt + }; diff --git a/cipher/cipher.c b/cipher/cipher.c index 6f92b75a..f2558371 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -90,6 +90,11 @@ static gcry_cipher_spec_t * const cipher_list[] = #endif #if USE_SM4 &_gcry_cipher_spec_sm4, +#endif +#if USE_ARIA + &_gcry_cipher_spec_aria128, + &_gcry_cipher_spec_aria192, + &_gcry_cipher_spec_aria256, #endif NULL }; @@ -207,8 +212,17 @@ static gcry_cipher_spec_t * const cipher_list_algo301[] = NULL, #endif #if USE_SM4 - &_gcry_cipher_spec_sm4 + &_gcry_cipher_spec_sm4, +#else + NULL, +#endif +#if USE_ARIA + &_gcry_cipher_spec_aria128, + &_gcry_cipher_spec_aria192, + &_gcry_cipher_spec_aria256 #else + NULL, + NULL, NULL #endif }; diff --git a/cipher/mac-cmac.c b/cipher/mac-cmac.c index b80c3406..5b1974ed 100644 --- a/cipher/mac-cmac.c +++ b/cipher/mac-cmac.c @@ -60,6 +60,8 @@ map_mac_algo_to_cipher (int mac_algo) return GCRY_CIPHER_GOST28147; case GCRY_MAC_CMAC_SM4: return GCRY_CIPHER_SM4; + case GCRY_MAC_CMAC_ARIA: + return GCRY_CIPHER_ARIA128; } } @@ -522,3 +524,9 @@ const gcry_mac_spec_t _gcry_mac_type_spec_cmac_sm4 = { &cmac_ops }; #endif +#if USE_ARIA +const gcry_mac_spec_t _gcry_mac_type_spec_cmac_aria = { + GCRY_MAC_CMAC_ARIA, {0, 0}, "CMAC_ARIA", + &cmac_ops +}; +#endif diff --git a/cipher/mac-gmac.c b/cipher/mac-gmac.c index 5e350010..20edaf91 100644 --- a/cipher/mac-gmac.c +++ b/cipher/mac-gmac.c @@ -47,6 +47,8 @@ map_mac_algo_to_cipher (int mac_algo) return GCRY_CIPHER_SEED; case GCRY_MAC_GMAC_SM4: return GCRY_CIPHER_SM4; + case GCRY_MAC_GMAC_ARIA: + return GCRY_CIPHER_ARIA128; } } @@ -193,3 +195,9 @@ const gcry_mac_spec_t _gcry_mac_type_spec_gmac_sm4 = { &gmac_ops }; #endif +#if USE_ARIA +const gcry_mac_spec_t _gcry_mac_type_spec_gmac_aria = { + GCRY_MAC_GMAC_ARIA, {0, 0}, "GMAC_ARIA", + &gmac_ops +}; +#endif diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 39876f55..142ef69e 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -234,6 +234,9 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_gost28147_imit; #if USE_SM4 extern const gcry_mac_spec_t _gcry_mac_type_spec_cmac_sm4; #endif +#if USE_ARIA +extern const gcry_mac_spec_t _gcry_mac_type_spec_cmac_aria; +#endif /* * The GMAC algorithm specifications (mac-gmac.c). @@ -256,6 +259,9 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_camellia; #if USE_SM4 extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_sm4; #endif +#if USE_ARIA +extern const gcry_mac_spec_t _gcry_mac_type_spec_gmac_aria; +#endif /* * The Poly1305 MAC algorithm specifications (mac-poly1305.c). @@ -279,3 +285,6 @@ extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_seed; #if USE_SM4 extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_sm4; #endif +#if USE_ARIA +extern const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aria; +#endif diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c index 5b6c489e..197468f2 100644 --- a/cipher/mac-poly1305.c +++ b/cipher/mac-poly1305.c @@ -86,6 +86,9 @@ poly1305mac_open (gcry_mac_hd_t h) case GCRY_MAC_POLY1305_SM4: cipher_algo = GCRY_CIPHER_SM4; break; + case GCRY_MAC_POLY1305_ARIA: + cipher_algo = GCRY_CIPHER_ARIA128; + break; } err = _gcry_cipher_open_internal (&mac_ctx->hd, cipher_algo, @@ -371,3 +374,9 @@ const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_sm4 = { &poly1305mac_ops }; #endif +#if USE_ARIA +const gcry_mac_spec_t _gcry_mac_type_spec_poly1305mac_aria = { + GCRY_MAC_POLY1305_ARIA, {0, 0}, "POLY1305_ARIA", + &poly1305mac_ops +}; +#endif diff --git a/cipher/mac.c b/cipher/mac.c index 05d2c64c..6305f51e 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -134,6 +134,11 @@ static const gcry_mac_spec_t * const mac_list[] = { &_gcry_mac_type_spec_cmac_sm4, &_gcry_mac_type_spec_gmac_sm4, &_gcry_mac_type_spec_poly1305mac_sm4, +#endif +#if USE_ARIA + &_gcry_mac_type_spec_cmac_aria, + &_gcry_mac_type_spec_gmac_aria, + &_gcry_mac_type_spec_poly1305mac_aria, #endif NULL }; @@ -310,7 +315,12 @@ static const gcry_mac_spec_t * const mac_list_algo201[] = NULL, #endif #if USE_SM4 - &_gcry_mac_type_spec_cmac_sm4 + &_gcry_mac_type_spec_cmac_sm4, +#else + NULL, +#endif +#if USE_ARIA + &_gcry_mac_type_spec_cmac_aria #else NULL #endif @@ -345,7 +355,12 @@ static const gcry_mac_spec_t * const mac_list_algo401[] = NULL, #endif #if USE_SM4 - &_gcry_mac_type_spec_gmac_sm4 + &_gcry_mac_type_spec_gmac_sm4, +#else + NULL, +#endif +#if USE_ARIA + &_gcry_mac_type_spec_gmac_aria #else NULL #endif @@ -381,7 +396,12 @@ static const gcry_mac_spec_t * const mac_list_algo501[] = NULL, #endif #if USE_SM4 - &_gcry_mac_type_spec_poly1305mac_sm4 + &_gcry_mac_type_spec_poly1305mac_sm4, +#else + NULL, +#endif +#if USE_ARIA + &_gcry_mac_type_spec_poly1305mac_aria #else NULL #endif diff --git a/configure.ac b/configure.ac index cc1104ca..9163b2ed 100644 --- a/configure.ac +++ b/configure.ac @@ -212,7 +212,7 @@ LIBGCRYPT_CONFIG_HOST="$host" # Definitions for symmetric ciphers. available_ciphers="arcfour blowfish cast5 des aes twofish serpent rfc2268 seed" available_ciphers="$available_ciphers camellia idea salsa20 gost28147 chacha20" -available_ciphers="$available_ciphers sm4" +available_ciphers="$available_ciphers sm4 aria" enabled_ciphers="" # Definitions for public-key ciphers. @@ -3030,6 +3030,12 @@ if test "$found" = "1" ; then esac fi +LIST_MEMBER(aria, $enabled_ciphers) +if test "$found" = "1" ; then + GCRYPT_CIPHERS="$GCRYPT_CIPHERS aria.lo" + AC_DEFINE(USE_ARIA, 1, [Defined if this module should be included]) +fi + LIST_MEMBER(dsa, $enabled_pubkey_ciphers) AM_CONDITIONAL(USE_DSA, [test "$found" = "1"]) if test "$found" = "1" ; then diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index db4ad1e6..e44c2f2e 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -1685,6 +1685,15 @@ A 128 bit cipher by the State Cryptography Administration of China (SCA). See @uref{https://tools.ietf.org/html/draft-ribose-cfrg-sm4-10}. + at item GCRY_CIPHER_ARIA128 + at itemx GCRY_CIPHER_ARIA192 + at itemx GCRY_CIPHER_ARIA256 + at cindex ARIA (cipher) +ARIA is a general-purpose block cipher algorithm developed by +Korean cryptographers in 2003. It was established as a Korean +standard block cipher algorithm in 2004. See + at uref{https://www.rfc-editor.org/rfc/rfc5794.html}. + @end table @node Available cipher modes @@ -4241,6 +4250,10 @@ block cipher algorithm. This is CMAC message authentication algorithm based on the SM4 block cipher algorithm. + at item GCRY_MAC_CMAC_ARIA +This is CMAC message authentication algorithm based on the ARIA +block cipher algorithm. + @item GCRY_MAC_GMAC_AES This is GMAC (GCM mode based MAC) message authentication algorithm based on the AES block cipher algorithm. @@ -4265,6 +4278,10 @@ block cipher algorithm. This is GMAC message authentication algorithm based on the SM4 block cipher algorithm. + at item GCRY_MAC_GMAC_ARIA +This is GMAC message authentication algorithm based on the ARIA +block cipher algorithm. + @item GCRY_MAC_POLY1305 This is plain Poly1305 message authentication algorithm, used with one-time key. @@ -4293,6 +4310,10 @@ key and one-time nonce. This is Poly1305-SM4 message authentication algorithm, used with key and one-time nonce. + at item GCRY_MAC_POLY1305_ARIA +This is Poly1305-ARIA message authentication algorithm, used with +key and one-time nonce. + @item GCRY_MAC_GOST28147_IMIT This is MAC construction defined in GOST 28147-89 (see RFC 5830 Section 8). diff --git a/src/cipher.h b/src/cipher.h index 9b890aeb..95ed43d7 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -171,6 +171,9 @@ extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147; extern gcry_cipher_spec_t _gcry_cipher_spec_gost28147_mesh; extern gcry_cipher_spec_t _gcry_cipher_spec_chacha20; extern gcry_cipher_spec_t _gcry_cipher_spec_sm4; +extern gcry_cipher_spec_t _gcry_cipher_spec_aria128; +extern gcry_cipher_spec_t _gcry_cipher_spec_aria192; +extern gcry_cipher_spec_t _gcry_cipher_spec_aria256; /* Declarations for the digest specifications. */ extern const gcry_md_spec_t _gcry_digest_spec_crc32; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 47d73339..aba22bfc 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -943,7 +943,10 @@ enum gcry_cipher_algos GCRY_CIPHER_GOST28147 = 315, GCRY_CIPHER_CHACHA20 = 316, GCRY_CIPHER_GOST28147_MESH = 317, /* With CryptoPro key meshing. */ - GCRY_CIPHER_SM4 = 318 + GCRY_CIPHER_SM4 = 318, + GCRY_CIPHER_ARIA128 = 319, + GCRY_CIPHER_ARIA192 = 320, + GCRY_CIPHER_ARIA256 = 321 }; /* The Rijndael algorithm is basically AES, so provide some macros. */ @@ -1510,6 +1513,7 @@ enum gcry_mac_algos GCRY_MAC_CMAC_IDEA = 210, GCRY_MAC_CMAC_GOST28147 = 211, GCRY_MAC_CMAC_SM4 = 212, + GCRY_MAC_CMAC_ARIA = 213, GCRY_MAC_GMAC_AES = 401, GCRY_MAC_GMAC_CAMELLIA = 402, @@ -1517,6 +1521,7 @@ enum gcry_mac_algos GCRY_MAC_GMAC_SERPENT = 404, GCRY_MAC_GMAC_SEED = 405, GCRY_MAC_GMAC_SM4 = 406, + GCRY_MAC_GMAC_ARIA = 407, GCRY_MAC_POLY1305 = 501, GCRY_MAC_POLY1305_AES = 502, @@ -1524,7 +1529,8 @@ enum gcry_mac_algos GCRY_MAC_POLY1305_TWOFISH = 504, GCRY_MAC_POLY1305_SERPENT = 505, GCRY_MAC_POLY1305_SEED = 506, - GCRY_MAC_POLY1305_SM4 = 507 + GCRY_MAC_POLY1305_SM4 = 507, + GCRY_MAC_POLY1305_ARIA = 508 }; /* Flags used with the open function. */ diff --git a/tests/basic.c b/tests/basic.c index 429bd237..68f4557b 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -919,6 +919,306 @@ check_ecb_cipher (void) } }, #endif /* USE_SM4 */ +#if USE_ARIA + { GCRY_CIPHER_ARIA128, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + 0, FLAG_NOFIPS, + { { "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + 16, + 16, + "\xd7\x18\xfb\xd6\xab\x64\x4c\x73\x9d\xa9\x5f\x3b\xe6\x45\x17\x78" }, + { } + } + }, + { GCRY_CIPHER_ARIA128, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + 0, FLAG_NOFIPS, + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 16, + 4 * 10 * 16, + "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e" + "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d" + "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf" + "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7" + "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16" + "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed" + "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8" + "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd" + "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca" + "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8" + "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e" + "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d" + "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf" + "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7" + "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16" + "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed" + "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8" + "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd" + "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca" + "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8" + "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e" + "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d" + "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf" + "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7" + "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16" + "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed" + "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8" + "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd" + "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca" + "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8" + "\xc6\xec\xd0\x8e\x22\xc3\x0a\xbd\xb2\x15\xcf\x74\xe2\x07\x5e\x6e" + "\x29\xcc\xaa\xc6\x34\x48\x70\x8d\x33\x1b\x2f\x81\x6c\x51\xb1\x7d" + "\x9e\x13\x3d\x15\x28\xdb\xf0\xaf\x57\x87\xc7\xf3\xa3\xf5\xc2\xbf" + "\x6b\x6f\x34\x59\x07\xa3\x05\x56\x12\xce\x07\x2f\xf5\x4d\xe7\xd7" + "\x88\x42\x4d\xa6\xe8\xcc\xfe\x81\x72\xb3\x91\xbe\x49\x93\x54\x16" + "\x56\x65\xba\x78\x64\x91\x70\x00\xa6\xee\xb2\xec\xb4\xa6\x98\xed" + "\xfc\x78\x87\xe7\xf5\x56\x37\x76\x14\xab\x0a\x28\x22\x93\xe6\xd8" + "\x84\xdb\xb8\x42\x06\xcd\xb1\x6e\xd1\x75\x4e\x77\xa1\xf2\x43\xfd" + "\x08\x69\x53\xf7\x52\xcc\x1e\x46\xc7\xc7\x94\xae\x85\x53\x7d\xca" + "\xec\x8d\xd7\x21\xf5\x5c\x93\xb6\xed\xfe\x2a\xde\xa4\x38\x73\xe8" }, + { } + } + }, + { GCRY_CIPHER_ARIA192, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17", + 0, FLAG_NOFIPS, + { { "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + 24, + 16, + "\x26\x44\x9c\x18\x05\xdb\xe7\xaa\x25\xa4\x68\xce\x26\x3a\x9e\x79" }, + { } + } + }, + { GCRY_CIPHER_ARIA192, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff" + "\x00\x11\x22\x33\x44\x55\x66\x77", + 0, FLAG_NOFIPS, + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 24, + 4 * 10 * 16, + "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b" + "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a" + "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c" + "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed" + "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a" + "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0" + "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00" + "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49" + "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc" + "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0" + "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b" + "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a" + "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c" + "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed" + "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a" + "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0" + "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00" + "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49" + "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc" + "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0" + "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b" + "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a" + "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c" + "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed" + "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a" + "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0" + "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00" + "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49" + "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc" + "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0" + "\x8d\x14\x70\x62\x5f\x59\xeb\xac\xb0\xe5\x5b\x53\x4b\x3e\x46\x2b" + "\x5f\x23\xd3\x3b\xff\x78\xf4\x6c\x3c\x15\x91\x1f\x4a\x21\x80\x9a" + "\xac\xca\xd8\x0b\x4b\xda\x91\x5a\xa9\xda\xe6\xbc\xeb\xe0\x6a\x6c" + "\x83\xf7\x7f\xd5\x39\x1a\xcf\xe6\x1d\xe2\xf6\x46\xb5\xd4\x47\xed" + "\xbf\xd5\xbb\x49\xb1\x2f\xbb\x91\x45\xb2\x27\x89\x5a\x75\x7b\x2a" + "\xf1\xf7\x18\x87\x34\x86\x3d\x7b\x8b\x6e\xde\x5a\x5b\x2f\x06\xa0" + "\xa2\x33\xc8\x52\x3d\x2d\xb7\x78\xfb\x31\xb0\xe3\x11\xf3\x27\x00" + "\x15\x2f\x33\x86\x1e\x9d\x04\x0c\x83\xb5\xeb\x40\xcd\x88\xea\x49" + "\x97\x57\x09\xdc\x62\x93\x65\xa1\x89\xf7\x8a\x3e\xc4\x03\x45\xfc" + "\x6a\x5a\x30\x7a\x8f\x9a\x44\x13\x09\x1e\x00\x7e\xca\x56\x45\xa0" }, + { } + } + }, + { GCRY_CIPHER_ARIA256, + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", + 0, FLAG_NOFIPS, + { { "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + 32, + 16, + "\xf9\x2b\xd7\xc7\x9f\xb7\x2e\x2f\x2b\x8f\x80\xc1\x97\x2d\x24\xfc" }, + { } + } + }, + { GCRY_CIPHER_ARIA256, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff" + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + 0, FLAG_NOFIPS, + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd" + "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 32, + 4 * 10 * 16, + "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d" + "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7" + "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42" + "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46" + "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e" + "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee" + "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59" + "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a" + "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7" + "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00" + "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d" + "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7" + "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42" + "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46" + "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e" + "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee" + "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59" + "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a" + "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7" + "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00" + "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d" + "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7" + "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42" + "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46" + "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e" + "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee" + "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59" + "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a" + "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7" + "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00" + "\x58\xa8\x75\xe6\x04\x4a\xd7\xff\xfa\x4f\x58\x42\x0f\x7f\x44\x2d" + "\x8e\x19\x10\x16\xf2\x8e\x79\xae\xfc\x01\xe2\x04\x77\x32\x80\xd7" + "\x01\x8e\x5f\x7a\x93\x8e\xc3\x07\x11\x71\x99\x53\xba\xe8\x65\x42" + "\xcd\x7e\xbc\x75\x24\x74\xc1\xa5\xf6\xea\xaa\xce\x2a\x7e\x29\x46" + "\x2e\xe7\xdf\xa5\xaf\xdb\x84\x17\x7e\xad\x95\xcc\xd4\xb4\xbb\x6e" + "\x1e\xd1\x7b\x95\x34\xcf\xf0\xa5\xfc\x29\x41\x42\x9c\xfe\xe2\xee" + "\x49\xc7\xad\xbe\xb7\xe9\xd1\xb0\xd2\xa8\x53\x1d\x94\x20\x79\x59" + "\x6a\x27\xed\x79\xf5\xb1\xdd\x13\xec\xd6\x04\xb0\x7a\x48\x88\x5a" + "\x3a\xfa\x06\x27\xa0\xe4\xe6\x0a\x3c\x70\x3a\xf2\x92\xf1\xba\xa7" + "\x7b\x70\x2f\x16\xc5\x4a\xa7\x4b\xc7\x27\xea\x95\xc7\x46\x8b\x00" }, + { } + } + }, +#endif /* USE_ARIA */ }; gcry_cipher_hd_t hde, hdd; unsigned char out[MAX_DATA_LEN]; @@ -2180,6 +2480,91 @@ check_ctr_cipher (void) } }, #endif /* USE_SM4 */ +#if USE_ARIA + { GCRY_CIPHER_ARIA128, FLAG_NOFIPS, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 10 * 16, + "\xac\x5d\x7d\xe8\x05\xa0\xbf\x1c\x57\xc8\x54\x50\x1a\xf6\x0f\xa1" + "\x14\x97\xe2\xa3\x45\x19\xde\xa1\x56\x9e\x91\xe5\xb5\xcc\xae\x2f" + "\xf3\xbf\xa1\xbf\x97\x5f\x45\x71\xf4\x8b\xe1\x91\x61\x35\x46\xc3" + "\x91\x11\x63\xc0\x85\xf8\x71\xf0\xe7\xae\x5f\x2a\x08\x5b\x81\x85" + "\x1c\x2a\x3d\xdf\x20\xec\xb8\xfa\x51\x90\x1a\xec\x8e\xe4\xba\x32" + "\xa3\x5d\xab\x67\xbb\x72\xcd\x91\x40\xad\x18\x8a\x96\x7a\xc0\xfb" + "\xbd\xfa\x94\xea\x6c\xce\x47\xdc\xf8\x52\x5a\xb5\xa8\x14\xcf\xeb" + "\x2b\xb6\x0e\xe2\xb1\x26\xe2\xd9\xd8\x47\xc1\xa9\xe9\x6f\x90\x19" + "\xe3\xe6\xa7\xfe\x40\xd3\x82\x9a\xfb\x73\xdb\x1c\xc2\x45\x64\x6a" + "\xdd\xb6\x2d\x9b\x90\x7b\xaa\xaf\xbe\x46\xa7\x3d\xbc\x13\x1d\x3d" }, + { "", 0, "" } + } + }, + { GCRY_CIPHER_ARIA192, FLAG_NOFIPS, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff" + "\x00\x11\x22\x33\x44\x55\x66\x77", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 10 * 16, + "\x08\x62\x5c\xa8\xfe\x56\x9c\x19\xba\x7a\xf3\x76\x0a\x6e\xd1\xce" + "\xf4\xd1\x99\x26\x3e\x99\x9d\xde\x14\x08\x2d\xbb\xa7\x56\x0b\x79" + "\xa4\xc6\xb4\x56\xb8\x70\x7d\xce\x75\x1f\x98\x54\xf1\x88\x93\xdf" + "\xdb\x3f\x4e\x5a\xfa\x53\x97\x33\xe6\xf1\xe7\x0b\x98\xba\x37\x89" + "\x1f\x8f\x81\xe9\x5d\xf8\xef\xc2\x6c\x7c\xe0\x43\x50\x4c\xb1\x89" + "\x58\xb8\x65\xe4\xe3\x16\xcd\x2a\xa1\xc9\x7f\x31\xbf\x23\xdc\x04" + "\x6e\xf3\x26\xb9\x5a\x69\x2a\x19\x1b\xa0\xf2\xa4\x1c\x5f\xe9\xae" + "\x07\x0f\x23\x6f\xf7\x07\x8e\x70\x3b\x42\x66\x6c\xaa\xfb\xdd\x20" + "\xba\xd7\x4a\xc4\xc2\x0c\x0f\x46\xc7\xca\x24\xc1\x51\x71\x65\x75" + "\xc9\x47\xda\x16\xc9\x0c\xfe\x1b\xf2\x17\xa4\x1c\xfe\xbe\x75\x31" }, + { "", 0, "" } + } + }, + { GCRY_CIPHER_ARIA256, FLAG_NOFIPS, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff" + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 10 * 16, + "\x30\x02\x6c\x32\x96\x66\x14\x17\x21\x17\x8b\x99\xc0\xa1\xf1\xb2" + "\xf0\x69\x40\x25\x3f\x7b\x30\x89\xe2\xa3\x0e\xa8\x6a\xa3\xc8\x8f" + "\x59\x40\xf0\x5a\xd7\xee\x41\xd7\x13\x47\xbb\x72\x61\xe3\x48\xf1" + "\x83\x60\x47\x3f\xdf\x7d\x4e\x77\x23\xbf\xfb\x44\x11\xcc\x13\xf6" + "\xcd\xd8\x9f\x3b\xc7\xb9\xc7\x68\x14\x50\x22\xc7\xa7\x4f\x14\xd7" + "\xc3\x05\xcd\x01\x2a\x10\xf1\x60\x50\xc2\x3f\x1a\xe5\xc2\x3f\x45" + "\x99\x8d\x13\xfb\xaa\x04\x1e\x51\x61\x95\x77\xe0\x77\x27\x64\x89" + "\x6a\x5d\x45\x16\xd8\xff\xce\xb3\xbf\x7e\x05\xf6\x13\xed\xd9\xa6" + "\x0c\xdc\xed\xaf\xf9\xcf\xca\xf4\xe0\x0d\x44\x5a\x54\x33\x4f\x73" + "\xab\x2c\xad\x94\x4e\x51\xd2\x66\x54\x8e\x61\xc6\xeb\x0a\xa1\xcd" }, + { "", 0, "" } + } + }, +#endif /* USE_ARIA */ { 0, 0, "", "", @@ -2716,6 +3101,60 @@ check_cfb_cipher (void) } }, #endif /* USE_SM4 */ +#if USE_ARIA + { GCRY_CIPHER_ARIA128, FLAG_NOFIPS, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78\x87\x96\xa5\xb4\xc3\xd2\xe1\xf0", + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 10 * 16, + "\x37\x20\xe5\x3b\xa7\xd6\x15\x38\x34\x06\xb0\x9f\x0a\x05\xa2\x00" + "\xc0\x7c\x21\xe6\x37\x0f\x41\x3a\x5d\x13\x25\x00\xa6\x82\x85\x01" + "\x7c\x61\xb4\x34\xc7\xb7\xca\x96\x85\xa5\x10\x71\x86\x1e\x4d\x4b" + "\xb8\x73\xb5\x99\xb4\x79\xe2\xd5\x73\xdd\xde\xaf\xba\x89\xf8\x12" + "\xac\x6a\x9e\x44\xd5\x54\x07\x8e\xb3\xbe\x94\x83\x9d\xb4\xb3\x3d" + "\xa3\xf5\x9c\x06\x31\x23\xa7\xef\x6f\x20\xe1\x05\x79\xfa\x4f\xd2" + "\x39\x10\x0c\xa7\x3b\x52\xd4\xfc\xaf\xea\xde\xe7\x3f\x13\x9f\x78" + "\xf9\xb7\x61\x4c\x2b\x3b\x9d\xbe\x01\x0f\x87\xdb\x06\xa8\x9a\x94" + "\x35\xf7\x9c\xe8\x12\x14\x31\x37\x1f\x4e\x87\xb9\x84\xe0\x23\x0c" + "\x22\xa6\xda\xcb\x32\xfc\x42\xdc\xc6\xac\xce\xf3\x32\x85\xbf\x11" }, + } + }, + { GCRY_CIPHER_ARIA128, FLAG_NOFIPS | FLAG_CFB8, + "\x00\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff", + "\x0f\x1e\x2d\x3c\x4b\x5a\x69\x78\x87\x96\xa5\xb4\xc3\xd2\xe1\xf0", + { { "\x11\x11\x11\x11\xaa\xaa\xaa\xaa\x11\x11\x11\x11\xbb\xbb\xbb\xbb" + "\x11\x11\x11\x11\xcc\xcc\xcc\xcc\x11\x11\x11\x11\xdd\xdd\xdd\xdd" + "\x22\x22\x22\x22\xaa\xaa\xaa\xaa\x22\x22\x22\x22\xbb\xbb\xbb\xbb" + "\x22\x22\x22\x22\xcc\xcc\xcc\xcc\x22\x22\x22\x22\xdd\xdd\xdd\xdd" + "\x33\x33\x33\x33\xaa\xaa\xaa\xaa\x33\x33\x33\x33\xbb\xbb\xbb\xbb" + "\x33\x33\x33\x33\xcc\xcc\xcc\xcc\x33\x33\x33\x33\xdd\xdd\xdd\xdd" + "\x44\x44\x44\x44\xaa\xaa\xaa\xaa\x44\x44\x44\x44\xbb\xbb\xbb\xbb" + "\x44\x44\x44\x44\xcc\xcc\xcc\xcc\x44\x44\x44\x44\xdd\xdd\xdd\xdd" + "\x55\x55\x55\x55\xaa\xaa\xaa\xaa\x55\x55\x55\x55\xbb\xbb\xbb\xbb" + "\x55\x55\x55\x55\xcc\xcc\xcc\xcc\x55\x55\x55\x55\xdd\xdd\xdd\xdd", + 10 * 16, + "\x37\x3c\x8f\x6a\x96\x55\x99\xec\x78\x5c\xc8\xf8\x14\x9f\x6c\x81" + "\xb6\x32\xcc\xb8\xe0\xc6\xeb\x6a\x97\x07\xae\x52\xc5\x92\x57\xa4" + "\x1f\x94\x70\x1c\x10\x96\x93\x31\x27\xa9\x01\x95\xed\x0c\x8e\x98" + "\x69\x05\x47\x57\x24\x23\xbb\x45\xc3\xd7\x0e\x4a\x18\xee\x56\xb9" + "\x67\xc1\x0e\x00\x0b\xa4\xdf\x5f\xba\x7c\x40\x41\x34\xa3\x43\xd8" + "\x37\x5d\x04\xb1\x51\xd1\x61\xef\x83\x41\x7f\xe1\x74\x84\x47\xd3" + "\x0a\x67\x23\xc4\x06\x73\x3d\xf7\xd1\x8a\xa3\x9a\x20\x75\x2d\x23" + "\x81\x94\x2e\x24\x48\x11\xbb\x97\xf7\x2e\xae\x44\x6b\x18\x15\xaa" + "\x69\x0c\xd1\xb1\xad\xcb\xd0\x07\xc0\x08\x8e\xcd\xc9\x1c\xb2\xe2" + "\xca\xf0\xe1\x1e\x72\x45\x98\x78\x13\x7e\xea\x64\xac\x62\xa9\xa1" }, + } + }, +#endif /* USE_ARIA */ }; gcry_cipher_hd_t hde, hdd; unsigned char out[MAX_DATA_LEN]; @@ -9342,6 +9781,14 @@ check_ocb_cipher (void) check_ocb_cipher_largebuf(GCRY_CIPHER_SM4, 16, "\x3c\x32\x54\x5d\xc5\x17\xa1\x16\x3f\x8e\xc7\x1d\x8d\x8b\x2d\xb0"); #endif /* USE_SM4 */ +#if USE_ARIA + check_ocb_cipher_largebuf(GCRY_CIPHER_ARIA128, 16, + "\x6c\xcc\x69\x34\x3b\xa3\x55\xe5\xdc\xf6\x13\xe0\x5b\x08\x6a\xd9"); + check_ocb_cipher_largebuf(GCRY_CIPHER_ARIA192, 24, + "\x78\xcb\x2e\xa4\x76\xca\x4b\x01\xe8\x34\x44\x00\x9a\x99\x99\x01"); + check_ocb_cipher_largebuf(GCRY_CIPHER_ARIA256, 32, + "\x70\x92\x29\xf5\xbc\x73\xa8\x02\xcc\x80\xac\x0b\xd4\x86\x7f\x43"); +#endif /* USE_ARIA */ /* Check that the AAD data is correctly buffered. */ check_ocb_cipher_splitaad (); @@ -13008,6 +13455,11 @@ check_ciphers (void) #endif #if USE_SM4 GCRY_CIPHER_SM4, +#endif +#if USE_ARIA + GCRY_CIPHER_ARIA128, + GCRY_CIPHER_ARIA192, + GCRY_CIPHER_ARIA256, #endif 0 }; @@ -17751,6 +18203,7 @@ main (int argc, char **argv) { check_ciphers (); check_cipher_modes (); + check_bulk_cipher_modes (); } else if (hash_only) { diff --git a/tests/bench-slope.c b/tests/bench-slope.c index eb301569..99b444e0 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -2064,6 +2064,7 @@ bench_mac_init (struct bench_obj *obj) case GCRY_MAC_POLY1305_SERPENT: case GCRY_MAC_POLY1305_SEED: case GCRY_MAC_POLY1305_SM4: + case GCRY_MAC_POLY1305_ARIA: gcry_mac_setiv (hd, key, 16); break; } diff --git a/tests/benchmark.c b/tests/benchmark.c index 60abd2cb..cf8182a6 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -652,7 +652,7 @@ mac_bench ( const char *algoname ) for (i=0; i < bufsize; i++) buf[i] = i; - if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_SM4) + if (algo >= GCRY_MAC_POLY1305_AES && algo <= GCRY_MAC_POLY1305_ARIA) { static const char iv[16] = { 1, 2, 3, 4, }; err = gcry_mac_setiv(hd, iv, sizeof(iv)); -- 2.37.2 From jussi.kivilinna at iki.fi Sun Jan 15 19:57:57 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 15 Jan 2023 20:57:57 +0200 Subject: [PATCH] Revert "aarch64-asm: use ADR for getting pointers for local labels" Message-ID: <20230115185757.224088-1-jussi.kivilinna@iki.fi> * cipher/asm-common-aarch64.h (GET_LOCAL_POINTER): Remove. (GET_DATA_POINTER): New. * cipher/camellia-aarch64.S: Use GET_DATA_POINTER instead of GET_LOCAL_POINTER. * cipher/chacha20-aarch64.S: Likewise. * cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise. * cipher/crc-armv8-aarch64-ce.S: Likewise. * cipher/sha1-armv8-aarch64-ce.S: Likewise. * cipher/sha256-armv8-aarch64-ce.S: Likewise. * cipher/sm3-aarch64.S: Likewise. * cipher/sm3-armv8-aarch64-ce.S: Likewise. * cipher/sm4-aarch64.S: Likewise. * cipher/sm4-armv9-aarch64-sve-ce.S: Likewise. -- This reverts commit fd02e8e78470deb661269c429f3348f811c054c6 with following modifications: - Only use adrp/add type address generation for GET_DATA_POINTER as adrp/ldr can cause problems with only locally visible data labels. - Change 'sm4-armv9-aarch64-sve-ce.S' to use GET_DATA_POINTER also. - Don't revert 'camellia-aarch64.S' to use ADR instruction directly but instead use GET_DATA_POINTER. Apparently taking local addresses with single instruction will not work when OS targets start to move to execute-only memory mappings. Therefore revert "aarch64-asm: use ADR for getting pointers for local labels" to switch back to using GET_DATA_POINTER. Reported-by: Theo de Raadt Signed-off-by: Jussi Kivilinna --- cipher/asm-common-aarch64.h | 11 +++++++++-- cipher/camellia-aarch64.S | 4 ++-- cipher/chacha20-aarch64.S | 8 ++++---- cipher/cipher-gcm-armv8-aarch64-ce.S | 6 +++--- cipher/crc-armv8-aarch64-ce.S | 4 ++-- cipher/sha1-armv8-aarch64-ce.S | 2 +- cipher/sha256-armv8-aarch64-ce.S | 2 +- cipher/sm3-aarch64.S | 2 +- cipher/sm3-armv8-aarch64-ce.S | 2 +- cipher/sm4-aarch64.S | 2 +- cipher/sm4-armv9-aarch64-sve-ce.S | 4 ++-- 11 files changed, 27 insertions(+), 20 deletions(-) diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h index b38b17a6..8f7951a3 100644 --- a/cipher/asm-common-aarch64.h +++ b/cipher/asm-common-aarch64.h @@ -29,8 +29,15 @@ # define ELF(...) /*_*/ #endif -#define GET_LOCAL_POINTER(reg, label) \ - adr reg, label; +#ifdef __APPLE__ +#define GET_DATA_POINTER(reg, name) \ + adrp reg, name at GOTPAGE ; \ + add reg, reg, name at GOTPAGEOFF ; +#else +#define GET_DATA_POINTER(reg, name) \ + adrp reg, name ; \ + add reg, reg, #:lo12:name ; +#endif #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES /* CFI directives to emit DWARF stack unwinding information. */ diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S index c019c168..d7c0cf31 100644 --- a/cipher/camellia-aarch64.S +++ b/cipher/camellia-aarch64.S @@ -214,7 +214,7 @@ _gcry_camellia_arm_encrypt_block: * w3: keybitlen */ - GET_LOCAL_POINTER(RTAB1, _gcry_camellia_arm_tables); + GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables); mov RMASK, #(0xff<<4); /* byte mask */ add RTAB2, RTAB1, #(1 * 4); add RTAB3, RTAB1, #(2 * 4); @@ -274,7 +274,7 @@ _gcry_camellia_arm_decrypt_block: * w3: keybitlen */ - GET_LOCAL_POINTER(RTAB1, _gcry_camellia_arm_tables); + adr RTAB1, _gcry_camellia_arm_tables; mov RMASK, #(0xff<<4); /* byte mask */ add RTAB2, RTAB1, #(1 * 4); add RTAB3, RTAB1, #(2 * 4); diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index 540f892b..2a980b95 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -206,10 +206,10 @@ _gcry_chacha20_aarch64_blocks4: */ CFI_STARTPROC() - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); add INPUT_CTR, INPUT, #(12*4); ld1 {ROT8.16b}, [CTR]; - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); mov INPUT_POS, INPUT; ld1 {VCTR.16b}, [CTR]; @@ -383,10 +383,10 @@ _gcry_chacha20_poly1305_aarch64_blocks4: mov POLY_RSTATE, x4; mov POLY_RSRC, x5; - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); add INPUT_CTR, INPUT, #(12*4); ld1 {ROT8.16b}, [CTR]; - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); mov INPUT_POS, INPUT; ld1 {VCTR.16b}, [CTR]; diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 78f3ad2d..687fabe3 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -169,7 +169,7 @@ _gcry_ghash_armv8_ce_pmull: cbz x3, .Ldo_nothing; - GET_LOCAL_POINTER(x5, .Lrconst) + GET_DATA_POINTER(x5, .Lrconst) eor vZZ.16b, vZZ.16b, vZZ.16b ld1 {rhash.16b}, [x1] @@ -368,7 +368,7 @@ _gcry_polyval_armv8_ce_pmull: cbz x3, .Lpolyval_do_nothing; - GET_LOCAL_POINTER(x5, .Lrconst) + GET_DATA_POINTER(x5, .Lrconst) eor vZZ.16b, vZZ.16b, vZZ.16b ld1 {rhash.16b}, [x1] @@ -589,7 +589,7 @@ _gcry_ghash_setup_armv8_ce_pmull: */ CFI_STARTPROC() - GET_LOCAL_POINTER(x2, .Lrconst) + GET_DATA_POINTER(x2, .Lrconst) eor vZZ.16b, vZZ.16b, vZZ.16b diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S index b6cdbb3d..7ac884af 100644 --- a/cipher/crc-armv8-aarch64-ce.S +++ b/cipher/crc-armv8-aarch64-ce.S @@ -71,7 +71,7 @@ _gcry_crc32r_armv8_ce_bulk: */ CFI_STARTPROC() - GET_LOCAL_POINTER(x7, .Lcrc32_constants) + GET_DATA_POINTER(x7, .Lcrc32_constants) add x9, x3, #consts_k(5 - 1) cmp x2, #128 @@ -280,7 +280,7 @@ _gcry_crc32_armv8_ce_bulk: */ CFI_STARTPROC() - GET_LOCAL_POINTER(x7, .Lcrc32_constants) + GET_DATA_POINTER(x7, .Lcrc32_constants) add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants cmp x2, #128 ld1 {v7.16b}, [x4] diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S index f95717ee..ea26564b 100644 --- a/cipher/sha1-armv8-aarch64-ce.S +++ b/cipher/sha1-armv8-aarch64-ce.S @@ -109,7 +109,7 @@ _gcry_sha1_transform_armv8_ce: cbz x2, .Ldo_nothing; - GET_LOCAL_POINTER(x4, .LK_VEC); + GET_DATA_POINTER(x4, .LK_VEC); ld1 {vH0123.4s}, [x0] /* load h0,h1,h2,h3 */ ld1 {vK1.4s-vK4.4s}, [x4] /* load K1,K2,K3,K4 */ diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S index 5616eada..d0fa6285 100644 --- a/cipher/sha256-armv8-aarch64-ce.S +++ b/cipher/sha256-armv8-aarch64-ce.S @@ -119,7 +119,7 @@ _gcry_sha256_transform_armv8_ce: cbz x2, .Ldo_nothing; - GET_LOCAL_POINTER(x3, .LK); + GET_DATA_POINTER(x3, .LK); mov x4, x3 ld1 {vH0123.4s-vH4567.4s}, [x0] /* load state */ diff --git a/cipher/sm3-aarch64.S b/cipher/sm3-aarch64.S index 0e58254b..3fb89006 100644 --- a/cipher/sm3-aarch64.S +++ b/cipher/sm3-aarch64.S @@ -425,7 +425,7 @@ _gcry_sm3_transform_aarch64: CFI_DEF_CFA_REGISTER(RFRAME); sub addr0, sp, #STACK_SIZE; - GET_LOCAL_POINTER(RKPTR, .LKtable); + GET_DATA_POINTER(RKPTR, .LKtable); and sp, addr0, #(~63); /* Preload first block. */ diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S index d592d08a..0900b84f 100644 --- a/cipher/sm3-armv8-aarch64-ce.S +++ b/cipher/sm3-armv8-aarch64-ce.S @@ -170,7 +170,7 @@ _gcry_sm3_transform_armv8_ce: ext CTX2.16b, CTX2.16b, CTX2.16b, #8; .Lloop: - GET_LOCAL_POINTER(x3, .Lsm3_Ktable); + GET_DATA_POINTER(x3, .Lsm3_Ktable); ld1 {v0.16b-v3.16b}, [x1], #64; sub x2, x2, #1; diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S index 8d06991b..306b425e 100644 --- a/cipher/sm4-aarch64.S +++ b/cipher/sm4-aarch64.S @@ -84,7 +84,7 @@ ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts) /* Helper macros. */ #define preload_sbox(ptr) \ - GET_LOCAL_POINTER(ptr, .Lsm4_sbox); \ + GET_DATA_POINTER(ptr, .Lsm4_sbox); \ ld1 {v16.16b-v19.16b}, [ptr], #64; \ ld1 {v20.16b-v23.16b}, [ptr], #64; \ ld1 {v24.16b-v27.16b}, [ptr], #64; \ diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S index 21e34e6f..a1b61416 100644 --- a/cipher/sm4-armv9-aarch64-sve-ce.S +++ b/cipher/sm4-armv9-aarch64-sve-ce.S @@ -167,7 +167,7 @@ ELF(.size _gcry_sm4_armv9_svesm4_consts,.-_gcry_sm4_armv9_svesm4_consts) /* Helper macros. */ #define PREPARE() \ - GET_LOCAL_POINTER(x7, .Lbswap128_mask); \ + GET_DATA_POINTER(x7, .Lbswap128_mask); \ ptrue p0.b, ALL; \ rdvl x5, #1; \ ld1b {RSWAP128.b}, p0/z, [x7]; \ @@ -811,7 +811,7 @@ _gcry_sm4_armv9_sve_ce_ctr_enc: PREPARE(); dup RZERO.d, #0; - GET_LOCAL_POINTER(x6, .Lle128_inc); + GET_DATA_POINTER(x6, .Lle128_inc); ld1b {RLE128_INC.b}, p0/z, [x6]; ldp x7, x8, [x3]; -- 2.37.2 From jussi.kivilinna at iki.fi Mon Jan 16 18:41:20 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 16 Jan 2023 19:41:20 +0200 Subject: [PATCH] avx512: tweak zmm16-zmm31 register clearing Message-ID: <20230116174120.332155-1-jussi.kivilinna@iki.fi> * cipher/asm-common-amd64.h (spec_stop_avx512): Clear ymm16 before and after vpopcntb. * cipher/camellia-gfni-avx512-amd64.S (clear_zmm16_zmm31): Clear YMM16-YMM31 registers instead of XMM16-XMM31. * cipher/chacha20-amd64-avx512.S (clear_zmm16_zmm31): Likewise. * cipher/keccak-amd64-avx512.S (clear_regs): Likewise. (clear_avx512_4regs): Clear all 4 registers with XOR. * cipher/cipher-gcm-intel-pclmul.c (_gcry_ghash_intel_pclmul) (_gcry_polyval_intel_pclmul): Clear YMM16-YMM19 registers instead of ZMM16-ZMM19. * cipher/poly1305-amd64-avx512.S (POLY1305_BLOCKS): Clear YMM16-YMM31 registers after vector processing instead of XMM16-XMM31. * cipher/sha512-avx512-amd64.S (_gcry_sha512_transform_amd64_avx512): Likewise. -- Clear zmm16-zmm31 registers with 256bit XOR instead of 128bit as this is better for AMD Zen4. Also clear xmm16 register after vpopcnt in avx512 spec-stop so we do not leave any zmm register state which might end up unnecessarily using CPU resources. Signed-off-by: Jussi Kivilinna --- cipher/asm-common-amd64.h | 10 ++++++---- cipher/camellia-gfni-avx512-amd64.S | 8 ++++---- cipher/chacha20-amd64-avx512.S | 8 ++++---- cipher/cipher-gcm-intel-pclmul.c | 18 +++++++++--------- cipher/keccak-amd64-avx512.S | 10 +++++----- cipher/poly1305-amd64-avx512.S | 8 ++++---- cipher/sha512-avx512-amd64.S | 14 +++++++------- 7 files changed, 39 insertions(+), 37 deletions(-) diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index cd93abc3..d9bbc01b 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -195,11 +195,13 @@ * available on newer CPUs that do not suffer from significant frequency * drop when 512-bit vectors are utilized. */ #define spec_stop_avx512 \ - vpxord %xmm16, %xmm16, %xmm16; \ - vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ + vpxord %ymm16, %ymm16, %ymm16; \ + vpopcntb %xmm16, %xmm16; /* Supported only by newer AVX512 CPUs. */ \ + vpxord %ymm16, %ymm16, %ymm16; #define spec_stop_avx512_intel_syntax \ - vpxord xmm16, xmm16, xmm16; \ - vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */ + vpxord ymm16, ymm16, ymm16; \ + vpopcntb xmm16, xmm16; /* Supported only by newer AVX512 CPUs. */ \ + vpxord ymm16, ymm16, ymm16; #endif /* GCRY_ASM_COMMON_AMD64_H */ diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S index bddad804..14725b4a 100644 --- a/cipher/camellia-gfni-avx512-amd64.S +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -99,10 +99,10 @@ vpxord v3, v3, v3 #define clear_zmm16_zmm31() \ - clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \ - clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \ - clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \ - clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31) + clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \ + clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \ + clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \ + clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31) #define clear_regs() \ kxorq %k1, %k1, %k1; \ diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S index 544e7cdc..4b183528 100644 --- a/cipher/chacha20-amd64-avx512.S +++ b/cipher/chacha20-amd64-avx512.S @@ -205,10 +205,10 @@ vpxord v3, v3, v3; #define clear_zmm16_zmm31() \ - clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \ - clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \ - clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \ - clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31); + clear_vec4(%ymm16, %ymm20, %ymm24, %ymm28); \ + clear_vec4(%ymm17, %ymm21, %ymm25, %ymm29); \ + clear_vec4(%ymm18, %ymm22, %ymm26, %ymm30); \ + clear_vec4(%ymm19, %ymm23, %ymm27, %ymm31); /********************************************************************** 16-way (zmm), 8-way (ymm), 4-way (xmm) chacha20 diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index ec00df09..391cbe6f 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -1560,10 +1560,10 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, } asm volatile ("vmovdqa %%xmm15, %%xmm7\n\t" - "vpxorq %%zmm16, %%zmm16, %%zmm16\n\t" - "vpxorq %%zmm17, %%zmm17, %%zmm17\n\t" - "vpxorq %%zmm18, %%zmm18, %%zmm18\n\t" - "vpxorq %%zmm19, %%zmm19, %%zmm19\n\t" + "vpxorq %%ymm16, %%ymm16, %%ymm16\n\t" + "vpxorq %%ymm17, %%ymm17, %%ymm17\n\t" + "vpxorq %%ymm18, %%ymm18, %%ymm18\n\t" + "vpxorq %%ymm19, %%ymm19, %%ymm19\n\t" : : : "memory" ); @@ -1838,15 +1838,15 @@ _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, } asm volatile ("vpxor %%xmm7, %%xmm7, %%xmm7\n\t" - "vpxorq %%zmm16, %%zmm16, %%zmm16\n\t" - "vpxorq %%zmm17, %%zmm17, %%zmm17\n\t" - "vpxorq %%zmm18, %%zmm18, %%zmm18\n\t" - "vpxorq %%zmm19, %%zmm19, %%zmm19\n\t" + "vpxorq %%ymm16, %%ymm16, %%ymm16\n\t" + "vpxorq %%ymm17, %%ymm17, %%ymm17\n\t" + "vpxorq %%ymm18, %%ymm18, %%ymm18\n\t" + "vpxorq %%ymm19, %%ymm19, %%ymm19\n\t" : : : "memory" ); } -#endif +#endif /* GCM_USE_INTEL_VPCLMUL_AVX512 */ if (nblocks >= 16) { diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S index 58b4150f..b1fc7b64 100644 --- a/cipher/keccak-amd64-avx512.S +++ b/cipher/keccak-amd64-avx512.S @@ -160,14 +160,14 @@ /* Misc helper macros. */ #define clear_avx512_4regs(a, b, c, d) \ - eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d; + eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d); #define clear_regs() \ vzeroall; /* xmm0-xmm15 */ \ - clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \ - clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \ - clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \ - clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31); + clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \ + clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \ + clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \ + clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31); ELF(.type KeccakF1600_ce, at function) .align 64, 0xcc diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S index 6622861f..9beed8ad 100644 --- a/cipher/poly1305-amd64-avx512.S +++ b/cipher/poly1305-amd64-avx512.S @@ -1425,10 +1425,10 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts) vmovdqa64 [rsp + STACK_r_save + 64*5], zmm0; \ \ vzeroall; \ - clear_zmm(xmm16); clear_zmm(xmm20); clear_zmm(xmm24); clear_zmm(xmm28); \ - clear_zmm(xmm17); clear_zmm(xmm21); clear_zmm(xmm25); clear_zmm(xmm29); \ - clear_zmm(xmm18); clear_zmm(xmm22); clear_zmm(xmm26); clear_zmm(xmm30); \ - clear_zmm(xmm19); clear_zmm(xmm23); clear_zmm(xmm27); clear_zmm(xmm31); \ + clear_zmm(ymm16); clear_zmm(ymm20); clear_zmm(ymm24); clear_zmm(ymm28); \ + clear_zmm(ymm17); clear_zmm(ymm21); clear_zmm(ymm25); clear_zmm(ymm29); \ + clear_zmm(ymm18); clear_zmm(ymm22); clear_zmm(ymm26); clear_zmm(ymm30); \ + clear_zmm(ymm19); clear_zmm(ymm23); clear_zmm(ymm27); clear_zmm(ymm31); \ \ .L_final_loop: \ cmp LEN, POLY1305_BLOCK_SIZE; \ diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S index 65475422..431fb3e9 100644 --- a/cipher/sha512-avx512-amd64.S +++ b/cipher/sha512-avx512-amd64.S @@ -384,13 +384,13 @@ _gcry_sha512_transform_amd64_avx512: vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */ vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */ vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */ - clear_reg(xmm16); - clear_reg(xmm17); - clear_reg(xmm18); - clear_reg(xmm19); - clear_reg(xmm20); - clear_reg(xmm21); - clear_reg(xmm22); + clear_reg(ymm16); + clear_reg(ymm17); + clear_reg(ymm18); + clear_reg(ymm19); + clear_reg(ymm20); + clear_reg(ymm21); + clear_reg(ymm22); /* Restore Stack Pointer */ mov rsp, RSP_SAVE -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 17:46:44 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 18:46:44 +0200 Subject: [PATCH 1/2] tests/basic: fix clutter vector register asm for amd64 and i386 Message-ID: <20230117164645.445455-1-jussi.kivilinna@iki.fi> * tests/basic.c (clutter_vector_registers): Pass data pointers through single register for CLUTTER_VECTOR_REGISTER_AMD64 and CLUTTER_VECTOR_REGISTER_I386 as compiler might attempt to allocate separate pointer register for each "m" operator. -- Reported-by: Julian Kirsch Signed-off-by: Jussi Kivilinna --- tests/basic.c | 74 ++++++++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 48 deletions(-) diff --git a/tests/basic.c b/tests/basic.c index 68f4557b..671182b1 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -339,39 +339,24 @@ clutter_vector_registers(void) prepare_vector_data(data); #if defined(CLUTTER_VECTOR_REGISTER_AMD64) - asm volatile("movdqu %[data0], %%xmm0\n" - "movdqu %[data1], %%xmm1\n" - "movdqu %[data2], %%xmm2\n" - "movdqu %[data3], %%xmm3\n" - "movdqu %[data4], %%xmm4\n" - "movdqu %[data5], %%xmm5\n" - "movdqu %[data6], %%xmm6\n" - "movdqu %[data7], %%xmm7\n" - "movdqu %[data8], %%xmm8\n" - "movdqu %[data9], %%xmm9\n" - "movdqu %[data10], %%xmm10\n" - "movdqu %[data11], %%xmm11\n" - "movdqu %[data12], %%xmm12\n" - "movdqu %[data13], %%xmm13\n" - "movdqu %[data14], %%xmm14\n" - "movdqu %[data15], %%xmm15\n" + asm volatile("movdqu (0 * 16)(%[data]), %%xmm0\n" + "movdqu (1 * 16)(%[data]), %%xmm1\n" + "movdqu (2 * 16)(%[data]), %%xmm2\n" + "movdqu (3 * 16)(%[data]), %%xmm3\n" + "movdqu (4 * 16)(%[data]), %%xmm4\n" + "movdqu (5 * 16)(%[data]), %%xmm5\n" + "movdqu (6 * 16)(%[data]), %%xmm6\n" + "movdqu (7 * 16)(%[data]), %%xmm7\n" + "movdqu (8 * 16)(%[data]), %%xmm8\n" + "movdqu (9 * 16)(%[data]), %%xmm9\n" + "movdqu (10 * 16)(%[data]), %%xmm10\n" + "movdqu (11 * 16)(%[data]), %%xmm11\n" + "movdqu (12 * 16)(%[data]), %%xmm12\n" + "movdqu (13 * 16)(%[data]), %%xmm13\n" + "movdqu (14 * 16)(%[data]), %%xmm14\n" + "movdqu (15 * 16)(%[data]), %%xmm15\n" : - : [data0] "m" (*data[0]), - [data1] "m" (*data[1]), - [data2] "m" (*data[2]), - [data3] "m" (*data[3]), - [data4] "m" (*data[4]), - [data5] "m" (*data[5]), - [data6] "m" (*data[6]), - [data7] "m" (*data[7]), - [data8] "m" (*data[8]), - [data9] "m" (*data[9]), - [data10] "m" (*data[10]), - [data11] "m" (*data[11]), - [data12] "m" (*data[12]), - [data13] "m" (*data[13]), - [data14] "m" (*data[14]), - [data15] "m" (*data[15]) + : [data] "r" (&data[0]) : "memory" #ifdef __SSE2__ ,"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", @@ -380,23 +365,16 @@ clutter_vector_registers(void) #endif ); #elif defined(CLUTTER_VECTOR_REGISTER_I386) - asm volatile("movdqu %[data0], %%xmm0\n" - "movdqu %[data1], %%xmm1\n" - "movdqu %[data2], %%xmm2\n" - "movdqu %[data3], %%xmm3\n" - "movdqu %[data4], %%xmm4\n" - "movdqu %[data5], %%xmm5\n" - "movdqu %[data6], %%xmm6\n" - "movdqu %[data7], %%xmm7\n" + asm volatile("movdqu (0 * 16)(%[data]), %%xmm0\n" + "movdqu (1 * 16)(%[data]), %%xmm1\n" + "movdqu (2 * 16)(%[data]), %%xmm2\n" + "movdqu (3 * 16)(%[data]), %%xmm3\n" + "movdqu (4 * 16)(%[data]), %%xmm4\n" + "movdqu (5 * 16)(%[data]), %%xmm5\n" + "movdqu (6 * 16)(%[data]), %%xmm6\n" + "movdqu (7 * 16)(%[data]), %%xmm7\n" : - : [data0] "m" (*data[0]), - [data1] "m" (*data[1]), - [data2] "m" (*data[2]), - [data3] "m" (*data[3]), - [data4] "m" (*data[4]), - [data5] "m" (*data[5]), - [data6] "m" (*data[6]), - [data7] "m" (*data[7]) + : [data] "r" (&data[0]) : "memory" #ifdef __SSE2__ ,"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 17:46:45 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 18:46:45 +0200 Subject: [PATCH 2/2] tests/basic: perform x86 vector cluttering only when __SSE2__ is set In-Reply-To: <20230117164645.445455-1-jussi.kivilinna@iki.fi> References: <20230117164645.445455-1-jussi.kivilinna@iki.fi> Message-ID: <20230117164645.445455-2-jussi.kivilinna@iki.fi> * tests/basic.c (CLUTTER_VECTOR_REGISTER_AMD64) (CLUTTER_VECTOR_REGISTER_I386): Set only if __SSE2__ defined. (clutter_vector_registers) [CLUTTER_VECTOR_REGISTER_AMD64]: Remove __SSE2__ check for "xmm" clobbers. (clutter_vector_registers) [CLUTTER_VECTOR_REGISTER_I386]: Likewise. -- Force __SSE2__ check as buggy compiler might not define __SSE2__ but still attempt to use XMM registers. Signed-off-by: Jussi Kivilinna --- tests/basic.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/basic.c b/tests/basic.c index 671182b1..095bdc97 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -243,11 +243,12 @@ progress_handler (void *cb_data, const char *what, int printchar, #if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ - defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(__SSE2__) # define CLUTTER_VECTOR_REGISTER_AMD64 1 # define CLUTTER_VECTOR_REGISTER_COUNT 16 #elif defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4 && \ - defined(HAVE_GCC_INLINE_ASM_SSSE3) + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(__SSE2__) # define CLUTTER_VECTOR_REGISTER_I386 1 # define CLUTTER_VECTOR_REGISTER_COUNT 8 #elif defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ @@ -357,12 +358,9 @@ clutter_vector_registers(void) "movdqu (15 * 16)(%[data]), %%xmm15\n" : : [data] "r" (&data[0]) - : "memory" -#ifdef __SSE2__ - ,"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", - "xmm15" -#endif + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13", "xmm14", "xmm15" ); #elif defined(CLUTTER_VECTOR_REGISTER_I386) asm volatile("movdqu (0 * 16)(%[data]), %%xmm0\n" @@ -375,10 +373,8 @@ clutter_vector_registers(void) "movdqu (7 * 16)(%[data]), %%xmm7\n" : : [data] "r" (&data[0]) - : "memory" -#ifdef __SSE2__ - ,"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7" ); #elif defined(CLUTTER_VECTOR_REGISTER_AARCH64) asm volatile("mov x0, %[ptr]\n" -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:35 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:35 +0200 Subject: [PATCH 1/7] tests/bench-slope: skip CPU warm-up in regression tests Message-ID: <20230117191741.718995-1-jussi.kivilinna@iki.fi> * tests/bench-slope.c (warm_up_cpu): Skip in regression tests. -- Signed-off-by: Jussi Kivilinna --- tests/bench-slope.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 99b444e0..f8031e5e 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -2966,6 +2966,9 @@ warm_up_cpu (void) { struct nsec_time start, end; + if (in_regression_test) + return; + get_nsec_time (&start); do { -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:38 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:38 +0200 Subject: [PATCH 4/7] mpi/amd64: align functions to 32 bytes In-Reply-To: <20230117191741.718995-1-jussi.kivilinna@iki.fi> References: <20230117191741.718995-1-jussi.kivilinna@iki.fi> Message-ID: <20230117191741.718995-4-jussi.kivilinna@iki.fi> * mpi/amd64/mpih-add1.S: Align function to 32 bytes. * mpi/amd64/mpih-lshift.S: Likewise. * mpi/amd64/mpih-mul2.S: Likewise. * mpi/amd64/mpih-mul3.S: Likewise. * mpi/amd64/mpih-rshift.S: Likewise. * mpi/amd64/mpih-sub1.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- mpi/amd64/mpih-add1.S | 3 ++- mpi/amd64/mpih-lshift.S | 3 ++- mpi/amd64/mpih-mul2.S | 1 + mpi/amd64/mpih-mul3.S | 1 + mpi/amd64/mpih-rshift.S | 3 ++- mpi/amd64/mpih-sub1.S | 3 ++- 6 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S index 39c00c52..e3b57ede 100644 --- a/mpi/amd64/mpih-add1.S +++ b/mpi/amd64/mpih-add1.S @@ -40,7 +40,8 @@ * mpi_size_t size) rcx */ -.text + TEXT + ALIGN(5) .globl C_SYMBOL_NAME(_gcry_mpih_add_n) C_SYMBOL_NAME(_gcry_mpih_add_n:) FUNC_ENTRY() diff --git a/mpi/amd64/mpih-lshift.S b/mpi/amd64/mpih-lshift.S index a9c7d7e1..07e73a1a 100644 --- a/mpi/amd64/mpih-lshift.S +++ b/mpi/amd64/mpih-lshift.S @@ -39,7 +39,8 @@ * unsigned cnt) rcx */ -.text + TEXT + ALIGN(5) .globl C_SYMBOL_NAME(_gcry_mpih_lshift) C_SYMBOL_NAME(_gcry_mpih_lshift:) FUNC_ENTRY() diff --git a/mpi/amd64/mpih-mul2.S b/mpi/amd64/mpih-mul2.S index 07913586..1badc024 100644 --- a/mpi/amd64/mpih-mul2.S +++ b/mpi/amd64/mpih-mul2.S @@ -39,6 +39,7 @@ * mpi_limb_t s2_limb) (rcx) */ TEXT + ALIGN(5) GLOBL C_SYMBOL_NAME(_gcry_mpih_addmul_1) C_SYMBOL_NAME(_gcry_mpih_addmul_1:) FUNC_ENTRY() diff --git a/mpi/amd64/mpih-mul3.S b/mpi/amd64/mpih-mul3.S index f8889eb2..21a518db 100644 --- a/mpi/amd64/mpih-mul3.S +++ b/mpi/amd64/mpih-mul3.S @@ -40,6 +40,7 @@ * mpi_limb_t s2_limb) (rcx) */ TEXT + ALIGN(5) GLOBL C_SYMBOL_NAME(_gcry_mpih_submul_1) C_SYMBOL_NAME(_gcry_mpih_submul_1:) FUNC_ENTRY() diff --git a/mpi/amd64/mpih-rshift.S b/mpi/amd64/mpih-rshift.S index 8ecf155f..5f117cb1 100644 --- a/mpi/amd64/mpih-rshift.S +++ b/mpi/amd64/mpih-rshift.S @@ -39,7 +39,8 @@ * unsigned cnt) rcx */ -.text + TEXT + ALIGN(5) .globl C_SYMBOL_NAME(_gcry_mpih_rshift) C_SYMBOL_NAME(_gcry_mpih_rshift:) FUNC_ENTRY() diff --git a/mpi/amd64/mpih-sub1.S b/mpi/amd64/mpih-sub1.S index d60b58a5..dd645314 100644 --- a/mpi/amd64/mpih-sub1.S +++ b/mpi/amd64/mpih-sub1.S @@ -39,7 +39,8 @@ * mpi_ptr_t s2_ptr, rdx * mpi_size_t size) rcx */ -.text + TEXT + ALIGN(5) .globl C_SYMBOL_NAME(_gcry_mpih_sub_n) C_SYMBOL_NAME(_gcry_mpih_sub_n:) FUNC_ENTRY() -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:40 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:40 +0200 Subject: [PATCH 6/7] s390x-asm: move constant data to read-only section In-Reply-To: <20230117191741.718995-1-jussi.kivilinna@iki.fi> References: <20230117191741.718995-1-jussi.kivilinna@iki.fi> Message-ID: <20230117191741.718995-6-jussi.kivilinna@iki.fi> * cipher/chacha20-s390x.S: Move constant data to read-only section; Align functions to 16 bytes. * cipher/poly1305-s390x.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/chacha20-s390x.S | 15 ++++++++++----- cipher/poly1305-s390x.S | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cipher/chacha20-s390x.S b/cipher/chacha20-s390x.S index 9b1d59c6..5a931998 100644 --- a/cipher/chacha20-s390x.S +++ b/cipher/chacha20-s390x.S @@ -26,9 +26,12 @@ #include "asm-poly1305-s390x.h" .machine "z13+vx" -.text +.section .rodata + +ELF(.type _gcry_chacha20_s390x_vx_constants, at function;) .balign 16 +_gcry_chacha20_s390x_vx_constants: .Lconsts: .Lwordswap: .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 @@ -302,7 +305,9 @@ 4-way && 2-way && 1-way chacha20 ("horizontal") **********************************************************************/ -.balign 8 +.text + +.balign 16 .globl _gcry_chacha20_s390x_vx_blocks4_2_1 ELF(.type _gcry_chacha20_s390x_vx_blocks4_2_1, at function;) @@ -578,7 +583,7 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1, 4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal") **********************************************************************/ -.balign 8 +.balign 16 .globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1 ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1, at function;) @@ -1058,7 +1063,7 @@ ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1, vpdi vc, tmpc, vd, 0; \ vpdi vd, tmpc, vd, 5; -.balign 8 +.balign 16 .globl _gcry_chacha20_s390x_vx_blocks8 ELF(.type _gcry_chacha20_s390x_vx_blocks8, at function;) @@ -1276,7 +1281,7 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks8, 8-way stitched chacha20-poly1305 ("vertical") **********************************************************************/ -.balign 8 +.balign 16 .globl _gcry_chacha20_poly1305_s390x_vx_blocks8 ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8, at function;) diff --git a/cipher/poly1305-s390x.S b/cipher/poly1305-s390x.S index 28bed560..5ba424e4 100644 --- a/cipher/poly1305-s390x.S +++ b/cipher/poly1305-s390x.S @@ -26,7 +26,7 @@ .text -.balign 8 +.balign 16 .globl _gcry_poly1305_s390x_blocks1 ELF(.type _gcry_poly1305_s390x_blocks1, at function;) -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:36 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:36 +0200 Subject: [PATCH 2/7] amd64-asm: move constant data to read-only section for hash/mac algos In-Reply-To: <20230117191741.718995-1-jussi.kivilinna@iki.fi> References: <20230117191741.718995-1-jussi.kivilinna@iki.fi> Message-ID: <20230117191741.718995-2-jussi.kivilinna@iki.fi> * cipher/asm-common-amd64.h (SECTION_RODATA): New. * cipher/blake2b-amd64-avx2.S: Use read-only section for constant data; Align text section to 64 bytes. * cipher/blake2b-amd64-avx512.S: Likewise. * cipher/blake2s-amd64-avx.S: Likewise. * cipher/blake2s-amd64-avx512.S: Likewise. * cipher/poly1305-amd64-avx512.S: Likewise. * cipher/sha1-avx-amd64.S: Likewise. * cipher/sha1-avx-bmi2-amd64.S: Likewise. * cipher/sha1-avx2-bmi2-amd64.S: Likewise. * cipher/sha1-ssse3-amd64.S: Likewise. * cipher/sha256-avx-amd64.S: Likewise. * cipher/sha256-avx2-bmi2-amd64.S: Likewise. * cipher/sha256-ssse3-amd64.S: Likewise. * cipher/sha512-avx-amd64.S: Likewise. * cipher/sha512-avx2-bmi2-amd64.S: Likewise. * cipher/sha512-avx512-amd64.S: Likewise. * cipher/sha512-ssse3-amd64.S: Likewise. * cipher/sha3-avx-bmi2-amd64.S: Likewise. * cipher/whirlpool-sse2-amd64.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/asm-common-amd64.h | 6 ++++++ cipher/blake2b-amd64-avx2.S | 7 ++++--- cipher/blake2b-amd64-avx512.S | 10 ++++++---- cipher/blake2s-amd64-avx.S | 9 ++++++--- cipher/blake2s-amd64-avx512.S | 10 ++++++---- cipher/poly1305-amd64-avx512.S | 7 +++++-- cipher/sha1-avx-amd64.S | 8 ++++++-- cipher/sha1-avx-bmi2-amd64.S | 9 +++++++-- cipher/sha1-avx2-bmi2-amd64.S | 9 +++++++-- cipher/sha1-ssse3-amd64.S | 9 +++++++-- cipher/sha256-avx-amd64.S | 7 ++++++- cipher/sha256-avx2-bmi2-amd64.S | 8 +++++++- cipher/sha256-ssse3-amd64.S | 7 ++++++- cipher/sha512-avx-amd64.S | 7 ++++++- cipher/sha512-avx2-bmi2-amd64.S | 7 ++++++- cipher/sha512-avx512-amd64.S | 4 +++- cipher/sha512-ssse3-amd64.S | 7 ++++++- cipher/sm3-avx-bmi2-amd64.S | 6 ++++-- cipher/whirlpool-sse2-amd64.S | 2 +- 19 files changed, 105 insertions(+), 34 deletions(-) diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index d9bbc01b..870fef9a 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -29,6 +29,12 @@ # define ELF(...) /*_*/ #endif +#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define SECTION_RODATA .section .rdata +#else +# define SECTION_RODATA .section .rodata +#endif + #ifdef __PIC__ # define rRIP (%rip) #else diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S index 3601b65f..43c2cce1 100644 --- a/cipher/blake2b-amd64-avx2.S +++ b/cipher/blake2b-amd64-avx2.S @@ -31,8 +31,6 @@ #include "asm-common-amd64.h" -.text - /* register macros */ #define RSTATE %rdi #define RINBLKS %rsi @@ -185,8 +183,10 @@ G2(ROW1, ROW2, ROW3, ROW4, m4); \ UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); -blake2b_data: +SECTION_RODATA .align 32 +ELF(.type _blake2b_avx2_data, at object;) +_blake2b_avx2_data: .Liv: .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 @@ -197,6 +197,7 @@ blake2b_data: .Lshuf_ror24: .byte 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 +.text .align 64 .globl _gcry_blake2b_transform_amd64_avx2 ELF(.type _gcry_blake2b_transform_amd64_avx2, at function;) diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S index 18b0c3ad..fe938730 100644 --- a/cipher/blake2b-amd64-avx512.S +++ b/cipher/blake2b-amd64-avx512.S @@ -31,8 +31,6 @@ #include "asm-common-amd64.h" -.text - /* register macros */ #define RSTATE %rdi #define RINBLKS %rsi @@ -180,9 +178,11 @@ G2(ROW1, ROW2, ROW3, ROW4, m4); \ UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4) -ELF(.type blake2b_data, at object;) -blake2b_data: +SECTION_RODATA + .align 32 +ELF(.type _blake2b_avx512_data, at object;) +_blake2b_avx512_data: .Liv: .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 @@ -209,6 +209,8 @@ blake2b_data: .Lgmask9: GEN_GMASK(10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) +.text + .align 64 .globl _gcry_blake2b_transform_amd64_avx512 ELF(.type _gcry_blake2b_transform_amd64_avx512, at function;) diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S index 5094b4c1..44b82ab2 100644 --- a/cipher/blake2s-amd64-avx.S +++ b/cipher/blake2s-amd64-avx.S @@ -31,8 +31,6 @@ #include "asm-common-amd64.h" -.text - /* register macros */ #define RSTATE %rdi #define RINBLKS %rsi @@ -171,8 +169,11 @@ G2(ROW1, ROW2, ROW3, ROW4, m4); \ UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); -blake2s_data: +SECTION_RODATA + .align 16 +ELF(.type _blake2s_avx_data, at object;) +_blake2s_avx_data: .Liv: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 @@ -181,6 +182,8 @@ blake2s_data: .Lshuf_ror8: .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12 +.text + .align 64 .globl _gcry_blake2s_transform_amd64_avx ELF(.type _gcry_blake2s_transform_amd64_avx, at function;) diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S index ddcdfd67..e2da2a18 100644 --- a/cipher/blake2s-amd64-avx512.S +++ b/cipher/blake2s-amd64-avx512.S @@ -31,8 +31,6 @@ #include "asm-common-amd64.h" -.text - /* register macros */ #define RSTATE %rdi #define RINBLKS %rsi @@ -164,13 +162,17 @@ G2(ROW1, ROW2, ROW3, ROW4, m4); \ UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); -ELF(.type blake2s_data, at object;) -blake2s_data: +SECTION_RODATA + +ELF(.type _blake2s_avx512_data, at object;) .align 16 +_blake2s_avx512_data: .Liv: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +.text + .align 64 .globl _gcry_blake2s_transform_amd64_avx512 ELF(.type _gcry_blake2s_transform_amd64_avx512, at function;) diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S index 9beed8ad..cf176129 100644 --- a/cipher/poly1305-amd64-avx512.S +++ b/cipher/poly1305-amd64-avx512.S @@ -44,7 +44,7 @@ .intel_syntax noprefix -.text +SECTION_RODATA ELF(.type _gcry_poly1305_avx512_consts, at object) _gcry_poly1305_avx512_consts: @@ -1575,7 +1575,10 @@ ELF(.size _gcry_poly1305_avx512_consts,.-_gcry_poly1305_avx512_consts) ;; arg3 - Input/output hash ;; arg4 - Poly1305 key */ -.align 32 + +.text + +.align 64 .globl _gcry_poly1305_amd64_avx512_blocks ELF(.type _gcry_poly1305_amd64_avx512_blocks, at function;) _gcry_poly1305_amd64_avx512_blocks: diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index acada960..5b9e0500 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -47,7 +47,10 @@ /* Constants */ -.text +SECTION_RODATA + +ELF(.type _sha1_avx_consts, at object) +_sha1_avx_consts: #define K1 0x5A827999 #define K2 0x6ED9EBA1 #define K3 0x8F1BBCDC @@ -195,6 +198,7 @@ vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); +.text /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. @@ -205,7 +209,7 @@ */ .globl _gcry_sha1_transform_amd64_avx ELF(.type _gcry_sha1_transform_amd64_avx, at function) -.align 16 +.align 64 _gcry_sha1_transform_amd64_avx: /* input: * %rdi: ctx, CTX diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index 5f4b9e69..9df147c2 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -48,7 +48,11 @@ /* Constants */ -.text +SECTION_RODATA + +ELF(.type _sha1_avx_bmi2_consts, at object) +_sha1_avx_bmi2_consts: + .align 16 .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f @@ -194,6 +198,7 @@ vpaddd K, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); +.text /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. @@ -204,7 +209,7 @@ */ .globl _gcry_sha1_transform_amd64_avx_bmi2 ELF(.type _gcry_sha1_transform_amd64_avx_bmi2, at function) -.align 16 +.align 64 _gcry_sha1_transform_amd64_avx_bmi2: /* input: * %rdi: ctx, CTX diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S index ed52761b..0db1d9b9 100644 --- a/cipher/sha1-avx2-bmi2-amd64.S +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -48,9 +48,13 @@ /* Constants */ +SECTION_RODATA + #define WK_STACK_WORDS (80 * 2) -.text +ELF(.type _sha1_avx2_bmi2_consts, at object) +_sha1_avx2_bmi2_consts: + .align 16 .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f @@ -200,6 +204,7 @@ vpaddd K, W, tmp0; \ vmovdqa tmp0, PRE_WK((i)&~3); +.text /* * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA. @@ -210,7 +215,7 @@ */ .globl _gcry_sha1_transform_amd64_avx2_bmi2 ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2, at function) -.align 16 +.align 64 _gcry_sha1_transform_amd64_avx2_bmi2: /* input: * %rdi: ctx, CTX diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index f09b1de1..afea6501 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -47,7 +47,11 @@ /* Constants */ -.text +SECTION_RODATA + +ELF(.type _sha1_ssse3_consts, at object) +_sha1_ssse3_consts: + #define K1 0x5A827999 #define K2 0x6ED9EBA1 #define K3 0x8F1BBCDC @@ -207,6 +211,7 @@ #define CLEAR_REG(reg) pxor reg, reg; +.text /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. @@ -217,7 +222,7 @@ */ .globl _gcry_sha1_transform_amd64_ssse3 ELF(.type _gcry_sha1_transform_amd64_ssse3, at function) -.align 16 +.align 64 _gcry_sha1_transform_amd64_ssse3: /* input: * %rdi: ctx, CTX diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index be8a799d..8b2cbfe8 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -342,7 +342,7 @@ .text .globl _gcry_sha256_transform_amd64_avx ELF(.type _gcry_sha256_transform_amd64_avx, at function;) -.align 16 +.align 64 _gcry_sha256_transform_amd64_avx: CFI_STARTPROC() vzeroupper @@ -475,6 +475,11 @@ _gcry_sha256_transform_amd64_avx: CFI_ENDPROC() +SECTION_RODATA + +ELF(.type _sha256_avx_consts, at object) +_sha256_avx_consts: + .align 16 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index 60ad442c..93919ead 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -247,7 +247,7 @@ .text .globl _gcry_sha256_transform_amd64_avx2 ELF(.type _gcry_sha256_transform_amd64_avx2, at function) -.align 32 +.align 64 _gcry_sha256_transform_amd64_avx2: CFI_STARTPROC() xor eax, eax @@ -477,6 +477,12 @@ _gcry_sha256_transform_amd64_avx2: ret_spec_stop CFI_ENDPROC() + +SECTION_RODATA + +ELF(.type _sha256_avx2_consts, at object) +_sha256_avx2_consts: + .align 64 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 401ff6f4..41c15420 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -349,7 +349,7 @@ .text .globl _gcry_sha256_transform_amd64_ssse3 ELF(.type _gcry_sha256_transform_amd64_ssse3, at function;) -.align 16 +.align 64 _gcry_sha256_transform_amd64_ssse3: CFI_STARTPROC() push rbx @@ -497,6 +497,11 @@ _gcry_sha256_transform_amd64_ssse3: CFI_ENDPROC() +SECTION_RODATA + +ELF(.type _sha256_ssse3_consts, at object) +_sha256_ssse3_consts: + .align 16 .LK256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index bfc4435d..e8663756 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -246,7 +246,7 @@ */ .globl _gcry_sha512_transform_amd64_avx ELF(.type _gcry_sha512_transform_amd64_avx, at function;) -.align 16 +.align 64 _gcry_sha512_transform_amd64_avx: CFI_STARTPROC() xor eax, eax @@ -408,6 +408,11 @@ _gcry_sha512_transform_amd64_avx: ;;; Binary Data */ +SECTION_RODATA + +ELF(.type _sha512_avx_consts, at object) +_sha512_avx_consts: + .align 16 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index a431e196..6e6e1e43 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -274,7 +274,7 @@ */ .globl _gcry_sha512_transform_amd64_avx2 ELF(.type _gcry_sha512_transform_amd64_avx2, at function;) -.align 16 +.align 64 _gcry_sha512_transform_amd64_avx2: CFI_STARTPROC() xor eax, eax @@ -445,6 +445,11 @@ _gcry_sha512_transform_amd64_avx2: /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /*;; Binary Data */ +SECTION_RODATA + +ELF(.type _sha512_avx2_consts, at object) +_sha512_avx2_consts: + .align 64 /* K[t] used in SHA512 hashing */ .LK512: diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S index 431fb3e9..f113824c 100644 --- a/cipher/sha512-avx512-amd64.S +++ b/cipher/sha512-avx512-amd64.S @@ -256,7 +256,7 @@ */ .globl _gcry_sha512_transform_amd64_avx512 ELF(.type _gcry_sha512_transform_amd64_avx512, at function;) -.align 16 +.align 64 _gcry_sha512_transform_amd64_avx512: CFI_STARTPROC() xor eax, eax @@ -404,6 +404,8 @@ ELF(.size _gcry_sha512_transform_amd64_avx512,.-_gcry_sha512_transform_amd64_avx /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /*;; Binary Data */ +SECTION_RODATA + ELF(.type _gcry_sha512_avx512_consts, at object) _gcry_sha512_avx512_consts: .align 64 diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 9cc30892..0a26f215 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -249,7 +249,7 @@ */ .globl _gcry_sha512_transform_amd64_ssse3 ELF(.type _gcry_sha512_transform_amd64_ssse3, at function;) -.align 16 +.align 64 _gcry_sha512_transform_amd64_ssse3: CFI_STARTPROC() xor eax, eax @@ -414,6 +414,11 @@ _gcry_sha512_transform_amd64_ssse3: ;;; Binary Data */ +SECTION_RODATA + +ELF(.type _sha512_ssse3_consts, at object) +_sha512_ssse3_consts: + .align 16 /* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ diff --git a/cipher/sm3-avx-bmi2-amd64.S b/cipher/sm3-avx-bmi2-amd64.S index d9b6206a..9066be33 100644 --- a/cipher/sm3-avx-bmi2-amd64.S +++ b/cipher/sm3-avx-bmi2-amd64.S @@ -41,7 +41,7 @@ /* Constants */ -.text +SECTION_RODATA .align 16 ELF(.type _gcry_sm3_avx2_consts, at object) _gcry_sm3_avx2_consts: @@ -334,6 +334,8 @@ ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts) vpxor w0, XTMP4, XTMP1; \ vmovdqa XTMP1, XW_W1W2_ADDR((round), 0); +.text + /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * @@ -343,7 +345,7 @@ ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts) */ .globl _gcry_sm3_transform_amd64_avx_bmi2 ELF(.type _gcry_sm3_transform_amd64_avx_bmi2, at function) -.align 16 +.align 64 _gcry_sm3_transform_amd64_avx_bmi2: /* input: * %rdi: ctx, CTX diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S index 37648faa..b26dfed2 100644 --- a/cipher/whirlpool-sse2-amd64.S +++ b/cipher/whirlpool-sse2-amd64.S @@ -152,7 +152,7 @@ #define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5 #define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6 -.align 8 +.align 64 .globl _gcry_whirlpool_transform_amd64 ELF(.type _gcry_whirlpool_transform_amd64, at function;) -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:39 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:39 +0200 Subject: [PATCH 5/7] powerpc-asm: move constant data to read-only section In-Reply-To: <20230117191741.718995-1-jussi.kivilinna@iki.fi> References: <20230117191741.718995-1-jussi.kivilinna@iki.fi> Message-ID: <20230117191741.718995-5-jussi.kivilinna@iki.fi> * cipher/chacha20-p10le-8x.s: Move constant data to read-only section. -- Signed-off-by: Jussi Kivilinna --- cipher/chacha20-p10le-8x.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cipher/chacha20-p10le-8x.s b/cipher/chacha20-p10le-8x.s index 427c6310..ff68c9ef 100644 --- a/cipher/chacha20-p10le-8x.s +++ b/cipher/chacha20-p10le-8x.s @@ -848,7 +848,7 @@ Out_no_chacha: li 3, 0 blr -.data +.section .rodata .align 4 sigma: .long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:41 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:41 +0200 Subject: [PATCH 7/7] aarch64-asm: move constant data to read-only section In-Reply-To: <20230117191741.718995-1-jussi.kivilinna@iki.fi> References: <20230117191741.718995-1-jussi.kivilinna@iki.fi> Message-ID: <20230117191741.718995-7-jussi.kivilinna@iki.fi> * cipher/asm-common-aarch64.h (SECTION_RODATA) (GET_DATA_POINTER): New. (GET_LOCAL_POINTER): Remove. * cipher/camellia-aarch64.S: Likewise. * cipher/chacha20-aarch64.S: Likewise. * cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise. * cipher/crc-armv8-aarch64-ce.S: Likewise. * cipher/rijndael-aarch64.S: Likewise. * cipher/sha1-armv8-aarch64-ce.S: Likewise. * cipher/sha256-armv8-aarch64-ce.S: Likewise. * cipher/sm3-aarch64.S: Likewise. * cipher/sm3-armv8-aarch64-ce.S: Likewise. * cipher/sm4-aarch64.S: Likewise. * cipher/sm4-armv9-aarch64-sve-ce.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/asm-common-aarch64.h | 13 +++++++++++-- cipher/camellia-aarch64.S | 11 ++++------- cipher/chacha20-aarch64.S | 16 ++++++++++------ cipher/cipher-gcm-armv8-aarch64-ce.S | 13 ++++++++----- cipher/crc-armv8-aarch64-ce.S | 11 +++++++---- cipher/rijndael-aarch64.S | 4 ---- cipher/sha1-armv8-aarch64-ce.S | 9 ++++++--- cipher/sha256-armv8-aarch64-ce.S | 9 ++++++--- cipher/sm3-aarch64.S | 7 +++++-- cipher/sm3-armv8-aarch64-ce.S | 7 +++++-- cipher/sm4-aarch64.S | 6 ++++-- cipher/sm4-armv9-aarch64-sve-ce.S | 6 +++--- 12 files changed, 69 insertions(+), 43 deletions(-) diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h index b38b17a6..8e8bf8e7 100644 --- a/cipher/asm-common-aarch64.h +++ b/cipher/asm-common-aarch64.h @@ -29,8 +29,17 @@ # define ELF(...) /*_*/ #endif -#define GET_LOCAL_POINTER(reg, label) \ - adr reg, label; +#define SECTION_RODATA .section .rodata + +#ifdef __APPLE__ +#define GET_DATA_POINTER(reg, name) \ + adrp reg, name at GOTPAGE ; \ + add reg, reg, name at GOTPAGEOFF ; +#else +#define GET_DATA_POINTER(reg, name) \ + adrp reg, name ; \ + add reg, reg, #:lo12:name ; +#endif #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES /* CFI directives to emit DWARF stack unwinding information. */ diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S index c019c168..d53c595c 100644 --- a/cipher/camellia-aarch64.S +++ b/cipher/camellia-aarch64.S @@ -214,7 +214,7 @@ _gcry_camellia_arm_encrypt_block: * w3: keybitlen */ - GET_LOCAL_POINTER(RTAB1, _gcry_camellia_arm_tables); + GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables); mov RMASK, #(0xff<<4); /* byte mask */ add RTAB2, RTAB1, #(1 * 4); add RTAB3, RTAB1, #(2 * 4); @@ -240,7 +240,6 @@ _gcry_camellia_arm_encrypt_block: CFI_RESTORE(x30) ret_spec_stop; CFI_RESTORE_STATE() -.ltorg .Lenc_256: enc_fls(24); @@ -254,7 +253,6 @@ _gcry_camellia_arm_encrypt_block: CFI_RESTORE(x30) ret_spec_stop; CFI_ENDPROC() -.ltorg ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;) .globl _gcry_camellia_arm_decrypt_block @@ -274,7 +272,7 @@ _gcry_camellia_arm_decrypt_block: * w3: keybitlen */ - GET_LOCAL_POINTER(RTAB1, _gcry_camellia_arm_tables); + GET_DATA_POINTER(RTAB1, _gcry_camellia_arm_tables); mov RMASK, #(0xff<<4); /* byte mask */ add RTAB2, RTAB1, #(1 * 4); add RTAB3, RTAB1, #(2 * 4); @@ -301,7 +299,6 @@ _gcry_camellia_arm_decrypt_block: CFI_RESTORE(x30) ret_spec_stop; CFI_RESTORE_STATE() -.ltorg .Ldec_256: inpack(32); @@ -310,11 +307,11 @@ _gcry_camellia_arm_decrypt_block: b .Ldec_128; CFI_ENDPROC() -.ltorg ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;) /* Encryption/Decryption tables */ -ELF(.type _gcry_camellia_arm_tables, at object;) +SECTION_RODATA +ELF(.type _gcry_camellia_arm_tables,%object;) .balign 32 _gcry_camellia_arm_tables: .Lcamellia_sp1110: diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index 540f892b..c07fbede 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -36,8 +36,6 @@ .cpu generic+simd -.text - #include "asm-poly1305-aarch64.h" /* register macros */ @@ -180,12 +178,16 @@ ROTATE4(b1, b2, b3, b4, 7, tmp1, tmp2, tmp3, tmp4, \ _(iop27), _(iop28), _(iop29)); +SECTION_RODATA + .align 4 +ELF(.type _gcry_chacha20_aarch64_blocks4_data_inc_counter,%object;) .globl _gcry_chacha20_aarch64_blocks4_data_inc_counter _gcry_chacha20_aarch64_blocks4_data_inc_counter: .long 0,1,2,3 .align 4 +ELF(.type _gcry_chacha20_aarch64_blocks4_data_rot8,%object;) .globl _gcry_chacha20_aarch64_blocks4_data_rot8 _gcry_chacha20_aarch64_blocks4_data_rot8: .byte 3,0,1,2 @@ -193,6 +195,8 @@ _gcry_chacha20_aarch64_blocks4_data_rot8: .byte 11,8,9,10 .byte 15,12,13,14 +.text + .align 3 .globl _gcry_chacha20_aarch64_blocks4 ELF(.type _gcry_chacha20_aarch64_blocks4,%function;) @@ -206,10 +210,10 @@ _gcry_chacha20_aarch64_blocks4: */ CFI_STARTPROC() - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); add INPUT_CTR, INPUT, #(12*4); ld1 {ROT8.16b}, [CTR]; - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); mov INPUT_POS, INPUT; ld1 {VCTR.16b}, [CTR]; @@ -383,10 +387,10 @@ _gcry_chacha20_poly1305_aarch64_blocks4: mov POLY_RSTATE, x4; mov POLY_RSRC, x5; - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8); add INPUT_CTR, INPUT, #(12*4); ld1 {ROT8.16b}, [CTR]; - GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); + GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter); mov INPUT_POS, INPUT; ld1 {VCTR.16b}, [CTR]; diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 78f3ad2d..8fd5d24a 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -25,12 +25,13 @@ .cpu generic+simd+crypto -.text - /* Constants */ +SECTION_RODATA + .align 4 +ELF(.type gcry_gcm_reduction_constant,%object;) gcry_gcm_reduction_constant: .Lrconst: .quad 0x87 @@ -149,6 +150,8 @@ gcry_gcm_reduction_constant: #define _(...) __VA_ARGS__ #define __ _() +.text + /* * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, * const byte *buf, size_t nblocks, @@ -169,7 +172,7 @@ _gcry_ghash_armv8_ce_pmull: cbz x3, .Ldo_nothing; - GET_LOCAL_POINTER(x5, .Lrconst) + GET_DATA_POINTER(x5, .Lrconst) eor vZZ.16b, vZZ.16b, vZZ.16b ld1 {rhash.16b}, [x1] @@ -368,7 +371,7 @@ _gcry_polyval_armv8_ce_pmull: cbz x3, .Lpolyval_do_nothing; - GET_LOCAL_POINTER(x5, .Lrconst) + GET_DATA_POINTER(x5, .Lrconst) eor vZZ.16b, vZZ.16b, vZZ.16b ld1 {rhash.16b}, [x1] @@ -589,7 +592,7 @@ _gcry_ghash_setup_armv8_ce_pmull: */ CFI_STARTPROC() - GET_LOCAL_POINTER(x2, .Lrconst) + GET_DATA_POINTER(x2, .Lrconst) eor vZZ.16b, vZZ.16b, vZZ.16b diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S index b6cdbb3d..5609e368 100644 --- a/cipher/crc-armv8-aarch64-ce.S +++ b/cipher/crc-armv8-aarch64-ce.S @@ -25,8 +25,6 @@ .cpu generic+simd+crypto -.text - /* Structure of crc32_consts_s */ @@ -35,7 +33,11 @@ /* Constants */ +SECTION_RODATA + .align 6 +ELF(.type _crc32_aarch64_ce_constants,%object;) +_crc32_aarch64_ce_constants: .Lcrc32_constants: .Lcrc32_partial_fold_input_mask: .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 @@ -54,6 +56,7 @@ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.text /* * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, @@ -71,7 +74,7 @@ _gcry_crc32r_armv8_ce_bulk: */ CFI_STARTPROC() - GET_LOCAL_POINTER(x7, .Lcrc32_constants) + GET_DATA_POINTER(x7, .Lcrc32_constants) add x9, x3, #consts_k(5 - 1) cmp x2, #128 @@ -280,7 +283,7 @@ _gcry_crc32_armv8_ce_bulk: */ CFI_STARTPROC() - GET_LOCAL_POINTER(x7, .Lcrc32_constants) + GET_DATA_POINTER(x7, .Lcrc32_constants) add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants cmp x2, #128 ld1 {v7.16b}, [x4] diff --git a/cipher/rijndael-aarch64.S b/cipher/rijndael-aarch64.S index 184fcd20..dcb82382 100644 --- a/cipher/rijndael-aarch64.S +++ b/cipher/rijndael-aarch64.S @@ -265,7 +265,6 @@ _gcry_aes_arm_encrypt_block: mov x0, #(0); ret_spec_stop; -.ltorg .Lenc_not_128: beq .Lenc_192 @@ -278,7 +277,6 @@ _gcry_aes_arm_encrypt_block: b .Lenc_done; -.ltorg .Lenc_192: encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); @@ -488,7 +486,6 @@ _gcry_aes_arm_decrypt_block: mov x0, #(0); ret_spec_stop; -.ltorg .Ldec_256: beq .Ldec_192; @@ -500,7 +497,6 @@ _gcry_aes_arm_decrypt_block: b .Ldec_tail; -.ltorg .Ldec_192: firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND); decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S index f95717ee..28fb1c14 100644 --- a/cipher/sha1-armv8-aarch64-ce.S +++ b/cipher/sha1-armv8-aarch64-ce.S @@ -25,16 +25,17 @@ .cpu generic+simd+crypto -.text - /* Constants */ +SECTION_RODATA + #define K1 0x5A827999 #define K2 0x6ED9EBA1 #define K3 0x8F1BBCDC #define K4 0xCA62C1D6 .align 4 +ELF(.type gcry_sha1_aarch64_ce_K_VEC,%object;) gcry_sha1_aarch64_ce_K_VEC: .LK_VEC: .LK1: .long K1, K1, K1, K1 @@ -91,6 +92,8 @@ gcry_sha1_aarch64_ce_K_VEC: #define CLEAR_REG(reg) movi reg.16b, #0; +.text + /* * unsigned int * _gcry_sha1_transform_armv8_ce (void *ctx, const unsigned char *data, @@ -109,7 +112,7 @@ _gcry_sha1_transform_armv8_ce: cbz x2, .Ldo_nothing; - GET_LOCAL_POINTER(x4, .LK_VEC); + GET_DATA_POINTER(x4, .LK_VEC); ld1 {vH0123.4s}, [x0] /* load h0,h1,h2,h3 */ ld1 {vK1.4s-vK4.4s}, [x4] /* load K1,K2,K3,K4 */ diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S index 5616eada..43b941b6 100644 --- a/cipher/sha256-armv8-aarch64-ce.S +++ b/cipher/sha256-armv8-aarch64-ce.S @@ -25,12 +25,13 @@ .cpu generic+simd+crypto -.text - /* Constants */ +SECTION_RODATA + .align 4 +ELF(.type gcry_sha256_aarch64_ce_K,%object;) gcry_sha256_aarch64_ce_K: .LK: .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 @@ -101,6 +102,8 @@ gcry_sha256_aarch64_ce_K: #define CLEAR_REG(reg) movi reg.16b, #0; +.text + /* * unsigned int * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data, @@ -119,7 +122,7 @@ _gcry_sha256_transform_armv8_ce: cbz x2, .Ldo_nothing; - GET_LOCAL_POINTER(x3, .LK); + GET_DATA_POINTER(x3, .LK); mov x4, x3 ld1 {vH0123.4s-vH4567.4s}, [x0] /* load state */ diff --git a/cipher/sm3-aarch64.S b/cipher/sm3-aarch64.S index 0e58254b..a4c132d3 100644 --- a/cipher/sm3-aarch64.S +++ b/cipher/sm3-aarch64.S @@ -29,7 +29,7 @@ /* Constants */ -.text +SECTION_RODATA .align 4 ELF(.type _gcry_sm3_aarch64_consts, at object) _gcry_sm3_aarch64_consts: @@ -383,6 +383,9 @@ ELF(.size _gcry_sm3_aarch64_consts,.-_gcry_sm3_aarch64_consts) #define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) + +.text + /* * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * @@ -425,7 +428,7 @@ _gcry_sm3_transform_aarch64: CFI_DEF_CFA_REGISTER(RFRAME); sub addr0, sp, #STACK_SIZE; - GET_LOCAL_POINTER(RKPTR, .LKtable); + GET_DATA_POINTER(RKPTR, .LKtable); and sp, addr0, #(~63); /* Preload first block. */ diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S index d592d08a..fdee3ccb 100644 --- a/cipher/sm3-armv8-aarch64-ce.S +++ b/cipher/sm3-armv8-aarch64-ce.S @@ -73,7 +73,7 @@ /* Constants */ -.text +SECTION_RODATA .align 4 ELF(.type _gcry_sm3_armv8_ce_consts, at object) _gcry_sm3_armv8_ce_consts: @@ -152,6 +152,9 @@ ELF(.size _gcry_sm3_armv8_ce_consts,.-_gcry_sm3_armv8_ce_consts) #define R1(s0, s1, s2, s3, s4, IOP) R(a, s0, s1, s2, s3, s4, IOP) #define R2(s0, s1, s2, s3, s4, IOP) R(b, s0, s1, s2, s3, s4, IOP) + +.text + .align 3 .global _gcry_sm3_transform_armv8_ce ELF(.type _gcry_sm3_transform_armv8_ce,%function;) @@ -170,7 +173,7 @@ _gcry_sm3_transform_armv8_ce: ext CTX2.16b, CTX2.16b, CTX2.16b, #8; .Lloop: - GET_LOCAL_POINTER(x3, .Lsm3_Ktable); + GET_DATA_POINTER(x3, .Lsm3_Ktable); ld1 {v0.16b-v3.16b}, [x1], #64; sub x2, x2, #1; diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S index 8d06991b..30a19358 100644 --- a/cipher/sm4-aarch64.S +++ b/cipher/sm4-aarch64.S @@ -30,7 +30,7 @@ /* Constants */ -.text +SECTION_RODATA .align 4 ELF(.type _gcry_sm4_aarch64_consts, at object) _gcry_sm4_aarch64_consts: @@ -84,7 +84,7 @@ ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts) /* Helper macros. */ #define preload_sbox(ptr) \ - GET_LOCAL_POINTER(ptr, .Lsm4_sbox); \ + GET_DATA_POINTER(ptr, .Lsm4_sbox); \ ld1 {v16.16b-v19.16b}, [ptr], #64; \ ld1 {v20.16b-v23.16b}, [ptr], #64; \ ld1 {v24.16b-v27.16b}, [ptr], #64; \ @@ -111,6 +111,8 @@ ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts) zip2 s3.2d, RTMP3.2d, RTMP1.2d; +.text + .align 3 ELF(.type sm4_aarch64_crypt_blk1_4,%function;) sm4_aarch64_crypt_blk1_4: diff --git a/cipher/sm4-armv9-aarch64-sve-ce.S b/cipher/sm4-armv9-aarch64-sve-ce.S index 21e34e6f..f180cfdb 100644 --- a/cipher/sm4-armv9-aarch64-sve-ce.S +++ b/cipher/sm4-armv9-aarch64-sve-ce.S @@ -32,7 +32,7 @@ /* Constants */ -.text +SECTION_RODATA .align 4 ELF(.type _gcry_sm4_armv9_svesm4_consts, at object) _gcry_sm4_armv9_svesm4_consts: @@ -167,7 +167,7 @@ ELF(.size _gcry_sm4_armv9_svesm4_consts,.-_gcry_sm4_armv9_svesm4_consts) /* Helper macros. */ #define PREPARE() \ - GET_LOCAL_POINTER(x7, .Lbswap128_mask); \ + GET_DATA_POINTER(x7, .Lbswap128_mask); \ ptrue p0.b, ALL; \ rdvl x5, #1; \ ld1b {RSWAP128.b}, p0/z, [x7]; \ @@ -811,7 +811,7 @@ _gcry_sm4_armv9_sve_ce_ctr_enc: PREPARE(); dup RZERO.d, #0; - GET_LOCAL_POINTER(x6, .Lle128_inc); + GET_DATA_POINTER(x6, .Lle128_inc); ld1b {RLE128_INC.b}, p0/z, [x6]; ldp x7, x8, [x3]; -- 2.37.2 From jussi.kivilinna at iki.fi Tue Jan 17 20:17:37 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Jan 2023 21:17:37 +0200 Subject: [PATCH 3/7] amd64-asm: move constant data to read-only section for cipher algos In-Reply-To: <20230117191741.718995-1-jussi.kivilinna@iki.fi> References: <20230117191741.718995-1-jussi.kivilinna@iki.fi> Message-ID: <20230117191741.718995-3-jussi.kivilinna@iki.fi> * cipher/arcfour-amd64.S: Move constant data to read-only section; Align text section to 64 bytes and functions to 16 bytes. * cipher/blowfish-amd64.S: Likewise. * cipher/camellia-aesni-avx-amd64.S: Likewise. * cipher/camellia-aesni-avx2-amd64.h: Likewise. * cipher/camellia-gfni-avx512-amd64.S: Likewise. * cipher/cast5-amd64.S: Likewise. * cipher/chacha20-amd64-avx2.S: Likewise. * cipher/chacha20-amd64-avx512.S: Likewise. * cipher/chacha20-amd64-ssse3.S: Likewise. * cipher/des-amd64.s: Likewise. * cipher/rijndael-amd64.S: Likewise. * cipher/rijndael-ssse3-amd64-asm.S: Likewise. * cipher/rijndael-vaes-avx2-amd64.S: Likewise. * cipher/salsa20-amd64.S: Likewise. * cipher/serpent-avx2-amd64.S: Likewise. * cipher/serpent-sse2-amd64.S: Likewise. * cipher/sm4-aesni-avx-amd64.S: Likewise. * cipher/sm4-aesni-avx2-amd64.S: Likewise. * cipher/sm4-gfni-avx2-amd64.S: Likewise. * cipher/sm4-gfni-avx512-amd64.S: Likewise. * cipher/twofish-amd64.S: Likewise. * cipher/twofish-avx2-amd64.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/arcfour-amd64.S | 2 +- cipher/blowfish-amd64.S | 19 ++++++++-------- cipher/camellia-aesni-avx-amd64.S | 34 +++++++++++++++++++---------- cipher/camellia-aesni-avx2-amd64.h | 26 ++++++++++++---------- cipher/camellia-gfni-avx512-amd64.S | 23 ++++++++++--------- cipher/cast5-amd64.S | 15 +++++++------ cipher/chacha20-amd64-avx2.S | 14 +++++++----- cipher/chacha20-amd64-avx512.S | 8 ++++--- cipher/chacha20-amd64-ssse3.S | 18 +++++++++------ cipher/des-amd64.S | 17 ++++++++++----- cipher/rijndael-amd64.S | 5 +++-- cipher/rijndael-ssse3-amd64-asm.S | 8 ++++++- cipher/rijndael-vaes-avx2-amd64.S | 3 +++ cipher/salsa20-amd64.S | 7 +++--- cipher/serpent-avx2-amd64.S | 25 ++++++++++++--------- cipher/serpent-sse2-amd64.S | 19 ++++++++-------- cipher/sm4-aesni-avx-amd64.S | 28 ++++++++++++++---------- cipher/sm4-aesni-avx2-amd64.S | 24 ++++++++++++-------- cipher/sm4-gfni-avx2-amd64.S | 32 ++++++++++++++++----------- cipher/sm4-gfni-avx512-amd64.S | 5 ++++- cipher/twofish-amd64.S | 23 +++++++++---------- cipher/twofish-avx2-amd64.S | 24 +++++++++++--------- 22 files changed, 229 insertions(+), 150 deletions(-) diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S index 2abd90a7..d4cd6083 100644 --- a/cipher/arcfour-amd64.S +++ b/cipher/arcfour-amd64.S @@ -21,7 +21,7 @@ #include "asm-common-amd64.h" .text -.align 16 +.align 64 .globl _gcry_arcfour_amd64 ELF(.type _gcry_arcfour_amd64, at function) _gcry_arcfour_amd64: diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S index 2b4ffa1a..9db3dc1b 100644 --- a/cipher/blowfish-amd64.S +++ b/cipher/blowfish-amd64.S @@ -27,6 +27,7 @@ #include "asm-common-amd64.h" .text +.align 64 /* structure of BLOWFISH_context: */ #define s0 0 @@ -123,7 +124,7 @@ bswapq RX0; \ movq RX0, (RIO); -.align 8 +.align 16 ELF(.type __blowfish_enc_blk1, at function;) __blowfish_enc_blk1: @@ -155,7 +156,7 @@ __blowfish_enc_blk1: CFI_ENDPROC(); ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;) -.align 8 +.align 16 .globl _gcry_blowfish_amd64_do_encrypt ELF(.type _gcry_blowfish_amd64_do_encrypt, at function;) @@ -186,7 +187,7 @@ _gcry_blowfish_amd64_do_encrypt: CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;) -.align 8 +.align 16 .globl _gcry_blowfish_amd64_encrypt_block ELF(.type _gcry_blowfish_amd64_encrypt_block, at function;) @@ -214,7 +215,7 @@ _gcry_blowfish_amd64_encrypt_block: CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;) -.align 8 +.align 16 .globl _gcry_blowfish_amd64_decrypt_block ELF(.type _gcry_blowfish_amd64_decrypt_block, at function;) @@ -342,7 +343,7 @@ ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_bloc bswapq RX2; \ bswapq RX3; -.align 8 +.align 16 ELF(.type __blowfish_enc_blk4, at function;) __blowfish_enc_blk4: @@ -371,7 +372,7 @@ __blowfish_enc_blk4: CFI_ENDPROC(); ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;) -.align 8 +.align 16 ELF(.type __blowfish_dec_blk4, at function;) __blowfish_dec_blk4: @@ -402,7 +403,7 @@ __blowfish_dec_blk4: CFI_ENDPROC(); ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;) -.align 8 +.align 16 .globl _gcry_blowfish_amd64_ctr_enc ELF(.type _gcry_blowfish_amd64_ctr_enc, at function;) _gcry_blowfish_amd64_ctr_enc: @@ -472,7 +473,7 @@ _gcry_blowfish_amd64_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;) -.align 8 +.align 16 .globl _gcry_blowfish_amd64_cbc_dec ELF(.type _gcry_blowfish_amd64_cbc_dec, at function;) _gcry_blowfish_amd64_cbc_dec: @@ -533,7 +534,7 @@ _gcry_blowfish_amd64_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;) -.align 8 +.align 16 .globl _gcry_blowfish_amd64_cfb_dec ELF(.type _gcry_blowfish_amd64_cfb_dec, at function;) _gcry_blowfish_amd64_cfb_dec: diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 5c304e57..9240d70b 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -619,7 +619,10 @@ vmovdqu y6, 14 * 16(rio); \ vmovdqu y7, 15 * 16(rio); -.text +SECTION_RODATA + +ELF(.type _camellia_aesni_avx_data, at object;) +_camellia_aesni_avx_data: .align 16 #define SHUFB_BYTES(idx) \ @@ -763,9 +766,11 @@ .L0f0f0f0f: .long 0x0f0f0f0f +.text +.align 64 -.align 8 ELF(.type __camellia_enc_blk16, at function;) +.align 16 __camellia_enc_blk16: /* input: @@ -826,7 +831,7 @@ __camellia_enc_blk16: CFI_ENDPROC(); ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) -.align 8 +.align 16 ELF(.type __camellia_dec_blk16, at function;) __camellia_dec_blk16: @@ -897,7 +902,7 @@ ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_ctr_enc ELF(.type _gcry_camellia_aesni_avx_ctr_enc, at function;) @@ -1025,7 +1030,7 @@ _gcry_camellia_aesni_avx_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_cbc_dec ELF(.type _gcry_camellia_aesni_avx_cbc_dec, at function;) @@ -1098,7 +1103,7 @@ _gcry_camellia_aesni_avx_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;) -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_cfb_dec ELF(.type _gcry_camellia_aesni_avx_cfb_dec, at function;) @@ -1180,7 +1185,7 @@ _gcry_camellia_aesni_avx_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_ocb_enc ELF(.type _gcry_camellia_aesni_avx_ocb_enc, at function;) @@ -1332,7 +1337,7 @@ _gcry_camellia_aesni_avx_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;) -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_ocb_dec ELF(.type _gcry_camellia_aesni_avx_ocb_dec, at function;) @@ -1503,7 +1508,7 @@ _gcry_camellia_aesni_avx_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;) -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_ocb_auth ELF(.type _gcry_camellia_aesni_avx_ocb_auth, at function;) @@ -1720,6 +1725,10 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth; vpsllq $(64-(nror)), out, out; \ vpaddd t0, out, out; +SECTION_RODATA + +ELF(.type _camellia_aesni_avx_keysetup_data, at object;) +_camellia_aesni_avx_keysetup_data: .align 16 .Linv_shift_row_and_unpcklbw: @@ -1752,8 +1761,9 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth; .Lsigma6: .long 0xB3E6C1FD, 0xB05688C2; +.text -.align 8 +.align 16 ELF(.type __camellia_avx_setup128, at function;) __camellia_avx_setup128: /* input: @@ -2100,7 +2110,7 @@ __camellia_avx_setup128: CFI_ENDPROC(); ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;) -.align 8 +.align 16 ELF(.type __camellia_avx_setup256, at function;) __camellia_avx_setup256: @@ -2580,7 +2590,7 @@ __camellia_avx_setup256: CFI_ENDPROC(); ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;) -.align 8 +.align 16 .globl _gcry_camellia_aesni_avx_keygen ELF(.type _gcry_camellia_aesni_avx_keygen, at function;) diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index 411e790f..46c2be81 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -784,7 +784,8 @@ vmovdqu y6, 14 * 32(rio); \ vmovdqu y7, 15 * 32(rio); -.text +SECTION_RODATA + .align 32 #define SHUFB_BYTES(idx) \ @@ -997,7 +998,10 @@ ELF(.type FUNC_NAME(_constants), at object;) ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);) -.align 8 +.text +.align 64 + +.align 16 ELF(.type FUNC_NAME(enc_blk32), at function;) FUNC_NAME(enc_blk32): @@ -1059,7 +1063,7 @@ FUNC_NAME(enc_blk32): CFI_ENDPROC(); ELF(.size FUNC_NAME(enc_blk32),.-FUNC_NAME(enc_blk32);) -.align 8 +.align 16 ELF(.type FUNC_NAME(dec_blk32), at function;) FUNC_NAME(dec_blk32): @@ -1130,7 +1134,7 @@ ELF(.size FUNC_NAME(dec_blk32),.-FUNC_NAME(dec_blk32);) vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 +.align 16 .globl FUNC_NAME(ctr_enc) ELF(.type FUNC_NAME(ctr_enc), at function;) @@ -1325,7 +1329,7 @@ FUNC_NAME(ctr_enc): CFI_ENDPROC(); ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);) -.align 8 +.align 16 .globl FUNC_NAME(cbc_dec) ELF(.type FUNC_NAME(cbc_dec), at function;) @@ -1400,7 +1404,7 @@ FUNC_NAME(cbc_dec): CFI_ENDPROC(); ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);) -.align 8 +.align 16 .globl FUNC_NAME(cfb_dec) ELF(.type FUNC_NAME(cfb_dec), at function;) @@ -1482,7 +1486,7 @@ FUNC_NAME(cfb_dec): CFI_ENDPROC(); ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);) -.align 8 +.align 16 .globl FUNC_NAME(ocb_enc) ELF(.type FUNC_NAME(ocb_enc), at function;) @@ -1654,7 +1658,7 @@ FUNC_NAME(ocb_enc): CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);) -.align 8 +.align 16 .globl FUNC_NAME(ocb_dec) ELF(.type FUNC_NAME(ocb_dec), at function;) @@ -1849,7 +1853,7 @@ FUNC_NAME(ocb_dec): CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);) -.align 8 +.align 16 .globl FUNC_NAME(ocb_auth) ELF(.type FUNC_NAME(ocb_auth), at function;) @@ -2018,7 +2022,7 @@ FUNC_NAME(ocb_auth): CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);) -.align 8 +.align 16 .globl FUNC_NAME(enc_blk1_32) ELF(.type FUNC_NAME(enc_blk1_32), at function;) @@ -2126,7 +2130,7 @@ FUNC_NAME(enc_blk1_32): CFI_ENDPROC(); ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);) -.align 8 +.align 16 .globl FUNC_NAME(dec_blk1_32) ELF(.type FUNC_NAME(dec_blk1_32), at function;) diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S index 14725b4a..7a98a3ce 100644 --- a/cipher/camellia-gfni-avx512-amd64.S +++ b/cipher/camellia-gfni-avx512-amd64.S @@ -584,7 +584,7 @@ vmovdqu64 y6, 14 * 64(rio); \ vmovdqu64 y7, 15 * 64(rio); -.text +SECTION_RODATA #define SHUFB_BYTES(idx) \ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) @@ -691,7 +691,10 @@ ELF(.type _gcry_camellia_gfni_avx512__constants, at object;) ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;) -.align 8 +.text +.align 64 + +.align 16 ELF(.type __camellia_gfni_avx512_enc_blk64, at function;) __camellia_gfni_avx512_enc_blk64: @@ -751,7 +754,7 @@ __camellia_gfni_avx512_enc_blk64: CFI_ENDPROC(); ELF(.size __camellia_gfni_avx512_enc_blk64,.-__camellia_gfni_avx512_enc_blk64;) -.align 8 +.align 16 ELF(.type __camellia_gfni_avx512_dec_blk64, at function;) __camellia_gfni_avx512_dec_blk64: @@ -820,7 +823,7 @@ ELF(.size __camellia_gfni_avx512_dec_blk64,.-__camellia_gfni_avx512_dec_blk64;) kaddb %k1, %k1, %k1; \ vpaddq hi_counter1, out, out{%k1}; -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_ctr_enc ELF(.type _gcry_camellia_gfni_avx512_ctr_enc, at function;) @@ -973,7 +976,7 @@ _gcry_camellia_gfni_avx512_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;) -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_cbc_dec ELF(.type _gcry_camellia_gfni_avx512_cbc_dec, at function;) @@ -1035,7 +1038,7 @@ _gcry_camellia_gfni_avx512_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_camellia_gfni_avx512_cbc_dec,.-_gcry_camellia_gfni_avx512_cbc_dec;) -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_cfb_dec ELF(.type _gcry_camellia_gfni_avx512_cfb_dec, at function;) @@ -1108,7 +1111,7 @@ _gcry_camellia_gfni_avx512_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_camellia_gfni_avx512_cfb_dec,.-_gcry_camellia_gfni_avx512_cfb_dec;) -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_ocb_enc ELF(.type _gcry_camellia_gfni_avx512_ocb_enc, at function;) @@ -1271,7 +1274,7 @@ _gcry_camellia_gfni_avx512_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_camellia_gfni_avx512_ocb_enc,.-_gcry_camellia_gfni_avx512_ocb_enc;) -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_ocb_dec ELF(.type _gcry_camellia_gfni_avx512_ocb_dec, at function;) @@ -1440,7 +1443,7 @@ _gcry_camellia_gfni_avx512_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_camellia_gfni_avx512_ocb_dec,.-_gcry_camellia_gfni_avx512_ocb_dec;) -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_enc_blk64 ELF(.type _gcry_camellia_gfni_avx512_enc_blk64, at function;) @@ -1504,7 +1507,7 @@ _gcry_camellia_gfni_avx512_enc_blk64: CFI_ENDPROC(); ELF(.size _gcry_camellia_gfni_avx512_enc_blk64,.-_gcry_camellia_gfni_avx512_enc_blk64;) -.align 8 +.align 16 .globl _gcry_camellia_gfni_avx512_dec_blk64 ELF(.type _gcry_camellia_gfni_avx512_dec_blk64, at function;) diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S index a804654c..39171587 100644 --- a/cipher/cast5-amd64.S +++ b/cipher/cast5-amd64.S @@ -26,6 +26,7 @@ #include "asm-common-amd64.h" .text +.align 64 .extern _gcry_cast5_s1to4; @@ -173,7 +174,7 @@ rorq $32, RLR0; \ movq RLR0, (RIO); -.align 8 +.align 16 .globl _gcry_cast5_amd64_encrypt_block ELF(.type _gcry_cast5_amd64_encrypt_block, at function;) @@ -223,7 +224,7 @@ _gcry_cast5_amd64_encrypt_block: CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;) -.align 8 +.align 16 .globl _gcry_cast5_amd64_decrypt_block ELF(.type _gcry_cast5_amd64_decrypt_block, at function;) @@ -373,7 +374,7 @@ ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;) rorq $32, c; \ rorq $32, d; -.align 8 +.align 16 ELF(.type __cast5_enc_blk4, at function;) __cast5_enc_blk4: @@ -403,7 +404,7 @@ __cast5_enc_blk4: CFI_ENDPROC(); ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;) -.align 8 +.align 16 ELF(.type __cast5_dec_blk4, at function;) __cast5_dec_blk4: @@ -435,7 +436,7 @@ __cast5_dec_blk4: ret_spec_stop; ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;) -.align 8 +.align 16 .globl _gcry_cast5_amd64_ctr_enc ELF(.type _gcry_cast5_amd64_ctr_enc, at function;) _gcry_cast5_amd64_ctr_enc: @@ -512,7 +513,7 @@ _gcry_cast5_amd64_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;) -.align 8 +.align 16 .globl _gcry_cast5_amd64_cbc_dec ELF(.type _gcry_cast5_amd64_cbc_dec, at function;) _gcry_cast5_amd64_cbc_dec: @@ -586,7 +587,7 @@ _gcry_cast5_amd64_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;) -.align 8 +.align 16 .globl _gcry_cast5_amd64_cfb_dec ELF(.type _gcry_cast5_amd64_cfb_dec, at function;) _gcry_cast5_amd64_cfb_dec: diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S index 9f2a036a..99ff7469 100644 --- a/cipher/chacha20-amd64-avx2.S +++ b/cipher/chacha20-amd64-avx2.S @@ -33,8 +33,6 @@ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) -.text - #include "asm-common-amd64.h" #include "asm-poly1305-amd64.h" @@ -157,8 +155,11 @@ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1); +SECTION_RODATA + +ELF(.type _chacha20_avx2_data, at object;) .align 32 -chacha20_data: +_chacha20_avx2_data: .Lshuf_rol16: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .Lshuf_rol8: @@ -168,7 +169,10 @@ chacha20_data: .Lunsigned_cmp: .long 0x80000000 -.align 8 +.text +.align 64 + +.align 16 .globl _gcry_chacha20_amd64_avx2_blocks8 ELF(.type _gcry_chacha20_amd64_avx2_blocks8, at function;) @@ -333,7 +337,7 @@ ELF(.size _gcry_chacha20_amd64_avx2_blocks8, #define _ /*_*/ -.align 8 +.align 16 .globl _gcry_chacha20_poly1305_amd64_avx2_blocks8 ELF(.type _gcry_chacha20_poly1305_amd64_avx2_blocks8, at function;) diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S index 4b183528..e39a505a 100644 --- a/cipher/chacha20-amd64-avx512.S +++ b/cipher/chacha20-amd64-avx512.S @@ -33,8 +33,6 @@ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) -.text - #include "asm-common-amd64.h" /* register macros */ @@ -269,6 +267,8 @@ ROTATE(x1, 7); ROTATE(y1, 7); \ WORD_SHUF(x1, shuf_x1); WORD_SHUF(y1, shuf_x1); +SECTION_RODATA + .align 64 ELF(.type _gcry_chacha20_amd64_avx512_data, at object;) _gcry_chacha20_amd64_avx512_data: @@ -286,7 +286,9 @@ _gcry_chacha20_amd64_avx512_data: .byte 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ELF(.size _gcry_chacha20_amd64_avx512_data,.-_gcry_chacha20_amd64_avx512_data) -.align 16 +.text + +.align 64 .globl _gcry_chacha20_amd64_avx512_blocks ELF(.type _gcry_chacha20_amd64_avx512_blocks, at function;) _gcry_chacha20_amd64_avx512_blocks: diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index 6c737978..50c4755e 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -33,8 +33,6 @@ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) -.text - #include "asm-common-amd64.h" #include "asm-poly1305-amd64.h" @@ -151,7 +149,10 @@ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE2(b1, b2, 7, tmp1, tmp2); -chacha20_data: +SECTION_RODATA + +ELF(.type _chacha20_ssse3_data, at object;) +_chacha20_ssse3_data: .align 16 .Lshuf_rol16: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 @@ -164,7 +165,10 @@ chacha20_data: .Lunsigned_cmp: .long 0x80000000,0x80000000,0x80000000,0x80000000 -.align 8 +.text +.align 64 + +.align 16 .globl _gcry_chacha20_amd64_ssse3_blocks4 ELF(.type _gcry_chacha20_amd64_ssse3_blocks4, at function;) @@ -366,7 +370,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, ROTATE(x1, 7, tmp1); \ WORD_SHUF(x1, shuf_x1); -.align 8 +.align 16 .globl _gcry_chacha20_amd64_ssse3_blocks1 ELF(.type _gcry_chacha20_amd64_ssse3_blocks1, at function;) @@ -513,7 +517,7 @@ ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, #define _ /*_*/ -.align 8 +.align 16 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks4 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks4, at function;) @@ -781,7 +785,7 @@ ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, 2-way && 1-way stitched chacha20-poly1305 **********************************************************************/ -.align 8 +.align 16 .globl _gcry_chacha20_poly1305_amd64_ssse3_blocks1 ELF(.type _gcry_chacha20_poly1305_amd64_ssse3_blocks1, at function;) diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S index c1bf9f29..44a8a90c 100644 --- a/cipher/des-amd64.S +++ b/cipher/des-amd64.S @@ -26,6 +26,7 @@ #include "asm-common-amd64.h" .text +.align 64 #define s1 0 #define s2 ((s1) + (64*8)) @@ -180,7 +181,7 @@ movl left##d, (io); \ movl right##d, 4(io); -.align 8 +.align 16 .globl _gcry_3des_amd64_crypt_block ELF(.type _gcry_3des_amd64_crypt_block, at function;) @@ -473,7 +474,7 @@ ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;) movl left##d, (io); \ movl right##d, 4(io); -.align 8 +.align 16 ELF(.type _gcry_3des_amd64_crypt_blk3, at function;) _gcry_3des_amd64_crypt_blk3: /* input: @@ -548,7 +549,7 @@ _gcry_3des_amd64_crypt_blk3: CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;) -.align 8 +.align 16 .globl _gcry_3des_amd64_cbc_dec ELF(.type _gcry_3des_amd64_cbc_dec, at function;) _gcry_3des_amd64_cbc_dec: @@ -603,6 +604,7 @@ _gcry_3des_amd64_cbc_dec: popq %rdx; /*src*/ CFI_POP_TMP_REG(); popq %rsi; /*dst*/ +.align 8 CFI_POP_TMP_REG(); bswapl RR0d; @@ -646,7 +648,7 @@ _gcry_3des_amd64_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) -.align 8 +.align 16 .globl _gcry_3des_amd64_ctr_enc ELF(.type _gcry_3des_amd64_ctr_enc, at function;) _gcry_3des_amd64_ctr_enc: @@ -744,7 +746,7 @@ _gcry_3des_amd64_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) -.align 8 +.align 16 .globl _gcry_3des_amd64_cfb_dec ELF(.type _gcry_3des_amd64_cfb_dec, at function;) _gcry_3des_amd64_cfb_dec: @@ -841,7 +843,12 @@ _gcry_3des_amd64_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;) + +SECTION_RODATA +ELF(.type _des_amd64_data, at object;) + .align 16 +_des_amd64_data: .L_s1: .quad 0x0010100001010400, 0x0000000000000000 .quad 0x0000100000010000, 0x0010100001010404 diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S index 6e3cc819..3d5a0bd2 100644 --- a/cipher/rijndael-amd64.S +++ b/cipher/rijndael-amd64.S @@ -26,6 +26,7 @@ #include "asm-common-amd64.h" .text +.align 64 /* table macros */ #define E0 (0) @@ -200,7 +201,7 @@ #define lastencround(round) \ do_lastencround((round) + 1); -.align 8 +.align 16 .globl _gcry_aes_amd64_encrypt_block ELF(.type _gcry_aes_amd64_encrypt_block, at function;) @@ -377,7 +378,7 @@ ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;) #define lastdecround(round) \ do_lastdecround(round); -.align 8 +.align 16 .globl _gcry_aes_amd64_decrypt_block ELF(.type _gcry_aes_amd64_decrypt_block, at function;) diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S index b98dca26..52cd0db2 100644 --- a/cipher/rijndael-ssse3-amd64-asm.S +++ b/cipher/rijndael-ssse3-amd64-asm.S @@ -43,10 +43,12 @@ #include "asm-common-amd64.h" .text +.align 64 ## ## _gcry_aes_ssse3_enc_preload ## +.align 16 ELF(.type _gcry_aes_ssse3_enc_preload, at function) .globl _gcry_aes_ssse3_enc_preload _gcry_aes_ssse3_enc_preload: @@ -68,6 +70,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload) ## ## _gcry_aes_ssse3_dec_preload ## +.align 16 ELF(.type _gcry_aes_ssse3_dec_preload, at function) .globl _gcry_aes_ssse3_dec_preload _gcry_aes_ssse3_dec_preload: @@ -689,8 +692,11 @@ ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core) ## ## ######################################################## +SECTION_RODATA + .align 16 -ELF(.type _aes_consts, at object) +ELF(.type _aes_ssse3_consts, at object) +_aes_ssse3_consts: .Laes_consts: _aes_consts: # s0F diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index 13fe7ab0..a801ad90 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -27,6 +27,7 @@ #include "asm-common-amd64.h" .text +.align 64 /********************************************************************** helper macros @@ -3313,6 +3314,8 @@ ELF(.size _gcry_vaes_avx2_ecb_crypt_amd64,.-_gcry_vaes_avx2_ecb_crypt_amd64) /********************************************************************** constants **********************************************************************/ +SECTION_RODATA + ELF(.type _gcry_vaes_consts, at object) _gcry_vaes_consts: .align 32 diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S index 64626063..b681a060 100644 --- a/cipher/salsa20-amd64.S +++ b/cipher/salsa20-amd64.S @@ -31,8 +31,9 @@ #include "asm-common-amd64.h" .text +.align 64 -.align 8 +.align 16 .globl _gcry_salsa20_amd64_keysetup ELF(.type _gcry_salsa20_amd64_keysetup, at function;) _gcry_salsa20_amd64_keysetup: @@ -86,7 +87,7 @@ _gcry_salsa20_amd64_keysetup: ret_spec_stop CFI_ENDPROC(); -.align 8 +.align 16 .globl _gcry_salsa20_amd64_ivsetup ELF(.type _gcry_salsa20_amd64_ivsetup, at function;) _gcry_salsa20_amd64_ivsetup: @@ -102,7 +103,7 @@ _gcry_salsa20_amd64_ivsetup: ret_spec_stop CFI_ENDPROC(); -.align 8 +.align 16 .globl _gcry_salsa20_amd64_encrypt_blocks ELF(.type _gcry_salsa20_amd64_encrypt_blocks, at function;) _gcry_salsa20_amd64_encrypt_blocks: diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index 54ff61e4..4da0a228 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -400,8 +400,9 @@ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text +.align 64 -.align 8 +.align 16 ELF(.type __serpent_enc_blk16, at function;) __serpent_enc_blk16: /* input: @@ -491,7 +492,7 @@ __serpent_enc_blk16: CFI_ENDPROC(); ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;) -.align 8 +.align 16 ELF(.type __serpent_dec_blk16, at function;) __serpent_dec_blk16: /* input: @@ -583,7 +584,7 @@ __serpent_dec_blk16: CFI_ENDPROC(); ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;) -.align 8 +.align 16 .globl _gcry_serpent_avx2_blk16 ELF(.type _gcry_serpent_avx2_blk16, at function;) _gcry_serpent_avx2_blk16: @@ -639,7 +640,7 @@ ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;) vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 +.align 16 .globl _gcry_serpent_avx2_ctr_enc ELF(.type _gcry_serpent_avx2_ctr_enc, at function;) _gcry_serpent_avx2_ctr_enc: @@ -751,7 +752,7 @@ _gcry_serpent_avx2_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;) -.align 8 +.align 16 .globl _gcry_serpent_avx2_cbc_dec ELF(.type _gcry_serpent_avx2_cbc_dec, at function;) _gcry_serpent_avx2_cbc_dec: @@ -804,7 +805,7 @@ _gcry_serpent_avx2_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;) -.align 8 +.align 16 .globl _gcry_serpent_avx2_cfb_dec ELF(.type _gcry_serpent_avx2_cfb_dec, at function;) _gcry_serpent_avx2_cfb_dec: @@ -859,7 +860,7 @@ _gcry_serpent_avx2_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;) -.align 8 +.align 16 .globl _gcry_serpent_avx2_ocb_enc ELF(.type _gcry_serpent_avx2_ocb_enc, at function;) @@ -973,7 +974,7 @@ _gcry_serpent_avx2_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;) -.align 8 +.align 16 .globl _gcry_serpent_avx2_ocb_dec ELF(.type _gcry_serpent_avx2_ocb_dec, at function;) @@ -1097,7 +1098,7 @@ _gcry_serpent_avx2_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;) -.align 8 +.align 16 .globl _gcry_serpent_avx2_ocb_auth ELF(.type _gcry_serpent_avx2_ocb_auth, at function;) @@ -1200,9 +1201,13 @@ _gcry_serpent_avx2_ocb_auth: CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;) -.align 16 + +SECTION_RODATA +ELF(.type _serpent_avx2_consts, at object) +_serpent_avx2_consts: /* For CTR-mode IV byteswap */ +.align 16 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index 01723a2a..e7a250d9 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -422,8 +422,9 @@ BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round); .text +.align 64 -.align 8 +.align 16 ELF(.type __serpent_enc_blk8, at function;) __serpent_enc_blk8: /* input: @@ -513,7 +514,7 @@ __serpent_enc_blk8: CFI_ENDPROC(); ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;) -.align 8 +.align 16 ELF(.type __serpent_dec_blk8, at function;) __serpent_dec_blk8: /* input: @@ -605,7 +606,7 @@ __serpent_dec_blk8: CFI_ENDPROC(); ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_blk8 ELF(.type _gcry_serpent_sse2_blk8, at function;) _gcry_serpent_sse2_blk8: @@ -670,7 +671,7 @@ _gcry_serpent_sse2_blk8: CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_blk8,.-_gcry_serpent_sse2_blk8;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_ctr_enc ELF(.type _gcry_serpent_sse2_ctr_enc, at function;) _gcry_serpent_sse2_ctr_enc: @@ -802,7 +803,7 @@ _gcry_serpent_sse2_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_cbc_dec ELF(.type _gcry_serpent_sse2_cbc_dec, at function;) _gcry_serpent_sse2_cbc_dec: @@ -865,7 +866,7 @@ _gcry_serpent_sse2_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_cfb_dec ELF(.type _gcry_serpent_sse2_cfb_dec, at function;) _gcry_serpent_sse2_cfb_dec: @@ -931,7 +932,7 @@ _gcry_serpent_sse2_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_ocb_enc ELF(.type _gcry_serpent_sse2_ocb_enc, at function;) @@ -1045,7 +1046,7 @@ _gcry_serpent_sse2_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_ocb_dec ELF(.type _gcry_serpent_sse2_ocb_dec, at function;) @@ -1169,7 +1170,7 @@ _gcry_serpent_sse2_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;) -.align 8 +.align 16 .globl _gcry_serpent_sse2_ocb_auth ELF(.type _gcry_serpent_sse2_ocb_auth, at function;) diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S index 7a99e070..bb0d20c6 100644 --- a/cipher/sm4-aesni-avx-amd64.S +++ b/cipher/sm4-aesni-avx-amd64.S @@ -97,9 +97,12 @@ 4-way && 8-way SM4 with AES-NI and AVX **********************************************************************/ -.text +SECTION_RODATA .align 16 +ELF(.type _sm4_aesni_avx_consts, at object) +_sm4_aesni_avx_consts: + /* * Following four affine transform look-up tables are from work by * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni @@ -152,7 +155,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f -.align 8 +.text +.align 64 + +.align 16 .globl _gcry_sm4_aesni_avx_expand_key ELF(.type _gcry_sm4_aesni_avx_expand_key, at function;) _gcry_sm4_aesni_avx_expand_key: @@ -244,7 +250,7 @@ _gcry_sm4_aesni_avx_expand_key: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;) -.align 8 +.align 16 ELF(.type sm4_aesni_avx_crypt_blk1_4, at function;) sm4_aesni_avx_crypt_blk1_4: /* input: @@ -349,7 +355,7 @@ sm4_aesni_avx_crypt_blk1_4: CFI_ENDPROC(); ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;) -.align 8 +.align 16 ELF(.type __sm4_crypt_blk8, at function;) __sm4_crypt_blk8: /* input: @@ -458,7 +464,7 @@ __sm4_crypt_blk8: CFI_ENDPROC(); ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_crypt_blk1_8 ELF(.type _gcry_sm4_aesni_avx_crypt_blk1_8, at function;) _gcry_sm4_aesni_avx_crypt_blk1_8: @@ -512,7 +518,7 @@ _gcry_sm4_aesni_avx_crypt_blk1_8: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_ctr_enc ELF(.type _gcry_sm4_aesni_avx_ctr_enc, at function;) _gcry_sm4_aesni_avx_ctr_enc: @@ -586,7 +592,7 @@ _gcry_sm4_aesni_avx_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_cbc_dec ELF(.type _gcry_sm4_aesni_avx_cbc_dec, at function;) _gcry_sm4_aesni_avx_cbc_dec: @@ -635,7 +641,7 @@ _gcry_sm4_aesni_avx_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_cfb_dec ELF(.type _gcry_sm4_aesni_avx_cfb_dec, at function;) _gcry_sm4_aesni_avx_cfb_dec: @@ -687,7 +693,7 @@ _gcry_sm4_aesni_avx_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_ocb_enc ELF(.type _gcry_sm4_aesni_avx_ocb_enc, at function;) @@ -786,7 +792,7 @@ _gcry_sm4_aesni_avx_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_ocb_dec ELF(.type _gcry_sm4_aesni_avx_ocb_dec, at function;) @@ -895,7 +901,7 @@ _gcry_sm4_aesni_avx_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx_ocb_auth ELF(.type _gcry_sm4_aesni_avx_ocb_auth, at function;) diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S index e09fed8f..db94be90 100644 --- a/cipher/sm4-aesni-avx2-amd64.S +++ b/cipher/sm4-aesni-avx2-amd64.S @@ -118,9 +118,12 @@ 16-way SM4 with AES-NI and AVX **********************************************************************/ -.text +SECTION_RODATA .align 16 +ELF(.type _sm4_aesni_avx2_consts, at object) +_sm4_aesni_avx2_consts: + /* * Following four affine transform look-up tables are from work by * Markku-Juhani O. Saarinen, at https://github.com/mjosaarinen/sm4ni @@ -173,7 +176,10 @@ .L0f0f0f0f: .long 0x0f0f0f0f -.align 8 +.text +.align 64 + +.align 16 ELF(.type __sm4_crypt_blk16, at function;) __sm4_crypt_blk16: /* input: @@ -288,7 +294,7 @@ __sm4_crypt_blk16: CFI_ENDPROC(); ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_crypt_blk1_16 ELF(.type _gcry_sm4_aesni_avx2_crypt_blk1_16, at function;) _gcry_sm4_aesni_avx2_crypt_blk1_16: @@ -354,7 +360,7 @@ ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_1 vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_ctr_enc ELF(.type _gcry_sm4_aesni_avx2_ctr_enc, at function;) _gcry_sm4_aesni_avx2_ctr_enc: @@ -464,7 +470,7 @@ _gcry_sm4_aesni_avx2_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_cbc_dec ELF(.type _gcry_sm4_aesni_avx2_cbc_dec, at function;) _gcry_sm4_aesni_avx2_cbc_dec: @@ -515,7 +521,7 @@ _gcry_sm4_aesni_avx2_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_cfb_dec ELF(.type _gcry_sm4_aesni_avx2_cfb_dec, at function;) _gcry_sm4_aesni_avx2_cfb_dec: @@ -568,7 +574,7 @@ _gcry_sm4_aesni_avx2_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_ocb_enc ELF(.type _gcry_sm4_aesni_avx2_ocb_enc, at function;) @@ -680,7 +686,7 @@ _gcry_sm4_aesni_avx2_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_ocb_dec ELF(.type _gcry_sm4_aesni_avx2_ocb_dec, at function;) @@ -802,7 +808,7 @@ _gcry_sm4_aesni_avx2_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;) -.align 8 +.align 16 .globl _gcry_sm4_aesni_avx2_ocb_auth ELF(.type _gcry_sm4_aesni_avx2_ocb_auth, at function;) diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S index 4ec0ea39..7c87400e 100644 --- a/cipher/sm4-gfni-avx2-amd64.S +++ b/cipher/sm4-gfni-avx2-amd64.S @@ -87,9 +87,12 @@ #define RB2x %xmm14 #define RB3x %xmm15 -.text +SECTION_RODATA .align 32 +ELF(.type _sm4_gfni_avx2_consts, at object) +_sm4_gfni_avx2_consts: + /* Affine transform, SM4 field to AES field */ .Lpre_affine_s: .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34 @@ -133,7 +136,10 @@ .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 -.align 8 +.text +.align 64 + +.align 16 .globl _gcry_sm4_gfni_avx2_expand_key ELF(.type _gcry_sm4_gfni_avx2_expand_key, at function;) _gcry_sm4_gfni_avx2_expand_key: @@ -216,7 +222,7 @@ _gcry_sm4_gfni_avx2_expand_key: CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_expand_key,.-_gcry_sm4_gfni_avx2_expand_key;) -.align 8 +.align 16 ELF(.type sm4_gfni_avx2_crypt_blk1_4, at function;) sm4_gfni_avx2_crypt_blk1_4: /* input: @@ -314,7 +320,7 @@ sm4_gfni_avx2_crypt_blk1_4: CFI_ENDPROC(); ELF(.size sm4_gfni_avx2_crypt_blk1_4,.-sm4_gfni_avx2_crypt_blk1_4;) -.align 8 +.align 16 ELF(.type __sm4_gfni_crypt_blk8, at function;) __sm4_gfni_crypt_blk8: /* input: @@ -415,7 +421,7 @@ __sm4_gfni_crypt_blk8: CFI_ENDPROC(); ELF(.size __sm4_gfni_crypt_blk8,.-__sm4_gfni_crypt_blk8;) -.align 8 +.align 16 ELF(.type _gcry_sm4_gfni_avx2_crypt_blk1_8, at function;) _gcry_sm4_gfni_avx2_crypt_blk1_8: /* input: @@ -472,7 +478,7 @@ ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_8,.-_gcry_sm4_gfni_avx2_crypt_blk1_8;) 16-way SM4 with GFNI and AVX2 **********************************************************************/ -.align 8 +.align 16 ELF(.type __sm4_gfni_crypt_blk16, at function;) __sm4_gfni_crypt_blk16: /* input: @@ -573,7 +579,7 @@ __sm4_gfni_crypt_blk16: CFI_ENDPROC(); ELF(.size __sm4_gfni_crypt_blk16,.-__sm4_gfni_crypt_blk16;) -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_crypt_blk1_16 ELF(.type _gcry_sm4_gfni_avx2_crypt_blk1_16, at function;) _gcry_sm4_gfni_avx2_crypt_blk1_16: @@ -641,7 +647,7 @@ ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_16,.-_gcry_sm4_gfni_avx2_crypt_blk1_16; vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_ctr_enc ELF(.type _gcry_sm4_gfni_avx2_ctr_enc, at function;) _gcry_sm4_gfni_avx2_ctr_enc: @@ -751,7 +757,7 @@ _gcry_sm4_gfni_avx2_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;) -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_cbc_dec ELF(.type _gcry_sm4_gfni_avx2_cbc_dec, at function;) _gcry_sm4_gfni_avx2_cbc_dec: @@ -802,7 +808,7 @@ _gcry_sm4_gfni_avx2_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_cbc_dec,.-_gcry_sm4_gfni_avx2_cbc_dec;) -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_cfb_dec ELF(.type _gcry_sm4_gfni_avx2_cfb_dec, at function;) _gcry_sm4_gfni_avx2_cfb_dec: @@ -855,7 +861,7 @@ _gcry_sm4_gfni_avx2_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_cfb_dec,.-_gcry_sm4_gfni_avx2_cfb_dec;) -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_ocb_enc ELF(.type _gcry_sm4_gfni_avx2_ocb_enc, at function;) @@ -967,7 +973,7 @@ _gcry_sm4_gfni_avx2_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_ocb_enc,.-_gcry_sm4_gfni_avx2_ocb_enc;) -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_ocb_dec ELF(.type _gcry_sm4_gfni_avx2_ocb_dec, at function;) @@ -1089,7 +1095,7 @@ _gcry_sm4_gfni_avx2_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_ocb_dec,.-_gcry_sm4_gfni_avx2_ocb_dec;) -.align 8 +.align 16 .globl _gcry_sm4_gfni_avx2_ocb_auth ELF(.type _gcry_sm4_gfni_avx2_ocb_auth, at function;) diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S index 0f9899d4..00a1c921 100644 --- a/cipher/sm4-gfni-avx512-amd64.S +++ b/cipher/sm4-gfni-avx512-amd64.S @@ -103,7 +103,7 @@ #define RB2z %zmm14 #define RB3z %zmm15 -.text +SECTION_RODATA .align 32 /* Affine transform, SM4 field to AES field */ @@ -146,6 +146,9 @@ .quad 2, 0 .quad 3, 0 +.text +.align 64 + .align 16 .globl _gcry_sm4_gfni_avx512_expand_key ELF(.type _gcry_sm4_gfni_avx512_expand_key, at function;) diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S index 8998d296..b19a5b1b 100644 --- a/cipher/twofish-amd64.S +++ b/cipher/twofish-amd64.S @@ -26,6 +26,7 @@ #include "asm-common-amd64.h" .text +.align 64 /* structure of TWOFISH_context: */ #define s0 0 @@ -161,7 +162,7 @@ xorl (w + 4 * (m))(CTX), x; \ movl x, (4 * (n))(out); -.align 8 +.align 16 .globl _gcry_twofish_amd64_encrypt_block ELF(.type _gcry_twofish_amd64_encrypt_block, at function;) @@ -215,7 +216,7 @@ _gcry_twofish_amd64_encrypt_block: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_decrypt_block ELF(.type _gcry_twofish_amd64_decrypt_block, at function;) @@ -486,7 +487,7 @@ ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block; rorq $32, RAB2; \ outunpack3(RAB, 2); -.align 8 +.align 16 ELF(.type __twofish_enc_blk3, at function;) __twofish_enc_blk3: @@ -515,7 +516,7 @@ __twofish_enc_blk3: CFI_ENDPROC(); ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;) -.align 8 +.align 16 ELF(.type __twofish_dec_blk3, at function;) __twofish_dec_blk3: @@ -544,7 +545,7 @@ __twofish_dec_blk3: CFI_ENDPROC(); ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_blk3 ELF(.type _gcry_twofish_amd64_blk3, at function;) _gcry_twofish_amd64_blk3: @@ -618,7 +619,7 @@ _gcry_twofish_amd64_blk3: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_blk3,.-_gcry_twofish_amd64_blk3;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_ctr_enc ELF(.type _gcry_twofish_amd64_ctr_enc, at function;) _gcry_twofish_amd64_ctr_enc: @@ -719,7 +720,7 @@ _gcry_twofish_amd64_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_cbc_dec ELF(.type _gcry_twofish_amd64_cbc_dec, at function;) _gcry_twofish_amd64_cbc_dec: @@ -804,7 +805,7 @@ _gcry_twofish_amd64_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_cfb_dec ELF(.type _gcry_twofish_amd64_cfb_dec, at function;) _gcry_twofish_amd64_cfb_dec: @@ -889,7 +890,7 @@ _gcry_twofish_amd64_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_ocb_enc ELF(.type _gcry_twofish_amd64_ocb_enc, at function;) _gcry_twofish_amd64_ocb_enc: @@ -1015,7 +1016,7 @@ _gcry_twofish_amd64_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_ocb_dec ELF(.type _gcry_twofish_amd64_ocb_dec, at function;) _gcry_twofish_amd64_ocb_dec: @@ -1149,7 +1150,7 @@ _gcry_twofish_amd64_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;) -.align 8 +.align 16 .globl _gcry_twofish_amd64_ocb_auth ELF(.type _gcry_twofish_amd64_ocb_auth, at function;) _gcry_twofish_amd64_ocb_auth: diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S index 0cb9a64c..19fe0d9c 100644 --- a/cipher/twofish-avx2-amd64.S +++ b/cipher/twofish-avx2-amd64.S @@ -27,6 +27,7 @@ #include "asm-common-amd64.h" .text +.align 64 /* structure of TWOFISH_context: */ #define s0 0 @@ -402,7 +403,7 @@ outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \ outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1); -.align 8 +.align 16 ELF(.type __twofish_enc_blk16, at function;) __twofish_enc_blk16: /* input: @@ -435,7 +436,7 @@ __twofish_enc_blk16: CFI_ENDPROC(); ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;) -.align 8 +.align 16 ELF(.type __twofish_dec_blk16, at function;) __twofish_dec_blk16: /* input: @@ -468,7 +469,7 @@ __twofish_dec_blk16: CFI_ENDPROC(); ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;) -.align 8 +.align 16 .globl _gcry_twofish_avx2_blk16 ELF(.type _gcry_twofish_avx2_blk16, at function;) _gcry_twofish_avx2_blk16: @@ -520,7 +521,7 @@ ELF(.size _gcry_twofish_avx2_blk16,.-_gcry_twofish_avx2_blk16;) vpslldq $8, tmp, tmp; \ vpsubq tmp, x, x; -.align 8 +.align 16 .globl _gcry_twofish_avx2_ctr_enc ELF(.type _gcry_twofish_avx2_ctr_enc, at function;) _gcry_twofish_avx2_ctr_enc: @@ -632,7 +633,7 @@ _gcry_twofish_avx2_ctr_enc: CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;) -.align 8 +.align 16 .globl _gcry_twofish_avx2_cbc_dec ELF(.type _gcry_twofish_avx2_cbc_dec, at function;) _gcry_twofish_avx2_cbc_dec: @@ -685,7 +686,7 @@ _gcry_twofish_avx2_cbc_dec: CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;) -.align 8 +.align 16 .globl _gcry_twofish_avx2_cfb_dec ELF(.type _gcry_twofish_avx2_cfb_dec, at function;) _gcry_twofish_avx2_cfb_dec: @@ -740,7 +741,7 @@ _gcry_twofish_avx2_cfb_dec: CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;) -.align 8 +.align 16 .globl _gcry_twofish_avx2_ocb_enc ELF(.type _gcry_twofish_avx2_ocb_enc, at function;) @@ -854,7 +855,7 @@ _gcry_twofish_avx2_ocb_enc: CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;) -.align 8 +.align 16 .globl _gcry_twofish_avx2_ocb_dec ELF(.type _gcry_twofish_avx2_ocb_dec, at function;) @@ -979,7 +980,7 @@ _gcry_twofish_avx2_ocb_dec: CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;) -.align 8 +.align 16 .globl _gcry_twofish_avx2_ocb_auth ELF(.type _gcry_twofish_avx2_ocb_auth, at function;) @@ -1082,10 +1083,13 @@ _gcry_twofish_avx2_ocb_auth: CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;) +SECTION_RODATA + .align 16 /* For CTR-mode IV byteswap */ - _gcry_twofish_bswap128_mask: +ELF(.type _gcry_twofish_bswap128_mask, at object) +_gcry_twofish_bswap128_mask: .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ELF(.size _gcry_twofish_bswap128_mask,.-_gcry_twofish_bswap128_mask;) -- 2.37.2 From jussi.kivilinna at iki.fi Sat Jan 21 13:27:43 2023 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 21 Jan 2023 14:27:43 +0200 Subject: [PATCH] asm-common-aarch64: fix read-only section for Windows target Message-ID: <20230121122743.1125643-1-jussi.kivilinna@iki.fi> * cipher/asm-common-aarch64.h (SECTION_RODATA): Use .rdata for _WIN32. -- Signed-off-by: Jussi Kivilinna --- cipher/asm-common-aarch64.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h index 8e8bf8e7..3a72d7c4 100644 --- a/cipher/asm-common-aarch64.h +++ b/cipher/asm-common-aarch64.h @@ -29,7 +29,11 @@ # define ELF(...) /*_*/ #endif -#define SECTION_RODATA .section .rodata +#ifdef _WIN32 +# define SECTION_RODATA .section .rdata +#else +# define SECTION_RODATA .section .rodata +#endif #ifdef __APPLE__ #define GET_DATA_POINTER(reg, name) \ -- 2.37.2