From gniibe at fsij.org Mon Dec 2 03:51:46 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Mon, 02 Dec 2013 11:51:46 +0900 Subject: Tasks for 1.6 In-Reply-To: <87iow47hsy.fsf@vigenere.g10code.de> References: <87iow47hsy.fsf@vigenere.g10code.de> Message-ID: <1385952706.3225.2.camel@cfw2.gniibe.org> On 2013-11-07 at 09:11 +0100, Werner Koch wrote: > The important thing is that we keep the ABI stable after 1.6.0. I have one thing not yet in the repository. It is a test case when gcry_mpi_powm is called with negative BASE (and a fix). I think that it is the bug of old implementation of gcry_mpi_powm. Here is the patch for the code before new implementation. On 2013-09-12 at 10:17 +0900, NIIBE Yutaka wrote: > Well, I wrote a test case to share the issue. Here is a possible > patch. In the test case, it calculates (-17)^6 mod 19. Result should > be 7. Mathematically, it's also correct for powm to return -12 in > this case, but it checks against positive 7. > > In the current implementation it returns -7, which is wrong. diff --git a/tests/mpitests.c b/tests/mpitests.c index e1c51d1..ae206d7 100644 --- a/tests/mpitests.c +++ b/tests/mpitests.c @@ -379,6 +379,25 @@ test_powm (void) if (gcry_mpi_cmp (res, base)) die ("test_powm failed at %d\n", __LINE__); + /* Check for a case: base is negative and expo is even. */ + gcry_mpi_set_ui (base, b_int); + gcry_mpi_neg (base, base); + gcry_mpi_set_ui (exp, e_int * 2); + gcry_mpi_set_ui(mod, m_int); + gcry_mpi_powm (res, base, exp, mod); + /* Result should be positive and it's 7 = (-17)^6 mod 19. */ + if (gcry_mpi_is_neg (res) || gcry_mpi_cmp_ui (res, 7)) + { + if (verbose) + { + fprintf (stderr, "is_neg: %d\n", gcry_mpi_is_neg (res)); + fprintf (stderr, "mpi: "); + gcry_mpi_dump (res); + putc ('\n', stderr); + } + die ("test_powm failed for negative base at %d\n", __LINE__); + } + gcry_mpi_release (base); gcry_mpi_release (exp); gcry_mpi_release (mod); diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c index 85d6fd8..4955fa5 100644 --- a/mpi/mpi-pow.c +++ b/mpi/mpi-pow.c @@ -169,7 +169,7 @@ gcry_mpi_powm (gcry_mpi_t res, } MPN_COPY ( rp, bp, bsize ); rsize = bsize; - rsign = bsign; + rsign = 0; /* Main processing. */ { @@ -184,7 +184,7 @@ gcry_mpi_powm (gcry_mpi_t res, xp = xp_marker = mpi_alloc_limb_space( 2 * (msize + 1), msec ); memset( &karactx, 0, sizeof karactx ); - negative_result = (ep[0] & 1) && base->sign; + negative_result = (ep[0] & 1) && bsign; i = esize - 1; e = ep[i]; -- From cvs at cvs.gnupg.org Mon Dec 2 19:09:15 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 02 Dec 2013 19:09:15 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-411-gd4ce0cf Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via d4ce0cfe0d35d7ec69c115456848b5b735c928ea (commit) via 14ae6224b1b17abbfc80c26ad0f4c60f1e8635e2 (commit) via 485f35124b1a74af0bad321ed70be3a79d8d11d7 (commit) via ecb90f8e7c6f2516080d27ed7da6a25f2314da3c (commit) via 29eddc2558d4cf39995f66d5fccd62f584d5b203 (commit) from 3b1cc9e6c357574f54160298d731c18f3d717b6c (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit d4ce0cfe0d35d7ec69c115456848b5b735c928ea Author: Werner Koch Date: Mon Dec 2 17:09:04 2013 +0100 ecc: Use constant time point operation for Twisted Edwards. * mpi/ec.c (_gcry_mpi_ec_mul_point): Try to do a constant time operation if needed. * tests/benchmark.c (main): Add option --use-secmem. Signed-off-by: Werner Koch diff --git a/mpi/ec.c b/mpi/ec.c index 565644a..6fca95b 100644 --- a/mpi/ec.c +++ b/mpi/ec.c @@ -1117,11 +1117,30 @@ _gcry_mpi_ec_mul_point (mpi_point_t result, mpi_set_ui (result->y, 1); mpi_set_ui (result->z, 1); - for (j=nbits-1; j >= 0; j--) + if (mpi_is_secure (scalar)) { - _gcry_mpi_ec_dup_point (result, result, ctx); - if (mpi_test_bit (scalar, j) == 1) - _gcry_mpi_ec_add_points (result, result, point, ctx); + /* If SCALAR is in secure memory we assume that it is the + secret key we use constant time operation. */ + mpi_point_struct tmppnt; + + point_init (&tmppnt); + for (j=nbits-1; j >= 0; j--) + { + _gcry_mpi_ec_dup_point (result, result, ctx); + _gcry_mpi_ec_add_points (&tmppnt, result, point, ctx); + if (mpi_test_bit (scalar, j)) + point_set (result, &tmppnt); + } + point_free (&tmppnt); + } + else + { + for (j=nbits-1; j >= 0; j--) + { + _gcry_mpi_ec_dup_point (result, result, ctx); + if (mpi_test_bit (scalar, j)) + _gcry_mpi_ec_add_points (result, result, point, ctx); + } } return; } diff --git a/tests/benchmark.c b/tests/benchmark.c index 8bb8584..3f44e33 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -1286,6 +1286,7 @@ main( int argc, char **argv ) int last_argc = -1; int no_blinding = 0; int use_random_daemon = 0; + int use_secmem = 0; int with_progress = 0; int debug = 0; int pk_count = 100; @@ -1331,6 +1332,11 @@ main( int argc, char **argv ) use_random_daemon = 1; argc--; argv++; } + else if (!strcmp (*argv, "--use-secmem")) + { + use_secmem = 1; + argc--; argv++; + } else if (!strcmp (*argv, "--prefer-standard-rng")) { /* This is anyway the default, but we may want to use it for @@ -1449,7 +1455,7 @@ main( int argc, char **argv ) if (gcry_fips_mode_active ()) in_fips_mode = 1; - else + else if (!use_secmem) gcry_control (GCRYCTL_DISABLE_SECMEM, 0); if (use_random_daemon) commit 14ae6224b1b17abbfc80c26ad0f4c60f1e8635e2 Author: Werner Koch Date: Mon Dec 2 16:18:25 2013 +0100 ecc: Make gcry_pk_testkey work for Ed25519. * cipher/ecc-misc.c (_gcry_ecc_compute_public): Add optional args G and d. Change all callers. * cipher/ecc.c (gen_y_2): Remove. (check_secret_key): Use generic public key compute function. Adjust for use with Ed25519 and EdDSA. (nist_generate_key): Do not use the compliant key thingy for Ed25519. (ecc_check_secret_key): Make parameter parsing similar to the other functions. * cipher/ecc-curves.c (domain_parms): Zero prefix some parameters so that _gcry_ecc_update_curve_param works correctly. * tests/keygen.c (check_ecc_keys): Add "param" flag. Check all Ed25519 keys. diff --git a/cipher/ecc-common.h b/cipher/ecc-common.h index 0cecdc3..74adaec 100644 --- a/cipher/ecc-common.h +++ b/cipher/ecc-common.h @@ -91,7 +91,8 @@ const char *_gcry_ecc_dialect2str (enum ecc_dialects dialect); gcry_mpi_t _gcry_ecc_ec2os (gcry_mpi_t x, gcry_mpi_t y, gcry_mpi_t p); gcry_error_t _gcry_ecc_os2ec (mpi_point_t result, gcry_mpi_t value); -mpi_point_t _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec); +mpi_point_t _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec, + mpi_point_t G, gcry_mpi_t d); /*-- ecc.c --*/ diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c index 5815e55..f7c1c6d 100644 --- a/cipher/ecc-curves.c +++ b/cipher/ecc-curves.c @@ -174,10 +174,10 @@ static const ecc_domain_parms_t domain_parms[] = "0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" "ffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb71e91386409", - "0xc6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d3d" - "baa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", - "0x11839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e6" - "62c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650" + "0x00c6858e06b70404e9cd9e3ecb662395b4429c648139053fb521f828af606b4d" + "3dbaa14b5e77efe75928fe1dc127a2ffa8de3348b3c1856a429bf97e7e31c2e5bd66", + "0x011839296a789a3bc0045c8a5fb42c7d1bd998f54449579b446817afbd17273e" + "662c97ee72995ef42640c550b9013fad0761353c7086a272c24088be94769fd16650" }, { "brainpoolP160r1", 160, 0, @@ -442,7 +442,7 @@ _gcry_ecc_fill_in_curve (unsigned int nbits, const char *name, /* Give the name of the curve NAME, store the curve parameters into P, - A, B, G, and N if they pint to NULL value. Note that G is returned + A, B, G, and N if they point to NULL value. Note that G is returned in standard uncompressed format. Also update MODEL and DIALECT if they are not NULL. */ gpg_err_code_t @@ -1030,7 +1030,7 @@ _gcry_ecc_get_mpi (const char *name, mpi_ec_t ec, int copy) { /* If only the private key is given, compute the public key. */ if (!ec->Q) - ec->Q = _gcry_ecc_compute_public (NULL, ec); + ec->Q = _gcry_ecc_compute_public (NULL, ec, NULL, NULL); if (!ec->Q) return NULL; @@ -1063,7 +1063,7 @@ _gcry_ecc_get_point (const char *name, mpi_ec_t ec) { /* If only the private key is given, compute the public key. */ if (!ec->Q) - ec->Q = _gcry_ecc_compute_public (NULL, ec); + ec->Q = _gcry_ecc_compute_public (NULL, ec, NULL, NULL); if (ec->Q) return point_copy (ec->Q); diff --git a/cipher/ecc-misc.c b/cipher/ecc-misc.c index 0eb3391..1633d32 100644 --- a/cipher/ecc-misc.c +++ b/cipher/ecc-misc.c @@ -253,13 +253,20 @@ reverse_buffer (unsigned char *buffer, unsigned int length) /* Compute the public key from the the context EC. Obviously a requirement is that the secret key is available in EC. On success Q is returned; on error NULL. If Q is NULL a newly allocated point - is returned. */ + is returned. If G or D are given they override the values taken + from EC. */ mpi_point_t -_gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec) +_gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec, + mpi_point_t G, gcry_mpi_t d) { int rc; - if (!ec->d || !ec->G || !ec->p || !ec->a) + if (!G) + G = ec->G; + if (!d) + d = ec->d; + + if (!d || !G || !ec->p || !ec->a) return NULL; if (ec->model == MPI_EC_TWISTEDEDWARDS && !ec->b) return NULL; @@ -280,7 +287,7 @@ _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec) return NULL; memset (hvec, 0, sizeof hvec); - rawmpi = _gcry_mpi_get_buffer (ec->d, 0, &rawmpilen, NULL); + rawmpi = _gcry_mpi_get_buffer (d, 0, &rawmpilen, NULL); if (!rawmpi) return NULL; memset (digest, 0, b); @@ -311,7 +318,7 @@ _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec) if (!Q) Q = gcry_mpi_point_new (0); if (Q) - _gcry_mpi_ec_mul_point (Q, a, ec->G, ec); + _gcry_mpi_ec_mul_point (Q, a, G, ec); mpi_free (a); } else @@ -319,7 +326,7 @@ _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec) if (!Q) Q = gcry_mpi_point_new (0); if (Q) - _gcry_mpi_ec_mul_point (Q, ec->d, ec->G, ec); + _gcry_mpi_ec_mul_point (Q, d, G, ec); } return Q; diff --git a/cipher/ecc.c b/cipher/ecc.c index 72ca726..bda2a86 100644 --- a/cipher/ecc.c +++ b/cipher/ecc.c @@ -84,8 +84,6 @@ static void *progress_cb_data; /* Local prototypes. */ static void test_keys (ECC_secret_key * sk, unsigned int nbits); -static int check_secret_key (ECC_secret_key * sk); -static gcry_mpi_t gen_y_2 (gcry_mpi_t x, elliptic_curve_t * base); static unsigned int ecc_get_nbits (gcry_sexp_t parms); @@ -109,32 +107,6 @@ _gcry_register_pk_ecc_progress (void (*cb) (void *, const char *, - -/* - * Solve the right side of the Weierstrass equation. - */ -static gcry_mpi_t -gen_y_2 (gcry_mpi_t x, elliptic_curve_t *base) -{ - gcry_mpi_t three, x_3, axb, y; - - three = mpi_alloc_set_ui (3); - x_3 = mpi_new (0); - axb = mpi_new (0); - y = mpi_new (0); - - mpi_powm (x_3, x, three, base->p); - mpi_mulm (axb, base->a, x, base->p); - mpi_addm (axb, axb, base->b, base->p); - mpi_addm (y, x_3, axb, base->p); - - mpi_free (x_3); - mpi_free (axb); - mpi_free (three); - return y; /* The quadratic value of the coordinate if it exist. */ -} - - /* Standard version of the key generation. */ static gpg_err_code_t nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx, @@ -181,55 +153,62 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx, * end up with the min(y,p-y) as the y coordinate. Such a public * key allows the most efficient compression: y can simply be * dropped because we know that it's a minimum of the two - * possibilities without any loss of security. */ - { - gcry_mpi_t x, y, negative; - const unsigned int pbits = mpi_get_nbits (E->p); + * possibilities without any loss of security. Note that we don't + * do that for Ed25519 so that we do not violate the special + * construction of the secret key. */ + if (E->dialect == ECC_DIALECT_ED25519) + point_set (&sk->Q, &Q); + else + { + gcry_mpi_t x, y, negative; + const unsigned int pbits = mpi_get_nbits (E->p); - x = mpi_new (pbits); - y = mpi_new (pbits); - negative = mpi_new (pbits); + x = mpi_new (pbits); + y = mpi_new (pbits); + negative = mpi_new (pbits); - if (_gcry_mpi_ec_get_affine (x, y, &Q, ctx)) - log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "Q"); + if (_gcry_mpi_ec_get_affine (x, y, &Q, ctx)) + log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "Q"); - if (E->model == MPI_EC_WEIERSTRASS) - mpi_sub (negative, E->p, y); /* negative = p - y */ - else - mpi_sub (negative, E->p, x); /* negative = p - x */ + if (E->model == MPI_EC_WEIERSTRASS) + mpi_sub (negative, E->p, y); /* negative = p - y */ + else + mpi_sub (negative, E->p, x); /* negative = p - x */ - if (mpi_cmp (negative, y) < 0) /* p - y < p */ - { - /* We need to end up with -Q; this assures that new Q's y is - the smallest one */ - mpi_sub (sk->d, E->n, sk->d); /* d = order - d */ - if (E->model == MPI_EC_WEIERSTRASS) - gcry_mpi_point_snatch_set (&sk->Q, x, negative, mpi_alloc_set_ui (1)); - else - gcry_mpi_point_snatch_set (&sk->Q, negative, y, mpi_alloc_set_ui (1)); - - if (DBG_CIPHER) - log_debug ("ecgen converted Q to a compliant point\n"); - } - else /* p - y >= p */ - { - /* No change is needed exactly 50% of the time: just copy. */ - point_set (&sk->Q, &Q); - if (DBG_CIPHER) - log_debug ("ecgen didn't need to convert Q to a compliant point\n"); - - mpi_free (negative); - if (E->model == MPI_EC_WEIERSTRASS) - mpi_free (x); - else - mpi_free (y); - } + if (mpi_cmp (negative, y) < 0) /* p - y < p */ + { + /* We need to end up with -Q; this assures that new Q's y is + the smallest one */ + mpi_sub (sk->d, E->n, sk->d); /* d = order - d */ + if (E->model == MPI_EC_WEIERSTRASS) + gcry_mpi_point_snatch_set (&sk->Q, x, negative, + mpi_alloc_set_ui (1)); + else + gcry_mpi_point_snatch_set (&sk->Q, negative, y, + mpi_alloc_set_ui (1)); - if (E->model == MPI_EC_WEIERSTRASS) - mpi_free (y); - else - mpi_free (x); - } + if (DBG_CIPHER) + log_debug ("ecgen converted Q to a compliant point\n"); + } + else /* p - y >= p */ + { + /* No change is needed exactly 50% of the time: just copy. */ + point_set (&sk->Q, &Q); + if (DBG_CIPHER) + log_debug ("ecgen didn't need to convert Q to a compliant point\n"); + + mpi_free (negative); + if (E->model == MPI_EC_WEIERSTRASS) + mpi_free (x); + else + mpi_free (y); + } + + if (E->model == MPI_EC_WEIERSTRASS) + mpi_free (y); + else + mpi_free (x); + } point_free (&Q); /* Now we can test our keys (this should never fail!). */ @@ -295,30 +274,26 @@ test_keys (ECC_secret_key *sk, unsigned int nbits) * between the public value and the secret one. */ static int -check_secret_key (ECC_secret_key * sk) +check_secret_key (ECC_secret_key *sk, mpi_ec_t ec, int flags) { int rc = 1; mpi_point_struct Q; - gcry_mpi_t y_2, y2; - gcry_mpi_t x1, x2; - mpi_ec_t ctx = NULL; + gcry_mpi_t x1, y1; + gcry_mpi_t x2 = NULL; + gcry_mpi_t y2 = NULL; point_init (&Q); + x1 = mpi_new (0); + y1 = mpi_new (0); - /* ?primarity test of 'p' */ - /* (...) //!! */ /* G in E(F_p) */ - y_2 = gen_y_2 (sk->E.G.x, &sk->E); /* y^2=x^3+a*x+b */ - y2 = mpi_alloc (0); - x1 = mpi_alloc (0); - x2 = mpi_alloc (0); - mpi_mulm (y2, sk->E.G.y, sk->E.G.y, sk->E.p); /* y^2=y*y */ - if (mpi_cmp (y_2, y2)) + if (!_gcry_mpi_ec_curve_point (&sk->E.G, ec)) { if (DBG_CIPHER) log_debug ("Bad check: Point 'G' does not belong to curve 'E'!\n"); goto leave; } + /* G != PaI */ if (!mpi_cmp_ui (sk->E.G.z, 0)) { @@ -327,37 +302,46 @@ check_secret_key (ECC_secret_key * sk) goto leave; } - ctx = _gcry_mpi_ec_p_internal_new (sk->E.model, sk->E.dialect, 0, - sk->E.p, sk->E.a, sk->E.b); - - _gcry_mpi_ec_mul_point (&Q, sk->E.n, &sk->E.G, ctx); - if (mpi_cmp_ui (Q.z, 0)) + /* Check order of curve. */ + if (sk->E.dialect != ECC_DIALECT_ED25519) { - if (DBG_CIPHER) - log_debug ("check_secret_key: E is not a curve of order n\n"); - goto leave; + _gcry_mpi_ec_mul_point (&Q, sk->E.n, &sk->E.G, ec); + if (mpi_cmp_ui (Q.z, 0)) + { + if (DBG_CIPHER) + log_debug ("check_secret_key: E is not a curve of order n\n"); + goto leave; + } } - /* pubkey cannot be PaI */ + + /* Pubkey cannot be PaI */ if (!mpi_cmp_ui (sk->Q.z, 0)) { if (DBG_CIPHER) log_debug ("Bad check: Q can not be a Point at Infinity!\n"); goto leave; } - /* pubkey = [d]G over E */ - _gcry_mpi_ec_mul_point (&Q, sk->d, &sk->E.G, ctx); - if (_gcry_mpi_ec_get_affine (x1, y_2, &Q, ctx)) + /* pubkey = [d]G over E */ + if (!_gcry_ecc_compute_public (&Q, ec, &sk->E.G, sk->d)) + { + if (DBG_CIPHER) + log_debug ("Bad check: computation of dG failed\n"); + goto leave; + } + if (_gcry_mpi_ec_get_affine (x1, y1, &Q, ec)) { if (DBG_CIPHER) log_debug ("Bad check: Q can not be a Point at Infinity!\n"); goto leave; } - /* Fast path for loaded secret keys - Q is already in affine coordinates */ - if (!mpi_cmp_ui (sk->Q.z, 1)) + if ((flags & PUBKEY_FLAG_EDDSA)) + ; /* Fixme: EdDSA is special. */ + else if (!mpi_cmp_ui (sk->Q.z, 1)) { - if (mpi_cmp (x1, sk->Q.x) || mpi_cmp (y_2, sk->Q.y)) + /* Fast path if Q is already in affine coordinates. */ + if (mpi_cmp (x1, sk->Q.x) || mpi_cmp (y1, sk->Q.y)) { if (DBG_CIPHER) log_debug @@ -367,14 +351,16 @@ check_secret_key (ECC_secret_key * sk) } else { - if (_gcry_mpi_ec_get_affine (x2, y2, &sk->Q, ctx)) + x2 = mpi_new (0); + y2 = mpi_new (0); + if (_gcry_mpi_ec_get_affine (x2, y2, &sk->Q, ec)) { if (DBG_CIPHER) log_debug ("Bad check: Q can not be a Point at Infinity!\n"); goto leave; } - if (mpi_cmp (x1, x2) || mpi_cmp (y_2, y2)) + if (mpi_cmp (x1, x2) || mpi_cmp (y1, y2)) { if (DBG_CIPHER) log_debug @@ -385,11 +371,10 @@ check_secret_key (ECC_secret_key * sk) rc = 0; /* Okay. */ leave: - _gcry_mpi_ec_free (ctx); mpi_free (x2); mpi_free (x1); + mpi_free (y1); mpi_free (y2); - mpi_free (y_2); point_free (&Q); return rc; } @@ -601,28 +586,35 @@ ecc_check_secret_key (gcry_sexp_t keyparms) { gcry_err_code_t rc; gcry_sexp_t l1 = NULL; + int flags = 0; char *curvename = NULL; gcry_mpi_t mpi_g = NULL; gcry_mpi_t mpi_q = NULL; ECC_secret_key sk; + mpi_ec_t ec = NULL; memset (&sk, 0, sizeof sk); - /* - * Extract the key. - */ - rc = _gcry_sexp_extract_param (keyparms, NULL, "-p?a?b?g?n?/q?+d", - &sk.E.p, &sk.E.a, &sk.E.b, &mpi_g, &sk.E.n, - &mpi_q, &sk.d, NULL); - if (rc) - goto leave; - if (mpi_g) + /* Look for flags. */ + l1 = gcry_sexp_find_token (keyparms, "flags", 0); + if (l1) { - point_init (&sk.E.G); - rc = _gcry_ecc_os2ec (&sk.E.G, mpi_g); + rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL); if (rc) goto leave; } + + /* Extract the parameters. */ + if ((flags & PUBKEY_FLAG_PARAM)) + rc = _gcry_sexp_extract_param (keyparms, NULL, "-p?a?b?g?n?/q?+d", + &sk.E.p, &sk.E.a, &sk.E.b, &mpi_g, &sk.E.n, + &mpi_q, &sk.d, NULL); + else + rc = _gcry_sexp_extract_param (keyparms, NULL, "/q?+d", + &mpi_q, &sk.d, NULL); + if (rc) + goto leave; + /* Add missing parameters using the optional curve parameter. */ gcry_sexp_release (l1); l1 = gcry_sexp_find_token (keyparms, "curve", 5); @@ -631,17 +623,32 @@ ecc_check_secret_key (gcry_sexp_t keyparms) curvename = gcry_sexp_nth_string (l1, 1); if (curvename) { - rc = _gcry_ecc_fill_in_curve (0, curvename, &sk.E, NULL); + rc = _gcry_ecc_update_curve_param (curvename, + &sk.E.model, &sk.E.dialect, + &sk.E.p, &sk.E.a, &sk.E.b, + &mpi_g, &sk.E.n); if (rc) return rc; } } + if (mpi_g) + { + point_init (&sk.E.G); + rc = _gcry_ecc_os2ec (&sk.E.G, mpi_g); + if (rc) + goto leave; + } + /* Guess required fields if a curve parameter has not been given. FIXME: This is a crude hacks. We need to fix that. */ if (!curvename) { - sk.E.model = MPI_EC_WEIERSTRASS; - sk.E.dialect = ECC_DIALECT_STANDARD; + sk.E.model = ((flags & PUBKEY_FLAG_EDDSA) + ? MPI_EC_TWISTEDEDWARDS + : MPI_EC_WEIERSTRASS); + sk.E.dialect = ((flags & PUBKEY_FLAG_EDDSA) + ? ECC_DIALECT_ED25519 + : ECC_DIALECT_STANDARD); } if (DBG_CIPHER) { @@ -665,24 +672,31 @@ ecc_check_secret_key (gcry_sexp_t keyparms) goto leave; } + ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0, + sk.E.p, sk.E.a, sk.E.b); + if (mpi_q) { point_init (&sk.Q); - rc = _gcry_ecc_os2ec (&sk.Q, mpi_q); + if (ec->dialect == ECC_DIALECT_ED25519) + rc = _gcry_ecc_eddsa_decodepoint (mpi_q, ec, &sk.Q, NULL, NULL); + else + rc = _gcry_ecc_os2ec (&sk.Q, mpi_q); if (rc) goto leave; } else { - /* The current test requires Q. */ + /* The secret key test requires Q. */ rc = GPG_ERR_NO_OBJ; goto leave; } - if (check_secret_key (&sk)) + if (check_secret_key (&sk, ec, flags)) rc = GPG_ERR_BAD_SECKEY; leave: + _gcry_mpi_ec_free (ec); gcry_mpi_release (sk.E.p); gcry_mpi_release (sk.E.a); gcry_mpi_release (sk.E.b); @@ -1623,7 +1637,7 @@ _gcry_pk_ecc_get_sexp (gcry_sexp_t *r_sexp, int mode, mpi_ec_t ec) /* Compute the public point if it is missing. */ if (!ec->Q && ec->d) - ec->Q = _gcry_ecc_compute_public (NULL, ec); + ec->Q = _gcry_ecc_compute_public (NULL, ec, NULL, NULL); /* Encode G and Q. */ mpi_G = _gcry_mpi_ec_ec2os (ec->G, ec); diff --git a/tests/keygen.c b/tests/keygen.c index 48663d4..e8cf7c5 100644 --- a/tests/keygen.c +++ b/tests/keygen.c @@ -90,21 +90,21 @@ show (const char *format, ...) } -static void -show_note (const char *format, ...) -{ - va_list arg_ptr; - - if (!verbose && getenv ("srcdir")) - fputs (" ", stderr); /* To align above "PASS: ". */ - else - fprintf (stderr, "%s: ", PGM); - va_start (arg_ptr, format); - vfprintf (stderr, format, arg_ptr); - if (*format && format[strlen(format)-1] != '\n') - putc ('\n', stderr); - va_end (arg_ptr); -} +/* static void */ +/* show_note (const char *format, ...) */ +/* { */ +/* va_list arg_ptr; */ + +/* if (!verbose && getenv ("srcdir")) */ +/* fputs (" ", stderr); /\* To align above "PASS: ". *\/ */ +/* else */ +/* fprintf (stderr, "%s: ", PGM); */ +/* va_start (arg_ptr, format); */ +/* vfprintf (stderr, format, arg_ptr); */ +/* if (*format && format[strlen(format)-1] != '\n') */ +/* putc ('\n', stderr); */ +/* va_end (arg_ptr); */ +/* } */ static void @@ -376,11 +376,11 @@ check_ecc_keys (void) show ("creating ECC key using curve %s\n", curves[testno]); if (!strcmp (curves[testno], "Ed25519")) rc = gcry_sexp_build (&keyparm, NULL, - "(genkey(ecc(curve %s)(flags eddsa)))", + "(genkey(ecc(curve %s)(flags param eddsa)))", curves[testno]); else rc = gcry_sexp_build (&keyparm, NULL, - "(genkey(ecc(curve %s)(flags )))", + "(genkey(ecc(curve %s)(flags param)))", curves[testno]); if (rc) die ("error creating S-expression: %s\n", gpg_strerror (rc)); @@ -393,10 +393,7 @@ check_ecc_keys (void) if (verbose > 1) show_sexp ("ECC key:\n", key); - if (!strcmp (curves[testno], "Ed25519")) - show_note ("note: gcry_pk_testkey does not yet work for Ed25519\n"); - else - check_generated_ecc_key (key); + check_generated_ecc_key (key); gcry_sexp_release (key); } @@ -415,6 +412,8 @@ check_ecc_keys (void) if (verbose > 1) show_sexp ("ECC key:\n", key); + check_generated_ecc_key (key); + if (verbose) show ("creating ECC key using curve Ed25519 for ECDSA (nocomp)\n"); rc = gcry_sexp_build (&keyparm, NULL, @@ -431,6 +430,8 @@ check_ecc_keys (void) if (verbose > 1) show_sexp ("ECC key:\n", key); + check_generated_ecc_key (key); + gcry_sexp_release (key); } commit 485f35124b1a74af0bad321ed70be3a79d8d11d7 Author: Werner Koch Date: Mon Dec 2 16:06:40 2013 +0100 ecc: Fix eddsa point decompression. * cipher/ecc-eddsa.c (_gcry_ecc_eddsa_recover_x): Fix the negative case. Signed-off-by: Werner Koch diff --git a/cipher/ecc-eddsa.c b/cipher/ecc-eddsa.c index 92c0713..29145f8 100644 --- a/cipher/ecc-eddsa.c +++ b/cipher/ecc-eddsa.c @@ -252,7 +252,7 @@ _gcry_ecc_eddsa_recover_x (gcry_mpi_t x, gcry_mpi_t y, int sign, mpi_ec_t ec) /* Choose the desired square root according to parity */ if (mpi_test_bit (x, 0) != !!sign) - gcry_mpi_neg (x, x); + mpi_sub (x, ec->p, x); mpi_free (t); mpi_free (v3); @@ -267,7 +267,7 @@ _gcry_ecc_eddsa_recover_x (gcry_mpi_t x, gcry_mpi_t y, int sign, mpi_ec_t ec) the usual curve context. If R_ENCPK is not NULL, the encoded PK is stored at that address; this is a new copy to be released by the caller. In contrast to the supplied PK, this is not an MPI and - thus guarnateed to be properly padded. R_ENCPKLEN received the + thus guarnateed to be properly padded. R_ENCPKLEN receives the length of that encoded key. */ gpg_err_code_t _gcry_ecc_eddsa_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result, commit ecb90f8e7c6f2516080d27ed7da6a25f2314da3c Author: Werner Koch Date: Fri Nov 29 17:14:33 2013 +0100 ecc: Fix gcry_mpi_ec_curve_point for Weierstrass. * mpi/ec.c (_gcry_mpi_ec_curve_point): Use correct equation. (ec_pow3): New. (ec_p_init): Always copy B. -- The code path was obviously never tested. Signed-off-by: Werner Koch diff --git a/mpi/ec.c b/mpi/ec.c index 57396ce..565644a 100644 --- a/mpi/ec.c +++ b/mpi/ec.c @@ -288,6 +288,16 @@ ec_pow2 (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx) } +/* Shortcut for + ec_powm (B, B, mpi_const (MPI_C_THREE), ctx); + for easier optimization. */ +static void +ec_pow3 (gcry_mpi_t w, const gcry_mpi_t b, mpi_ec_t ctx) +{ + mpi_powm (w, b, mpi_const (MPI_C_THREE), ctx->p); +} + + static void ec_invm (gcry_mpi_t x, gcry_mpi_t a, mpi_ec_t ctx) { @@ -375,8 +385,7 @@ ec_p_init (mpi_ec_t ctx, enum gcry_mpi_ec_models model, ctx->nbits = mpi_get_nbits (p); ctx->p = mpi_copy (p); ctx->a = mpi_copy (a); - if (b && model == MPI_EC_TWISTEDEDWARDS) - ctx->b = mpi_copy (b); + ctx->b = mpi_copy (b); ctx->t.p_barrett = use_barrett > 0? _gcry_mpi_barrett_init (ctx->p, 0):NULL; @@ -469,7 +478,7 @@ _gcry_mpi_ec_p_internal_new (enum gcry_mpi_ec_models model, /* This is a variant of _gcry_mpi_ec_p_internal_new which returns an - public contect and does some error checking on the supplied + public context and does some error checking on the supplied arguments. On success the new context is stored at R_CTX and 0 is returned; on error NULL is stored at R_CTX and an error code is returned. @@ -1221,21 +1230,20 @@ _gcry_mpi_ec_curve_point (gcry_mpi_point_t point, mpi_ec_t ctx) { case MPI_EC_WEIERSTRASS: { - gcry_mpi_t xx = mpi_new (0); + gcry_mpi_t xxx = mpi_new (0); - /* y^2 == x^3 + a?x^2 + b */ + /* y^2 == x^3 + a?x + b */ ec_pow2 (y, y, ctx); - ec_pow2 (xx, x, ctx); - ec_mulm (w, ctx->a, xx, ctx); + ec_pow3 (xxx, x, ctx); + ec_mulm (w, ctx->a, x, ctx); ec_addm (w, w, ctx->b, ctx); - ec_mulm (xx, xx, x, ctx); - ec_addm (w, w, xx, ctx); + ec_addm (w, w, xxx, ctx); if (!mpi_cmp (y, w)) res = 1; - gcry_mpi_release (xx); + gcry_mpi_release (xxx); } break; case MPI_EC_MONTGOMERY: commit 29eddc2558d4cf39995f66d5fccd62f584d5b203 Author: Werner Koch Date: Thu Nov 28 09:07:15 2013 +0100 mpi: Introduce 4 user flags for gcry_mpi_t. * src/gcrypt.h.in (GCRYMPI_FLAG_USER1, GCRYMPI_FLAG_USER2) (GCRYMPI_FLAG_USER3, GCRYMPI_FLAG_USER4): New. * mpi/mpiutil.c (gcry_mpi_set_flag, gcry_mpi_clear_flag) (gcry_mpi_get_flag, _gcry_mpi_free): Implement them. (gcry_mpi_set_opaque): Keep user flags. -- The space for the flags in the MPI struct is free and thus we can help applications to make use of some flags. This is for example useful to indicate that an MPI needs special processing before use. Signed-off-by: Werner Koch diff --git a/NEWS b/NEWS index 00435e2..ec853c9 100644 --- a/NEWS +++ b/NEWS @@ -101,6 +101,10 @@ Noteworthy changes in version 1.6.0 (unreleased) gcry_mpi_ec_curve_point NEW. GCRYMPI_FLAG_IMMUTABLE NEW. GCRYMPI_FLAG_CONST NEW. + GCRYMPI_FLAG_USER1 NEW. + GCRYMPI_FLAG_USER2 NEW. + GCRYMPI_FLAG_USER3 NEW. + GCRYMPI_FLAG_USER4 NEW. GCRYMPI_CONST_ONE NEW. GCRYMPI_CONST_TWO NEW. GCRYMPI_CONST_THREE NEW. diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 4870a1c..91168a8 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -4822,6 +4822,11 @@ commonly used constants are pre-defined and accessible using the macros @code{GCRYMPI_CONST_ONE}, @code{GCRYMPI_CONST_TWO}, @code{GCRYMPI_CONST_THREE}, @code{GCRYMPI_CONST_FOUR}, and @code{GCRYMPI_CONST_EIGHT}. + at item GCRYMPI_FLAG_USER1 + at itemx GCRYMPI_FLAG_USER2 + at itemx GCRYMPI_FLAG_USER3 + at itemx GCRYMPI_FLAG_USER4 +These flags are reserved for use by the application. @end table @deftypefun void gcry_mpi_set_flag (@w{gcry_mpi_t @var{a}}, @ diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c index a82a8e7..c4259ac 100644 --- a/mpi/mpiutil.c +++ b/mpi/mpiutil.c @@ -220,7 +220,11 @@ _gcry_mpi_free( gcry_mpi_t a ) } /* Check that the flags makes sense. We better allow for bit 1 (value 2) for backward ABI compatibility. */ - if ((a->flags & ~(1|2|4|16))) + if ((a->flags & ~(1|2|4|16 + |GCRYMPI_FLAG_USER1 + |GCRYMPI_FLAG_USER2 + |GCRYMPI_FLAG_USER3 + |GCRYMPI_FLAG_USER4))) log_bug("invalid flag value in mpi_free\n"); gcry_free(a); } @@ -275,7 +279,8 @@ gcry_mpi_set_opaque( gcry_mpi_t a, void *p, unsigned int nbits ) a->alloced = 0; a->nlimbs = 0; a->sign = nbits; - a->flags = 4; + a->flags = 4 | (a->flags & (GCRYMPI_FLAG_USER1|GCRYMPI_FLAG_USER2 + |GCRYMPI_FLAG_USER3|GCRYMPI_FLAG_USER4)); if (gcry_is_secure (a->d)) a->flags |= 1; return a; @@ -603,6 +608,12 @@ gcry_mpi_set_flag (gcry_mpi_t a, enum gcry_mpi_flag flag) case GCRYMPI_FLAG_SECURE: mpi_set_secure(a); break; case GCRYMPI_FLAG_CONST: a->flags |= (16|32); break; case GCRYMPI_FLAG_IMMUTABLE: a->flags |= 16; break; + + case GCRYMPI_FLAG_USER1: + case GCRYMPI_FLAG_USER2: + case GCRYMPI_FLAG_USER3: + case GCRYMPI_FLAG_USER4: a->flags |= flag; break; + case GCRYMPI_FLAG_OPAQUE: default: log_bug("invalid flag value\n"); } @@ -619,6 +630,14 @@ gcry_mpi_clear_flag (gcry_mpi_t a, enum gcry_mpi_flag flag) if (!(a->flags & 32)) a->flags &= ~16; break; + + case GCRYMPI_FLAG_USER1: + case GCRYMPI_FLAG_USER2: + case GCRYMPI_FLAG_USER3: + case GCRYMPI_FLAG_USER4: + a->flags &= ~flag; + break; + case GCRYMPI_FLAG_CONST: case GCRYMPI_FLAG_SECURE: case GCRYMPI_FLAG_OPAQUE: @@ -635,6 +654,10 @@ gcry_mpi_get_flag (gcry_mpi_t a, enum gcry_mpi_flag flag) case GCRYMPI_FLAG_OPAQUE: return !!(a->flags & 4); case GCRYMPI_FLAG_IMMUTABLE: return !!(a->flags & 16); case GCRYMPI_FLAG_CONST: return !!(a->flags & 32); + case GCRYMPI_FLAG_USER1: + case GCRYMPI_FLAG_USER2: + case GCRYMPI_FLAG_USER3: + case GCRYMPI_FLAG_USER4: return !!(a->flags & flag); default: log_bug("invalid flag value\n"); } /*NOTREACHED*/ diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 625fa9f..53133bf 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -501,7 +501,11 @@ enum gcry_mpi_flag a way to store some bytes. This is useful for encrypted big integers. */ GCRYMPI_FLAG_IMMUTABLE = 4, /* Mark the MPI as immutable. */ - GCRYMPI_FLAG_CONST = 8 /* Mark the MPI as a constant. */ + GCRYMPI_FLAG_CONST = 8, /* Mark the MPI as a constant. */ + GCRYMPI_FLAG_USER1 = 0x0100,/* User flag 1. */ + GCRYMPI_FLAG_USER2 = 0x0200,/* User flag 2. */ + GCRYMPI_FLAG_USER3 = 0x0400,/* User flag 3. */ + GCRYMPI_FLAG_USER4 = 0x0800,/* User flag 4. */ }; ----------------------------------------------------------------------- Summary of changes: NEWS | 4 + cipher/ecc-common.h | 3 +- cipher/ecc-curves.c | 14 +-- cipher/ecc-eddsa.c | 4 +- cipher/ecc-misc.c | 19 ++-- cipher/ecc.c | 256 +++++++++++++++++++++++++++------------------------ doc/gcrypt.texi | 5 + mpi/ec.c | 55 ++++++++--- mpi/mpiutil.c | 27 +++++- src/gcrypt.h.in | 6 +- tests/benchmark.c | 8 +- tests/keygen.c | 43 ++++----- 12 files changed, 268 insertions(+), 176 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Tue Dec 3 13:17:04 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Tue, 03 Dec 2013 13:17:04 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-413-g59b1a1b Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 59b1a1b7ee2923e1bf091071ae716d180c6c6006 (commit) via 80896bc8f5e6ed9a627374e34f040ad5f3617584 (commit) from d4ce0cfe0d35d7ec69c115456848b5b735c928ea (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 59b1a1b7ee2923e1bf091071ae716d180c6c6006 Author: Jussi Kivilinna Date: Tue Dec 3 14:03:09 2013 +0200 rijndael: fix compiler warning on aarch64 * cipher/rijndael.c (do_setkey): Use braces for empty if statement instead of semicolon. -- Patch fixes following warning: rijndael.c: In function 'do_setkey': rijndael.c:507:9: warning: suggest braces around empty body in an 'if' statement [-Wempty-body] ; ^ Signed-off-by: Jussi Kivilinna diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 257a162..8019f0a 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -504,7 +504,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) KC = 4; if (0) - ; + { + ; + } #ifdef USE_PADLOCK else if (hwfeatures & HWF_PADLOCK_AES) { @@ -559,7 +561,9 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen) /* NB: We don't yet support Padlock hardware key generation. */ if (0) - ; + { + ; + } #ifdef USE_AESNI else if (ctx->use_aesni) aesni_do_setkey(ctx, key); commit 80896bc8f5e6ed9a627374e34f040ad5f3617584 Author: Jussi Kivilinna Date: Tue Dec 3 13:57:02 2013 +0200 Add aarch64 (arm64) mpi assembly * mpi/aarch64/mpi-asm-defs.h: New. * mpi/aarch64/mpih-add1.S: New. * mpi/aarch64/mpih-mul1.S: New. * mpi/aarch64/mpih-mul2.S: New. * mpi/aarch64/mpih-mul3.S: New. * mpi/aarch64/mpih-sub1.S: New. * mpi/config.links [host=aarch64-*-*]: Add configguration for aarch64 assembly. * mpi/longlong.h [__aarch64__] (add_ssaaaa, sub_ddmmss, umul_ppmm) (count_leading_zeros): New. -- Add preliminary aarch64 assembly implementations for mpi. Signed-off-by: Jussi Kivilinna diff --git a/mpi/aarch64/mpi-asm-defs.h b/mpi/aarch64/mpi-asm-defs.h new file mode 100644 index 0000000..6519065 --- /dev/null +++ b/mpi/aarch64/mpi-asm-defs.h @@ -0,0 +1,4 @@ +/* This file defines some basic constants for the MPI machinery. We + * need to define the types on a per-CPU basis, so it is done with + * this file here. */ +#define BYTES_PER_MPI_LIMB (SIZEOF_UNSIGNED_LONG_LONG) diff --git a/mpi/aarch64/mpih-add1.S b/mpi/aarch64/mpih-add1.S new file mode 100644 index 0000000..9f7e2e6 --- /dev/null +++ b/mpi/aarch64/mpih-add1.S @@ -0,0 +1,71 @@ +/* ARM64 add_n -- Add two limb vectors of the same length > 0 and store + * sum in a third limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/******************* + * mpi_limb_t + * _gcry_mpih_add_n( mpi_ptr_t res_ptr, x0 + * mpi_ptr_t s1_ptr, x1 + * mpi_ptr_t s2_ptr, x2 + * mpi_size_t size) x3 + */ + +.text + +.globl _gcry_mpih_add_n +.type _gcry_mpih_add_n,%function +_gcry_mpih_add_n: + and x5, x3, #3; + adds xzr, xzr, xzr; /* clear carry flag */ + + cbz x5, .Large_loop; + +.Loop: + ldr x4, [x1], #8; + sub x3, x3, #1; + ldr x11, [x2], #8; + and x5, x3, #3; + adcs x4, x4, x11; + str x4, [x0], #8; + cbz x3, .Lend; + cbnz x5, .Loop; + +.Large_loop: + ldp x4, x6, [x1], #16; + ldp x5, x7, [x2], #16; + ldp x8, x10, [x1], #16; + ldp x9, x11, [x2], #16; + sub x3, x3, #4; + adcs x4, x4, x5; + adcs x6, x6, x7; + adcs x8, x8, x9; + adcs x10, x10, x11; + stp x4, x6, [x0], #16; + stp x8, x10, [x0], #16; + cbnz x3, .Large_loop; + +.Lend: + adc x0, xzr, xzr; + ret; +.size _gcry_mpih_add_n,.-_gcry_mpih_add_n; diff --git a/mpi/aarch64/mpih-mul1.S b/mpi/aarch64/mpih-mul1.S new file mode 100644 index 0000000..cbb333f --- /dev/null +++ b/mpi/aarch64/mpih-mul1.S @@ -0,0 +1,96 @@ +/* ARM64 mul_1 -- Multiply a limb vector with a limb and store the result in + * a second limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/******************* + * mpi_limb_t + * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, x0 + * mpi_ptr_t s1_ptr, x1 + * mpi_size_t s1_size, x2 + * mpi_limb_t s2_limb) x3 + */ + +.text + +.globl _gcry_mpih_mul_1 +.type _gcry_mpih_mul_1,%function +_gcry_mpih_mul_1: + and x5, x2, #3; + mov x4, xzr; + + cbz x5, .Large_loop; + +.Loop: + ldr x5, [x1], #8; + sub x2, x2, #1; + mul x9, x5, x3; + umulh x10, x5, x3; + and x5, x2, #3; + adds x4, x4, x9; + str x4, [x0], #8; + adc x4, x10, xzr; + + cbz x2, .Lend; + cbnz x5, .Loop; + +.Large_loop: + ldp x5, x6, [x1]; + sub x2, x2, #4; + + mul x9, x5, x3; + ldp x7, x8, [x1, #16]; + umulh x10, x5, x3; + add x1, x1, #32; + + adds x4, x4, x9; + str x4, [x0], #8; + mul x11, x6, x3; + adc x4, x10, xzr; + + umulh x12, x6, x3; + + adds x4, x4, x11; + str x4, [x0], #8; + mul x13, x7, x3; + adc x4, x12, xzr; + + umulh x14, x7, x3; + + adds x4, x4, x13; + str x4, [x0], #8; + mul x15, x8, x3; + adc x4, x14, xzr; + + umulh x16, x8, x3; + + adds x4, x4, x15; + str x4, [x0], #8; + adc x4, x16, xzr; + + cbnz x2, .Large_loop; + +.Lend: + mov x0, x4; + ret; +.size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1; diff --git a/mpi/aarch64/mpih-mul2.S b/mpi/aarch64/mpih-mul2.S new file mode 100644 index 0000000..bfb3571 --- /dev/null +++ b/mpi/aarch64/mpih-mul2.S @@ -0,0 +1,108 @@ +/* ARM64 mul_2 -- Multiply a limb vector with a limb and add the result to + * a second limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/******************* + * mpi_limb_t + * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, x0 + * mpi_ptr_t s1_ptr, x1 + * mpi_size_t s1_size, x2 + * mpi_limb_t s2_limb) x3 + */ + +.text + +.globl _gcry_mpih_addmul_1 +.type _gcry_mpih_addmul_1,%function +_gcry_mpih_addmul_1: + and x5, x2, #3; + mov x6, xzr; + mov x7, xzr; + + cbz x5, .Large_loop; + +.Loop: + ldr x5, [x1], #8; + + mul x12, x5, x3; + ldr x4, [x0]; + umulh x13, x5, x3; + sub x2, x2, #1; + + adds x12, x12, x4; + and x5, x2, #3; + adc x13, x13, x7; + adds x12, x12, x6; + str x12, [x0], #8; + adc x6, x7, x13; + + cbz x2, .Lend; + cbnz x5, .Loop; + +.Large_loop: + ldp x5, x9, [x1], #16; + sub x2, x2, #4; + ldp x4, x8, [x0]; + + mul x12, x5, x3; + umulh x13, x5, x3; + + adds x12, x12, x4; + mul x14, x9, x3; + adc x13, x13, x7; + adds x12, x12, x6; + umulh x15, x9, x3; + str x12, [x0], #8; + adc x6, x7, x13; + + adds x14, x14, x8; + ldp x5, x9, [x1], #16; + adc x15, x15, x7; + adds x14, x14, x6; + mul x12, x5, x3; + str x14, [x0], #8; + ldp x4, x8, [x0]; + umulh x13, x5, x3; + adc x6, x7, x15; + + adds x12, x12, x4; + mul x14, x9, x3; + adc x13, x13, x7; + adds x12, x12, x6; + umulh x15, x9, x3; + str x12, [x0], #8; + adc x6, x7, x13; + + adds x14, x14, x8; + adc x15, x15, x7; + adds x14, x14, x6; + str x14, [x0], #8; + adc x6, x7, x15; + + cbnz x2, .Large_loop; + +.Lend: + mov x0, x6; + ret; +.size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1; diff --git a/mpi/aarch64/mpih-mul3.S b/mpi/aarch64/mpih-mul3.S new file mode 100644 index 0000000..6f12b7b --- /dev/null +++ b/mpi/aarch64/mpih-mul3.S @@ -0,0 +1,121 @@ +/* ARM mul_3 -- Multiply a limb vector with a limb and subtract the result + * from a second limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/******************* + * mpi_limb_t + * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, x0 + * mpi_ptr_t s1_ptr, x1 + * mpi_size_t s1_size, x2 + * mpi_limb_t s2_limb) x3 + */ + +.text + +.globl _gcry_mpih_submul_1 +.type _gcry_mpih_submul_1,%function +_gcry_mpih_submul_1: + and x5, x2, #3; + mov x7, xzr; + cbz x5, .Large_loop; + + subs xzr, xzr, xzr; + +.Loop: + ldr x4, [x1], #8; + cinc x7, x7, cc; + ldr x5, [x0]; + sub x2, x2, #1; + + mul x6, x4, x3; + subs x5, x5, x7; + umulh x4, x4, x3; + and x10, x2, #3; + + cset x7, cc; + subs x5, x5, x6; + add x7, x7, x4; + str x5, [x0], #8; + + cbz x2, .Loop_end; + cbnz x10, .Loop; + + cinc x7, x7, cc; + +.Large_loop: + ldp x4, x8, [x1], #16; + sub x2, x2, #4; + ldp x5, x9, [x0]; + + mul x6, x4, x3; + subs x5, x5, x7; + umulh x4, x4, x3; + + cset x7, cc; + subs x5, x5, x6; + mul x6, x8, x3; + add x7, x7, x4; + str x5, [x0], #8; + cinc x7, x7, cc; + + umulh x8, x8, x3; + + subs x9, x9, x7; + cset x7, cc; + subs x9, x9, x6; + ldp x4, x10, [x1], #16; + str x9, [x0], #8; + add x7, x7, x8; + ldp x5, x9, [x0]; + cinc x7, x7, cc; + + mul x6, x4, x3; + subs x5, x5, x7; + umulh x4, x4, x3; + + cset x7, cc; + subs x5, x5, x6; + mul x6, x10, x3; + add x7, x7, x4; + str x5, [x0], #8; + cinc x7, x7, cc; + + umulh x10, x10, x3; + + subs x9, x9, x7; + cset x7, cc; + subs x9, x9, x6; + add x7, x7, x10; + str x9, [x0], #8; + cinc x7, x7, cc; + + cbnz x2, .Large_loop; + + mov x0, x7; + ret; + +.Loop_end: + cinc x0, x7, cc; + ret; +.size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1; diff --git a/mpi/aarch64/mpih-sub1.S b/mpi/aarch64/mpih-sub1.S new file mode 100644 index 0000000..f18b1cd --- /dev/null +++ b/mpi/aarch64/mpih-sub1.S @@ -0,0 +1,71 @@ +/* ARM64 sub_n -- Subtract two limb vectors of the same length > 0 and store + * sum in a third limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +/******************* + * mpi_limb_t + * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, x0 + * mpi_ptr_t s1_ptr, x1 + * mpi_ptr_t s2_ptr, x2 + * mpi_size_t size) x3 + */ + +.text + +.globl _gcry_mpih_sub_n +.type _gcry_mpih_sub_n,%function +_gcry_mpih_sub_n: + and x5, x3, #3; + subs xzr, xzr, xzr; /* prepare carry flag for sub */ + + cbz x5, .Large_loop; + +.Loop: + ldr x4, [x1], #8; + sub x3, x3, #1; + ldr x11, [x2], #8; + and x5, x3, #3; + sbcs x4, x4, x11; + str x4, [x0], #8; + cbz x3, .Lend; + cbnz x5, .Loop; + +.Large_loop: + ldp x4, x6, [x1], #16; + ldp x5, x7, [x2], #16; + ldp x8, x10, [x1], #16; + ldp x9, x11, [x2], #16; + sub x3, x3, #4; + sbcs x4, x4, x5; + sbcs x6, x6, x7; + sbcs x8, x8, x9; + sbcs x10, x10, x11; + stp x4, x6, [x0], #16; + stp x8, x10, [x0], #16; + cbnz x3, .Large_loop; + +.Lend: + cset x0, cc; + ret; +.size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n; diff --git a/mpi/config.links b/mpi/config.links index 90d1077..a79b03b 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -136,6 +136,11 @@ case "${host}" in mpi_extra_modules="udiv-qrnnd" mpi_cpu_arch="alpha" ;; + aarch64-*-*) + echo '/* configured for aarch64 */' >>./mpi/asm-syntax.h + path="aarch64" + mpi_cpu_arch="aarch64" + ;; arm*-*-*) if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then echo '/* configured for arm */' >>./mpi/asm-syntax.h diff --git a/mpi/longlong.h b/mpi/longlong.h index 146830b..8dd8fe8 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -268,6 +268,44 @@ extern UDItype __udiv_qrnnd (); #endif /* __arm__ */ /*************************************** + ********** ARM64 / Aarch64 ********** + ***************************************/ +#if defined(__aarch64__) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("adds %1, %4, %5\n" \ + "adc %0, %2, %3\n" \ + : "=r" ((sh)), \ + "=&r" ((sl)) \ + : "r" ((UDItype)(ah)), \ + "r" ((UDItype)(bh)), \ + "r" ((UDItype)(al)), \ + "r" ((UDItype)(bl)) __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subs %1, %4, %5\n" \ + "sbc %0, %2, %3\n" \ + : "=r" ((sh)), \ + "=&r" ((sl)) \ + : "r" ((UDItype)(ah)), \ + "r" ((UDItype)(bh)), \ + "r" ((UDItype)(al)), \ + "r" ((UDItype)(bl)) __CLOBBER_CC) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1), __ph; \ + (pl) = __m0 * __m1; \ + __asm__ ("umulh %0,%1,%2" \ + : "=r" (__ph) \ + : "r" (__m0), \ + "r" (__m1)); \ + (ph) = __ph; \ + } while (0) +#define count_leading_zeros(count, x) \ + __asm__ ("clz %0, %1\n" \ + : "=r" ((count)) \ + : "r" ((UDItype)(x))) +#endif /* __aarch64__ */ + +/*************************************** ************** CLIPPER ************** ***************************************/ #if defined (__clipper__) && W_TYPE_SIZE == 32 ----------------------------------------------------------------------- Summary of changes: cipher/rijndael.c | 8 ++- mpi/{amd64 => aarch64}/mpi-asm-defs.h | 0 mpi/{arm => aarch64}/mpih-add1.S | 67 +++++++++--------- mpi/{arm => aarch64}/mpih-mul1.S | 96 +++++++++++++++----------- mpi/aarch64/mpih-mul2.S | 108 +++++++++++++++++++++++++++++ mpi/aarch64/mpih-mul3.S | 121 +++++++++++++++++++++++++++++++++ mpi/{arm => aarch64}/mpih-sub1.S | 68 +++++++++--------- mpi/config.links | 5 ++ mpi/longlong.h | 38 +++++++++++ 9 files changed, 396 insertions(+), 115 deletions(-) copy mpi/{amd64 => aarch64}/mpi-asm-defs.h (100%) copy mpi/{arm => aarch64}/mpih-add1.S (53%) copy mpi/{arm => aarch64}/mpih-mul1.S (50%) create mode 100644 mpi/aarch64/mpih-mul2.S create mode 100644 mpi/aarch64/mpih-mul3.S copy mpi/{arm => aarch64}/mpih-sub1.S (52%) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Tue Dec 3 13:42:31 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 03 Dec 2013 14:42:31 +0200 Subject: [PATCH] mpi: add inline assembly for x86-64 Message-ID: <20131203124231.13444.45353.stgit@localhost6.localdomain6> * mpi/longlong.h [__x86_64] (add_ssaaaa, sub_ddmmss, umul_ppmm) (udiv_qrnnd, count_leading_zeros, count_trailing_zeros): New. -- Signed-off-by: Jussi Kivilinna --- mpi/longlong.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/mpi/longlong.h b/mpi/longlong.h index 8dd8fe8..4f33937 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -555,6 +555,69 @@ extern USItype __udiv_qrnnd (); #endif #endif /* 80x86 */ +/*************************************** + *********** AMD64 / x86-64 ************ + ***************************************/ +#if defined(__x86_64) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addq %5,%1\n" \ + "adcq %3,%0" \ + : "=r" ((sh)), \ + "=&r" ((sl)) \ + : "0" ((UDItype)(ah)), \ + "g" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), \ + "g" ((UDItype)(bl)) \ + __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subq %5,%1\n" \ + "sbbq %3,%0" \ + : "=r" ((sh)), \ + "=&r" ((sl)) \ + : "0" ((UDItype)(ah)), \ + "g" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), \ + "g" ((UDItype)(bl)) \ + __CLOBBER_CC) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulq %3" \ + : "=a" ((w0)), \ + "=d" ((w1)) \ + : "0" ((UDItype)(u)), \ + "rm" ((UDItype)(v)) \ + __CLOBBER_CC) +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divq %4" \ + : "=a" ((q)), \ + "=d" ((r)) \ + : "0" ((UDItype)(n0)), \ + "1" ((UDItype)(n1)), \ + "rm" ((UDItype)(d)) \ + __CLOBBER_CC) +#define count_leading_zeros(count, x) \ + do { \ + UDItype __cbtmp; \ + __asm__ ("bsrq %1,%0" \ + : "=r" (__cbtmp) : "rm" ((UDItype)(x)) \ + __CLOBBER_CC); \ + (count) = __cbtmp ^ 63; \ + } while (0) +#define count_trailing_zeros(count, x) \ + do { \ + UDItype __cbtmp; \ + __asm__ ("bsfq %1,%0" \ + : "=r" (__cbtmp) : "rm" ((UDItype)(x)) \ + __CLOBBER_CC); \ + (count) = __cbtmp; \ + } while (0) +#ifndef UMUL_TIME +#define UMUL_TIME 40 +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 40 +#endif +#endif /* __x86_64 */ + /*************************************** ************** I860 ***************** From cvs at cvs.gnupg.org Tue Dec 3 13:51:10 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Tue, 03 Dec 2013 13:51:10 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-414-g2ff86db Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 2ff86db2e1b0f6cc22a1ca86037b526c5fa3be51 (commit) from 59b1a1b7ee2923e1bf091071ae716d180c6c6006 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 2ff86db2e1b0f6cc22a1ca86037b526c5fa3be51 Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 Add build support for ppc64le. * config.guess, config.sub: Update to latest version (2013-11-29). * m4/libtool.m4: Add patches for ppc64le. -- We don't want to update libtool, thus we use patches supplied by IBM. Signed-off-by: Werner Koch diff --git a/config.guess b/config.guess index ad5f74a..9afd676 100755 --- a/config.guess +++ b/config.guess @@ -1,14 +1,12 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012 Free Software Foundation, Inc. +# Copyright 1992-2013 Free Software Foundation, Inc. -timestamp='2012-07-31' +timestamp='2013-11-29' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or +# the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but @@ -22,19 +20,17 @@ timestamp='2012-07-31' # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - - -# Originally written by Per Bothner. Please send patches (context -# diff format) to and include a ChangeLog -# entry. +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). # -# This script attempts to guess a canonical system name similar to -# config.sub. If it succeeds, it prints the system name on stdout, and -# exits with 0. Otherwise, it exits with 1. +# Originally written by Per Bothner. # # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD +# +# Please send patches with a ChangeLog entry to config-patches at gnu.org. + me=`echo "$0" | sed -e 's,.*/,,'` @@ -54,9 +50,7 @@ version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 -Free Software Foundation, Inc. +Copyright 1992-2013 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -138,6 +132,27 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown +case "${UNAME_SYSTEM}" in +Linux|GNU|GNU/*) + # If the system lacks a compiler, then just pick glibc. + # We could probably try harder. + LIBC=gnu + + eval $set_cc_for_build + cat <<-EOF > $dummy.c + #include + #if defined(__UCLIBC__) + LIBC=uclibc + #elif defined(__dietlibc__) + LIBC=dietlibc + #else + LIBC=gnu + #endif + EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` + ;; +esac + # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in @@ -306,7 +321,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; - arm:riscos:*:*|arm:RISCOS:*:*) + arm*:riscos:*:*|arm*:RISCOS:*:*) echo arm-unknown-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) @@ -859,21 +874,21 @@ EOF exit ;; *:GNU:*:*) # the GNU system - echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland - echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC} exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: $.*$/\1/p' < /proc/cpuinfo` in @@ -886,59 +901,54 @@ EOF EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 - if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi - echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + if test "$?" = 0 ; then LIBC="gnulibc1" ; fi + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + exit ;; + arc:Linux:*:* | arceb:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then - echo ${UNAME_MACHINE}-unknown-linux-gnueabi + echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi else - echo ${UNAME_MACHINE}-unknown-linux-gnueabihf + echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf fi fi exit ;; avr32*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; cris:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-gnu + echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; crisv32:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-gnu + echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; frv:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; hexagon:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; i*86:Linux:*:*) - LIBC=gnu - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #ifdef __dietlibc__ - LIBC=dietlibc - #endif -EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` - echo "${UNAME_MACHINE}-pc-linux-${LIBC}" + echo ${UNAME_MACHINE}-pc-linux-${LIBC} exit ;; ia64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; m68*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build @@ -957,54 +967,63 @@ EOF #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; } ;; + or1k:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + exit ;; or32:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; padre:Linux:*:*) - echo sparc-unknown-linux-gnu + echo sparc-unknown-linux-${LIBC} exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-gnu + echo hppa64-unknown-linux-${LIBC} exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-gnu ;; - PA8*) echo hppa2.0-unknown-linux-gnu ;; - *) echo hppa-unknown-linux-gnu ;; + PA7*) echo hppa1.1-unknown-linux-${LIBC} ;; + PA8*) echo hppa2.0-unknown-linux-${LIBC} ;; + *) echo hppa-unknown-linux-${LIBC} ;; esac exit ;; ppc64:Linux:*:*) - echo powerpc64-unknown-linux-gnu + echo powerpc64-unknown-linux-${LIBC} exit ;; ppc:Linux:*:*) - echo powerpc-unknown-linux-gnu + echo powerpc-unknown-linux-${LIBC} + exit ;; + ppc64le:Linux:*:*) + echo powerpc64le-unknown-linux-${LIBC} + exit ;; + ppcle:Linux:*:*) + echo powerpcle-unknown-linux-${LIBC} exit ;; s390:Linux:*:* | s390x:Linux:*:*) - echo ${UNAME_MACHINE}-ibm-linux + echo ${UNAME_MACHINE}-ibm-linux-${LIBC} exit ;; sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; sh*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; tile*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; vax:Linux:*:*) - echo ${UNAME_MACHINE}-dec-linux-gnu + echo ${UNAME_MACHINE}-dec-linux-${LIBC} exit ;; x86_64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. @@ -1208,6 +1227,9 @@ EOF BePC:Haiku:*:*) # Haiku running on Intel PC compatible. echo i586-pc-haiku exit ;; + x86_64:Haiku:*:*) + echo x86_64-unknown-haiku + exit ;; SX-4:SUPER-UX:*:*) echo sx4-nec-superux${UNAME_RELEASE} exit ;; @@ -1234,19 +1256,31 @@ EOF exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - case $UNAME_PROCESSOR in - i386) - eval $set_cc_for_build - if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then - if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - UNAME_PROCESSOR="x86_64" - fi - fi ;; - unknown) UNAME_PROCESSOR=powerpc ;; - esac + eval $set_cc_for_build + if test "$UNAME_PROCESSOR" = unknown ; then + UNAME_PROCESSOR=powerpc + fi + if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + case $UNAME_PROCESSOR in + i386) UNAME_PROCESSOR=x86_64 ;; + powerpc) UNAME_PROCESSOR=powerpc64 ;; + esac + fi + fi + elif test "$UNAME_PROCESSOR" = i386 ; then + # Avoid executing cc on OS X 10.9, as it ships with a stub + # that puts up a graphical alert prompting to install + # developer tools. Any system running Mac OS X 10.7 or + # later (Darwin 11 and later) is required to have a 64-bit + # processor. This is not true of the ARM version of Darwin + # that Apple uses in portable devices. + UNAME_PROCESSOR=x86_64 + fi echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) diff --git a/config.sub b/config.sub index b15df57..61cb4bc 100755 --- a/config.sub +++ b/config.sub @@ -1,24 +1,18 @@ #! /bin/sh # Configuration validation subroutine script. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012 Free Software Foundation, Inc. +# Copyright 1992-2013 Free Software Foundation, Inc. -timestamp='2012-07-31' +timestamp='2013-10-01' -# This file is (in principle) common to ALL GNU software. -# The presence of a machine in this file suggests that SOME GNU software -# can handle that machine. It does not imply ALL GNU software can. -# -# This file is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . @@ -26,11 +20,12 @@ timestamp='2012-07-31' # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). -# Please send patches to . Submit a context -# diff and a properly formatted GNU ChangeLog entry. +# Please send patches with a ChangeLog entry to config-patches at gnu.org. # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. @@ -73,9 +68,7 @@ Report bugs and patches to ." version="\ GNU config.sub ($timestamp) -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 -Free Software Foundation, Inc. +Copyright 1992-2013 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -123,7 +116,7 @@ esac maybe_os=`echo $1 | sed 's/^$.*$-$[^-]*-[^-]*$$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ - linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ + linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ knetbsd*-gnu* | netbsd*-gnu* | \ kopensolaris*-gnu* | \ storm-chaos* | os2-emx* | rtmk-nova*) @@ -156,7 +149,7 @@ case $os in -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ - -apple | -axis | -knuth | -cray | -microblaze) + -apple | -axis | -knuth | -cray | -microblaze*) os= basic_machine=$1 ;; @@ -259,10 +252,12 @@ case $basic_machine in | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ - | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ - | be32 | be64 \ + | arc | arceb \ + | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ + | avr | avr32 \ + | be32 | be64 \ | bfin \ - | c4x | clipper \ + | c4x | c8051 | clipper \ | d10v | d30v | dlx | dsp16xx \ | epiphany \ | fido | fr30 | frv \ @@ -270,10 +265,11 @@ case $basic_machine in | hexagon \ | i370 | i860 | i960 | ia64 \ | ip2k | iq2000 \ + | k1om \ | le32 | le64 \ | lm32 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ - | maxq | mb | microblaze | mcore | mep | metag \ + | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ @@ -291,16 +287,17 @@ case $basic_machine in | mipsisa64r2 | mipsisa64r2el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ + | mipsr5900 | mipsr5900el \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ - | nios | nios2 \ + | nios | nios2 | nios2eb | nios2el \ | ns16k | ns32k \ | open8 \ - | or32 \ + | or1k | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pyramid \ @@ -328,7 +325,7 @@ case $basic_machine in c6x) basic_machine=tic6x-unknown ;; - m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip) + m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip) basic_machine=$basic_machine-unknown os=-none ;; @@ -370,13 +367,13 @@ case $basic_machine in | aarch64-* | aarch64_be-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ - | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ + | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ - | clipper-* | craynv-* | cydra-* \ + | c8051-* | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ @@ -385,11 +382,13 @@ case $basic_machine in | hexagon-* \ | i*86-* | i860-* | i960-* | ia64-* \ | ip2k-* | iq2000-* \ + | k1om-* \ | le32-* | le64-* \ | lm32-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ - | m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \ + | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ + | microblaze-* | microblazeel-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ @@ -407,12 +406,13 @@ case $basic_machine in | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ + | mipsr5900-* | mipsr5900el-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ - | nios-* | nios2-* \ + | nios-* | nios2-* | nios2eb-* | nios2el-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ | orion-* \ @@ -788,7 +788,7 @@ case $basic_machine in basic_machine=ns32k-utek os=-sysv ;; - microblaze) + microblaze*) basic_machine=microblaze-xilinx ;; mingw64) @@ -796,7 +796,7 @@ case $basic_machine in os=-mingw64 ;; mingw32) - basic_machine=i386-pc + basic_machine=i686-pc os=-mingw32 ;; mingw32ce) @@ -832,7 +832,7 @@ case $basic_machine in basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) - basic_machine=i386-pc + basic_machine=i686-pc os=-msys ;; mvs) @@ -1023,7 +1023,11 @@ case $basic_machine in basic_machine=i586-unknown os=-pw32 ;; - rdos) + rdos | rdos64) + basic_machine=x86_64-pc + os=-rdos + ;; + rdos32) basic_machine=i386-pc os=-rdos ;; @@ -1350,7 +1354,7 @@ case $os in -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ - | -sym* | -kopensolaris* \ + | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* | -aros* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ @@ -1364,7 +1368,7 @@ case $os in | -chorusos* | -chorusrdb* | -cegcc* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ - | -linux-newlib* | -linux-uclibc* \ + | -linux-newlib* | -linux-musl* | -linux-uclibc* \ | -uxpv* | -beos* | -mpeix* | -udk* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ @@ -1496,9 +1500,6 @@ case $os in -aros*) os=-aros ;; - -kaos*) - os=-kaos - ;; -zvmoe) os=-zvmoe ;; @@ -1547,6 +1548,9 @@ case $basic_machine in c4x-* | tic4x-*) os=-coff ;; + c8051-*) + os=-elf + ;; hexagon-*) os=-elf ;; @@ -1590,6 +1594,9 @@ case $basic_machine in mips*-*) os=-elf ;; + or1k-*) + os=-elf + ;; or32-*) os=-coff ;; diff --git a/m4/libtool.m4 b/m4/libtool.m4 index ff871a0..f0a2752 100644 --- a/m4/libtool.m4 +++ b/m4/libtool.m4 @@ -1326,7 +1326,10 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*) x86_64-*linux*) LD="${LD-ld} -m elf_i386" ;; - ppc64-*linux*|powerpc64-*linux*) + powerpc64le-*) + LD="${LD-ld} -m elf32lppclinux" + ;; + powerpc64-*) LD="${LD-ld} -m elf32ppclinux" ;; s390x-*linux*) @@ -1345,7 +1348,10 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*) x86_64-*linux*) LD="${LD-ld} -m elf_x86_64" ;; - ppc*-*linux*|powerpc*-*linux*) + powerpcle-*) + LD="${LD-ld} -m elf64lppc" + ;; + powerpc-*) LD="${LD-ld} -m elf64ppc" ;; s390*-*linux*|s390*-*tpf*) ----------------------------------------------------------------------- Summary of changes: config.guess | 182 ++++++++++++++++++++++++++++++++++----------------------- config.sub | 93 +++++++++++++++-------------- m4/libtool.m4 | 10 +++- 3 files changed, 166 insertions(+), 119 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Wed Dec 4 04:02:01 2013 From: cvs at cvs.gnupg.org (by NIIBE Yutaka) Date: Wed, 04 Dec 2013 04:02:01 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-415-gc56080c Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via c56080c26186d25dec05f01831494c77d8d07e13 (commit) from 2ff86db2e1b0f6cc22a1ca86037b526c5fa3be51 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit c56080c26186d25dec05f01831494c77d8d07e13 Author: NIIBE Yutaka Date: Wed Dec 4 10:03:57 2013 +0900 mpi: fix gcry_mpi_powm for negative base. * mpi/mpi-pow.c (gcry_mpi_powm) [USE_ALGORITHM_SIMPLE_EXPONENTIATION]: Fix for the case where BASE is negative. * tests/mpitests.c (test_powm): Add a test case of (-17)^6 mod 19. Signed-off-by: NIIBE Yutaka diff --git a/mpi/mpi-pow.c b/mpi/mpi-pow.c index 469c382..4bf0233 100644 --- a/mpi/mpi-pow.c +++ b/mpi/mpi-pow.c @@ -177,7 +177,7 @@ gcry_mpi_powm (gcry_mpi_t res, } MPN_COPY ( rp, bp, bsize ); rsize = bsize; - rsign = bsign; + rsign = 0; /* Main processing. */ { @@ -192,7 +192,7 @@ gcry_mpi_powm (gcry_mpi_t res, xp = xp_marker = mpi_alloc_limb_space( 2 * (msize + 1), msec ); memset( &karactx, 0, sizeof karactx ); - negative_result = (ep[0] & 1) && base->sign; + negative_result = (ep[0] & 1) && bsign; i = esize - 1; e = ep[i]; diff --git a/tests/mpitests.c b/tests/mpitests.c index 1c4edb6..9d1206e 100644 --- a/tests/mpitests.c +++ b/tests/mpitests.c @@ -520,6 +520,25 @@ test_powm (void) if (gcry_mpi_cmp (res, base)) die ("test_powm failed at %d\n", __LINE__); + /* Check for a case: base is negative and expo is even. */ + gcry_mpi_set_ui (base, b_int); + gcry_mpi_neg (base, base); + gcry_mpi_set_ui (exp, e_int * 2); + gcry_mpi_set_ui(mod, m_int); + gcry_mpi_powm (res, base, exp, mod); + /* Result should be positive and it's 7 = (-17)^6 mod 19. */ + if (gcry_mpi_is_neg (res) || gcry_mpi_cmp_ui (res, 7)) + { + if (verbose) + { + fprintf (stderr, "is_neg: %d\n", gcry_mpi_is_neg (res)); + fprintf (stderr, "mpi: "); + gcry_mpi_dump (res); + putc ('\n', stderr); + } + die ("test_powm failed for negative base at %d\n", __LINE__); + } + gcry_mpi_release (base); gcry_mpi_release (exp); gcry_mpi_release (mod); ----------------------------------------------------------------------- Summary of changes: mpi/mpi-pow.c | 4 ++-- tests/mpitests.c | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Wed Dec 4 17:23:07 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Wed, 04 Dec 2013 17:23:07 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-416-g85bb0a9 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 85bb0a98ea5add0296cbcc415d557eaa1f6bd294 (commit) from c56080c26186d25dec05f01831494c77d8d07e13 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 85bb0a98ea5add0296cbcc415d557eaa1f6bd294 Author: Jussi Kivilinna Date: Wed Dec 4 18:17:22 2013 +0200 mpi: add inline assembly for x86-64 * mpi/longlong.h [__x86_64] (add_ssaaaa, sub_ddmmss, umul_ppmm) (udiv_qrnnd, count_leading_zeros, count_trailing_zeros): New. -- Signed-off-by: Jussi Kivilinna diff --git a/mpi/longlong.h b/mpi/longlong.h index 8dd8fe8..4f33937 100644 --- a/mpi/longlong.h +++ b/mpi/longlong.h @@ -555,6 +555,69 @@ extern USItype __udiv_qrnnd (); #endif #endif /* 80x86 */ +/*************************************** + *********** AMD64 / x86-64 ************ + ***************************************/ +#if defined(__x86_64) && W_TYPE_SIZE == 64 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addq %5,%1\n" \ + "adcq %3,%0" \ + : "=r" ((sh)), \ + "=&r" ((sl)) \ + : "0" ((UDItype)(ah)), \ + "g" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), \ + "g" ((UDItype)(bl)) \ + __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subq %5,%1\n" \ + "sbbq %3,%0" \ + : "=r" ((sh)), \ + "=&r" ((sl)) \ + : "0" ((UDItype)(ah)), \ + "g" ((UDItype)(bh)), \ + "1" ((UDItype)(al)), \ + "g" ((UDItype)(bl)) \ + __CLOBBER_CC) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulq %3" \ + : "=a" ((w0)), \ + "=d" ((w1)) \ + : "0" ((UDItype)(u)), \ + "rm" ((UDItype)(v)) \ + __CLOBBER_CC) +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divq %4" \ + : "=a" ((q)), \ + "=d" ((r)) \ + : "0" ((UDItype)(n0)), \ + "1" ((UDItype)(n1)), \ + "rm" ((UDItype)(d)) \ + __CLOBBER_CC) +#define count_leading_zeros(count, x) \ + do { \ + UDItype __cbtmp; \ + __asm__ ("bsrq %1,%0" \ + : "=r" (__cbtmp) : "rm" ((UDItype)(x)) \ + __CLOBBER_CC); \ + (count) = __cbtmp ^ 63; \ + } while (0) +#define count_trailing_zeros(count, x) \ + do { \ + UDItype __cbtmp; \ + __asm__ ("bsfq %1,%0" \ + : "=r" (__cbtmp) : "rm" ((UDItype)(x)) \ + __CLOBBER_CC); \ + (count) = __cbtmp; \ + } while (0) +#ifndef UMUL_TIME +#define UMUL_TIME 40 +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 40 +#endif +#endif /* __x86_64 */ + /*************************************** ************** I860 ***************** ----------------------------------------------------------------------- Summary of changes: mpi/longlong.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Thu Dec 5 12:10:53 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Thu, 05 Dec 2013 12:10:53 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-418-g7ef43d1 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 7ef43d1eebb4f8226e860982dfe5fa2e2c82ad0f (commit) from 7bacf1812b55fa78db63abaa1f5a9220e9c6cccc (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 7ef43d1eebb4f8226e860982dfe5fa2e2c82ad0f Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 ecc: Change OID for Ed25519. * cipher/ecc-curves.c (curve_aliased): Add more suitable OID for Ed25519. -- The formerly used OID has been assigned by Peter Gutmann for Curve25519. We better keep them distinct and assign a separate one for Ed25519. Signed-off-by: Werner Koch diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c index 5c418d9..8125f24 100644 --- a/cipher/ecc-curves.c +++ b/cipher/ecc-curves.c @@ -40,7 +40,8 @@ static const struct const char *other; /* Other name. */ } curve_aliases[] = { - { "Ed25519", "1.3.6.1.4.1.3029.1.5.1" }, + /*{ "Curve25519", "1.3.6.1.4.1.3029.1.5.1" },*/ + { "Ed25519", "1.3.6.1.4.1.11591.15.1" }, { "NIST P-192", "1.2.840.10045.3.1.1" }, /* X9.62 OID */ { "NIST P-192", "prime192v1" }, /* X9.62 name. */ ----------------------------------------------------------------------- Summary of changes: cipher/ecc-curves.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Fri Dec 6 01:04:10 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 06 Dec 2013 01:04:10 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-419-g5917ce3 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 5917ce34e3b3eac4c15f62577e4723974024f818 (commit) from 7ef43d1eebb4f8226e860982dfe5fa2e2c82ad0f (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 5917ce34e3b3eac4c15f62577e4723974024f818 Author: Jussi Kivilinna Date: Fri Dec 6 02:02:06 2013 +0200 Fix building on mingw32 * src/gcrypt-int.h: Include . -- 'ulong' is not defined on W32, so we need to include "types.h" in 'gcrypt-int.h'. In file included from ../src/visibility.h:53:0, from ../src/g10lib.h:39, from compat.c:22: ../src/gcrypt-int.h:365:49: error: unknown type name 'ulong' Signed-off-by: Jussi Kivilinna diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h index dabc9df..e22baf6 100644 --- a/src/gcrypt-int.h +++ b/src/gcrypt-int.h @@ -25,6 +25,7 @@ #endif #include "gcrypt.h" +#include "types.h" /* These error codes are used but not defined in the required libgpg-error 1.11. Define them here. */ ----------------------------------------------------------------------- Summary of changes: src/gcrypt-int.h | 1 + 1 file changed, 1 insertion(+) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Sat Dec 7 23:11:23 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Sat, 07 Dec 2013 23:11:23 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-422-gd455543 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via d4555433b6e422fa69a85cae99961f513e55d82b (commit) via 405021cb6d4e470337302c65dec5bc91491a89c1 (commit) via 4cf2c65fe15173c8d68a141a01b34fc1fb9080b7 (commit) from 5917ce34e3b3eac4c15f62577e4723974024f818 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit d4555433b6e422fa69a85cae99961f513e55d82b Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 sexp: Allow long names and white space in gcry_sexp_extract_param. * src/sexp.c (_gcry_sexp_vextract_param): Skip white space. Support long parameter names. * tests/tsexp.c (check_extract_param): Add test cases for long parameter names and white space. Signed-off-by: Werner Koch diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 91168a8..927634f 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -4145,10 +4145,10 @@ likely want to use @code{GCRYMPI_FMT_USG}. @w{const char *@var{path}}, @ @w{const char *@var{list}}, ...) -Extract parameters from an S-expression using a list of single letter -parameter names. The names of these parameters are specified in -LIST. Some special characters may be given to control the -conversion: +Extract parameters from an S-expression using a list of parameter +names. The names of these parameters are specified in LIST. White +space between the parameter names are ignored. Some special characters +may be given to control the conversion: @table @samp @item + @@ -4162,21 +4162,25 @@ computations; see @code{gcry_mpi_get_opaque} for details. @item & Switch to buffer descriptor mode. See below for details. @item ? -If immediately following a parameter letter, that parameter is -considered optional. +If immediately following a parameter letter (no white space allowed), +that parameter is considered optional. @end table +In general parameter names are single letters. To use a string for a +parameter name, enclose the name in single quotes. + Unless in buffer descriptor mode for each parameter name a pointer to an @code{gcry_mpi_t} variable is expected finally followed by a @code{NULL}. For example @example - _gcry_sexp_extract_param (key, NULL, "n/x+ed", - &mpi_n, &mpi_x, &mpi_e, NULL) + _gcry_sexp_extract_param (key, NULL, "n/x+e d-'foo'", + &mpi_n, &mpi_x, &mpi_e, &mpi_foo, NULL) @end example stores the parameter 'n' from @var{key} as an unsigned MPI into - at var{mpi_n}, the parameter 'x' as an opaque MPI into @var{mpi_x}, and -the parameter 'e' again as an unsigned MPI into @var{mpi_e}. + at var{mpi_n}, the parameter 'x' as an opaque MPI into @var{mpi_x}, the +parameter 'e' again as an unsigned MPI into @var{mpi_e}, and the +parameter 'foo' as a signed MPI. @var{path} is an optional string used to locate a token. The exclamation mark separated tokens are used via diff --git a/src/sexp.c b/src/sexp.c index 16def5b..7ff4c0a 100644 --- a/src/sexp.c +++ b/src/sexp.c @@ -2113,10 +2113,9 @@ _gcry_sexp_canon_len (const unsigned char *buffer, size_t length, } -/* Extract MPIs from an s-expression using a list of one letter - * parameters. The names of these parameters are given by the string - * LIST. Some special characters may be given to control the - * conversion: +/* Extract MPIs from an s-expression using a list of parameters. The + * names of these parameters are given by the string LIST. Some + * special characters may be given to control the conversion: * * + :: Switch to unsigned integer format (default). * - :: Switch to standard signed format. @@ -2124,6 +2123,9 @@ _gcry_sexp_canon_len (const unsigned char *buffer, size_t length, * & :: Switch to buffer descriptor mode - see below. * ? :: The previous parameter is optional. * + * In general parameter names are single letters. To use a string for + * a parameter name, enclose the name in single quotes. + * * Unless in gcry_buffer_t mode for each parameter name a pointer to * an MPI variable is expected and finally a NULL is expected. * Example: @@ -2158,7 +2160,7 @@ _gcry_sexp_vextract_param (gcry_sexp_t sexp, const char *path, const char *list, va_list arg_ptr) { gpg_err_code_t rc; - const char *s; + const char *s, *s2; gcry_mpi_t *array[20]; char arrayisdesc[20]; int idx; @@ -2173,10 +2175,23 @@ _gcry_sexp_vextract_param (gcry_sexp_t sexp, const char *path, was found. */ for (s=list, idx=0; *s && idx < DIM (array); s++) { - if (*s == '&' || *s == '+' || *s == '-' || *s == '/' || *s == '?' ) + if (*s == '&' || *s == '+' || *s == '-' || *s == '/' || *s == '?') + ; + else if (whitespacep (s)) ; else { + if (*s == '\'') + { + s++; + s2 = strchr (s, '\''); + if (!s2 || s2 == s) + { + /* Closing quote not found or empty string. */ + return GPG_ERR_SYNTAX; + } + s = s2; + } array[idx] = va_arg (arg_ptr, gcry_mpi_t *); if (!array[idx]) return GPG_ERR_MISSING_VALUE; /* NULL pointer given. */ @@ -2221,11 +2236,29 @@ _gcry_sexp_vextract_param (gcry_sexp_t sexp, const char *path, { if (*s == '&' || *s == '+' || *s == '-' || *s == '/') mode = *s; + else if (whitespacep (s)) + ; else if (*s == '?') ; /* Only used via lookahead. */ else { - l1 = _gcry_sexp_find_token (sexp, s, 1); + if (*s == '\'') + { + /* Find closing quote, find token, set S to closing quote. */ + s++; + s2 = strchr (s, '\''); + if (!s2 || s2 == s) + { + /* Closing quote not found or empty string. */ + rc = GPG_ERR_SYNTAX; + goto cleanup; + } + l1 = _gcry_sexp_find_token (sexp, s, s2 - s); + s = s2; + } + else + l1 = _gcry_sexp_find_token (sexp, s, 1); + if (!l1 && s[1] == '?') { /* Optional element not found. */ diff --git a/tests/tsexp.c b/tests/tsexp.c index afa79ff..2f6ad8f 100644 --- a/tests/tsexp.c +++ b/tests/tsexp.c @@ -684,6 +684,18 @@ check_extract_param (void) { sample1, NULL, + "pab'gnq", 7, + GPG_ERR_SYNTAX + }, + { + sample1, + NULL, + "pab''gnq", 7, + GPG_ERR_SYNTAX + }, + { + sample1, + NULL, "pabgnqd", 7, 0, sample1_px, sample1_ax, sample1_bx, sample1_gx, sample1_nx, @@ -692,6 +704,14 @@ check_extract_param (void) { sample1, NULL, + " pab\tg nq\nd ", 7, + 0, + sample1_px, sample1_ax, sample1_bx, sample1_gx, sample1_nx, + sample1_qx, sample1_d + }, + { + sample1, + NULL, "abg", 3, 0, sample1_ax, sample1_bx, sample1_gx @@ -699,6 +719,13 @@ check_extract_param (void) { sample1, NULL, + "ab'g'", 3, + 0, + sample1_ax, sample1_bx, sample1_gx + }, + { + sample1, + NULL, "x?abg", 4, 0, NULL, sample1_ax, sample1_bx, sample1_gx @@ -967,6 +994,50 @@ check_extract_param (void) } gcry_sexp_release (sxp); + + info ("checking gcry_sexp_extract_param long name\n"); + + memset (ioarray, 0, sizeof ioarray); + memset (mpis, 0, sizeof mpis); + + err = gcry_sexp_new (&sxp, sample1, 0, 1); + if (err) + die ("converting string to sexp failed: %s", gpg_strerror (err)); + + err = gcry_sexp_extract_param (sxp, "key-data!private-key", + "&'curve'+p", + ioarray+0, mpis+0, NULL); + if (err) + fail ("gcry_sexp_extract_param long name failed: %s", gpg_strerror (err)); + + if (!ioarray[0].data) + fail ("gcry_sexp_extract_param long name failed: no curve"); + else if (ioarray[0].size != 7) + fail ("gcry_sexp_extract_param long name failed: curve has wrong size"); + else if (ioarray[0].len != 7) + fail ("gcry_sexp_extract_param long name failed: curve has wrong length"); + else if (ioarray[0].off) + fail ("gcry_sexp_extract_param long name failed: curve has OFF set"); + else if (strncmp (ioarray[0].data, "Ed25519", 7)) + { + fail ("gcry_sexp_extract_param long name failed: curve mismatch"); + gcry_log_debug ("expected: %s\n", "Ed25519"); + gcry_log_debug (" got: %.*s\n", (int)ioarray[0].len, ioarray[0].data); + } + + if (!mpis[0]) + fail ("gcry_sexp_extract_param long name failed: p not returned"); + else if (cmp_mpihex (mpis[0], sample1_p)) + { + fail ("gcry_sexp_extract_param long name failed: p mismatch"); + gcry_log_debug ("expected: %s\n", sample1_p); + gcry_log_debugmpi (" got", mpis[0]); + } + + gcry_mpi_release (mpis[0]); + + gcry_sexp_release (sxp); + } commit 405021cb6d4e470337302c65dec5bc91491a89c1 Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 ecc: Merge partly duplicated code. * cipher/ecc-eddsa.c (_gcry_ecc_eddsa_sign): Factor A hashing out to ... (_gcry_ecc_eddsa_compute_h_d): new function. * cipher/ecc-misc.c (_gcry_ecc_compute_public): Use new function. (reverse_buffer): Remove. Signed-off-by: Werner Koch diff --git a/cipher/ecc-common.h b/cipher/ecc-common.h index dc4a9d0..c407c74 100644 --- a/cipher/ecc-common.h +++ b/cipher/ecc-common.h @@ -115,6 +115,8 @@ gpg_err_code_t _gcry_ecc_eddsa_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result, unsigned char **r_encpk, unsigned int *r_encpklen); +gpg_err_code_t _gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, + gcry_mpi_t d, mpi_ec_t ec); gpg_err_code_t _gcry_ecc_eddsa_genkey (ECC_secret_key *sk, elliptic_curve_t *E, diff --git a/cipher/ecc-eddsa.c b/cipher/ecc-eddsa.c index 12cdea0..17c1f73 100644 --- a/cipher/ecc-eddsa.c +++ b/cipher/ecc-eddsa.c @@ -368,6 +368,71 @@ _gcry_ecc_eddsa_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result, } +/* Compute the A value as used by EdDSA. The caller needs to provide + the context EC and the actual secret D as an MPI. The function + returns a newly allocated 64 byte buffer at r_digest; the first 32 + bytes represent the A value. NULL is returned on error and NULL + stored at R_DIGEST. */ +gpg_err_code_t +_gcry_ecc_eddsa_compute_h_d (unsigned char **r_digest, + gcry_mpi_t d, mpi_ec_t ec) +{ + gpg_err_code_t rc; + unsigned char *rawmpi = NULL; + unsigned int rawmpilen; + unsigned char *digest; + gcry_buffer_t hvec[2]; + int hashalgo, b; + + *r_digest = NULL; + + hashalgo = GCRY_MD_SHA512; + if (hashalgo != GCRY_MD_SHA512) + return GPG_ERR_DIGEST_ALGO; + + b = (ec->nbits+7)/8; + if (b != 256/8) + return GPG_ERR_INTERNAL; /* We only support 256 bit. */ + + /* Note that we clear DIGEST so we can use it as input to left pad + the key with zeroes for hashing. */ + digest = gcry_calloc_secure (2, b); + if (!digest) + return gpg_err_code_from_syserror (); + + memset (hvec, 0, sizeof hvec); + + rawmpi = _gcry_mpi_get_buffer (d, 0, &rawmpilen, NULL); + if (!rawmpi) + { + gcry_free (digest); + return gpg_err_code_from_syserror (); + } + + hvec[0].data = digest; + hvec[0].off = 0; + hvec[0].len = b > rawmpilen? b - rawmpilen : 0; + hvec[1].data = rawmpi; + hvec[1].off = 0; + hvec[1].len = rawmpilen; + rc = _gcry_md_hash_buffers (hashalgo, 0, digest, hvec, 2); + gcry_free (rawmpi); + if (rc) + { + gcry_free (digest); + return rc; + } + + /* Compute the A value. */ + reverse_buffer (digest, 32); /* Only the first half of the hash. */ + digest[0] = (digest[0] & 0x7f) | 0x40; + digest[31] &= 0xf8; + + *r_digest = digest; + return 0; +} + + /* Ed25519 version of the key generation. */ gpg_err_code_t _gcry_ecc_eddsa_genkey (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx, @@ -480,8 +545,6 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, ECC_secret_key *skey, if (!mpi_is_opaque (input)) return GPG_ERR_INV_DATA; - if (hashalgo != GCRY_MD_SHA512) - return GPG_ERR_DIGEST_ALGO; /* Initialize some helpers. */ point_init (&I); @@ -496,36 +559,9 @@ _gcry_ecc_eddsa_sign (gcry_mpi_t input, ECC_secret_key *skey, if (b != 256/8) return GPG_ERR_INTERNAL; /* We only support 256 bit. */ - digest = gcry_calloc_secure (2, b); - if (!digest) - { - rc = gpg_err_code_from_syserror (); - goto leave; - } - - /* Hash the secret key. We clear DIGEST so we can use it as input - to left pad the key with zeroes for hashing. */ - rawmpi = _gcry_mpi_get_buffer (skey->d, 0, &rawmpilen, NULL); - if (!rawmpi) - { - rc = gpg_err_code_from_syserror (); - goto leave; - } - hvec[0].data = digest; - hvec[0].off = 0; - hvec[0].len = b > rawmpilen? b - rawmpilen : 0; - hvec[1].data = rawmpi; - hvec[1].off = 0; - hvec[1].len = rawmpilen; - rc = _gcry_md_hash_buffers (hashalgo, 0, digest, hvec, 2); - gcry_free (rawmpi); rawmpi = NULL; + rc = _gcry_ecc_eddsa_compute_h_d (&digest, skey->d, ctx); if (rc) goto leave; - - /* Compute the A value (this modifies DIGEST). */ - reverse_buffer (digest, 32); /* Only the first half of the hash. */ - digest[0] = (digest[0] & 0x7f) | 0x40; - digest[31] &= 0xf8; _gcry_mpi_set_buffer (a, digest, 32, 0); /* Compute the public key if it has not been supplied as optional diff --git a/cipher/ecc-misc.c b/cipher/ecc-misc.c index 26c9e8d..ae3e4f0 100644 --- a/cipher/ecc-misc.c +++ b/cipher/ecc-misc.c @@ -236,20 +236,6 @@ _gcry_ecc_os2ec (mpi_point_t result, gcry_mpi_t value) } -static void -reverse_buffer (unsigned char *buffer, unsigned int length) -{ - unsigned int tmp, i; - - for (i=0; i < length/2; i++) - { - tmp = buffer[i]; - buffer[i] = buffer[length-1-i]; - buffer[length-1-i] = tmp; - } -} - - /* Compute the public key from the the context EC. Obviously a requirement is that the secret key is available in EC. On success Q is returned; on error NULL. If Q is NULL a newly allocated point @@ -259,8 +245,6 @@ mpi_point_t _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec, mpi_point_t G, gcry_mpi_t d) { - int rc; - if (!G) G = ec->G; if (!d) @@ -275,41 +259,11 @@ _gcry_ecc_compute_public (mpi_point_t Q, mpi_ec_t ec, && (ec->flags & PUBKEY_FLAG_EDDSA)) { gcry_mpi_t a; - unsigned char *rawmpi = NULL; - unsigned int rawmpilen; unsigned char *digest; - gcry_buffer_t hvec[2]; - int b = (ec->nbits+7)/8; - - gcry_assert (b >= 32); - digest = gcry_calloc_secure (2, b); - if (!digest) - return NULL; - memset (hvec, 0, sizeof hvec); - rawmpi = _gcry_mpi_get_buffer (d, 0, &rawmpilen, NULL); - if (!rawmpi) + if (_gcry_ecc_eddsa_compute_h_d (&digest, d, ec)) return NULL; - memset (digest, 0, b); - hvec[0].data = digest; - hvec[0].off = 0; - hvec[0].len = b > rawmpilen? b - rawmpilen : 0; - hvec[1].data = rawmpi; - hvec[1].off = 0; - hvec[1].len = rawmpilen; - /* FIXME: Put and take the hash algo from the context. */ - rc = _gcry_md_hash_buffers (GCRY_MD_SHA512, 0, digest, hvec, 2); - gcry_free (rawmpi); - if (rc) - { - gcry_free (digest); - return NULL; - } - /* Compute the A value. */ - reverse_buffer (digest, 32); /* Only the first half of the hash. */ - digest[0] = (digest[0] & 0x7f) | 0x40; - digest[31] &= 0xf8; a = mpi_snew (0); _gcry_mpi_set_buffer (a, digest, 32, 0); gcry_free (digest); commit 4cf2c65fe15173c8d68a141a01b34fc1fb9080b7 Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 ecc: Remove unused internal function. * src/cipher-proto.h (gcry_pk_spec): Remove get_param. * cipher/ecc-curves.c (_gcry_ecc_get_param_sexp): Merge in code from _gcry_ecc_get_param. (_gcry_ecc_get_param): Remove. * cipher/ecc.c (_gcry_pubkey_spec_ecc): Remove _gcry_ecc_get_param. Signed-off-by: Werner Koch diff --git a/cipher/ecc-common.h b/cipher/ecc-common.h index 1ee1d39..dc4a9d0 100644 --- a/cipher/ecc-common.h +++ b/cipher/ecc-common.h @@ -80,8 +80,7 @@ gpg_err_code_t _gcry_ecc_update_curve_param (const char *name, const char *_gcry_ecc_get_curve (gcry_sexp_t keyparms, int iterator, unsigned int *r_nbits); -gcry_err_code_t _gcry_ecc_get_param (const char *name, gcry_mpi_t *pkey); -gcry_sexp_t _gcry_ecc_get_param_sexp (const char *name); +gcry_sexp_t _gcry_ecc_get_param_sexp (const char *name); /*-- ecc-misc.c --*/ void _gcry_ecc_curve_free (elliptic_curve_t *E); diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c index 8125f24..85ebdd3 100644 --- a/cipher/ecc-curves.c +++ b/cipher/ecc-curves.c @@ -926,20 +926,21 @@ _gcry_mpi_ec_new (gcry_ctx_t *r_ctx, } -/* Return the parameters of the curve NAME in an MPI array. */ -gcry_err_code_t -_gcry_ecc_get_param (const char *name, gcry_mpi_t *pkey) +/* Return the parameters of the curve NAME as an S-expression. */ +gcry_sexp_t +_gcry_ecc_get_param_sexp (const char *name) { - gpg_err_code_t err; unsigned int nbits; elliptic_curve_t E; mpi_ec_t ctx; gcry_mpi_t g_x, g_y; + gcry_mpi_t pkey[6]; + gcry_sexp_t result; + int i; memset (&E, 0, sizeof E); - err = _gcry_ecc_fill_in_curve (0, name, &E, &nbits); - if (err) - return err; + if (_gcry_ecc_fill_in_curve (0, name, &E, &nbits)) + return NULL; g_x = mpi_new (0); g_y = mpi_new (0); @@ -962,21 +963,6 @@ _gcry_ecc_get_param (const char *name, gcry_mpi_t *pkey) mpi_free (g_x); mpi_free (g_y); - return 0; -} - - -/* Return the parameters of the curve NAME as an S-expression. */ -gcry_sexp_t -_gcry_ecc_get_param_sexp (const char *name) -{ - gcry_mpi_t pkey[6]; - gcry_sexp_t result; - int i; - - if (_gcry_ecc_get_param (name, pkey)) - return NULL; - if (sexp_build (&result, NULL, "(public-key(ecc(p%m)(a%m)(b%m)(g%m)(n%m)))", pkey[0], pkey[1], pkey[2], pkey[3], pkey[4])) diff --git a/cipher/ecc.c b/cipher/ecc.c index a7de254..e3c49ce 100644 --- a/cipher/ecc.c +++ b/cipher/ecc.c @@ -1755,7 +1755,6 @@ gcry_pk_spec_t _gcry_pubkey_spec_ecc = ecc_get_nbits, run_selftests, compute_keygrip, - _gcry_ecc_get_param, _gcry_ecc_get_curve, _gcry_ecc_get_param_sexp }; diff --git a/src/cipher-proto.h b/src/cipher-proto.h index 11c2862..0955ef5 100644 --- a/src/cipher-proto.h +++ b/src/cipher-proto.h @@ -84,10 +84,6 @@ typedef unsigned (*gcry_pk_get_nbits_t) (gcry_sexp_t keyparms); typedef gpg_err_code_t (*pk_comp_keygrip_t) (gcry_md_hd_t md, gcry_sexp_t keyparm); -/* The type used to query ECC curve parameters. */ -typedef gcry_err_code_t (*pk_get_param_t) (const char *name, - gcry_mpi_t *pkey); - /* The type used to query an ECC curve name. */ typedef const char *(*pk_get_curve_t)(gcry_sexp_t keyparms, int iterator, unsigned int *r_nbits); @@ -121,7 +117,6 @@ typedef struct gcry_pk_spec gcry_pk_get_nbits_t get_nbits; selftest_func_t selftest; pk_comp_keygrip_t comp_keygrip; - pk_get_param_t get_param; pk_get_curve_t get_curve; pk_get_curve_param_t get_curve_param; } gcry_pk_spec_t; ----------------------------------------------------------------------- Summary of changes: cipher/ecc-common.h | 5 +-- cipher/ecc-curves.c | 30 +++++----------- cipher/ecc-eddsa.c | 96 +++++++++++++++++++++++++++++++++++---------------- cipher/ecc-misc.c | 48 +------------------------- cipher/ecc.c | 1 - doc/gcrypt.texi | 24 +++++++------ src/cipher-proto.h | 5 --- src/sexp.c | 47 +++++++++++++++++++++---- tests/tsexp.c | 71 +++++++++++++++++++++++++++++++++++++ 9 files changed, 203 insertions(+), 124 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Mon Dec 9 08:38:37 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 09 Dec 2013 08:38:37 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-423-g8072e9f Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 8072e9fa4b42ae8e65e266aa158fd903f1bb0927 (commit) from d4555433b6e422fa69a85cae99961f513e55d82b (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 8072e9fa4b42ae8e65e266aa158fd903f1bb0927 Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 tests: Add --csv option to bench-slope. * tests/bench-slope.c (STR, STR2): New. (cvs_mode): New. (num_measurement_repetitions): New. Replace use of NUM_MEASUREMENT_REPETITIONS by this. (current_section_name, current_algo_name, current_mode_name): New. (bench_print_result_csv): New. (bench_print_result_std): Rename from bench_print_result. (bench_print_result): New. Divert depending on CSV_MODE. (bench_print_header, bench_print_footer): take care of CSV_MODE. (bench_print_algo, bench_print_mode): New. Use them instead of explicit printfs. (main): Add options --csv and --repetitions. Signed-off-by: Werner Koch diff --git a/tests/bench-slope.c b/tests/bench-slope.c index fd9f64b..79314c3 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -1,5 +1,5 @@ /* bench-slope.c - for libgcrypt - * Copyright ? 2013 Jussi Kivilinna + * Copyright (C) 2013 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -27,21 +27,33 @@ #include #ifdef _GCRYPT_IN_LIBGCRYPT -#include "../src/gcrypt-int.h" -#include "../compat/libcompat.h" +# include "../src/gcrypt-int.h" +# include "../compat/libcompat.h" #else -#include +# include +#endif + +#ifndef STR +#define STR(v) #v +#define STR2(v) STR(v) #endif #define PGM "bench-slope" static int verbose; - +static int csv_mode; +static int num_measurement_repetitions; /* CPU Ghz value provided by user, allows constructing cycles/byte and other results. */ static double cpu_ghz = -1; +/* The name of the currently printed section. */ +static char *current_section_name; +/* The name of the currently printed algorithm. */ +static char *current_algo_name; +/* The name of the currently printed mode. */ +static char *current_mode_name; /*************************************** Default parameters for measurements. */ @@ -463,7 +475,41 @@ double_to_str (char *out, size_t outlen, double value) } static void -bench_print_result (double nsecs_per_byte) +bench_print_result_csv (double nsecs_per_byte) +{ + double cycles_per_byte, mbytes_per_sec; + char nsecpbyte_buf[16]; + char mbpsec_buf[16]; + char cpbyte_buf[16]; + + *cpbyte_buf = 0; + + double_to_str (nsecpbyte_buf, sizeof (nsecpbyte_buf), nsecs_per_byte); + + /* If user didn't provide CPU speed, we cannot show cycles/byte results. */ + if (cpu_ghz > 0.0) + { + cycles_per_byte = nsecs_per_byte * cpu_ghz; + double_to_str (cpbyte_buf, sizeof (cpbyte_buf), cycles_per_byte); + } + + mbytes_per_sec = + (1000.0 * 1000.0 * 1000.0) / (nsecs_per_byte * 1024 * 1024); + double_to_str (mbpsec_buf, sizeof (mbpsec_buf), mbytes_per_sec); + + /* We print two empty fields to allow for future enhancements. */ + printf ("%s,%s,%s,,,%s,ns/B,%s,MiB/s,%s,c/B\n", + current_section_name, + current_algo_name? current_algo_name : "", + current_mode_name? current_mode_name : "", + nsecpbyte_buf, + mbpsec_buf, + cpbyte_buf); + +} + +static void +bench_print_result_std (double nsecs_per_byte) { double cycles_per_byte, mbytes_per_sec; char nsecpbyte_buf[16]; @@ -493,17 +539,85 @@ bench_print_result (double nsecs_per_byte) } static void +bench_print_result (double nsecs_per_byte) +{ + if (csv_mode) + bench_print_result_csv (nsecs_per_byte); + else + bench_print_result_std (nsecs_per_byte); +} + +static void +bench_print_section (const char *section_name, const char *print_name) +{ + if (csv_mode) + { + gcry_free (current_section_name); + current_section_name = gcry_xstrdup (section_name); + } + else + printf ("%s:\n", print_name); +} + +static void bench_print_header (int algo_width, const char *algo_name) { - printf (" %-*s | ", algo_width, algo_name); - printf ("%14s %15s %13s\n", "nanosecs/byte", "mebibytes/sec", - "cycles/byte"); + if (csv_mode) + { + gcry_free (current_algo_name); + current_algo_name = gcry_xstrdup (algo_name); + } + else + { + if (algo_width < 0) + printf (" %-*s | ", -algo_width, algo_name); + else + printf (" %-*s | ", algo_width, algo_name); + printf ("%14s %15s %13s\n", "nanosecs/byte", "mebibytes/sec", + "cycles/byte"); + } +} + +static void +bench_print_algo (int algo_width, const char *algo_name) +{ + if (csv_mode) + { + gcry_free (current_algo_name); + current_algo_name = gcry_xstrdup (algo_name); + } + else + { + if (algo_width < 0) + printf (" %-*s | ", -algo_width, algo_name); + else + printf (" %-*s | ", algo_width, algo_name); + } +} + +static void +bench_print_mode (int width, const char *mode_name) +{ + if (csv_mode) + { + gcry_free (current_mode_name); + current_mode_name = gcry_xstrdup (mode_name); + } + else + { + if (width < 0) + printf (" %-*s | ", -width, mode_name); + else + printf (" %*s | ", width, mode_name); + fflush (stdout); + } } static void bench_print_footer (int algo_width) { - printf (" %-*s =\n", algo_width, ""); + if (!csv_mode) + printf (" %-*s =\n", algo_width, ""); } @@ -529,7 +643,7 @@ bench_encrypt_init (struct bench_obj *obj) obj->min_bufsize = BUF_START_SIZE; obj->max_bufsize = BUF_END_SIZE; obj->step_size = BUF_STEP_SIZE; - obj->num_measure_repetitions = NUM_MEASUREMENT_REPETITIONS; + obj->num_measure_repetitions = num_measurement_repetitions; err = gcry_cipher_open (&hd, mode->algo, mode->mode, 0); if (err) @@ -962,8 +1076,7 @@ cipher_bench_one (int algo, struct bench_cipher_mode *pmode) if (mode.mode == GCRY_CIPHER_MODE_GCM && blklen != GCRY_GCM_BLOCK_LEN) return; - printf (" %14s | ", mode.name); - fflush (stdout); + bench_print_mode (14, mode.name); obj.ops = mode.ops; obj.priv = &mode; @@ -996,7 +1109,7 @@ cipher_bench (char **argv, int argc) { int i, algo; - printf ("Cipher:\n"); + bench_print_section ("cipher", "Cipher"); if (argv && argc) { @@ -1037,7 +1150,7 @@ bench_hash_init (struct bench_obj *obj) obj->min_bufsize = BUF_START_SIZE; obj->max_bufsize = BUF_END_SIZE; obj->step_size = BUF_STEP_SIZE; - obj->num_measure_repetitions = NUM_MEASUREMENT_REPETITIONS; + obj->num_measure_repetitions = num_measurement_repetitions; err = gcry_md_open (&hd, mode->algo, 0); if (err) @@ -1093,10 +1206,9 @@ hash_bench_one (int algo, struct bench_hash_mode *pmode) mode.algo = algo; if (mode.name[0] == '\0') - printf (" %-14s | ", gcry_md_algo_name (algo)); + bench_print_algo (-14, gcry_md_algo_name (algo)); else - printf (" %14s | ", mode.name); - fflush (stdout); + bench_print_algo (14, mode.name); obj.ops = mode.ops; obj.priv = &mode; @@ -1120,8 +1232,7 @@ hash_bench (char **argv, int argc) { int i, algo; - printf ("Hash:\n"); - + bench_print_section ("hash", "Hash"); bench_print_header (14, ""); if (argv && argc) @@ -1167,7 +1278,7 @@ bench_mac_init (struct bench_obj *obj) obj->min_bufsize = BUF_START_SIZE; obj->max_bufsize = BUF_END_SIZE; obj->step_size = BUF_STEP_SIZE; - obj->num_measure_repetitions = NUM_MEASUREMENT_REPETITIONS; + obj->num_measure_repetitions = num_measurement_repetitions; keylen = gcry_mac_get_algo_keylen (mode->algo); if (keylen == 0) @@ -1247,10 +1358,9 @@ mac_bench_one (int algo, struct bench_mac_mode *pmode) mode.algo = algo; if (mode.name[0] == '\0') - printf (" %-18s | ", gcry_mac_algo_name (algo)); + bench_print_algo (-18, gcry_mac_algo_name (algo)); else - printf (" %18s | ", mode.name); - fflush (stdout); + bench_print_algo (18, mode.name); obj.ops = mode.ops; obj.priv = &mode; @@ -1274,8 +1384,7 @@ mac_bench (char **argv, int argc) { int i, algo; - printf ("MAC:\n"); - + bench_print_section ("mac", "MAC"); bench_print_header (18, ""); if (argv && argc) @@ -1307,10 +1416,13 @@ print_help (void) "usage: bench-slope [options] [hash|mac|cipher [algonames]]", "", " options:", - " --cpu-mhz Set CPU speed for calculating cycles per bytes", - " results.", - " --disable-hwf Disable hardware acceleration feature(s) for", - " benchmarking.", + " --cpu-mhz Set CPU speed for calculating cycles", + " per bytes results.", + " --disable-hwf Disable hardware acceleration feature(s)", + " for benchmarking.", + " --repetitions Use N repetitions (default " + STR2(NUM_MEASUREMENT_REPETITIONS) ")", + " --csv Use CSV output format", NULL }; const char **line; @@ -1341,6 +1453,8 @@ main (int argc, char **argv) int last_argc = -1; int debug = 0; + num_measurement_repetitions = NUM_MEASUREMENT_REPETITIONS; + if (argc) { argc--; @@ -1380,6 +1494,12 @@ main (int argc, char **argv) argc--; argv++; } + else if (!strcmp (*argv, "--csv")) + { + csv_mode = 1; + argc--; + argv++; + } else if (!strcmp (*argv, "--disable-hwf")) { argc--; @@ -1408,6 +1528,25 @@ main (int argc, char **argv) argv++; } } + else if (!strcmp (*argv, "--repetitions")) + { + argc--; + argv++; + if (argc) + { + num_measurement_repetitions = atof (*argv); + if (num_measurement_repetitions < 2) + { + fprintf (stderr, + PGM + ": value for --repetitions too small - using %d\n", + NUM_MEASUREMENT_REPETITIONS); + num_measurement_repetitions = NUM_MEASUREMENT_REPETITIONS; + } + argc--; + argv++; + } + } } gcry_control (GCRYCTL_SET_VERBOSITY, (int) verbose); ----------------------------------------------------------------------- Summary of changes: tests/bench-slope.c | 199 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 169 insertions(+), 30 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Mon Dec 9 09:10:13 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 09 Dec 2013 09:10:13 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-424-g2e5354f Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 2e5354fe8db5288939733d0fb63ad4c87bc20105 (commit) from 8072e9fa4b42ae8e65e266aa158fd903f1bb0927 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 2e5354fe8db5288939733d0fb63ad4c87bc20105 Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 tests: Speed up benchmarks in regression test mode. * tests/tsexp.c (check_extract_param): Fix compiler warning. * tests/Makefile.am (TESTS_ENVIRONMENT): Set GCRYPT_IN_REGRESSION_TEST. * tests/bench-slope.c (main): Speed up if in regression test mode. * tests/benchmark.c (main): Ditto. Signed-off-by: Werner Koch diff --git a/tests/Makefile.am b/tests/Makefile.am index 87283f9..3fb9fd6 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -23,10 +23,11 @@ TESTS = version mpitests tsexp t-convert \ prime basic keygen pubkey hmac hashtest t-kdf keygrip \ fips186-dsa aeswrap pkcs1v2 random dsa-rfc6979 t-ed25519 - # The last tests to run. TESTS += benchmark bench-slope +TESTS_ENVIRONMENT = GCRYPT_IN_REGRESSION_TEST=1 + # Need to include ../src in addition to top_srcdir because gcrypt.h is # a built header. diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 79314c3..219e0dd 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -48,6 +48,9 @@ static int num_measurement_repetitions; results. */ static double cpu_ghz = -1; +/* Whether we are running as part of the regression test suite. */ +static int in_regression_test; + /* The name of the currently printed section. */ static char *current_section_name; /* The name of the currently printed algorithm. */ @@ -1453,8 +1456,6 @@ main (int argc, char **argv) int last_argc = -1; int debug = 0; - num_measurement_repetitions = NUM_MEASUREMENT_REPETITIONS; - if (argc) { argc--; @@ -1466,6 +1467,14 @@ main (int argc, char **argv) if (!argc && getenv ("srcdir") && getenv ("GCRYPT_NO_BENCHMARKS")) exit (77); + if (getenv ("GCRYPT_IN_REGRESSION_TEST")) + { + in_regression_test = 1; + num_measurement_repetitions = 2; + } + else + num_measurement_repetitions = NUM_MEASUREMENT_REPETITIONS; + while (argc && last_argc != argc) { last_argc = argc; @@ -1565,6 +1574,9 @@ main (int argc, char **argv) gcry_control (GCRYCTL_INITIALIZATION_FINISHED, 0); gcry_control (GCRYCTL_ENABLE_QUICK_RANDOM, 0); + if (in_regression_test) + fputs ("Note: " PGM " running in quick regression test mode.\n", stdout); + if (!argc) { warm_up_cpu (); diff --git a/tests/benchmark.c b/tests/benchmark.c index 3f44e33..62dfc22 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -59,6 +59,9 @@ static int cipher_with_keysetup; /* Whether fips mode was active at startup. */ static int in_fips_mode; +/* Whether we are running as part of the regression test suite. */ +static int in_regression_test; + static const char sample_private_dsa_key_1024[] = "(private-key\n" @@ -266,6 +269,7 @@ die (const char *format, ...) exit (1); } + static void show_sexp (const char *prefix, gcry_sexp_t a) { @@ -1301,6 +1305,12 @@ main( int argc, char **argv ) if (!argc && getenv ("srcdir") && getenv ("GCRYPT_NO_BENCHMARKS")) exit (77); + if (getenv ("GCRYPT_IN_REGRESSION_TEST")) + { + in_regression_test = 1; + pk_count = 10; + } + while (argc && last_argc != argc ) { last_argc = argc; @@ -1473,6 +1483,9 @@ main( int argc, char **argv ) if (mac_repetitions < 1) mac_repetitions = 1; + if (in_regression_test) + fputs ("Note: " PGM " running in quick regression test mode.\n", stdout); + if ( !argc ) { gcry_control (GCRYCTL_ENABLE_QUICK_RANDOM, 0); diff --git a/tests/tsexp.c b/tests/tsexp.c index 2f6ad8f..1ab9802 100644 --- a/tests/tsexp.c +++ b/tests/tsexp.c @@ -1022,7 +1022,8 @@ check_extract_param (void) { fail ("gcry_sexp_extract_param long name failed: curve mismatch"); gcry_log_debug ("expected: %s\n", "Ed25519"); - gcry_log_debug (" got: %.*s\n", (int)ioarray[0].len, ioarray[0].data); + gcry_log_debug (" got: %.*s\n", + (int)ioarray[0].len, (char*)ioarray[0].data); } if (!mpis[0]) ----------------------------------------------------------------------- Summary of changes: tests/Makefile.am | 3 ++- tests/bench-slope.c | 16 ++++++++++++++-- tests/benchmark.c | 13 +++++++++++++ tests/tsexp.c | 3 ++- 4 files changed, 31 insertions(+), 4 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Mon Dec 9 18:04:05 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 09 Dec 2013 18:04:05 +0100 Subject: benchmarks Message-ID: <8738m2yn0a.fsf@vigenere.g10code.de> Hi, I did some benchmark charts for Libgcrypt. See ftp://ftp.g10code.com/g10code/scratch/gcrypt-bench-x220-2300.gnumeric or just the charts: ftp://ftp.g10code.com/g10code/scratch/gcrypt-bench-x220-2300.pdf this nicely shows Jussi's improvements. The values for Nettle have been taken from the nettle-benchmark example program. I did not add data for OpenSSL because it is a bit of work to extract the numbers from "openssl speed". It would be nice to have wrapper for openssl so that bench-slope can be used to create compatible numbers. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From cvs at cvs.gnupg.org Mon Dec 9 19:59:44 2013 From: cvs at cvs.gnupg.org (by David 'Digit' Turner) Date: Mon, 09 Dec 2013 19:59:44 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-425-g2516f0b Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 2516f0b660b1a7181ad38c44310c627f4f498595 (commit) from 2e5354fe8db5288939733d0fb63ad4c87bc20105 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 2516f0b660b1a7181ad38c44310c627f4f498595 Author: David 'Digit' Turner Date: Tue Oct 22 14:26:53 2013 +0200 Update libtool to support Android. * m4/libtool.m4: Add "linux*android*" case. Taken from the libtool repository. -- The patch, which cleanly applies, is commit 8eeeb00daef8c4f720c9b79a0cdb89225d9909b6 Author: David 'Digit' Turner Date: Tue Oct 8 14:37:32 2013 -0700 This patch adds proper Android support to libtool. The main issues are the following: - Versioned libraries are not supported by the platform and its build/packaging tools. - The dynamic linker is not GNU ld, there is no support for DT_RUNPATH. - Similarly, there is no ldconfig. Signed-off-by: Werner Koch diff --git a/m4/libtool.m4 b/m4/libtool.m4 index f0a2752..1d62b05 100644 --- a/m4/libtool.m4 +++ b/m4/libtool.m4 @@ -2644,6 +2644,26 @@ linux*oldld* | linux*aout* | linux*coff*) dynamic_linker=no ;; +linux*android*) + version_type=none # Android doesn't support versioned libraries. + need_lib_prefix=no + need_version=no + library_names_spec='$libname$release$shared_ext' + soname_spec='$libname$release$shared_ext' + finish_cmds= + shlibpath_var=LD_LIBRARY_PATH + shlibpath_overrides_runpath=yes + + # This implies no fast_install, which is unacceptable. + # Some rework will be needed to allow for fast_install + # before this can be enabled. + hardcode_into_libs=yes + + dynamic_linker='Android linker' + # Don't embed -rpath directories since the linker doesn't support them. + _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir' + ;; + # This must be glibc/ELF. linux* | k*bsd*-gnu | kopensolaris*-gnu) version_type=linux # correct to gnu/linux during the next big refactor ----------------------------------------------------------------------- Summary of changes: m4/libtool.m4 | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Mon Dec 9 20:30:07 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 09 Dec 2013 20:30:07 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-426-g9a37470 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 9a37470c50ee9966cb2652617a404ddd54a9c096 (commit) from 2516f0b660b1a7181ad38c44310c627f4f498595 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 9a37470c50ee9966cb2652617a404ddd54a9c096 Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 Fix one-off memory leak when build with Linux capability support. * src/secmem.c (lock_pool, secmem_init): Use cap_free. Reported by Mike Crowe . Signed-off-by: Werner Koch diff --git a/src/secmem.c b/src/secmem.c index c350bc9..fd2cdf4 100644 --- a/src/secmem.c +++ b/src/secmem.c @@ -243,11 +243,20 @@ lock_pool (void *p, size_t n) #if defined(USE_CAPABILITIES) && defined(HAVE_MLOCK) int err; - cap_set_proc (cap_from_text ("cap_ipc_lock+ep")); - err = no_mlock? 0 : mlock (p, n); - if (err && errno) - err = errno; - cap_set_proc (cap_from_text ("cap_ipc_lock+p")); + { + cap_t cap; + + cap = cap_from_text ("cap_ipc_lock+ep"); + cap_set_proc (cap); + cap_free (cap); + err = no_mlock? 0 : mlock (p, n); + if (err && errno) + err = errno; + cap_set_proc (cap_from_text ("cap_ipc_lock+p")); + cap = cap_from_text ("cap_ipc_lock+p"); + cap_set_proc (cap); + cap_free(cap); + } if (err) { @@ -474,7 +483,13 @@ secmem_init (size_t n) { #ifdef USE_CAPABILITIES /* drop all capabilities */ - cap_set_proc (cap_from_text ("all-eip")); + { + cap_t cap; + + cap = cap_from_text ("all-eip"); + cap_set_proc (cap); + cap_free (cap); + } #elif !defined(HAVE_DOSISH_SYSTEM) uid_t uid; ----------------------------------------------------------------------- Summary of changes: src/secmem.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Mon Dec 9 20:26:03 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 09 Dec 2013 20:26:03 +0100 Subject: [PATCH] Avoid secmem leaking capability state In-Reply-To: <1385123466-30446-1-git-send-email-mac@mcrowe.com> (Mike Crowe's message of "Fri, 22 Nov 2013 12:31:06 +0000") References: <1385123466-30446-1-git-send-email-mac@mcrowe.com> Message-ID: <87ppp5ygfo.fsf@vigenere.g10code.de> On Fri, 22 Nov 2013 13:31, mac at mcrowe.com said: > The capability state returned by cap_from_text needs to be freed after it > has been used. I applied your fix. Also fixed at another place. Please send a DCO (see doc/HACKING) if you want to provide larger patches. Thanks, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From nmav at gnutls.org Mon Dec 9 18:57:46 2013 From: nmav at gnutls.org (Nikos Mavrogiannopoulos) Date: Mon, 09 Dec 2013 18:57:46 +0100 Subject: benchmarks In-Reply-To: <8738m2yn0a.fsf@vigenere.g10code.de> References: <8738m2yn0a.fsf@vigenere.g10code.de> Message-ID: <1386611866.2107.5.camel@aspire.lan> On Mon, 2013-12-09 at 18:04 +0100, Werner Koch wrote: > Hi, > > I did some benchmark charts for Libgcrypt. See > ftp://ftp.g10code.com/g10code/scratch/gcrypt-bench-x220-2300.gnumeric > or just the charts: > ftp://ftp.g10code.com/g10code/scratch/gcrypt-bench-x220-2300.pdf > this nicely shows Jussi's improvements. The values for Nettle have been > taken from the nettle-benchmark example program. I did not add data for > OpenSSL because it is a bit of work to extract the numbers from "openssl > speed". It would be nice to have wrapper for openssl so that > bench-slope can be used to create compatible numbers. Could libgcrypt and nettle share the low level algorithms so improvements on one project will be shared with the other? As it is now I think there is quite some duplicate effort being spent in making the two libraries faster and there is no much reason they couldn't be combined. In the end I feel disappointed to see that because of that, even the gnulib people will use openssl's libcrypto because it is faster: http://lists.gnu.org/archive/html/bug-gnulib/2013-12/msg00058.html regards, Nikos From jussi.kivilinna at iki.fi Mon Dec 9 21:44:49 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 09 Dec 2013 22:44:49 +0200 Subject: benchmarks In-Reply-To: <8738m2yn0a.fsf@vigenere.g10code.de> References: <8738m2yn0a.fsf@vigenere.g10code.de> Message-ID: <52A62BC1.4030702@iki.fi> On 09.12.2013 19:04, Werner Koch wrote: > Hi, > > I did some benchmark charts for Libgcrypt. See > > ftp://ftp.g10code.com/g10code/scratch/gcrypt-bench-x220-2300.gnumeric > > or just the charts: > > ftp://ftp.g10code.com/g10code/scratch/gcrypt-bench-x220-2300.pdf > > this nicely shows Jussi's improvements. The values for Nettle have been > taken from the nettle-benchmark example program. I did not add data for > OpenSSL because it is a bit of work to extract the numbers from "openssl > speed". It would be nice to have wrapper for openssl so that > bench-slope can be used to create compatible numbers. To compare with Nettle and OpenSSL, I have done some testing with a modified version of crypto-speedtest from http://panthema.net/2008/0714-cryptography-speedtest-comparison/ Graph showing differences between OpenSSL (1.0.1e), libgcrypt-1.5, libgcrypt-1.6 (ECB & CTR) and nettle (2.7.1) on Intel Haswell: http://koti.kapsi.fi/~jukivili/gcrypt/haswell-3200-ubuntu-saucy-gcrypt.pdf Above graph shows additional benefit of new bulk processing implementations for parallelizable modes of operation (CTR, CFB-dec, CBC-dec). Each measurement are for key-setup(enc)+enc+key-setup(dec)+dec for different buffer lengths. Default key-length is 256-bit; shorter used if 256-bits not supported. -Jussi > > > Shalom-Salam, > > Werner > > From wk at gnupg.org Mon Dec 9 21:48:19 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 09 Dec 2013 21:48:19 +0100 Subject: benchmarks In-Reply-To: <1386611866.2107.5.camel@aspire.lan> (Nikos Mavrogiannopoulos's message of "Mon, 09 Dec 2013 18:57:46 +0100") References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> Message-ID: <87bo0pwy24.fsf@vigenere.g10code.de> On Mon, 9 Dec 2013 18:57, nmav at gnutls.org said: > Could libgcrypt and nettle share the low level algorithms so > improvements on one project will be shared with the other? As it is now Fell free to take whatever you need from Libgcrypt. They both you the same license me thinks. > I think there is quite some duplicate effort being spent in making the > two libraries faster and there is no much reason they couldn't be > combined. That is what free software is about, right? > gnulib people will use openssl's libcrypto because it is faster: > http://lists.gnu.org/archive/html/bug-gnulib/2013-12/msg00058.html And thus forcing the requirement for openssl exception on every software using gnulib. Funny GNU. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Mon Dec 9 21:59:55 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 09 Dec 2013 21:59:55 +0100 Subject: benchmarks In-Reply-To: <52A62BC1.4030702@iki.fi> (Jussi Kivilinna's message of "Mon, 09 Dec 2013 22:44:49 +0200") References: <8738m2yn0a.fsf@vigenere.g10code.de> <52A62BC1.4030702@iki.fi> Message-ID: <877gbdwxis.fsf@vigenere.g10code.de> On Mon, 9 Dec 2013 21:44, jussi.kivilinna at iki.fi said: > http://koti.kapsi.fi/~jukivili/gcrypt/haswell-3200-ubuntu-saucy-gcrypt.pdf That shows that your work on Libgcrypt made a huge difference. Now for the hash functions ;-). Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From cvs at cvs.gnupg.org Tue Dec 10 08:30:41 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Tue, 10 Dec 2013 08:30:41 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-427-geae1e77 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via eae1e7712e1b687bd77eb37d0eb505fc9d46d93c (commit) from 9a37470c50ee9966cb2652617a404ddd54a9c096 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit eae1e7712e1b687bd77eb37d0eb505fc9d46d93c Author: Werner Koch Date: Tue Oct 22 14:26:53 2013 +0200 Fix last commit (9a37470c) * src/secmem.c (lock_pool): Remove remaining line. Reported by Ian Goldberg. diff --git a/src/secmem.c b/src/secmem.c index fd2cdf4..2bf7d8c 100644 --- a/src/secmem.c +++ b/src/secmem.c @@ -252,7 +252,6 @@ lock_pool (void *p, size_t n) err = no_mlock? 0 : mlock (p, n); if (err && errno) err = errno; - cap_set_proc (cap_from_text ("cap_ipc_lock+p")); cap = cap_from_text ("cap_ipc_lock+p"); cap_set_proc (cap); cap_free(cap); ----------------------------------------------------------------------- Summary of changes: src/secmem.c | 1 - 1 file changed, 1 deletion(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From nmav at gnutls.org Tue Dec 10 08:58:33 2013 From: nmav at gnutls.org (Nikos Mavrogiannopoulos) Date: Tue, 10 Dec 2013 08:58:33 +0100 Subject: benchmarks In-Reply-To: <87bo0pwy24.fsf@vigenere.g10code.de> References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> Message-ID: On Mon, Dec 9, 2013 at 9:48 PM, Werner Koch wrote: >> Could libgcrypt and nettle share the low level algorithms so >> improvements on one project will be shared with the other? As it is now > Fell free to take whatever you need from Libgcrypt. Of course that is possible, it always was. However, copying isn't easy when different low-level interfaces are involved, and pretty much that just prolongs the current situation. > They both you the > same license me thinks. Only for the cipher/hash/mac part which was the part I suggested to be shared. Nettle's PK interface is effectively LGPLv3 due to reliance on GMP. So that part would be more hard to benefit from a sharing. >> I think there is quite some duplicate effort being spent in making the >> two libraries faster and there is no much reason they couldn't be >> combined. > That is what free software is about, right? Well, I really want to think that it is also about collaboration. While I understand that everyone has a different agenda on the things that need to be done, schedules etc., a compromise that will benefit everyone may be possible. regards, Nikos From wk at gnupg.org Tue Dec 10 11:15:35 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 10 Dec 2013 11:15:35 +0100 Subject: benchmarks In-Reply-To: (Nikos Mavrogiannopoulos's message of "Tue, 10 Dec 2013 08:58:33 +0100") References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> Message-ID: <87k3fdui48.fsf@vigenere.g10code.de> On Tue, 10 Dec 2013 08:58, nmav at gnutls.org said: > Well, I really want to think that it is also about collaboration. Right, I thought about the same lines but back then Niels decided to compile his own library without the need to comply to the strict GNU rules. Thus he was able to use all kind of code while I was not. A decade or more ago I had to reject Brian Gladman's offer to use his code due simply due to the CA requirement. Latter then the GNU project seems to have concluded that CAs are not important anymore unless they are already in use. The effect was that GNUTLS silently started to use Nettle instead of helping to convince the GNU towers to drop the CA requirement for Libgcrypt. Meanwhile I terminated by own CAs and we are now able to basically do the same what Nettle did. There are lots of crypto libraries out there and anyone may use whatever he likes. In case there is useful code in Nettle we may included that in Libgcrypt but I see no point in joining the two libraries. > While I understand that everyone has a different agenda on the things > that need to be done, schedules etc., a compromise that will benefit > everyone may be possible. Well, you removed all support for Libgcrypt from GNUTLS. If you want to use it again, you only need to add that layer again. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wachs at net.in.tum.de Tue Dec 10 12:05:46 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Tue, 10 Dec 2013 12:05:46 +0100 Subject: fig2dev detection Message-ID: <52A6F58A.4000502@net.in.tum.de> -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Hi, I am trying to install the latest libgcrypt version from git on a ubuntu 12.4.3 system. I (still) have the fig2dev detection issue Making install in doc make[1]: Entering directory `/home/gnunet/svn/libgcrypt/doc' fig2dev -L eps `test -f 'libgcrypt-modules.fig' || echo './'`libgcrypt-modules.fig libgcrypt-modules.eps /bin/bash: fig2dev: command not found make[1]: *** [libgcrypt-modules.eps] Error 127 make[1]: Leaving directory `/home/gnunet/svn/libgcrypt/doc' make: *** [install-recursive] Error 1 - -Matthias -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/ iQEcBAEBAgAGBQJSpvWKAAoJEEFv6YGnAaM225oH/3PozJHN07wFJWBtdIVGgNK6 gT+cgOiac9dyF8OoMp6xexyd4te91Q6pclgbW8OBMr2iPVijl8kBsxQX8nee3I7F C+t2j60f14EdnMR3s5rMu2LPeJ7B/jlpjTfG+QQRen9nfk99H0c4BKDClJmKHnAy sXCdt5I08SE/GDVw5RfMxHEGvKVVBC0d24V8N9OHzNlEOT0w4ErWj2ZQMTzf8QHQ anZohAGtV89rD4ztEG1ASdesdcAqrhMfjyodeWfjwFAgZNIDNnUPcaf+PkY4Gbtk MAV3KI8e/KIAYWQqFyWOehsM2JtYLZ1yVQ0gY2sSns6yr3oeOBQRu6EpAWxey0s= =49i6 -----END PGP SIGNATURE----- From dbaryshkov at gmail.com Tue Dec 10 12:06:57 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Tue, 10 Dec 2013 15:06:57 +0400 Subject: benchmarks In-Reply-To: <87k3fdui48.fsf@vigenere.g10code.de> References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> <87k3fdui48.fsf@vigenere.g10code.de> Message-ID: On Tue, Dec 10, 2013 at 2:15 PM, Werner Koch wrote: > On Tue, 10 Dec 2013 08:58, nmav at gnutls.org said: >> While I understand that everyone has a different agenda on the things >> that need to be done, schedules etc., a compromise that will benefit >> everyone may be possible. > > Well, you removed all support for Libgcrypt from GNUTLS. If you want to > use it again, you only need to add that layer again. I have been working on this in spare time for some time (mostly to be able to test my GOST cryptography code). Most of the code is there, but there are some test cases still failing. -- With best wishes Dmitry From nmav at gnutls.org Tue Dec 10 13:39:19 2013 From: nmav at gnutls.org (Nikos Mavrogiannopoulos) Date: Tue, 10 Dec 2013 13:39:19 +0100 Subject: benchmarks In-Reply-To: <87k3fdui48.fsf@vigenere.g10code.de> References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> <87k3fdui48.fsf@vigenere.g10code.de> Message-ID: On Tue, Dec 10, 2013 at 11:15 AM, Werner Koch wrote: >> Well, I really want to think that it is also about collaboration. > Right, I thought about the same lines but back then Niels decided to > compile his own library without the need to comply to the strict GNU > rules. Thus he was able to use all kind of code while I was not. A > decade or more ago I had to reject Brian Gladman's offer to use his code > due simply due to the CA requirement. Latter then the GNU project seems > to have concluded that CAs are not important anymore unless they are > already in use. The effect was that GNUTLS silently started to use > Nettle instead of helping to convince the GNU towers to drop the CA > requirement for Libgcrypt. I thought that this was done quite vocally :) There was quite a long discussion in this list about the issues I had back then, that if I remember well were (a) libgcrypt could not be used in setuid processes, and (b) that it was much slower -more than 2x- than openssl (and nettle). The conclusion of that discussion was that libgcrypt wouldn't change. I now understand that (b) was because you were strictly following the gnu rules, but that provided no comfort to me who was seeing gnutls being at the bottom of any benchmark. That's why I switched to nettle and as it is now we are more than 2x faster compared to openssl in public key operations. >> While I understand that everyone has a different agenda on the things >> that need to be done, schedules etc., a compromise that will benefit >> everyone may be possible. > Well, you removed all support for Libgcrypt from GNUTLS. If you want to > use it again, you only need to add that layer again. I removed it because I changed the internal API and libgcrypt support would be incomplete (gcm was missing at the time). I don't think it is would be trivial to re-add, but it is not hard either. Nevertheless, even if such a switch would be possible today, it would solve nothing, as now we have some few parts that libgcrypt is better than nettle, and other parts which nettle is better than libgcrypt. My argumentation for the merging was with the intention to have the best implementations of each library merged. regards, Nikos From wk at gnupg.org Tue Dec 10 14:50:46 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 10 Dec 2013 14:50:46 +0100 Subject: fig2dev detection In-Reply-To: <52A6F58A.4000502@net.in.tum.de> (Matthias Wachs's message of "Tue, 10 Dec 2013 12:05:46 +0100") References: <52A6F58A.4000502@net.in.tum.de> Message-ID: <87fvq0vmq1.fsf@vigenere.g10code.de> On Tue, 10 Dec 2013 12:05, wachs at net.in.tum.de said: > I am trying to install the latest libgcrypt version from git on a > ubuntu 12.4.3 system. I (still) have the fig2dev detection issue apt-get install transfig Remember that the GIT repo is not a release but work in progress - all kind of stuff may be broken. You need to have certain tools available to even build configure. Ask Captain Yossarian why an extra step to configure the automake stuff drives people crazy. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From jukivili at dalek.fi Tue Dec 10 14:46:43 2013 From: jukivili at dalek.fi (Jussi Kivilinna) Date: Tue, 10 Dec 2013 15:46:43 +0200 Subject: benchmarks In-Reply-To: <877gbdwxis.fsf@vigenere.g10code.de> References: <8738m2yn0a.fsf@vigenere.g10code.de> <52A62BC1.4030702@iki.fi> <877gbdwxis.fsf@vigenere.g10code.de> Message-ID: <52A71B43.4010006@dalek.fi> On 09.12.2013 22:59, Werner Koch wrote: > On Mon, 9 Dec 2013 21:44, jussi.kivilinna at iki.fi said: > >> http://koti.kapsi.fi/~jukivili/gcrypt/haswell-3200-ubuntu-saucy-gcrypt.pdf > > That shows that your work on Libgcrypt made a huge difference. Now > for the hash functions ;-). Well, actually, that's something I have had in mind. Now, if I just had more time. -Jussi > > > Shalom-Salam, > > Werner > From nisse at lysator.liu.se Mon Dec 9 21:48:53 2013 From: nisse at lysator.liu.se (Niels =?iso-8859-1?Q?M=F6ller?=) Date: Mon, 09 Dec 2013 21:48:53 +0100 Subject: benchmarks In-Reply-To: <1386611866.2107.5.camel@aspire.lan> (Nikos Mavrogiannopoulos's message of "Mon, 09 Dec 2013 18:57:46 +0100") References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> Message-ID: Nikos Mavrogiannopoulos writes: > Could libgcrypt and nettle share the low level algorithms so > improvements on one project will be shared with the other? To me, it would make sense to have gcrypt add its interfaces on top of Nettle. I hope there should be minimal overhead (in code complexity as well as running time overhead). It was my intention from the start that it should be easy to build other frameworks on top of Nettle. I'm not sure what obstacles, technical or others, there are. At least Nettle is now LGPL licenced (v2 or later). Any pieces missing which are essential for gcrypt? (E.g., currently Nettle has no runtime selection of cpu-specific code). > In the end I feel disappointed to see that because of that, even the > gnulib people will use openssl's libcrypto because it is faster: > http://lists.gnu.org/archive/html/bug-gnulib/2013-12/msg00058.html I totally agree. (And OpenSSL is not always faster than Nettle. On my home machine, a low end x86_64 AMD E-350, Nettle's AES is some 130% faster than openssl (which surprises me), sha1 is 20% faster than openssl, ecdsa is 300% faster for signing and 75% faster for verification. But on many machines, Nettle is a bit slower than openssl for sha1 and md5, which might be the most important things for gnulib). Regards, /Niels -- Niels M?ller. PGP-encrypted email is preferred. Keyid C0B98E26. Internet email is subject to wholesale government surveillance. From nisse at lysator.liu.se Tue Dec 10 15:01:17 2013 From: nisse at lysator.liu.se (Niels =?iso-8859-1?Q?M=F6ller?=) Date: Tue, 10 Dec 2013 15:01:17 +0100 Subject: benchmarks In-Reply-To: (Nikos Mavrogiannopoulos's message of "Tue, 10 Dec 2013 13:39:19 +0100") References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> <87k3fdui48.fsf@vigenere.g10code.de> Message-ID: Nikos Mavrogiannopoulos writes: >>> Well, I really want to think that it is also about collaboration. > On Tue, Dec 10, 2013 at 11:15 AM, Werner Koch wrote: >> Right, I thought about the same lines but back then Niels decided to >> compile his own library without the need to comply to the strict GNU >> rules. Thus he was able to use all kind of code while I was not. I'm sorry you feel like this. There are historic reasons (and the first release of Nettle happened more than a decade ago). The code started as the Pike cryptographic toolkit which I and Henrik Grubbstr?m did in 1996 or so, we needed this to implement SSL for the Roxen webserver, which is written in Pike (Henrik first wrote some glue to use openssl, or "ssleay" as it was known back in the day, but we then decided to write our own SSL implementation). This was GPLd, but neither Pike nor Roxen are GNU projects, copyright assignments were never considered. A few years later, 1998, I started on LSH, and reused the low-level C code. Half a year later, LSH was dubbed a GNU package, with no large changes to the way it was developed. No copyright assignment policy was imposed at that time (and since I wasn't the sole author, it wouldn't have been trivial). Then Nettle was spun off from LSH in 2001. Time went by, and in 2009 it was dubbed a GNU package on its own, despite concerns about duplication with libgcrypt. Maybe I could have done some things differently back then, but I can't feel particularly guilty. So what about today? Is FSF copyright assignment important to you, and lack of Nettle CA a main show stopper for using Nettle in any way? I'd like to know the obstacles, technical or other. It would be possible, although some amount of boring work, to transfer most nettle copyrights to the FSF. I think I understand both the advantages and disadvantages which come with FSF copyright assignment. Regards, /Niels -- Niels M?ller. PGP-encrypted email is preferred. Keyid C0B98E26. Internet email is subject to wholesale government surveillance. From wk at gnupg.org Tue Dec 10 18:04:00 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 10 Dec 2013 18:04:00 +0100 Subject: benchmarks In-Reply-To: ("Niels =?utf-8?Q?M?= =?utf-8?Q?=C3=B6ller=22's?= message of "Tue, 10 Dec 2013 15:01:17 +0100") References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> <87k3fdui48.fsf@vigenere.g10code.de> Message-ID: <8738m0vdrz.fsf@vigenere.g10code.de> On Tue, 10 Dec 2013 15:01, nisse at lysator.liu.se said: > assignments were never considered. A few years later, 1998, I started on > LSH, and reused the low-level C code. Half a year later, LSH was dubbed I recall that we met at the DKUUG spring conference that year and talked about who would write a free ssh implementation (back then ssh was proprietary) and I pointed you to the psst site. I didn't knew that you had a free crypto library at hand. > a GNU package, with no large changes to the way it was developed. No > copyright assignment policy was imposed at that time (and since I wasn't Nobody understands why RMS sometimes demands a CA and sometimes not. The FSF had always communicated to me that there is no way to do GNU stuff without a CA except for existsing code like TeX, X11, and Mach. > So what about today? Is FSF copyright assignment important to you, and I announced last year that there is no more need for a CA. But the whole discussion is moot; Nettle and Libgcrypt are very different and there is no need to merge them. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From nisse at lysator.liu.se Tue Dec 10 20:53:58 2013 From: nisse at lysator.liu.se (Niels =?iso-8859-1?Q?M=F6ller?=) Date: Tue, 10 Dec 2013 20:53:58 +0100 Subject: benchmarks In-Reply-To: <8738m0vdrz.fsf@vigenere.g10code.de> (Werner Koch's message of "Tue, 10 Dec 2013 18:04:00 +0100") References: <8738m2yn0a.fsf@vigenere.g10code.de> <1386611866.2107.5.camel@aspire.lan> <87bo0pwy24.fsf@vigenere.g10code.de> <87k3fdui48.fsf@vigenere.g10code.de> <8738m0vdrz.fsf@vigenere.g10code.de> Message-ID: Werner Koch writes: > But the whole discussion is moot; Nettle and Libgcrypt are very > different and there is no need to merge them. I imagine that it would be technically possible to write libgcrypt so that parts of it is a thin layer on top of Nettle. For a simple example, in libcgrypt/cipher/md5.c, either replace the transform function with a call to _nettle_md5_compress (which is implemented in assembly on some platforms, and somewhat internal). Or write the internal (not visible in libgcrypt's ABI) MD5_CONTEXT as something like #include typedef struct { struct md5_ctx ctx; uint8_t digest[MD5_DIGEST_SIZE]; } MD5_CONTEXT; and redefine _gcr_digest_spec_md5 to point to corresponding nettle functions, or trivial wrappers if needed. I see that there's some impedance mismatch, e.g., (init,write,final,read), vs (init,update,digest). I can't say if that's a big deal, aesthetically or in terms of performance overhead. Now, there may sure be good reasons why that's an undesirable change, but I'd like to understand what the obstacles are. The advantage is sharing of more low-level code. And if libgcrypt is faster than nettle for some particular algorithm, we'd obviously want to get the faster code into Nettle before changing libgcrypt. Regards, /Niels -- Niels M?ller. PGP-encrypted email is preferred. Keyid C0B98E26. Internet email is subject to wholesale government surveillance. From lrn1986 at gmail.com Tue Dec 10 23:23:29 2013 From: lrn1986 at gmail.com (LRN) Date: Wed, 11 Dec 2013 02:23:29 +0400 Subject: fig2dev detection In-Reply-To: <52A6F58A.4000502@net.in.tum.de> References: <52A6F58A.4000502@net.in.tum.de> Message-ID: <52A79461.2000308@gmail.com> -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 10.12.2013 15:05, Matthias Wachs wrote: > Hi, > > I am trying to install the latest libgcrypt version from git on a > ubuntu 12.4.3 system. I (still) have the fig2dev detection issue > > Making install in doc make[1]: Entering directory > `/home/gnunet/svn/libgcrypt/doc' fig2dev -L eps `test -f > 'libgcrypt-modules.fig' || echo './'`libgcrypt-modules.fig > libgcrypt-modules.eps /bin/bash: fig2dev: command not found > make[1]: *** [libgcrypt-modules.eps] Error 127 make[1]: Leaving > directory `/home/gnunet/svn/libgcrypt/doc' make: *** > [install-recursive] Error 1 > Use this patch, and may the Force be with you. Or install fig2dev, as suggested in the other message in this thread. - -- O< ascii ribbon - stop html email! - www.asciiribbon.org -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (MingW32) iQEcBAEBAgAGBQJSp5RgAAoJEOs4Jb6SI2CwvmEIAIABI5XWKqsvod3jIFMNWEFD WaxxMbeAVPTDos0wwG33Mm0TnU/9OjnSAXpqsqxZMyH170AYOO5fP2lA123s1Gg3 TiD0vXOAe1KMphOeELzntS4oMoDr/6dxbmN9lK1GXKeZ9Y56I+Ea/VgaxlcYnnqt fegigB5zHSoB/jo0Y2N+7PkJP/npswSuMzUZz061P+HKWgL5BtjSAvXjqJbPBhek 7iQ++3P4EAaZenck+7GujWyp9tvU7lYxQBOFkhoXzblAt2o+WHldTl3iU6VWPSVo bAJsiSpNIqFMtWA4mFXe+jeqrlt6968b9p28RSZenOdic2O4j5/uEv+HAxtrxs8= =VcqT -----END PGP SIGNATURE----- -------------- next part -------------- From 15cad0ff53ad39e569b796474ad534687ea33e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D1=83=D1=81=D0=BB=D0=B0=D0=BD=20=D0=98=D0=B6=D0=B1=D1?= =?UTF-8?q?=83=D0=BB=D0=B0=D1=82=D0=BE=D0=B2?= Date: Wed, 20 Mar 2013 20:31:57 +0400 Subject: [PATCH 2/3] Smarter fig2dev detection --- configure.ac | 5 +++++ doc/Makefile.am | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 7afd83d..a6d8bdc 100644 --- a/configure.ac +++ b/configure.ac @@ -654,6 +654,11 @@ AC_SUBST(PTH_CFLAGS) AC_SUBST(PTH_LIBS) # +# Find fig2dev +# +AC_PATH_PROG([FIG2DEV], [fig2dev], [false]) + +# # Check whether pthreads is available # AC_CHECK_LIB(pthread,pthread_create,have_pthread=yes) diff --git a/doc/Makefile.am b/doc/Makefile.am index 30330bb..42c7d4f 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -45,16 +45,16 @@ yat2m: yat2m.c $(CC_FOR_BUILD) -o $@ $(srcdir)/yat2m.c .fig.png: - fig2dev -L png `test -f '$<' || echo '$(srcdir)/'`$< $@ + $(FIG2DEV) -L png `test -f '$<' || echo '$(srcdir)/'`$< $@ || touch $@ .fig.jpg: - fig2dev -L jpg `test -f '$<' || echo '$(srcdir)/'`$< $@ + $(FIG2DEV) -L jpg `test -f '$<' || echo '$(srcdir)/'`$< $@ || touch $@ .fig.eps: - fig2dev -L eps `test -f '$<' || echo '$(srcdir)/'`$< $@ + $(FIG2DEV) -L eps `test -f '$<' || echo '$(srcdir)/'`$< $@ || touch $@ .fig.pdf: - fig2dev -L pdf `test -f '$<' || echo '$(srcdir)/'`$< $@ + $(FIG2DEV) -L pdf `test -f '$<' || echo '$(srcdir)/'`$< $@ || touch $@ yat2m-stamp: $(myman_sources) @rm -f yat2m-stamp.tmp -- 1.7.11 From cvs at cvs.gnupg.org Wed Dec 11 17:08:57 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Wed, 11 Dec 2013 17:08:57 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-428-gcd548ba Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via cd548ba2dc777b8b27d8d33182ba733c20222120 (commit) from eae1e7712e1b687bd77eb37d0eb505fc9d46d93c (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit cd548ba2dc777b8b27d8d33182ba733c20222120 Author: Werner Koch Date: Wed Dec 11 16:59:41 2013 +0100 random: Add a feature to close device file descriptors. * src/gcrypt.h.in (GCRYCTL_CLOSE_RANDOM_DEVICE): New. * src/global.c (_gcry_vcontrol): Call _gcry_random_close_fds. * random/random.c (_gcry_random_close_fds): New. * random/random-csprng.c (_gcry_rngcsprng_close_fds): New. * random/random-fips.c (_gcry_rngfips_close_fds): New. * random/random-system.c (_gcry_rngsystem_close_fds): New. * random/rndlinux.c (open_device): Add arg retry. (_gcry_rndlinux_gather_random): Add mode to close open fds. * tests/random.c (check_close_random_device): New. (main): Call new test. Signed-off-by: Werner Koch diff --git a/NEWS b/NEWS index ec853c9..4c95e8a 100644 --- a/NEWS +++ b/NEWS @@ -72,6 +72,7 @@ Noteworthy changes in version 1.6.0 (unreleased) GCRYCTL_SET_ENFORCED_FIPS_FLAG NEW. GCRYCTL_SET_PREFERRED_RNG_TYPE NEW. GCRYCTL_GET_CURRENT_RNG_TYPE NEW. + GCRYCTL_CLOSE_RANDOM_DEVICE NEW. GCRY_RNG_TYPE_STANDARD NEW. GCRY_RNG_TYPE_FIPS NEW. GCRY_RNG_TYPE_SYSTEM NEW. diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 927634f..97dac1c 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -766,6 +766,14 @@ not an issue when using Linux (rndlinux driver), because this one guarantees to read full 16 bytes from /dev/urandom and thus there is no way for an attacker without kernel access to control these 16 bytes. + at item GCRYCTL_CLOSE_RANDOM_DEVICE; Arguments: none +Try to close the random device. If on Unix system you call fork(), +the child process does no call exec(), and you do not intend to use +Libgcrypt in the child, it might be useful to use this control code to +close the inherited file descriptors of the random device. If +Libgcrypt is later used again by the child, the device will be +re-opened. On non-Unix systems this control code is ignored. + @item GCRYCTL_SET_VERBOSITY; Arguments: int level This command sets the verbosity of the logging. A level of 0 disables all extra logging whereas positive numbers enable more verbose logging. @@ -1355,6 +1363,10 @@ values for @var{what} are defined: Not enough entropy is available. @var{total} holds the number of required bytes. + at item wait_dev_random +Waiting to re-open a random device. @var{total} gives the number of +seconds until the next try. + @item primegen Values for @var{printchar}: @table @code diff --git a/random/rand-internal.h b/random/rand-internal.h index f59a102..79b23ac 100644 --- a/random/rand-internal.h +++ b/random/rand-internal.h @@ -44,6 +44,7 @@ void _gcry_random_progress (const char *what, int printchar, /*-- random-csprng.c --*/ void _gcry_rngcsprng_initialize (int full); +void _gcry_rngcsprng_close_fds (void); void _gcry_rngcsprng_dump_stats (void); void _gcry_rngcsprng_secure_alloc (void); void _gcry_rngcsprng_enable_quick_gen (void); @@ -64,6 +65,7 @@ void _gcry_rngcsprng_fast_poll (void); /*-- random-fips.c --*/ void _gcry_rngfips_initialize (int full); +void _gcry_rngfips_close_fds (void); void _gcry_rngfips_dump_stats (void); int _gcry_rngfips_is_faked (void); gcry_error_t _gcry_rngfips_add_bytes (const void *buf, size_t buflen, @@ -89,6 +91,7 @@ void _gcry_rngfips_deinit_external_test (void *context); /*-- random-system.c --*/ void _gcry_rngsystem_initialize (int full); +void _gcry_rngsystem_close_fds (void); void _gcry_rngsystem_dump_stats (void); int _gcry_rngsystem_is_faked (void); gcry_error_t _gcry_rngsystem_add_bytes (const void *buf, size_t buflen, diff --git a/random/random-csprng.c b/random/random-csprng.c index 9921c4f..b6d7f66 100644 --- a/random/random-csprng.c +++ b/random/random-csprng.c @@ -154,7 +154,7 @@ static int allow_seed_file_update; static int secure_alloc; /* This function pointer is set to the actual entropy gathering - function during initailization. After initialization it is + function during initialization. After initialization it is guaranteed to point to function. (On systems without a random gatherer module a dummy function is used).*/ static int (*slow_gather_fnc)(void (*)(const void*, size_t, @@ -361,6 +361,20 @@ _gcry_rngcsprng_initialize (int full) } +/* Try to close the FDs of the random gather module. This is + currently only implemented for rndlinux. */ +void +_gcry_rngcsprng_close_fds (void) +{ + lock_pool (); +#if USE_RNDLINUX + _gcry_rndlinux_gather_random (NULL, 0, 0, 0); + pool_filled = 0; /* Force re-open on next use. */ +#endif + unlock_pool (); +} + + void _gcry_rngcsprng_dump_stats (void) { diff --git a/random/random-fips.c b/random/random-fips.c index c8100a2..6ee52f1 100644 --- a/random/random-fips.c +++ b/random/random-fips.c @@ -780,6 +780,19 @@ _gcry_rngfips_initialize (int full) } +/* Try to close the FDs of the random gather module. This is + currently only implemented for rndlinux. */ +void +_gcry_rngfips_close_fds (void) +{ + lock_rng (); +#if USE_RNDLINUX + _gcry_rndlinux_gather_random (NULL, 0, 0, 0); +#endif + unlock_rng (); +} + + /* Print some statistics about the RNG. */ void _gcry_rngfips_dump_stats (void) diff --git a/random/random-system.c b/random/random-system.c index 0ef9d24..3962ab8 100644 --- a/random/random-system.c +++ b/random/random-system.c @@ -193,6 +193,19 @@ _gcry_rngsystem_initialize (int full) } +/* Try to close the FDs of the random gather module. This is + currently only implemented for rndlinux. */ +void +_gcry_rngsystem_close_fds (void) +{ + lock_rng (); +#if USE_RNDLINUX + _gcry_rndlinux_gather_random (NULL, 0, 0, 0); +#endif + unlock_rng (); +} + + /* Print some statistics about the RNG. */ void _gcry_rngsystem_dump_stats (void) diff --git a/random/random.c b/random/random.c index 4679301..97018c4 100644 --- a/random/random.c +++ b/random/random.c @@ -165,6 +165,27 @@ _gcry_random_initialize (int full) } +/* If possible close file descriptors used by the RNG. */ +void +_gcry_random_close_fds (void) +{ + /* Note that we can't do that directly because each random system + has its own lock functions which need to be used for accessing + the entropy gatherer. */ + + if (fips_mode ()) + _gcry_rngfips_close_fds (); + else if (rng_types.standard) + _gcry_rngcsprng_close_fds (); + else if (rng_types.fips) + _gcry_rngfips_close_fds (); + else if (rng_types.system) + _gcry_rngsystem_close_fds (); + else + _gcry_rngcsprng_close_fds (); +} + + /* Return the current RNG type. IGNORE_FIPS_MODE is a flag used to skip the test for FIPS. This is useful, so that we are able to return the type of the RNG even before we have setup FIPS mode diff --git a/random/random.h b/random/random.h index aae07ab..2bc8cab 100644 --- a/random/random.h +++ b/random/random.h @@ -28,6 +28,7 @@ void _gcry_register_random_progress (void (*cb)(void *,const char*,int,int,int), void _gcry_set_preferred_rng_type (int type); void _gcry_random_initialize (int full); +void _gcry_random_close_fds (void); int _gcry_get_rng_type (int ignore_fips_mode); void _gcry_random_dump_stats(void); void _gcry_secure_random_alloc(void); diff --git a/random/rndlinux.c b/random/rndlinux.c index b304cc9..21ea8c4 100644 --- a/random/rndlinux.c +++ b/random/rndlinux.c @@ -36,7 +36,7 @@ #include "g10lib.h" #include "rand-internal.h" -static int open_device ( const char *name ); +static int open_device (const char *name, int retry); static int @@ -54,15 +54,30 @@ set_cloexec_flag (int fd) /* - * Used to open the /dev/random devices (Linux, xBSD, Solaris (if it exists)). + * Used to open the /dev/random devices (Linux, xBSD, Solaris (if it + * exists)). If RETRY is true, the function does not terminate with + * a fatal error but retries until it is able to reopen the device. */ static int -open_device ( const char *name ) +open_device (const char *name, int retry) { int fd; - fd = open ( name, O_RDONLY ); - if ( fd == -1 ) + if (retry) + _gcry_random_progress ("open_dev_random", 'X', 1, 0); + again: + fd = open (name, O_RDONLY); + if (fd == -1 && retry) + { + struct timeval tv; + + tv.tv_sec = 5; + tv.tv_usec = 0; + _gcry_random_progress ("wait_dev_random", 'X', 0, (int)tv.tv_sec); + select (0, NULL, NULL, NULL, &tv); + goto again; + } + if (fd == -1) log_fatal ("can't open %s: %s\n", name, strerror(errno) ); if (set_cloexec_flag (fd)) @@ -84,6 +99,10 @@ open_device ( const char *name ) } +/* Note that the caller needs to make sure that this function is only + called by one thread at a time. The function returns 0 on success + or true on failure (in which case the caller will signal a fatal + error). */ int _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, enum random_origins), @@ -92,6 +111,7 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, { static int fd_urandom = -1; static int fd_random = -1; + static unsigned char ever_opened; int fd; int n; byte buffer[768]; @@ -101,6 +121,23 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, int any_need_entropy = 0; int delay; + if (!add) + { + /* Special mode to close the descriptors. */ + if (fd_random != -1) + { + close (fd_random); + fd_random = -1; + } + if (fd_urandom != -1) + { + close (fd_urandom); + fd_urandom = -1; + } + return 0; + } + + /* First read from a hardware source. However let it account only for up to 50% of the requested bytes. */ n_hw = _gcry_rndhw_poll_slow (add, origin); @@ -109,17 +146,29 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, if (length > 1) length -= n_hw; - /* Open the requested device. */ + /* Open the requested device. The first time a device is to be + opened we fail with a fatal error if the device does not exists. + In case the device has ever been closed, further open requests + will however retry indefinitely. The rationale for this behaviour is + that we always require the device to be existent but want a more + graceful behaviour if the rarely needed close operation has been + used and the device needs to be re-opened later. */ if (level >= 2) { - if( fd_random == -1 ) - fd_random = open_device ( NAME_OF_DEV_RANDOM ); + if (fd_random == -1) + { + fd_random = open_device (NAME_OF_DEV_RANDOM, (ever_opened & 1)); + ever_opened |= 1; + } fd = fd_random; } else { - if( fd_urandom == -1 ) - fd_urandom = open_device ( NAME_OF_DEV_URANDOM ); + if (fd_urandom == -1) + { + fd_urandom = open_device (NAME_OF_DEV_URANDOM, (ever_opened & 2)); + ever_opened |= 2; + } fd = fd_urandom; } @@ -164,7 +213,7 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, log_error ("select() error: %s\n", strerror(errno)); if (!delay) delay = 1; /* Use 1 second if we encounter an error before - we have ever blocked. */ + we have ever blocked. */ continue; } } diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 53133bf..5c771e5 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -326,7 +326,8 @@ enum gcry_ctl_cmds GCRYCTL_GET_CURRENT_RNG_TYPE = 66, GCRYCTL_DISABLE_LOCKED_SECMEM = 67, GCRYCTL_DISABLE_PRIV_DROP = 68, - GCRYCTL_SET_CCM_LENGTHS = 69 + GCRYCTL_SET_CCM_LENGTHS = 69, + GCRYCTL_CLOSE_RANDOM_DEVICE = 70 }; /* Perform various operations defined by CMD. */ diff --git a/src/global.c b/src/global.c index 8521e58..8a5d310 100644 --- a/src/global.c +++ b/src/global.c @@ -540,6 +540,10 @@ _gcry_vcontrol (enum gcry_ctl_cmds cmd, va_list arg_ptr) _gcry_use_random_daemon (!! va_arg (arg_ptr, int)); break; + case GCRYCTL_CLOSE_RANDOM_DEVICE: + _gcry_random_close_fds (); + break; + /* This command dumps information pertaining to the configuration of libgcrypt to the given stream. It may be used before the initialization has been finished but not diff --git a/tests/random.c b/tests/random.c index ccaa3f9..10bf646 100644 --- a/tests/random.c +++ b/tests/random.c @@ -270,6 +270,54 @@ check_nonce_forking (void) } +/* Check that a closed random device os re-opened if needed. */ +static void +check_close_random_device (void) +{ +#ifdef HAVE_W32_SYSTEM + if (verbose) + inf ("check_close_random_device skipped: not applicable on Windows\n"); +#else /*!HAVE_W32_SYSTEM*/ + pid_t pid; + int i, status; + char buf[4]; + + if (verbose) + inf ("checking that close_random_device works\n"); + + gcry_randomize (buf, sizeof buf, GCRY_STRONG_RANDOM); + if (verbose) + print_hex ("parent random: ", buf, sizeof buf); + + pid = fork (); + if (pid == (pid_t)(-1)) + die ("fork failed: %s\n", strerror (errno)); + if (!pid) + { + gcry_control (GCRYCTL_CLOSE_RANDOM_DEVICE, 0); + + /* The next call will re-open the device. */ + gcry_randomize (buf, sizeof buf, GCRY_STRONG_RANDOM); + if (verbose) + { + print_hex ("child random : ", buf, sizeof buf); + fflush (stdout); + } + _exit (0); + } + + while ( (i=waitpid (pid, &status, 0)) == -1 && errno == EINTR) + ; + if (i != (pid_t)(-1) + && WIFEXITED (status) && !WEXITSTATUS (status)) + ; + else + die ("child failed\n"); + +#endif /*!HAVE_W32_SYSTEM*/ +} + + static int rng_type (void) { @@ -529,6 +577,7 @@ main (int argc, char **argv) { check_forking (); check_nonce_forking (); + check_close_random_device (); } check_rng_type_switching (); ----------------------------------------------------------------------- Summary of changes: NEWS | 1 + doc/gcrypt.texi | 12 ++++++++ random/rand-internal.h | 3 ++ random/random-csprng.c | 16 ++++++++++- random/random-fips.c | 13 +++++++++ random/random-system.c | 13 +++++++++ random/random.c | 21 ++++++++++++++ random/random.h | 1 + random/rndlinux.c | 71 ++++++++++++++++++++++++++++++++++++++++-------- src/gcrypt.h.in | 3 +- src/global.c | 4 +++ tests/random.c | 49 +++++++++++++++++++++++++++++++++ 12 files changed, 194 insertions(+), 13 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Wed Dec 11 20:49:14 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 11 Dec 2013 21:49:14 +0200 Subject: [PATCH] SHA-256: Add SSSE3 implementation for x86-64 Message-ID: <20131211194914.944.7490.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha256-ssse3-amd64.S'. * cipher/sha256-ssse3-amd64.S: New. * cipher/sha256.c (USE_SSSE3): New. (SHA256_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'. (sha256_init, sha224_init) [USE_SSSE3]: Initialize 'use_ssse3'. (transform): Rename to... (_transform): This. [USE_SSSE3] (_gcry_sha256_transform_amd64_ssse3): New. (transform): New. * configure.ac (HAVE_INTEL_SYNTAX_PLATFORM_AS): New check. (sha256): Add 'sha256-ssse3-amd64.lo'. * src/g10lib.h (HWF_INTEL_SSSE3): New. * src/global.c (hwflist): Add "intel-ssse3". * src/hwf-x86.c (detect_x86_gnuc): Test for SSSE3. -- Patch adds fast SSSE3 implementation of SHA-256 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA - 256 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/sha-256-implementations-paper.html Benchmarks: cpu Old New Diff Intel i5-4570 13.99 c/B 10.66 c/B 1.31x Intel i5-2450M 21.53 c/B 15.79 c/B 1.36x Intel Core2 T8100 20.84 c/B 15.07 c/B 1.38x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/sha256-ssse3-amd64.S | 526 +++++++++++++++++++++++++++++++++++++++++++ cipher/sha256.c | 45 ++++ configure.ac | 45 ++++ src/g10lib.h | 1 src/global.c | 1 src/hwf-x86.c | 3 7 files changed, 621 insertions(+), 2 deletions(-) create mode 100644 cipher/sha256-ssse3-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index ff9deca..34f74e2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,7 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ -sha256.c \ +sha256.c sha256-ssse3-amd64.S \ sha512.c sha512-armv7-neon.S \ stribog.c \ tiger.c \ diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S new file mode 100644 index 0000000..d7d5980 --- /dev/null +++ b/cipher/sha256-ssse3-amd64.S @@ -0,0 +1,526 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + * + * Note: original implementation was named as SHA256-SSE4. However, only SSSE3 + * is required. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +#define MOVDQ movdqu /* assume buffers not aligned */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ + +/* addm [mem], reg + * Add reg to mem using reg-mem add and store */ +.macro addm p1 p2 + add \p2, \p1 + mov \p1, \p2 +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask + * Load xmm with mem and byte swap each dword */ +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + MOVDQ \p1, \p2 + pshufb \p1, \p3 +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +X0 = xmm4 +X1 = xmm5 +X2 = xmm6 +X3 = xmm7 + +XTMP0 = xmm0 +XTMP1 = xmm1 +XTMP2 = xmm2 +XTMP3 = xmm3 +XTMP4 = xmm8 +XFER = xmm9 + +SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ +SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ +BYTE_FLIP_MASK = xmm12 + +NUM_BLKS = rdx /* 3rd arg */ +CTX = rsi /* 2nd arg */ +INP = rdi /* 1st arg */ + +SRND = rdi /* clobbers INP */ +c = ecx +d = r8d +e = edx + +TBL = rbp +a = eax +b = ebx + +f = r9d +g = r10d +h = r11d + +y0 = r13d +y1 = r14d +y2 = r15d + + + +#define _INP_END_SIZE 8 +#define _INP_SIZE 8 +#define _XFER_SIZE 8 +#define _XMM_SAVE_SIZE 0 +/* STACK_SIZE plus pushes must be an odd multiple of 8 */ +#define _ALIGN_SIZE 8 + +#define _INP_END 0 +#define _INP (_INP_END + _INP_END_SIZE) +#define _XFER (_INP + _INP_SIZE) +#define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) +#define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) + +/* rotate_Xs + * Rotate values of symbols X0...X3 */ +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +/* ROTATE_ARGS + * Rotate values of symbols a...h */ +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + /* compute s0 four at a time and s1 two at a time + * compute W[-16] + W[-7] 4 at a time */ + movdqa XTMP0, X3 + mov y0, e /* y0 = e */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + movdqa XTMP1, X1 + xor y1, a /* y1 = a ^ (a >> (22-13) */ + xor y2, g /* y2 = f^g */ + paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + /* compute s0 */ + palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ + movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pslld XTMP1, (32-7) + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + psrld XTMP2, 7 + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */ + mov y0, e /* y0 = e */ + mov y1, a /* y1 = a */ + movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + pslld XTMP3, (32-18) + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y2, g /* y2 = f^g */ + psrld XTMP2, 18 + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + pxor XTMP1, XTMP3 + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pxor XTMP1, XTMP4 /* XTMP1 = s0 */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + /* compute low s1 */ + pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */ + mov y0, e /* y0 = e */ + mov y1, a /* y1 = a */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + mov y2, f /* y2 = f */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ + xor y2, g /* y2 = f^g */ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + pxor XTMP2, XTMP3 + add y2, y0 /* y2 = S1 + CH */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ + pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + /* compute high s1 */ + pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */ + mov y0, e /* y0 = e */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + xor y2, g /* y2 = f^g */ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + pxor XTMP2, XTMP3 + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ + pxor X0, XTMP2 /* X0 = s1 {xDxC} */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS +rotate_Xs +.endm + +/* input is [rsp + _XFER + %1 * 4] */ +.macro DO_ROUND i1 + mov y0, e /* y0 = e */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + mov y2, f /* y2 = f */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y2, g /* y2 = f^g */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + and y2, e /* y2 = (f^g)&e */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + add y2, y0 /* y2 = S1 + CH */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + ROTATE_ARGS +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +*/ +.text +.globl _gcry_sha256_transform_amd64_ssse3 +.type _gcry_sha256_transform_amd64_ssse3, at function; +.align 32 +_gcry_sha256_transform_amd64_ssse3: + push rbx + push rbp + push r13 + push r14 + push r15 + + sub rsp, STACK_SIZE + + shl NUM_BLKS, 6 /* convert to bytes */ + jz .Ldone_hash + add NUM_BLKS, INP /* pointer to end of data */ + mov [rsp + _INP_END], NUM_BLKS + + /* load initial digest */ + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + movdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + movdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] + movdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] + +.Loop0: + lea TBL, [.LK256 ADD_RIP] + + /* byte swap first 16 dwords */ + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + /* schedule 48 input dwords, by doing 3 rounds of 16 each */ + mov SRND, 3 +.align 16 +.Loop1: + movdqa XFER, [TBL + 0*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne .Loop1 + + mov SRND, 2 +.Loop2: + paddd X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne .Loop2 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne .Loop0 + +.Ldone_hash: + add rsp, STACK_SIZE + + pop r15 + pop r14 + pop r13 + pop rbp + pop rbx + + mov rax, STACK_SIZE + ret + + +.data +.align 64 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 + +/* shuffle xBxA -> 00BA */ +.L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +/* shuffle xDxC -> DC00 */ +.L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF + +#endif +#endif diff --git a/cipher/sha256.c b/cipher/sha256.c index bd5a412..f3c1d62 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -46,11 +46,25 @@ #include "cipher.h" #include "hash-common.h" + +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_SSSE3 1 +#endif + + typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4,h5,h6,h7; +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; +#endif } SHA256_CONTEXT; + static unsigned int transform (void *c, const unsigned char *data); @@ -74,6 +88,10 @@ sha256_init (void *context) hd->bctx.count = 0; hd->bctx.blocksize = 64; hd->bctx.bwrite = transform; + +#ifdef USE_SSSE3 + hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -96,6 +114,10 @@ sha224_init (void *context) hd->bctx.count = 0; hd->bctx.blocksize = 64; hd->bctx.bwrite = transform; + +#ifdef USE_SSSE3 + hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -148,7 +170,7 @@ Sum1 (u32 x) static unsigned int -transform (void *ctx, const unsigned char *data) +_transform (void *ctx, const unsigned char *data) { SHA256_CONTEXT *hd = ctx; static const u32 K[64] = { @@ -254,6 +276,27 @@ transform (void *ctx, const unsigned char *data) #undef R +#ifdef USE_SSSE3 +unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data, + u32 state[8], size_t num_blks); +#endif + + +static unsigned int +transform (void *ctx, const unsigned char *data) +{ + SHA256_CONTEXT *hd = ctx; + +#ifdef USE_SSSE3 + if (hd->use_ssse3) + return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, 1) + + 4 * sizeof(void*); +#endif + + return _transform (hd, data); +} + + /* The routine finally terminates the computation and returns the digest. The handle is prepared for a new cycle, but adding bytes diff --git a/configure.ac b/configure.ac index 6d40343..eb0dd29 100644 --- a/configure.ac +++ b/configure.ac @@ -1077,6 +1077,44 @@ fi # +# Check whether GCC assembler supports features needed for our amd64 assembly +# implementations that use Intel syntax +# +if test $amd64_as_feature_detection = yes; then + if test $gcry_cv_gcc_amd64_platform_as_ok = yes; then + AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations], + [gcry_cv_gcc_amd64_platform_as_ok_for_intel_syntax], + [gcry_cv_gcc_amd64_platform_as_ok_for_intel_syntax=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[__asm__( + ".intel_syntax noprefix\n\t" + "pxor xmm1, xmm7;\n\t" + /* Intel syntax implementation also use GAS macros, so check + * for them here. */ + "VAL_A = xmm8\n\t" + "VAL_B = xmm9\n\t" + ".macro SET_VAL_A p1\n\t" + " VAL_A = \\\\p1 \n\t" + ".endm\n\t" + ".macro SET_VAL_B p1\n\t" + " VAL_B = \\\\p1 \n\t" + ".endm\n\t" + "vmovdqa VAL_A, VAL_B;\n\t" + "SET_VAL_A eax\n\t" + "SET_VAL_B r8d\n\t" + "add VAL_A, VAL_B;\n\t" + "add VAL_B, 0b10101;\n\t" + );]])], + [gcry_cv_gcc_amd64_platform_as_ok_for_intel_syntax=yes])]) + if test "$gcry_cv_gcc_amd64_platform_as_ok_for_intel_syntax" = "yes" ; then + AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1, + [Defined if underlying assembler is compatible with Intel syntax assembly implementations]) + fi + fi +fi + + +# # Check whether compiler is configured for ARMv6 or newer architecture # AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture], @@ -1676,6 +1714,13 @@ LIST_MEMBER(sha256, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo" AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo" + ;; + esac fi LIST_MEMBER(sha512, $enabled_digests) diff --git a/src/g10lib.h b/src/g10lib.h index a326ad5..1dcadfa 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -167,6 +167,7 @@ int _gcry_log_verbosity( int level ); #define HWF_PADLOCK_SHA 4 #define HWF_PADLOCK_MMUL 8 +#define HWF_INTEL_SSSE3 64 #define HWF_INTEL_PCLMUL 128 #define HWF_INTEL_AESNI 256 #define HWF_INTEL_RDRAND 512 diff --git a/src/global.c b/src/global.c index 8a5d310..de99286 100644 --- a/src/global.c +++ b/src/global.c @@ -66,6 +66,7 @@ static struct { HWF_PADLOCK_AES, "padlock-aes" }, { HWF_PADLOCK_SHA, "padlock-sha" }, { HWF_PADLOCK_MMUL,"padlock-mmul"}, + { HWF_INTEL_SSSE3, "intel-ssse3" }, { HWF_INTEL_PCLMUL,"intel-pclmul" }, { HWF_INTEL_AESNI, "intel-aesni" }, { HWF_INTEL_RDRAND,"intel-rdrand" }, diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 784fe2a..ab6dacd 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -206,6 +206,9 @@ detect_x86_gnuc (void) if (features & 0x00000002) result |= HWF_INTEL_PCLMUL; #endif + /* Test bit 9 for SSSE3. */ + if (features & 0x00000200) + result |= HWF_INTEL_SSSE3; #ifdef ENABLE_AESNI_SUPPORT /* Test bit 25 for AES-NI. */ if (features & 0x02000000) From wk at gnupg.org Thu Dec 12 14:08:46 2013 From: wk at gnupg.org (Werner Koch) Date: Thu, 12 Dec 2013 14:08:46 +0100 Subject: [PATCH] SHA-256: Add SSSE3 implementation for x86-64 In-Reply-To: <20131211194914.944.7490.stgit@localhost6.localdomain6> (Jussi Kivilinna's message of "Wed, 11 Dec 2013 21:49:14 +0200") References: <20131211194914.944.7490.stgit@localhost6.localdomain6> Message-ID: <87r49iqkrl.fsf@vigenere.g10code.de> On Wed, 11 Dec 2013 20:49, jussi.kivilinna at iki.fi said: > Intel i5-4570 13.99 c/B 10.66 c/B 1.31x > Intel i5-2450M 21.53 c/B 15.79 c/B 1.36x > Intel Core2 T8100 20.84 c/B 15.07 c/B 1.38x That sounds useful enough to put it into the 1.6 release. I would like to release early next week, though. I expect that we will get a few build problem reports anyway, so better put this into 1.6 that to wait for 1.7. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From jussi.kivilinna at iki.fi Thu Dec 12 14:59:58 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 12 Dec 2013 15:59:58 +0200 Subject: [PATCH 1/2] SHA-512: Add SSSE3 implementation for x86-64 Message-ID: <20131212135958.15095.22118.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha512-ssse3-amd64.S'. * cipher/sha512-ssse3-amd64.S: New. * cipher/sha512.c (USE_SSSE3): New. (SHA512_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'. (sha512_init, sha384_init) [USE_SSSE3]: Initialize 'use_ssse3'. [USE_SSSE3] (_gcry_sha512_transform_amd64_ssse3): New. (transform) [USE_SSSE3]: Call SSSE3 implementation. * configure.ac (sha512): Add 'sha512-ssse3-amd64.lo'. -- Patch adds fast SSSE3 implementation of SHA-512 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA512 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/fast-sha512-implementations-ia-processors-paper.html Benchmarks: cpu Old New Diff Intel i5-4570 10.11 c/B 7.56 c/B 1.33x Intel i5-2450M 14.11 c/B 10.53 c/B 1.33x Intel Core2 T8100 11.92 c/B 10.22 c/B 1.16x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/sha512-ssse3-amd64.S | 414 +++++++++++++++++++++++++++++++++++++++++++ cipher/sha512.c | 34 +++- configure.ac | 7 + 4 files changed, 454 insertions(+), 3 deletions(-) create mode 100644 cipher/sha512-ssse3-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 34f74e2..88c288a 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -79,7 +79,7 @@ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ sha256.c sha256-ssse3-amd64.S \ -sha512.c sha512-armv7-neon.S \ +sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c \ diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S new file mode 100644 index 0000000..2a719e5 --- /dev/null +++ b/cipher/sha512-ssse3-amd64.S @@ -0,0 +1,414 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + * + * Note: original implementation was named as SHA512-SSE4. However, only SSSE3 + * is required. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +.text + +/* Virtual Registers */ +msg = rdi /* ARG1 */ +digest = rsi /* ARG2 */ +msglen = rdx /* ARG3 */ +T1 = rcx +T2 = r8 +a_64 = r9 +b_64 = r10 +c_64 = r11 +d_64 = r12 +e_64 = r13 +f_64 = r14 +g_64 = r15 +h_64 = rbx +tmp0 = rax + +/* +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +*/ +frame_W = 0 /* Message Schedule */ +frame_W_size = (80 * 8) +frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +frame_WK_size = (2 * 8) +frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) +frame_GPRSAVE_size = (5 * 8) +frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) + + +/* Useful QWORD "arrays" for simpler memory references */ +#define MSG(i) msg + 8*(i) /* Input message (arg1) */ +#define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ +#define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ +#define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ +#define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ +/* MSG, DIGEST, K_t, W_t are arrays */ +/* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ + +.macro RotateState + /* Rotate symbles a..h right */ + __TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = __TMP +.endm + +.macro SHA512_Round t + /* Compute Round %%t */ + mov T1, f_64 /* T1 = f */ + mov tmp0, e_64 /* tmp = e */ + xor T1, g_64 /* T1 = f ^ g */ + ror tmp0, 23 /* 41 ; tmp = e ror 23 */ + and T1, e_64 /* T1 = (f ^ g) & e */ + xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ + xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ + add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ + ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ + xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ + mov T2, a_64 /* T2 = a */ + add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ + ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ + mov tmp0, a_64 /* tmp = a */ + xor T2, c_64 /* T2 = a ^ c */ + and tmp0, c_64 /* tmp = a & c */ + and T2, b_64 /* T2 = (a ^ c) & b */ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ + mov tmp0, a_64 /* tmp = a */ + ror tmp0, 5 /* 39 ; tmp = a ror 5 */ + xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ + add d_64, T1 /* e(next_state) = d + T1 */ + ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ + xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ + lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ + ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ + add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + RotateState +.endm + +.macro SHA512_2Sched_2Round_sse t +/* ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. */ + + mov T1, f_64 + movdqa xmm2, [W_t(\t-2)] /* XMM2 = W[t-2] */ + xor T1, g_64 + and T1, e_64 + movdqa xmm0, xmm2 /* XMM0 = W[t-2] */ + xor T1, g_64 + add T1, [WK_2(\t)] + movdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ + mov tmp0, e_64 + ror tmp0, 23 /* 41 */ + movdqa xmm3, xmm5 /* XMM3 = W[t-15] */ + xor tmp0, e_64 + ror tmp0, 4 /* 18 */ + psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */ + xor tmp0, e_64 + ror tmp0, 14 /* 14 */ + psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */ + add T1, tmp0 + add T1, h_64 + pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */ + mov T2, a_64 + xor T2, c_64 + pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */ + and T2, b_64 + mov tmp0, a_64 + psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */ + and tmp0, c_64 + xor T2, tmp0 + psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */ + mov tmp0, a_64 + ror tmp0, 5 /* 39 */ + pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */ + xor tmp0, a_64 + ror tmp0, 6 /* 34 */ + pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */ + xor tmp0, a_64 + ror tmp0, 28 /* 28 */ + psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */ + add T2, tmp0 + add d_64, T1 + psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */ + lea h_64, [T1 + T2] + RotateState + movdqa xmm1, xmm2 /* XMM1 = W[t-2] */ + mov T1, f_64 + xor T1, g_64 + movdqa xmm4, xmm5 /* XMM4 = W[t-15] */ + and T1, e_64 + xor T1, g_64 + psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */ + add T1, [WK_2(\t+1)] + mov tmp0, e_64 + psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */ + ror tmp0, 23 /* 41 */ + xor tmp0, e_64 + pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */ + ror tmp0, 4 /* 18 */ + xor tmp0, e_64 + pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */ + ror tmp0, 14 /* 14 */ + add T1, tmp0 + psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */ + add T1, h_64 + mov T2, a_64 + psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */ + xor T2, c_64 + and T2, b_64 + pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */ + mov tmp0, a_64 + and tmp0, c_64 + movdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ + xor T2, tmp0 + pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */ + mov tmp0, a_64 + paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */ + ror tmp0, 5 /* 39 */ + paddq xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */ + xor tmp0, a_64 + paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ + ror tmp0, 6 /* 34 */ + movdqa [W_t(\t)], xmm0 /* Store scheduled qwords */ + xor tmp0, a_64 + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + ror tmp0, 28 /* 28 */ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ + add T2, tmp0 + add d_64, T1 + lea h_64, [T1 + T2] + RotateState +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_sse4(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks. +*/ +.globl _gcry_sha512_transform_amd64_ssse3 +.type _gcry_sha512_transform_amd64_ssse3, at function; +.align 16 +_gcry_sha512_transform_amd64_ssse3: + xor eax, eax + + cmp msglen, 0 + je .Lnowork + + /* Allocate Stack Space */ + sub rsp, frame_size + + /* Save GPRs */ + mov [rsp + frame_GPRSAVE + 8 * 0], rbx + mov [rsp + frame_GPRSAVE + 8 * 1], r12 + mov [rsp + frame_GPRSAVE + 8 * 2], r13 + mov [rsp + frame_GPRSAVE + 8 * 3], r14 + mov [rsp + frame_GPRSAVE + 8 * 4], r15 + +.Lupdateblock: + + /* Load state variables */ + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + t = 0 + .rept 80/2 + 1 + /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ + /* +1 iteration because the scheduler leads hashing by 1 iteration */ + .if t < 2 + /* BSWAP 2 QWORDS */ + movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 /* BSWAP */ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + movdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ + .elseif t < 16 + /* BSWAP 2 QWORDS; Compute 2 Rounds */ + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 /* BSWAP */ + SHA512_Round (t - 2) /* Round t-2 */ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + SHA512_Round (t - 1) /* Round t-1 */ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ + .elseif t < 79 + /* Schedule 2 QWORDS; Compute 2 Rounds */ + SHA512_2Sched_2Round_sse t + .else + /* Compute 2 Rounds */ + SHA512_Round (t - 2) + SHA512_Round (t - 1) + .endif + t = (t)+2 + .endr + + /* Update digest */ + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + /* Advance to next message block */ + add msg, 16*8 + dec msglen + jnz .Lupdateblock + + /* Restore GPRs */ + mov rbx, [rsp + frame_GPRSAVE + 8 * 0] + mov r12, [rsp + frame_GPRSAVE + 8 * 1] + mov r13, [rsp + frame_GPRSAVE + 8 * 2] + mov r14, [rsp + frame_GPRSAVE + 8 * 3] + mov r15, [rsp + frame_GPRSAVE + 8 * 4] + + /* Restore Stack Pointer */ + add rsp, frame_size + + /* Return stack burn depth */ + mov rax, frame_size + +.Lnowork: + ret + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data +*/ + +.data + +.align 16 + +/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ +.LXMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +/* K[t] used in SHA512 hashing */ +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +#endif +#endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 14608dc..34b3215 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -64,6 +64,16 @@ # endif #endif + +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_SSSE3 1 +#endif + + typedef struct { u64 h0, h1, h2, h3, h4, h5, h6, h7; @@ -74,7 +84,10 @@ typedef struct gcry_md_block_ctx_t bctx; SHA512_STATE state; #ifdef USE_ARM_NEON_ASM - int use_neon; + unsigned int use_neon:1; +#endif +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; #endif } SHA512_CONTEXT; @@ -105,6 +118,9 @@ sha512_init (void *context) #ifdef USE_ARM_NEON_ASM ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; #endif +#ifdef USE_SSSE3 + ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } static void @@ -131,6 +147,9 @@ sha384_init (void *context) #ifdef USE_ARM_NEON_ASM ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; #endif +#ifdef USE_SSSE3 + ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -483,16 +502,27 @@ void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd, const u64 k[]); #endif +#ifdef USE_SSSE3 +unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data, + void *state, size_t num_blks); +#endif + static unsigned int transform (void *context, const unsigned char *data) { SHA512_CONTEXT *ctx = context; +#ifdef USE_SSSE3 + if (ctx->use_ssse3) + return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, 1) + + 4 * sizeof(void*); +#endif + #ifdef USE_ARM_NEON_ASM if (ctx->use_neon) { - _gcry_sha512_transform_armv7_neon(&ctx->state, data, k); + _gcry_sha512_transform_armv7_neon (&ctx->state, data, k); /* _gcry_sha512_transform_armv7_neon does not store sensitive data * to stack. */ diff --git a/configure.ac b/configure.ac index eb0dd29..b930f4e 100644 --- a/configure.ac +++ b/configure.ac @@ -1728,6 +1728,13 @@ if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo" AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included]) + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo" + ;; + esac + if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo" From jussi.kivilinna at iki.fi Thu Dec 12 15:09:15 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 12 Dec 2013 16:09:15 +0200 Subject: [PATCH] SHA-256: Add SSSE3 implementation for x86-64 In-Reply-To: <87r49iqkrl.fsf@vigenere.g10code.de> References: <20131211194914.944.7490.stgit@localhost6.localdomain6> <87r49iqkrl.fsf@vigenere.g10code.de> Message-ID: <52A9C38B.108@iki.fi> On 12.12.2013 15:08, Werner Koch wrote: > On Wed, 11 Dec 2013 20:49, jussi.kivilinna at iki.fi said: > >> Intel i5-4570 13.99 c/B 10.66 c/B 1.31x >> Intel i5-2450M 21.53 c/B 15.79 c/B 1.36x >> Intel Core2 T8100 20.84 c/B 15.07 c/B 1.38x > > That sounds useful enough to put it into the 1.6 release. I would like > to release early next week, though. I expect that we will get a few > build problem reports anyway, so better put this into 1.6 that to wait > for 1.7. Ok, and I assume that SHA-512 SSSE3/AVX/AVX2 patches are useful too. I'll push SHA-256/512 patches to repository tomorrow. -Jussi > > > Salam-Shalom, > > Werner > From jussi.kivilinna at iki.fi Thu Dec 12 15:00:03 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 12 Dec 2013 16:00:03 +0200 Subject: [PATCH 2/2] SHA-512: Add AVX and AVX2 implementations for x86-64 In-Reply-To: <20131212135958.15095.22118.stgit@localhost6.localdomain6> References: <20131212135958.15095.22118.stgit@localhost6.localdomain6> Message-ID: <20131212140003.15095.73005.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha512-avx-amd64.S' and 'sha512-avx2-bmi2-amd64.S'. * cipher/sha512-avx-amd64.S: New. * cipher/sha512-avx2-bmi2-amd64.S: New. * cipher/sha512.c (USE_AVX, USE_AVX2): New. (SHA512_CONTEXT) [USE_AVX]: Add 'use_avx'. (SHA512_CONTEXT) [USE_AVX2]: Add 'use_avx2'. (sha512_init, sha384_init) [USE_AVX]: Initialize 'use_avx'. (sha512_init, sha384_init) [USE_AVX2]: Initialize 'use_avx2'. [USE_AVX] (_gcry_sha512_transform_amd64_avx): New. [USE_AVX2] (_gcry_sha512_transform_amd64_avx2): New. (transform) [USE_AVX2]: Add call for AVX2 implementation. (transform) [USE_AVX]: Add call for AVX implementation. * configure.ac (HAVE_GCC_INLINE_ASM_BMI2): New check. (sha512): Add 'sha512-avx-amd64.lo' and 'sha512-avx2-bmi2-amd64.lo'. * src/g10lib.h (HWF_INTEL_CPU, HWF_INTEL_BMI2): New. * src/global.c (hwflist): Add "intel-cpu" and "intel-bmi2". * src/hwf-x86.c (detect_x86_gnuc): Check for HWF_INTEL_CPU and HWF_INTEL_BMI2. -- Patch adds fast AVX and AVX2 implementation of SHA-512 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA512 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/fast-sha512-implementat$ Note: AVX implementation uses SHLD instruction to emulate RORQ, since it's faster on Intel Sandy-Bridge. However, on non-Intel CPUs SHLD is much slower than RORQ, so therefore AVX implementation is (for now) limited to Intel CPUs. Note: AVX2 implementation also uses BMI2 instruction rorx, thus additional HWF flag. Benchmarks: cpu Old SSSE3 AVX/AVX2 Old vs AVX/AVX2 vs SSSE3 Intel i5-4570 10.11 c/B 7.56 c/B 6.72 c/B 1.50x 1.12x Intel i5-2450M 14.11 c/B 10.53 c/B 8.88 c/B 1.58x 1.18x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/sha512-avx-amd64.S | 412 +++++++++++++++++++++ cipher/sha512-avx2-bmi2-amd64.S | 783 +++++++++++++++++++++++++++++++++++++++ cipher/sha512.c | 72 +++- configure.ac | 19 + src/g10lib.h | 2 src/global.c | 2 src/hwf-x86.c | 7 8 files changed, 1293 insertions(+), 6 deletions(-) create mode 100644 cipher/sha512-avx-amd64.S create mode 100644 cipher/sha512-avx2-bmi2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 88c288a..7c85af2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,7 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ -sha256.c sha256-ssse3-amd64.S \ +sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ tiger.c \ diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S new file mode 100644 index 0000000..01c1daa --- /dev/null +++ b/cipher/sha512-avx-amd64.S @@ -0,0 +1,412 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +.text + +/* Virtual Registers */ +msg = rdi /* ARG1 */ +digest = rsi /* ARG2 */ +msglen = rdx /* ARG3 */ +T1 = rcx +T2 = r8 +a_64 = r9 +b_64 = r10 +c_64 = r11 +d_64 = r12 +e_64 = r13 +f_64 = r14 +g_64 = r15 +h_64 = rbx +tmp0 = rax + +/* +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +*/ +frame_W = 0 /* Message Schedule */ +frame_W_size = (80 * 8) +frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +frame_WK_size = (2 * 8) +frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) +frame_GPRSAVE_size = (5 * 8) +frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) + + +/* Useful QWORD "arrays" for simpler memory references */ +#define MSG(i) msg + 8*(i) /* Input message (arg1) */ +#define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ +#define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ +#define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ +#define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ +/* MSG, DIGEST, K_t, W_t are arrays */ +/* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ + +.macro RotateState + /* Rotate symbles a..h right */ + __TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = __TMP +.endm + +.macro RORQ p1 p2 + /* shld is faster than ror on Intel Sandybridge */ + shld \p1, \p1, (64 - \p2) +.endm + +.macro SHA512_Round t + /* Compute Round %%t */ + mov T1, f_64 /* T1 = f */ + mov tmp0, e_64 /* tmp = e */ + xor T1, g_64 /* T1 = f ^ g */ + RORQ tmp0, 23 /* 41 ; tmp = e ror 23 */ + and T1, e_64 /* T1 = (f ^ g) & e */ + xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ + xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ + add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ + RORQ tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ + xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ + mov T2, a_64 /* T2 = a */ + add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ + RORQ tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ + mov tmp0, a_64 /* tmp = a */ + xor T2, c_64 /* T2 = a ^ c */ + and tmp0, c_64 /* tmp = a & c */ + and T2, b_64 /* T2 = (a ^ c) & b */ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ + mov tmp0, a_64 /* tmp = a */ + RORQ tmp0, 5 /* 39 ; tmp = a ror 5 */ + xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ + add d_64, T1 /* e(next_state) = d + T1 */ + RORQ tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ + xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ + lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ + RORQ tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ + add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + RotateState +.endm + +.macro SHA512_2Sched_2Round_avx t +/* ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. */ + + vmovdqa xmm4, [W_t(\t-2)] /* XMM4 = W[t-2] */ + vmovdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ + mov T1, f_64 + vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */ + mov tmp0, e_64 + vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */ + xor T1, g_64 + RORQ tmp0, 23 /* 41 */ + vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */ + and T1, e_64 + xor tmp0, e_64 + vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */ + xor T1, g_64 + add T1, [WK_2(\t)]; + vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */ + RORQ tmp0, 4 /* 18 */ + vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */ + xor tmp0, e_64 + mov T2, a_64 + add T1, h_64 + vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */ + RORQ tmp0, 14 /* 14 */ + add T1, tmp0 + vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */ + mov tmp0, a_64 + xor T2, c_64 + vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */ + and tmp0, c_64 + and T2, b_64 + vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */ + xor T2, tmp0 + mov tmp0, a_64 + vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */ + RORQ tmp0, 5 /* 39 */ + vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */ + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 /* 34 */ + xor tmp0, a_64 + vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */ + lea h_64, [T1 + T2] + RORQ tmp0, 28 /* 28 */ + vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */ + add h_64, tmp0 + RotateState + vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */ + mov T1, f_64 + vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */ + mov tmp0, e_64 + xor T1, g_64 + vpaddq xmm0, xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */ + vmovdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ + RORQ tmp0, 23 /* 41 */ + and T1, e_64 + xor tmp0, e_64 + xor T1, g_64 + vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */ + add T1, [WK_2(\t+1)] + vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */ + RORQ tmp0, 4 /* 18 */ + vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */ + xor tmp0, e_64 + vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ + mov T2, a_64 + add T1, h_64 + RORQ tmp0, 14 /* 14 */ + add T1, tmp0 + vmovdqa [W_t(\t)], xmm0 /* Store W[t] */ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ + mov tmp0, a_64 + xor T2, c_64 + and tmp0, c_64 + and T2, b_64 + xor T2, tmp0 + mov tmp0, a_64 + RORQ tmp0, 5 /* 39 */ + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 /* 34 */ + xor tmp0, a_64 + lea h_64, [T1 + T2] + RORQ tmp0, 28 /* 28 */ + add h_64, tmp0 + RotateState +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_avx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +*/ +.globl _gcry_sha512_transform_amd64_avx +.type _gcry_sha512_transform_amd64_avx, at function; +.align 16 +_gcry_sha512_transform_amd64_avx: + xor eax, eax + + cmp msglen, 0 + je .Lnowork + + /* Allocate Stack Space */ + sub rsp, frame_size + + /* Save GPRs */ + mov [rsp + frame_GPRSAVE + 8 * 0], rbx + mov [rsp + frame_GPRSAVE + 8 * 1], r12 + mov [rsp + frame_GPRSAVE + 8 * 2], r13 + mov [rsp + frame_GPRSAVE + 8 * 3], r14 + mov [rsp + frame_GPRSAVE + 8 * 4], r15 + +.Lupdateblock: + + /* Load state variables */ + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + t = 0 + .rept 80/2 + 1 + /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ + /* +1 iteration because the scheduler leads hashing by 1 iteration */ + .if t < 2 + /* BSWAP 2 QWORDS */ + vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ + .elseif t < 16 + /* BSWAP 2 QWORDS, Compute 2 Rounds */ + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + SHA512_Round (t - 2) /* Round t-2 */ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + SHA512_Round (t - 1) /* Round t-1 */ + vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ + .elseif t < 79 + /* Schedule 2 QWORDS; Compute 2 Rounds */ + SHA512_2Sched_2Round_avx t + .else + /* Compute 2 Rounds */ + SHA512_Round (t - 2) + SHA512_Round (t - 1) + .endif + t = ((t)+2) + .endr + + /* Update digest */ + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + /* Advance to next message block */ + add msg, 16*8 + dec msglen + jnz .Lupdateblock + + /* Restore GPRs */ + mov rbx, [rsp + frame_GPRSAVE + 8 * 0] + mov r12, [rsp + frame_GPRSAVE + 8 * 1] + mov r13, [rsp + frame_GPRSAVE + 8 * 2] + mov r14, [rsp + frame_GPRSAVE + 8 * 3] + mov r15, [rsp + frame_GPRSAVE + 8 * 4] + + /* Restore Stack Pointer */ + add rsp, frame_size + + /* Return stack burn depth */ + mov rax, frame_size + +.Lnowork: + ret + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data +*/ + +.data + +.align 16 + +/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ +.LXMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +/* K[t] used in SHA512 hashing */ +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +#endif +#endif diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S new file mode 100644 index 0000000..9573c8d --- /dev/null +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -0,0 +1,783 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(USE_SHA512) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +.text + +/* Virtual Registers */ +Y_0 = ymm4 +Y_1 = ymm5 +Y_2 = ymm6 +Y_3 = ymm7 + +YTMP0 = ymm0 +YTMP1 = ymm1 +YTMP2 = ymm2 +YTMP3 = ymm3 +YTMP4 = ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = ymm9 + +INP = rdi /* 1st arg */ +CTX = rsi /* 2nd arg */ +NUM_BLKS = rdx /* 3rd arg */ +c = rcx +d = r8 +e = rdx +y3 = rdi + +TBL = rbp + +a = rax +b = rbx + +f = r9 +g = r10 +h = r11 +old_h = r11 + +T1 = r12 +y0 = r13 +y1 = r14 +y2 = r15 + +y4 = r12 + +/* Local variables (stack frame) */ +#define frame_XFER 0 +#define frame_XFER_size (4*8) +#define frame_SRND (frame_XFER + frame_XFER_size) +#define frame_SRND_size (1*8) +#define frame_INP (frame_SRND + frame_SRND_size) +#define frame_INP_size (1*8) +#define frame_INPEND (frame_INP + frame_INP_size) +#define frame_INPEND_size (1*8) +#define frame_RSPSAVE (frame_INPEND + frame_INPEND_size) +#define frame_RSPSAVE_size (1*8) +#define frame_GPRSAVE (frame_RSPSAVE + frame_RSPSAVE_size) +#define frame_GPRSAVE_size (6*8) +#define frame_size (frame_GPRSAVE + frame_GPRSAVE_size) + +#define VMOVDQ vmovdqu /*; assume buffers not aligned */ + +/* addm [mem], reg */ +/* Add reg to mem using reg-mem add and store */ +.macro addm p1 p2 + add \p2, \p1 + mov \p1, \p2 +.endm + + +/* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */ +/* Load ymm with mem and byte swap each dword */ +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p1, \p2 + vpshufb \p1, \p1, \p3 +.endm +/* rotate_Ys */ +/* Rotate values of symbols Y0...Y3 */ +.macro rotate_Ys + __Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = __Y_ +.endm + +/* RotateState */ +.macro RotateState + /* Rotate symbles a..h right */ + old_h = h + __TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = __TMP_ +.endm + +/* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */ +/* YDST = {YSRC1, YSRC2} >> RVAL*8 */ +.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL + vperm2f128 \YDST, \YSRC1, \YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */ + vpalignr \YDST, \YDST, \YSRC2, \RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ +.endm + +.macro FOUR_ROUNDS_AND_SCHED +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + /* Extract w[t-7] */ + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 /* YTMP0 = W[-7] */ + /* Calculate w[t-16] + w[t-7] */ + vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */ + /* Extract w[t-15] */ + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 /* YTMP1 = W[-15] */ + + /* Calculate sigma0 */ + + /* Calculate w[t-15] ror 1 */ + vpsrlq YTMP2, YTMP1, 1 + vpsllq YTMP3, YTMP1, (64-1) + vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */ + /* Calculate w[t-15] shr 7 */ + vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + + add h, [rsp+frame_XFER+0*8] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + + and y2, e /* y2 = (f^g)&e ; CH */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + add d, h /* d = k + w + h + d ; -- */ + + and y3, b /* y3 = (a|c)&b ; MAJA */ + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + + add y2, y0 /* y2 = S1 + CH ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;; */ + + /* Calculate w[t-15] ror 8 */ + vpsrlq YTMP2, YTMP1, 8 + vpsllq YTMP1, YTMP1, (64-8) + vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */ + /* XOR the three components */ + vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */ + vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */ + + + /* Add three components, w[t-16], w[t-7] and sigma0 */ + vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */ + /* Move to appropriate lanes for calculating w[16] and w[17] */ + vperm2f128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */ + /* Move to appropriate lanes for calculating w[18] and w[19] */ + vpand YTMP0, YTMP0, [.LMASK_YMM_LO ADD_RIP] /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */ + + /* Calculate w[16] and w[17] in both 128 bit lanes */ + + /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */ + vperm2f128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */ + vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */ + + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + add h, [rsp+frame_XFER+1*8] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + + + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + and y2, e /* y2 = (f^g)&e ; CH */ + add d, h /* d = k + w + h + d ; -- */ + + and y3, b /* y3 = (a|c)&b ; MAJA */ + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + + + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;; */ + + + vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */ + vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */ + vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */ + vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */ + + /* Add sigma1 to the other compunents to get w[16] and w[17] */ + vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */ + + /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */ + vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + add h, [rsp+frame_XFER+2*8] /* h = k + w + h ; -- */ + + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + or y3, c /* y3 = a|c ; MAJA */ + mov y2, f /* y2 = f ; CH */ + xor y2, g /* y2 = f^g ; CH */ + + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + add d, h /* d = k + w + h + d ; -- */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;; */ + + vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */ + vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */ + vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */ + vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */ + + /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */ + vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */ + + /* Form w[19, w[18], w17], w[16] */ + vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */ +/* vperm2f128 Y_0, Y_0, YTMP2, 0x30 */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + add h, [rsp+frame_XFER+3*8] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + + + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add d, h /* d = k + w + h + d ; -- */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + + add h, y1 /* h = k + w + h + S0 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + +rotate_Ys +.endm + +.macro DO_4ROUNDS + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*0] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ + + RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*1] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ + + RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*2] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ + + RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*3] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + + RotateState + +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_rorx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +*/ +.globl _gcry_sha512_transform_amd64_avx2 +.type _gcry_sha512_transform_amd64_avx2, at function; +.align 16 +_gcry_sha512_transform_amd64_avx2: + xor eax, eax + + cmp rdx, 0 + je .Lnowork + + /* Allocate Stack Space */ + mov rax, rsp + sub rsp, frame_size + and rsp, ~(0x20 - 1) + mov [rsp + frame_RSPSAVE], rax + + /* Save GPRs */ + mov [rsp + frame_GPRSAVE + 8 * 0], rbp + mov [rsp + frame_GPRSAVE + 8 * 1], rbx + mov [rsp + frame_GPRSAVE + 8 * 2], r12 + mov [rsp + frame_GPRSAVE + 8 * 3], r13 + mov [rsp + frame_GPRSAVE + 8 * 4], r14 + mov [rsp + frame_GPRSAVE + 8 * 5], r15 + + vpblendd xmm0, xmm0, xmm1, 0xf0 + vpblendd ymm0, ymm0, ymm1, 0xf0 + + shl NUM_BLKS, 7 /* convert to bytes */ + jz .Ldone_hash + add NUM_BLKS, INP /* pointer to end of data */ + mov [rsp + frame_INPEND], NUM_BLKS + + /*; load initial digest */ + mov a,[8*0 + CTX] + mov b,[8*1 + CTX] + mov c,[8*2 + CTX] + mov d,[8*3 + CTX] + mov e,[8*4 + CTX] + mov f,[8*5 + CTX] + mov g,[8*6 + CTX] + mov h,[8*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + +.Loop0: + lea TBL,[.LK512 ADD_RIP] + + /*; byte swap first 16 dwords */ + COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + + mov [rsp + frame_INP], INP + + /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ + movq [rsp + frame_SRND],4 + +.align 16 +.Loop1: + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame_XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 1*32] + vmovdqa [rsp + frame_XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 2*32] + vmovdqa [rsp + frame_XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 3*32] + vmovdqa [rsp + frame_XFER], XFER + add TBL, 4*32 + FOUR_ROUNDS_AND_SCHED + + subq [rsp + frame_SRND], 1 + jne .Loop1 + + movq [rsp + frame_SRND], 2 +.Loop2: + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame_XFER], XFER + DO_4ROUNDS + vpaddq XFER, Y_1, [TBL + 1*32] + vmovdqa [rsp + frame_XFER], XFER + add TBL, 2*32 + DO_4ROUNDS + + vmovdqa Y_0, Y_2 + vmovdqa Y_1, Y_3 + + subq [rsp + frame_SRND], 1 + jne .Loop2 + + addm [8*0 + CTX],a + addm [8*1 + CTX],b + addm [8*2 + CTX],c + addm [8*3 + CTX],d + addm [8*4 + CTX],e + addm [8*5 + CTX],f + addm [8*6 + CTX],g + addm [8*7 + CTX],h + + mov INP, [rsp + frame_INP] + add INP, 128 + cmp INP, [rsp + frame_INPEND] + jne .Loop0 + +.Ldone_hash: + + /* Restore GPRs */ + mov rbp, [rsp + frame_GPRSAVE + 8 * 0] + mov rbx, [rsp + frame_GPRSAVE + 8 * 1] + mov r12, [rsp + frame_GPRSAVE + 8 * 2] + mov r13, [rsp + frame_GPRSAVE + 8 * 3] + mov r14, [rsp + frame_GPRSAVE + 8 * 4] + mov r15, [rsp + frame_GPRSAVE + 8 * 5] + + /* Restore Stack Pointer */ + mov rsp, [rsp + frame_RSPSAVE] + + mov rax, frame_size +.Lnowork: + ret + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ +/*;; Binary Data */ + +.data + +.align 64 +/* K[t] used in SHA512 hashing */ +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.align 32 + +/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ +.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +.LMASK_YMM_LO: .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF + +#endif +#endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 34b3215..586c809 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -74,6 +74,24 @@ #endif +/* USE_AVX indicates whether to compile with Intel AVX code. */ +#undef USE_AVX +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_AVX 1 +#endif + + +/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */ +#undef USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_AVX2 1 +#endif + + typedef struct { u64 h0, h1, h2, h3, h4, h5, h6, h7; @@ -89,6 +107,12 @@ typedef struct #ifdef USE_SSSE3 unsigned int use_ssse3:1; #endif +#ifdef USE_AVX + unsigned int use_avx:1; +#endif +#ifdef USE_AVX2 + unsigned int use_avx2:1; +#endif } SHA512_CONTEXT; static unsigned int @@ -99,6 +123,7 @@ sha512_init (void *context) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; + unsigned int features = _gcry_get_hw_features (); hd->h0 = U64_C(0x6a09e667f3bcc908); hd->h1 = U64_C(0xbb67ae8584caa73b); @@ -116,11 +141,19 @@ sha512_init (void *context) ctx->bctx.bwrite = transform; #ifdef USE_ARM_NEON_ASM - ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; + ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_SSSE3 - ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; +#endif +#ifdef USE_AVX + ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); +#endif +#ifdef USE_AVX2 + ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); #endif + + (void)features; } static void @@ -128,6 +161,7 @@ sha384_init (void *context) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; + unsigned int features = _gcry_get_hw_features (); hd->h0 = U64_C(0xcbbb9d5dc1059ed8); hd->h1 = U64_C(0x629a292a367cd507); @@ -145,11 +179,19 @@ sha384_init (void *context) ctx->bctx.bwrite = transform; #ifdef USE_ARM_NEON_ASM - ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; + ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_SSSE3 - ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; +#endif +#ifdef USE_AVX + ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); #endif +#ifdef USE_AVX2 + ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); +#endif + + (void)features; } @@ -507,12 +549,34 @@ unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data, void *state, size_t num_blks); #endif +#ifdef USE_AVX +unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data, + void *state, size_t num_blks); +#endif + +#ifdef USE_AVX2 +unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data, + void *state, size_t num_blks); +#endif + static unsigned int transform (void *context, const unsigned char *data) { SHA512_CONTEXT *ctx = context; +#ifdef USE_AVX2 + if (ctx->use_avx2) + return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, 1) + + 4 * sizeof(void*); +#endif + +#ifdef USE_AVX + if (ctx->use_avx) + return _gcry_sha512_transform_amd64_avx (data, &ctx->state, 1) + + 4 * sizeof(void*); +#endif + #ifdef USE_SSSE3 if (ctx->use_ssse3) return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, 1) diff --git a/configure.ac b/configure.ac index b930f4e..5a83160 100644 --- a/configure.ac +++ b/configure.ac @@ -1050,6 +1050,23 @@ fi # +# Check whether GCC inline assembler supports BMI2 instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions], + [gcry_cv_gcc_inline_asm_bmi2], + [gcry_cv_gcc_inline_asm_bmi2=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[void a(void) { + __asm__("rorxq \$23, %%rax, %%rdx\\n\\t":::"memory"); + }]])], + [gcry_cv_gcc_inline_asm_bmi2=yes])]) +if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1, + [Defined if inline assembler supports BMI2 instructions]) +fi + + +# # Check whether GCC assembler supports features needed for our amd64 # implementations # @@ -1732,6 +1749,8 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo" ;; esac diff --git a/src/g10lib.h b/src/g10lib.h index 1dcadfa..98ed92b 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -167,6 +167,8 @@ int _gcry_log_verbosity( int level ); #define HWF_PADLOCK_SHA 4 #define HWF_PADLOCK_MMUL 8 +#define HWF_INTEL_CPU 16 +#define HWF_INTEL_BMI2 32 #define HWF_INTEL_SSSE3 64 #define HWF_INTEL_PCLMUL 128 #define HWF_INTEL_AESNI 256 diff --git a/src/global.c b/src/global.c index de99286..909adec 100644 --- a/src/global.c +++ b/src/global.c @@ -66,6 +66,8 @@ static struct { HWF_PADLOCK_AES, "padlock-aes" }, { HWF_PADLOCK_SHA, "padlock-sha" }, { HWF_PADLOCK_MMUL,"padlock-mmul"}, + { HWF_INTEL_CPU, "intel-cpu" }, + { HWF_INTEL_BMI2, "intel-bmi2" }, { HWF_INTEL_SSSE3, "intel-ssse3" }, { HWF_INTEL_PCLMUL,"intel-pclmul" }, { HWF_INTEL_AESNI, "intel-aesni" }, diff --git a/src/hwf-x86.c b/src/hwf-x86.c index ab6dacd..4e82558 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -189,6 +189,7 @@ detect_x86_gnuc (void) else if (!strcmp (vendor_id, "GenuineIntel")) { /* This is an Intel CPU. */ + result |= HWF_INTEL_CPU; } else if (!strcmp (vendor_id, "AuthenticAMD")) { @@ -231,10 +232,14 @@ detect_x86_gnuc (void) * Source: http://www.sandpile.org/x86/cpuid.htm */ if (max_cpuid_level >= 7 && (features & 0x00000001)) { -#ifdef ENABLE_AVX2_SUPPORT /* Get CPUID:7 contains further Intel feature flags. */ get_cpuid(7, NULL, &features, NULL, NULL); + /* Test bit 8 for BMI2. */ + if (features & 0x00000100) + result |= HWF_INTEL_BMI2; + +#ifdef ENABLE_AVX2_SUPPORT /* Test bit 5 for AVX2. */ if (features & 0x00000020) result |= HWF_INTEL_AVX2; From cvs at cvs.gnupg.org Thu Dec 12 18:56:05 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Thu, 12 Dec 2013 18:56:05 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-430-g4ae7732 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 4ae77322b681a13da62d01274bcab25be2af12d0 (commit) from 3b30e9840d4b351c4de73b126e561154cb7df4cc (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 4ae77322b681a13da62d01274bcab25be2af12d0 Author: Werner Koch Date: Thu Dec 12 18:53:39 2013 +0100 Move list of hardware features to hwfeatures.c. * src/global.c (hwflist, disabled_hw_features): Move to .. * src/hwfeatures.c: here. (_gcry_disable_hw_feature): New. (_gcry_enum_hw_features): New. (_gcry_detect_hw_features): Remove arg DISABLED_FEATURES. * src/global.c (print_config, _gcry_vcontrol, global_init): Adjust accordingly. -- It is better to keep the hardware feature infor at one place. Signed-off-by: Werner Koch diff --git a/src/g10lib.h b/src/g10lib.h index 4e083b8..1e58ef6 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -206,8 +206,10 @@ int _gcry_log_verbosity( int level ); #define HWF_ARM_NEON 4096 +gpg_err_code_t _gcry_disable_hw_feature (const char *name); +void _gcry_detect_hw_features (void); unsigned int _gcry_get_hw_features (void); -void _gcry_detect_hw_features (unsigned int); +const char *_gcry_enum_hw_features (int idx, unsigned int *r_feature); /*-- mpi/mpiutil.c --*/ diff --git a/src/global.c b/src/global.c index cb66d37..9af499e 100644 --- a/src/global.c +++ b/src/global.c @@ -55,31 +55,6 @@ static int force_fips_mode; /* Controlled by global_init(). */ static int any_init_done; -/* A table to map hardware features to a string. */ -static struct -{ - unsigned int flag; - const char *desc; -} hwflist[] = - { - { HWF_PADLOCK_RNG, "padlock-rng" }, - { HWF_PADLOCK_AES, "padlock-aes" }, - { HWF_PADLOCK_SHA, "padlock-sha" }, - { HWF_PADLOCK_MMUL,"padlock-mmul"}, - { HWF_INTEL_PCLMUL,"intel-pclmul" }, - { HWF_INTEL_AESNI, "intel-aesni" }, - { HWF_INTEL_RDRAND,"intel-rdrand" }, - { HWF_INTEL_AVX, "intel-avx" }, - { HWF_INTEL_AVX2, "intel-avx2" }, - { HWF_ARM_NEON, "arm-neon" }, - { 0, NULL} - }; - -/* A bit vector with the hardware features which shall not be used. - This variable must be set prior to any initialization. */ -static unsigned int disabled_hw_features; - - /* Memory management. */ static gcry_handler_alloc_t alloc_func; @@ -125,7 +100,7 @@ global_init (void) /* Before we do any other initialization we need to test available hardware features. */ - _gcry_detect_hw_features (disabled_hw_features); + _gcry_detect_hw_features (); /* Initialize the modules - this is mainly allocating some memory and creating mutexes. */ @@ -289,8 +264,9 @@ _gcry_check_version (const char *req_version) static void print_config ( int (*fnc)(FILE *fp, const char *format, ...), FILE *fp) { - unsigned int hwf; + unsigned int hwfeatures, afeature; int i; + const char *s; fnc (fp, "version:%s:\n", VERSION); fnc (fp, "ciphers:%s:\n", LIBGCRYPT_CIPHERS); @@ -329,11 +305,11 @@ print_config ( int (*fnc)(FILE *fp, const char *format, ...), FILE *fp) ":\n"); fnc (fp, "mpi-asm:%s:\n", _gcry_mpi_get_hw_config ()); fnc (fp, "threads:%s:\n", ath_get_model (NULL)); - hwf = _gcry_get_hw_features (); + hwfeatures = _gcry_get_hw_features (); fnc (fp, "hwflist:"); - for (i=0; hwflist[i].desc; i++) - if ( (hwf & hwflist[i].flag) ) - fnc (fp, "%s:", hwflist[i].desc); + for (i=0; (s = _gcry_enum_hw_features (i, &afeature)); i++) + if ((hwfeatures & afeature)) + fnc (fp, "%s:", s); fnc (fp, "\n"); /* We use y/n instead of 1/0 for the simple reason that Emacsen's compile error parser would accidently flag that line when printed @@ -343,8 +319,6 @@ print_config ( int (*fnc)(FILE *fp, const char *format, ...), FILE *fp) _gcry_enforced_fips_mode ()? 'y':'n' ); /* The currently used RNG type. */ { - const char *s; - i = _gcry_get_rng_type (0); switch (i) { @@ -653,16 +627,7 @@ _gcry_vcontrol (enum gcry_ctl_cmds cmd, va_list arg_ptr) case GCRYCTL_DISABLE_HWF: { const char *name = va_arg (arg_ptr, const char *); - int i; - - for (i=0; hwflist[i].desc; i++) - if (!strcmp (hwflist[i].desc, name)) - { - disabled_hw_features |= hwflist[i].flag; - break; - } - if (!hwflist[i].desc) - rc = GPG_ERR_INV_NAME; + rc = _gcry_disable_hw_feature (name); } break; diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 9ab4cd0..5c99a1d 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -39,7 +39,8 @@ #define AT_HWCAP 16 #define HWCAP_NEON 4096 -static int get_hwcap(unsigned int *hwcap) +static int +get_hwcap(unsigned int *hwcap) { struct { unsigned int a_type; unsigned int a_val; } auxv; FILE *f; @@ -76,7 +77,8 @@ static int get_hwcap(unsigned int *hwcap) return err; } -static unsigned int detect_arm_at_hwcap(void) +static unsigned int +detect_arm_at_hwcap(void) { unsigned int hwcap; unsigned int features = 0; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index 1e3c27d..43847d2 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -29,11 +29,51 @@ #include "hwf-common.h" +/* A table to map hardware features to a string. */ +static struct +{ + unsigned int flag; + const char *desc; +} hwflist[] = + { + { HWF_PADLOCK_RNG, "padlock-rng" }, + { HWF_PADLOCK_AES, "padlock-aes" }, + { HWF_PADLOCK_SHA, "padlock-sha" }, + { HWF_PADLOCK_MMUL,"padlock-mmul"}, + { HWF_INTEL_PCLMUL,"intel-pclmul" }, + { HWF_INTEL_AESNI, "intel-aesni" }, + { HWF_INTEL_RDRAND,"intel-rdrand" }, + { HWF_INTEL_AVX, "intel-avx" }, + { HWF_INTEL_AVX2, "intel-avx2" }, + { HWF_ARM_NEON, "arm-neon" } + }; + +/* A bit vector with the hardware features which shall not be used. + This variable must be set prior to any initialization. */ +static unsigned int disabled_hw_features; + /* A bit vector describing the hardware features currently available. */ static unsigned int hw_features; +/* Disable a feature by name. This function must be called *before* + _gcry_detect_hw_features is called. */ +gpg_err_code_t +_gcry_disable_hw_feature (const char *name) +{ + int i; + + for (i=0; i < DIM (hwflist); i++) + if (!strcmp (hwflist[i].desc, name)) + { + disabled_hw_features |= hwflist[i].flag; + return 0; + } + return GPG_ERR_INV_NAME; +} + + /* Return a bit vector describing the available hardware features. The HWF_ constants are used to test for them. */ unsigned int @@ -43,11 +83,24 @@ _gcry_get_hw_features (void) } +/* Enumerate all features. The caller is expected to start with an + IDX of 0 and then increment IDX until NULL is returned. */ +const char * +_gcry_enum_hw_features (int idx, unsigned int *r_feature) +{ + if (idx < 0 || idx >= DIM (hwflist)) + return NULL; + if (r_feature) + *r_feature = hwflist[idx].flag; + return hwflist[idx].desc; +} + + /* Detect the available hardware features. This function is called once right at startup and we assume that no other threads are running. */ void -_gcry_detect_hw_features (unsigned int disabled_features) +_gcry_detect_hw_features (void) { hw_features = 0; @@ -65,5 +118,5 @@ _gcry_detect_hw_features (unsigned int disabled_features) } #endif /* HAVE_CPU_ARCH_ARM */ - hw_features &= ~disabled_features; + hw_features &= ~disabled_hw_features; } ----------------------------------------------------------------------- Summary of changes: src/g10lib.h | 4 +++- src/global.c | 51 ++++++++---------------------------------------- src/hwf-arm.c | 6 ++++-- src/hwfeatures.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 70 insertions(+), 48 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Thu Dec 12 20:29:11 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Thu, 12 Dec 2013 20:29:11 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-431-g5e1239b Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 5e1239b1e2948211ff2675f45cce2b28c3379cfb (commit) from 4ae77322b681a13da62d01274bcab25be2af12d0 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 5e1239b1e2948211ff2675f45cce2b28c3379cfb Author: Werner Koch Date: Thu Dec 12 20:26:56 2013 +0100 Add a configuration file to disable hardware features. * src/hwfeatures.c: Inclyde syslog.h and ctype.h. (HWF_DENY_FILE): New. (my_isascii): New. (parse_hwf_deny_file): New. (_gcry_detect_hw_features): Call it. * src/mpicalc.c (main): Correctly initialize Libgcrypt. Add options "--print-config" and "--disable-hwf". Signed-off-by: Werner Koch diff --git a/NEWS b/NEWS index 4c95e8a..978047f 100644 --- a/NEWS +++ b/NEWS @@ -46,6 +46,8 @@ Noteworthy changes in version 1.6.0 (unreleased) * Changed gcry_pk_genkey for "ecc" to only include the curve name and not the parameters. The flag "param" may be used to revert this. + * Added a feature to globally disable selected hardware features. + * Interface changes relative to the 1.5.0 release: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ gcry_ac_* REMOVED. diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 97dac1c..f3af29f 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -210,6 +210,7 @@ of the library are verified. * Initializing the library:: How to initialize the library. * Multi-Threading:: How Libgcrypt can be used in a MT environment. * Enabling FIPS mode:: How to enable the FIPS mode. +* Hardware features:: How to disable hardware features. @end menu @@ -609,6 +610,50 @@ switch back to standard mode without terminating the process first. If the logging verbosity level of Libgcrypt has been set to at least 2, the state transitions and the self-tests are logged. + at node Hardware features + at section How to disable hardware features + at cindex hardware features + +Libgcrypt makes use of certain hardware features. If the use of a +feature is not desired it may be either be disabled by a program or +globally using a configuration file. The currently supported features +are + + at table @code + at item padlock-rng + at item padlock-aes + at item padlock-sha + at item padlock-mmul + at item intel-pclmul + at item intel-aesni + at item intel-rdrand + at item intel-avx + at item intel-avx2 + at item arm-neon + at end table + +To disable a feature for all processes using Libgcrypt 1.6 or newer, +create the file @file{/etc/gcrypt/hwf.deny} and put each feature not +to be used on a single line. Empty lines, white space, and lines +prefixed with a hash mark are ignored. The file should be world +readable. + +To disable a feature specifically for a program that program must tell +it Libgcrypt before before calling @code{gcry_check_version}. +Example:@footnote{NB. Libgcrypt uses the RDRAND feature only as one +source of entropy. A CPU with a broken RDRAND will thus not +compromise of the random number generator} + + at example + gcry_control (GCRYCTL_DISABLE_HWF, "intel-rdrand", NULL); + at end example + + at noindent +To print the list of active features you may use this command: + + at example + mpicalc --print-config | grep ^hwflist: | tr : '\n' | tail -n +2 + at end example @c ********************************************************** diff --git a/src/fips.c b/src/fips.c index 1d7a6a4..8148dcd 100644 --- a/src/fips.c +++ b/src/fips.c @@ -36,7 +36,7 @@ #include "hmac256.h" -/* The name of the file used to foce libgcrypt into fips mode. */ +/* The name of the file used to force libgcrypt into fips mode. */ #define FIPS_FORCE_FILE "/etc/gcrypt/fips_enabled" diff --git a/src/hwfeatures.c b/src/hwfeatures.c index 43847d2..6699816 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -20,14 +20,20 @@ #include #include +#include #include #include #include #include +#ifdef HAVE_SYSLOG +# include +#endif /*HAVE_SYSLOG*/ #include "g10lib.h" #include "hwf-common.h" +/* The name of a file used to globally disable selected features. */ +#define HWF_DENY_FILE "/etc/gcrypt/hwf.deny" /* A table to map hardware features to a string. */ static struct @@ -56,7 +62,11 @@ static unsigned int disabled_hw_features; available. */ static unsigned int hw_features; +/* Convenience macros. */ +#define my_isascii(c) (!((c) & 0x80)) + + /* Disable a feature by name. This function must be called *before* _gcry_detect_hw_features is called. */ gpg_err_code_t @@ -96,6 +106,70 @@ _gcry_enum_hw_features (int idx, unsigned int *r_feature) } +/* Read a file with features which shall not be used. The file is a + simple text file where empty lines and lines with the first non + white-space character being '#' are ignored. */ +static void +parse_hwf_deny_file (void) +{ + const char *fname = HWF_DENY_FILE; + FILE *fp; + char buffer[256]; + char *p, *pend; + int i, lnr = 0; + + fp = fopen (fname, "r"); + if (!fp) + return; + + for (;;) + { + if (!fgets (buffer, sizeof buffer, fp)) + { + if (!feof (fp)) + { +#ifdef HAVE_SYSLOG + syslog (LOG_USER|LOG_WARNING, + "Libgcrypt warning: error reading '%s', line %d", + fname, lnr); +#endif /*HAVE_SYSLOG*/ + } + fclose (fp); + return; + } + lnr++; + for (p=buffer; my_isascii (*p) && isspace (*p); p++) + ; + pend = strchr (p, '\n'); + if (pend) + *pend = 0; + pend = p + (*p? (strlen (p)-1):0); + for ( ;pend > p; pend--) + if (my_isascii (*pend) && isspace (*pend)) + *pend = 0; + if (!*p || *p == '#') + continue; + + for (i=0; i < DIM (hwflist); i++) + { + if (!strcmp (hwflist[i].desc, p)) + { + disabled_hw_features |= hwflist[i].flag; + break; + } + } + if (i == DIM (hwflist)) + { +#ifdef HAVE_SYSLOG + syslog (LOG_USER|LOG_WARNING, + "Libgcrypt warning: unknown feature in '%s', line %d", + fname, lnr); +#endif /*HAVE_SYSLOG*/ + } + } +} + + /* Detect the available hardware features. This function is called once right at startup and we assume that no other threads are running. */ @@ -107,6 +181,8 @@ _gcry_detect_hw_features (void) if (fips_mode ()) return; /* Hardware support is not to be evaluated. */ + parse_hwf_deny_file (); + #if defined (HAVE_CPU_ARCH_X86) { hw_features = _gcry_hwf_detect_x86 (); diff --git a/src/mpicalc.c b/src/mpicalc.c index 335b7c3..b2b4335 100644 --- a/src/mpicalc.c +++ b/src/mpicalc.c @@ -40,6 +40,7 @@ #define MPICALC_VERSION "2.0" +#define NEED_LIBGCRYPT_VERSION "1.6.0" #define STACKSIZE 500 static gcry_mpi_t stack[STACKSIZE]; @@ -309,6 +310,7 @@ main (int argc, char **argv) { const char *pgm; int last_argc = -1; + int print_config = 0; int i, c; int state = 0; char strbuf[1000]; @@ -350,10 +352,28 @@ main (int argc, char **argv) "Simple interactive big integer RPN calculator\n" "\n" "Options:\n" - " --version print version information\n", + " --version print version information\n" + " --print-config print the Libgcrypt config\n" + " --disable-hwf NAME disable feature NAME\n", pgm, gcry_check_version (NULL)); exit (0); } + else if (!strcmp (*argv, "--print-config")) + { + argc--; argv++; + print_config = 1; + } + else if (!strcmp (*argv, "--disable-hwf")) + { + argc--; argv++; + if (argc) + { + if (gcry_control (GCRYCTL_DISABLE_HWF, *argv, NULL)) + fprintf (stderr, "%s: unknown hardware feature `%s'" + " - option ignored\n", pgm, *argv); + argc--; argv++; + } + } } if (argc) @@ -362,6 +382,20 @@ main (int argc, char **argv) exit (1); } + if (!gcry_check_version (NEED_LIBGCRYPT_VERSION)) + { + fprintf (stderr, "%s: Libgcrypt is too old (need %s, have %s)\n", + pgm, NEED_LIBGCRYPT_VERSION, gcry_check_version (NULL) ); + exit (1); + } + gcry_control (GCRYCTL_DISABLE_SECMEM, 0); + gcry_control (GCRYCTL_INITIALIZATION_FINISHED, 0); + if (print_config) + { + gcry_control (GCRYCTL_PRINT_CONFIG, stdout); + exit (0); + } + for (i = 0; i < STACKSIZE; i++) stack[i] = NULL; stackidx = 0; ----------------------------------------------------------------------- Summary of changes: NEWS | 2 ++ doc/gcrypt.texi | 45 ++++++++++++++++++++++++++++++++ src/fips.c | 2 +- src/hwfeatures.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/mpicalc.c | 36 +++++++++++++++++++++++++- 5 files changed, 159 insertions(+), 2 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Fri Dec 13 10:25:41 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 13 Dec 2013 10:25:41 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-436-ge41d605 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via e41d605ee41469e8a33cdc4d38f742cfb931f835 (commit) via a71b810ddd67ca3a1773d8f929d162551abb58eb (commit) via 2e4253dc8eb512cd0e807360926dc6ba912c95b4 (commit) via 69a6d0f9562fcd26112a589318c13de66ce1700e (commit) via e1a3931263e67aacec3c0bfcaa86c7d1441d5c6a (commit) from 5e1239b1e2948211ff2675f45cce2b28c3379cfb (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit e41d605ee41469e8a33cdc4d38f742cfb931f835 Author: Jussi Kivilinna Date: Fri Dec 13 00:00:08 2013 +0200 Fix empty clobber in AVX2 assembly check * configure.ac (gcry_cv_gcc_inline_asm_avx2): Add "cc" as assembly globber. -- Appearently empty globbers only work in some cases on linux, and fail on mingw32. Signed-off-by: Jussi Kivilinna diff --git a/configure.ac b/configure.ac index bc86956..b91240b 100644 --- a/configure.ac +++ b/configure.ac @@ -1040,7 +1040,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions], [gcry_cv_gcc_inline_asm_avx2=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { - __asm__("vpbroadcastb %%xmm7,%%ymm1\n\t":::); + __asm__("vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); }]])], [gcry_cv_gcc_inline_asm_avx2=yes])]) if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then commit a71b810ddd67ca3a1773d8f929d162551abb58eb Author: Jussi Kivilinna Date: Thu Dec 12 23:53:28 2013 +0200 Fix W32 build * random/rndw32.c (register_poll, slow_gatherer): Change gcry_xmalloc to xmalloc, and gcry_xrealloc to xrealloc. -- Patch fixes following errors: ../random/.libs/librandom.a(rndw32.o): In function `registry_poll': .../libgcrypt/random/rndw32.c:434: undefined reference to `__gcry_USE_THE_UNDERSCORED_FUNCTION' .../libgcrypt/random/rndw32.c:454: undefined reference to `__gcry_USE_THE_UNDERSCORED_FUNCTION' ../random/.libs/librandom.a(rndw32.o): In function `slow_gatherer': .../random/rndw32.c:658: undefined reference to `__gcry_USE_THE_UNDERSCORED_FUNCTION' Signed-off-by: Jussi Kivilinna diff --git a/random/rndw32.c b/random/rndw32.c index 03dffaf..c495131 100644 --- a/random/rndw32.c +++ b/random/rndw32.c @@ -431,7 +431,7 @@ registry_poll (void (*add)(const void*, size_t, enum random_origins), } else { - pPerfData = gcry_xmalloc (cbPerfData); + pPerfData = xmalloc (cbPerfData); for (iterations=0; iterations < 10; iterations++) { dwSize = cbPerfData; @@ -451,7 +451,7 @@ registry_poll (void (*add)(const void*, size_t, enum random_origins), else if (status == ERROR_MORE_DATA) { cbPerfData += PERFORMANCE_BUFFER_STEP; - pPerfData = gcry_xrealloc (pPerfData, cbPerfData); + pPerfData = xrealloc (pPerfData, cbPerfData); } else { @@ -655,7 +655,7 @@ slow_gatherer ( void (*add)(const void*, size_t, enum random_origins), This scan typically yields around 20 pieces of data, there's nothing in the range 65...128 so chances are there won't be anything above there either. */ - buffer = gcry_xmalloc (PERFORMANCE_BUFFER_SIZE); + buffer = xmalloc (PERFORMANCE_BUFFER_SIZE); for (dwType = 0; dwType < 64; dwType++) { switch (dwType) commit 2e4253dc8eb512cd0e807360926dc6ba912c95b4 Author: Jussi Kivilinna Date: Thu Dec 12 13:56:13 2013 +0200 SHA-512: Add AVX and AVX2 implementations for x86-64 * cipher/Makefile.am: Add 'sha512-avx-amd64.S' and 'sha512-avx2-bmi2-amd64.S'. * cipher/sha512-avx-amd64.S: New. * cipher/sha512-avx2-bmi2-amd64.S: New. * cipher/sha512.c (USE_AVX, USE_AVX2): New. (SHA512_CONTEXT) [USE_AVX]: Add 'use_avx'. (SHA512_CONTEXT) [USE_AVX2]: Add 'use_avx2'. (sha512_init, sha384_init) [USE_AVX]: Initialize 'use_avx'. (sha512_init, sha384_init) [USE_AVX2]: Initialize 'use_avx2'. [USE_AVX] (_gcry_sha512_transform_amd64_avx): New. [USE_AVX2] (_gcry_sha512_transform_amd64_avx2): New. (transform) [USE_AVX2]: Add call for AVX2 implementation. (transform) [USE_AVX]: Add call for AVX implementation. * configure.ac (HAVE_GCC_INLINE_ASM_BMI2): New check. (sha512): Add 'sha512-avx-amd64.lo' and 'sha512-avx2-bmi2-amd64.lo'. * doc/gcrypt.texi: Document 'intel-cpu' and 'intel-bmi2'. * src/g10lib.h (HWF_INTEL_CPU, HWF_INTEL_BMI2): New. * src/hwfeatures.c (hwflist): Add "intel-cpu" and "intel-bmi2". * src/hwf-x86.c (detect_x86_gnuc): Check for HWF_INTEL_CPU and HWF_INTEL_BMI2. -- Patch adds fast AVX and AVX2 implementation of SHA-512 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA512 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/fast-sha512-implementat$ Note: AVX implementation uses SHLD instruction to emulate RORQ, since it's faster on Intel Sandy-Bridge. However, on non-Intel CPUs SHLD is much slower than RORQ, so therefore AVX implementation is (for now) limited to Intel CPUs. Note: AVX2 implementation also uses BMI2 instruction rorx, thus additional HWF flag. Benchmarks: cpu Old SSSE3 AVX/AVX2 Old vs AVX/AVX2 vs SSSE3 Intel i5-4570 10.11 c/B 7.56 c/B 6.72 c/B 1.50x 1.12x Intel i5-2450M 14.11 c/B 10.53 c/B 8.88 c/B 1.58x 1.18x Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 88c288a..7c85af2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,7 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ -sha256.c sha256-ssse3-amd64.S \ +sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ tiger.c \ diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S new file mode 100644 index 0000000..691d771 --- /dev/null +++ b/cipher/sha512-avx-amd64.S @@ -0,0 +1,412 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +.text + +/* Virtual Registers */ +msg = rdi /* ARG1 */ +digest = rsi /* ARG2 */ +msglen = rdx /* ARG3 */ +T1 = rcx +T2 = r8 +a_64 = r9 +b_64 = r10 +c_64 = r11 +d_64 = r12 +e_64 = r13 +f_64 = r14 +g_64 = r15 +h_64 = rbx +tmp0 = rax + +/* +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +*/ +frame_W = 0 /* Message Schedule */ +frame_W_size = (80 * 8) +frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +frame_WK_size = (2 * 8) +frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) +frame_GPRSAVE_size = (5 * 8) +frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) + + +/* Useful QWORD "arrays" for simpler memory references */ +#define MSG(i) msg + 8*(i) /* Input message (arg1) */ +#define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ +#define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ +#define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ +#define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ +/* MSG, DIGEST, K_t, W_t are arrays */ +/* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ + +.macro RotateState + /* Rotate symbles a..h right */ + __TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = __TMP +.endm + +.macro RORQ p1 p2 + /* shld is faster than ror on Intel Sandybridge */ + shld \p1, \p1, (64 - \p2) +.endm + +.macro SHA512_Round t + /* Compute Round %%t */ + mov T1, f_64 /* T1 = f */ + mov tmp0, e_64 /* tmp = e */ + xor T1, g_64 /* T1 = f ^ g */ + RORQ tmp0, 23 /* 41 ; tmp = e ror 23 */ + and T1, e_64 /* T1 = (f ^ g) & e */ + xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ + xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ + add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ + RORQ tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ + xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ + mov T2, a_64 /* T2 = a */ + add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ + RORQ tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ + mov tmp0, a_64 /* tmp = a */ + xor T2, c_64 /* T2 = a ^ c */ + and tmp0, c_64 /* tmp = a & c */ + and T2, b_64 /* T2 = (a ^ c) & b */ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ + mov tmp0, a_64 /* tmp = a */ + RORQ tmp0, 5 /* 39 ; tmp = a ror 5 */ + xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ + add d_64, T1 /* e(next_state) = d + T1 */ + RORQ tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ + xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ + lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ + RORQ tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ + add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + RotateState +.endm + +.macro SHA512_2Sched_2Round_avx t +/* ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. */ + + vmovdqa xmm4, [W_t(\t-2)] /* XMM4 = W[t-2] */ + vmovdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ + mov T1, f_64 + vpsrlq xmm0, xmm4, 61 /* XMM0 = W[t-2]>>61 */ + mov tmp0, e_64 + vpsrlq xmm6, xmm5, 1 /* XMM6 = W[t-15]>>1 */ + xor T1, g_64 + RORQ tmp0, 23 /* 41 */ + vpsrlq xmm1, xmm4, 19 /* XMM1 = W[t-2]>>19 */ + and T1, e_64 + xor tmp0, e_64 + vpxor xmm0, xmm0, xmm1 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 */ + xor T1, g_64 + add T1, [WK_2(\t)]; + vpsrlq xmm7, xmm5, 8 /* XMM7 = W[t-15]>>8 */ + RORQ tmp0, 4 /* 18 */ + vpsrlq xmm2, xmm4, 6 /* XMM2 = W[t-2]>>6 */ + xor tmp0, e_64 + mov T2, a_64 + add T1, h_64 + vpxor xmm6, xmm6, xmm7 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 */ + RORQ tmp0, 14 /* 14 */ + add T1, tmp0 + vpsrlq xmm8, xmm5, 7 /* XMM8 = W[t-15]>>7 */ + mov tmp0, a_64 + xor T2, c_64 + vpsllq xmm3, xmm4, (64-61) /* XMM3 = W[t-2]<<3 */ + and tmp0, c_64 + and T2, b_64 + vpxor xmm2, xmm2, xmm3 /* XMM2 = W[t-2]>>6 ^ W[t-2]<<3 */ + xor T2, tmp0 + mov tmp0, a_64 + vpsllq xmm9, xmm5, (64-1) /* XMM9 = W[t-15]<<63 */ + RORQ tmp0, 5 /* 39 */ + vpxor xmm8, xmm8, xmm9 /* XMM8 = W[t-15]>>7 ^ W[t-15]<<63 */ + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 /* 34 */ + xor tmp0, a_64 + vpxor xmm6, xmm6, xmm8 /* XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ W[t-15]>>7 ^ W[t-15]<<63 */ + lea h_64, [T1 + T2] + RORQ tmp0, 28 /* 28 */ + vpsllq xmm4, xmm4, (64-19) /* XMM4 = W[t-2]<<25 */ + add h_64, tmp0 + RotateState + vpxor xmm0, xmm0, xmm4 /* XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ W[t-2]<<25 */ + mov T1, f_64 + vpxor xmm0, xmm0, xmm2 /* XMM0 = s1(W[t-2]) */ + mov tmp0, e_64 + xor T1, g_64 + vpaddq xmm0, xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + W[t-16] */ + vmovdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ + RORQ tmp0, 23 /* 41 */ + and T1, e_64 + xor tmp0, e_64 + xor T1, g_64 + vpsllq xmm5, xmm5, (64-8) /* XMM5 = W[t-15]<<56 */ + add T1, [WK_2(\t+1)] + vpxor xmm6, xmm6, xmm5 /* XMM6 = s0(W[t-15]) */ + RORQ tmp0, 4 /* 18 */ + vpaddq xmm0, xmm0, xmm6 /* XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) */ + xor tmp0, e_64 + vpaddq xmm0, xmm0, xmm1 /* XMM0 = W[t] = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ + mov T2, a_64 + add T1, h_64 + RORQ tmp0, 14 /* 14 */ + add T1, tmp0 + vmovdqa [W_t(\t)], xmm0 /* Store W[t] */ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ + mov tmp0, a_64 + xor T2, c_64 + and tmp0, c_64 + and T2, b_64 + xor T2, tmp0 + mov tmp0, a_64 + RORQ tmp0, 5 /* 39 */ + xor tmp0, a_64 + add d_64, T1 + RORQ tmp0, 6 /* 34 */ + xor tmp0, a_64 + lea h_64, [T1 + T2] + RORQ tmp0, 28 /* 28 */ + add h_64, tmp0 + RotateState +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_avx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +*/ +.globl _gcry_sha512_transform_amd64_avx +.type _gcry_sha512_transform_amd64_avx, at function; +.align 16 +_gcry_sha512_transform_amd64_avx: + xor eax, eax + + cmp msglen, 0 + je .Lnowork + + /* Allocate Stack Space */ + sub rsp, frame_size + + /* Save GPRs */ + mov [rsp + frame_GPRSAVE + 8 * 0], rbx + mov [rsp + frame_GPRSAVE + 8 * 1], r12 + mov [rsp + frame_GPRSAVE + 8 * 2], r13 + mov [rsp + frame_GPRSAVE + 8 * 3], r14 + mov [rsp + frame_GPRSAVE + 8 * 4], r15 + +.Lupdateblock: + + /* Load state variables */ + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + t = 0 + .rept 80/2 + 1 + /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ + /* +1 iteration because the scheduler leads hashing by 1 iteration */ + .if t < 2 + /* BSWAP 2 QWORDS */ + vmovdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + vmovdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ + .elseif t < 16 + /* BSWAP 2 QWORDS, Compute 2 Rounds */ + vmovdqu xmm0, [MSG(t)] + vpshufb xmm0, xmm0, xmm1 /* BSWAP */ + SHA512_Round (t - 2) /* Round t-2 */ + vmovdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + vpaddq xmm0, xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + SHA512_Round (t - 1) /* Round t-1 */ + vmovdqa [WK_2(t)], xmm0 /* W[t]+K[t] into WK */ + .elseif t < 79 + /* Schedule 2 QWORDS; Compute 2 Rounds */ + SHA512_2Sched_2Round_avx t + .else + /* Compute 2 Rounds */ + SHA512_Round (t - 2) + SHA512_Round (t - 1) + .endif + t = ((t)+2) + .endr + + /* Update digest */ + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + /* Advance to next message block */ + add msg, 16*8 + dec msglen + jnz .Lupdateblock + + /* Restore GPRs */ + mov rbx, [rsp + frame_GPRSAVE + 8 * 0] + mov r12, [rsp + frame_GPRSAVE + 8 * 1] + mov r13, [rsp + frame_GPRSAVE + 8 * 2] + mov r14, [rsp + frame_GPRSAVE + 8 * 3] + mov r15, [rsp + frame_GPRSAVE + 8 * 4] + + /* Restore Stack Pointer */ + add rsp, frame_size + + /* Return stack burn depth */ + mov rax, frame_size + +.Lnowork: + ret + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data +*/ + +.data + +.align 16 + +/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ +.LXMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +/* K[t] used in SHA512 hashing */ +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +#endif +#endif diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S new file mode 100644 index 0000000..878c41b --- /dev/null +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -0,0 +1,783 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(USE_SHA512) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +.text + +/* Virtual Registers */ +Y_0 = ymm4 +Y_1 = ymm5 +Y_2 = ymm6 +Y_3 = ymm7 + +YTMP0 = ymm0 +YTMP1 = ymm1 +YTMP2 = ymm2 +YTMP3 = ymm3 +YTMP4 = ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = ymm9 + +INP = rdi /* 1st arg */ +CTX = rsi /* 2nd arg */ +NUM_BLKS = rdx /* 3rd arg */ +c = rcx +d = r8 +e = rdx +y3 = rdi + +TBL = rbp + +a = rax +b = rbx + +f = r9 +g = r10 +h = r11 +old_h = r11 + +T1 = r12 +y0 = r13 +y1 = r14 +y2 = r15 + +y4 = r12 + +/* Local variables (stack frame) */ +#define frame_XFER 0 +#define frame_XFER_size (4*8) +#define frame_SRND (frame_XFER + frame_XFER_size) +#define frame_SRND_size (1*8) +#define frame_INP (frame_SRND + frame_SRND_size) +#define frame_INP_size (1*8) +#define frame_INPEND (frame_INP + frame_INP_size) +#define frame_INPEND_size (1*8) +#define frame_RSPSAVE (frame_INPEND + frame_INPEND_size) +#define frame_RSPSAVE_size (1*8) +#define frame_GPRSAVE (frame_RSPSAVE + frame_RSPSAVE_size) +#define frame_GPRSAVE_size (6*8) +#define frame_size (frame_GPRSAVE + frame_GPRSAVE_size) + +#define VMOVDQ vmovdqu /*; assume buffers not aligned */ + +/* addm [mem], reg */ +/* Add reg to mem using reg-mem add and store */ +.macro addm p1 p2 + add \p2, \p1 + mov \p1, \p2 +.endm + + +/* COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask */ +/* Load ymm with mem and byte swap each dword */ +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p1, \p2 + vpshufb \p1, \p1, \p3 +.endm +/* rotate_Ys */ +/* Rotate values of symbols Y0...Y3 */ +.macro rotate_Ys + __Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = __Y_ +.endm + +/* RotateState */ +.macro RotateState + /* Rotate symbles a..h right */ + old_h = h + __TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = __TMP_ +.endm + +/* %macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL */ +/* YDST = {YSRC1, YSRC2} >> RVAL*8 */ +.macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL + vperm2f128 \YDST, \YSRC1, \YSRC2, 0x3 /* YDST = {YS1_LO, YS2_HI} */ + vpalignr \YDST, \YDST, \YSRC2, \RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ +.endm + +.macro FOUR_ROUNDS_AND_SCHED +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + /* Extract w[t-7] */ + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 /* YTMP0 = W[-7] */ + /* Calculate w[t-16] + w[t-7] */ + vpaddq YTMP0, YTMP0, Y_0 /* YTMP0 = W[-7] + W[-16] */ + /* Extract w[t-15] */ + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 /* YTMP1 = W[-15] */ + + /* Calculate sigma0 */ + + /* Calculate w[t-15] ror 1 */ + vpsrlq YTMP2, YTMP1, 1 + vpsllq YTMP3, YTMP1, (64-1) + vpor YTMP3, YTMP3, YTMP2 /* YTMP3 = W[-15] ror 1 */ + /* Calculate w[t-15] shr 7 */ + vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + + add h, [rsp+frame_XFER+0*8] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + + and y2, e /* y2 = (f^g)&e ; CH */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + add d, h /* d = k + w + h + d ; -- */ + + and y3, b /* y3 = (a|c)&b ; MAJA */ + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + + add y2, y0 /* y2 = S1 + CH ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;; */ + + /* Calculate w[t-15] ror 8 */ + vpsrlq YTMP2, YTMP1, 8 + vpsllq YTMP1, YTMP1, (64-8) + vpor YTMP1, YTMP1, YTMP2 /* YTMP1 = W[-15] ror 8 */ + /* XOR the three components */ + vpxor YTMP3, YTMP3, YTMP4 /* YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 */ + vpxor YTMP1, YTMP3, YTMP1 /* YTMP1 = s0 */ + + + /* Add three components, w[t-16], w[t-7] and sigma0 */ + vpaddq YTMP0, YTMP0, YTMP1 /* YTMP0 = W[-16] + W[-7] + s0 */ + /* Move to appropriate lanes for calculating w[16] and w[17] */ + vperm2f128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */ + /* Move to appropriate lanes for calculating w[18] and w[19] */ + vpand YTMP0, YTMP0, [.LMASK_YMM_LO ADD_RIP] /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */ + + /* Calculate w[16] and w[17] in both 128 bit lanes */ + + /* Calculate sigma1 for w[16] and w[17] on both 128 bit lanes */ + vperm2f128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */ + vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */ + + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + add h, [rsp+frame_XFER+1*8] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + + + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + and y2, e /* y2 = (f^g)&e ; CH */ + add d, h /* d = k + w + h + d ; -- */ + + and y3, b /* y3 = (a|c)&b ; MAJA */ + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + + + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;; */ + + + vpsrlq YTMP3, YTMP2, 19 /* YTMP3 = W[-2] >> 19 {BABA} */ + vpsllq YTMP1, YTMP2, (64-19) /* YTMP1 = W[-2] << 19 {BABA} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {BABA} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} */ + vpsrlq YTMP3, YTMP2, 61 /* YTMP3 = W[-2] >> 61 {BABA} */ + vpsllq YTMP1, YTMP2, (64-61) /* YTMP1 = W[-2] << 61 {BABA} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {BABA} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} */ + + /* Add sigma1 to the other compunents to get w[16] and w[17] */ + vpaddq Y_0, Y_0, YTMP4 /* Y_0 = {W[1], W[0], W[1], W[0]} */ + + /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */ + vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + add h, [rsp+frame_XFER+2*8] /* h = k + w + h ; -- */ + + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + or y3, c /* y3 = a|c ; MAJA */ + mov y2, f /* y2 = f ; CH */ + xor y2, g /* y2 = f^g ; CH */ + + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + add d, h /* d = k + w + h + d ; -- */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;; */ + + vpsrlq YTMP3, Y_0, 19 /* YTMP3 = W[-2] >> 19 {DC--} */ + vpsllq YTMP1, Y_0, (64-19) /* YTMP1 = W[-2] << 19 {DC--} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 19 {DC--} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} */ + vpsrlq YTMP3, Y_0, 61 /* YTMP3 = W[-2] >> 61 {DC--} */ + vpsllq YTMP1, Y_0, (64-61) /* YTMP1 = W[-2] << 61 {DC--} */ + vpor YTMP3, YTMP3, YTMP1 /* YTMP3 = W[-2] ror 61 {DC--} */ + vpxor YTMP4, YTMP4, YTMP3 /* YTMP4 = s1 = (W[-2] ror 19) ^ (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} */ + + /* Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] to newly calculated sigma1 to get w[18] and w[19] */ + vpaddq YTMP2, YTMP0, YTMP4 /* YTMP2 = {W[3], W[2], --, --} */ + + /* Form w[19, w[18], w17], w[16] */ + vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */ +/* vperm2f128 Y_0, Y_0, YTMP2, 0x30 */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + add h, [rsp+frame_XFER+3*8] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + + + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add d, h /* d = k + w + h + d ; -- */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + + add h, y1 /* h = k + w + h + S0 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + +RotateState + +rotate_Ys +.endm + +.macro DO_4ROUNDS + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*0] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ + + RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*1] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ + + RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*2] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ + + RotateState + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ + rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ + rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ + rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ + rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ + add h, [rsp + frame_XFER + 8*3] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ + + add h, y3 /* h = t1 + S0 + MAJ ; -- */ + + RotateState + +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_rorx(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks +*/ +.globl _gcry_sha512_transform_amd64_avx2 +.type _gcry_sha512_transform_amd64_avx2, at function; +.align 16 +_gcry_sha512_transform_amd64_avx2: + xor eax, eax + + cmp rdx, 0 + je .Lnowork + + /* Allocate Stack Space */ + mov rax, rsp + sub rsp, frame_size + and rsp, ~(0x20 - 1) + mov [rsp + frame_RSPSAVE], rax + + /* Save GPRs */ + mov [rsp + frame_GPRSAVE + 8 * 0], rbp + mov [rsp + frame_GPRSAVE + 8 * 1], rbx + mov [rsp + frame_GPRSAVE + 8 * 2], r12 + mov [rsp + frame_GPRSAVE + 8 * 3], r13 + mov [rsp + frame_GPRSAVE + 8 * 4], r14 + mov [rsp + frame_GPRSAVE + 8 * 5], r15 + + vpblendd xmm0, xmm0, xmm1, 0xf0 + vpblendd ymm0, ymm0, ymm1, 0xf0 + + shl NUM_BLKS, 7 /* convert to bytes */ + jz .Ldone_hash + add NUM_BLKS, INP /* pointer to end of data */ + mov [rsp + frame_INPEND], NUM_BLKS + + /*; load initial digest */ + mov a,[8*0 + CTX] + mov b,[8*1 + CTX] + mov c,[8*2 + CTX] + mov d,[8*3 + CTX] + mov e,[8*4 + CTX] + mov f,[8*5 + CTX] + mov g,[8*6 + CTX] + mov h,[8*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + +.Loop0: + lea TBL,[.LK512 ADD_RIP] + + /*; byte swap first 16 dwords */ + COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + + mov [rsp + frame_INP], INP + + /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ + movq [rsp + frame_SRND],4 + +.align 16 +.Loop1: + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame_XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 1*32] + vmovdqa [rsp + frame_XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 2*32] + vmovdqa [rsp + frame_XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddq XFER, Y_0, [TBL + 3*32] + vmovdqa [rsp + frame_XFER], XFER + add TBL, 4*32 + FOUR_ROUNDS_AND_SCHED + + subq [rsp + frame_SRND], 1 + jne .Loop1 + + movq [rsp + frame_SRND], 2 +.Loop2: + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame_XFER], XFER + DO_4ROUNDS + vpaddq XFER, Y_1, [TBL + 1*32] + vmovdqa [rsp + frame_XFER], XFER + add TBL, 2*32 + DO_4ROUNDS + + vmovdqa Y_0, Y_2 + vmovdqa Y_1, Y_3 + + subq [rsp + frame_SRND], 1 + jne .Loop2 + + addm [8*0 + CTX],a + addm [8*1 + CTX],b + addm [8*2 + CTX],c + addm [8*3 + CTX],d + addm [8*4 + CTX],e + addm [8*5 + CTX],f + addm [8*6 + CTX],g + addm [8*7 + CTX],h + + mov INP, [rsp + frame_INP] + add INP, 128 + cmp INP, [rsp + frame_INPEND] + jne .Loop0 + +.Ldone_hash: + + /* Restore GPRs */ + mov rbp, [rsp + frame_GPRSAVE + 8 * 0] + mov rbx, [rsp + frame_GPRSAVE + 8 * 1] + mov r12, [rsp + frame_GPRSAVE + 8 * 2] + mov r13, [rsp + frame_GPRSAVE + 8 * 3] + mov r14, [rsp + frame_GPRSAVE + 8 * 4] + mov r15, [rsp + frame_GPRSAVE + 8 * 5] + + /* Restore Stack Pointer */ + mov rsp, [rsp + frame_RSPSAVE] + + mov rax, frame_size +.Lnowork: + ret + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ +/*;; Binary Data */ + +.data + +.align 64 +/* K[t] used in SHA512 hashing */ +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.align 32 + +/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ +.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +.LMASK_YMM_LO: .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF + +#endif +#endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 34b3215..586c809 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -74,6 +74,24 @@ #endif +/* USE_AVX indicates whether to compile with Intel AVX code. */ +#undef USE_AVX +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_AVX 1 +#endif + + +/* USE_AVX2 indicates whether to compile with Intel AVX2/rorx code. */ +#undef USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_AVX2 1 +#endif + + typedef struct { u64 h0, h1, h2, h3, h4, h5, h6, h7; @@ -89,6 +107,12 @@ typedef struct #ifdef USE_SSSE3 unsigned int use_ssse3:1; #endif +#ifdef USE_AVX + unsigned int use_avx:1; +#endif +#ifdef USE_AVX2 + unsigned int use_avx2:1; +#endif } SHA512_CONTEXT; static unsigned int @@ -99,6 +123,7 @@ sha512_init (void *context) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; + unsigned int features = _gcry_get_hw_features (); hd->h0 = U64_C(0x6a09e667f3bcc908); hd->h1 = U64_C(0xbb67ae8584caa73b); @@ -116,11 +141,19 @@ sha512_init (void *context) ctx->bctx.bwrite = transform; #ifdef USE_ARM_NEON_ASM - ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; + ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_SSSE3 - ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; +#endif +#ifdef USE_AVX + ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); +#endif +#ifdef USE_AVX2 + ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); #endif + + (void)features; } static void @@ -128,6 +161,7 @@ sha384_init (void *context) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; + unsigned int features = _gcry_get_hw_features (); hd->h0 = U64_C(0xcbbb9d5dc1059ed8); hd->h1 = U64_C(0x629a292a367cd507); @@ -145,11 +179,19 @@ sha384_init (void *context) ctx->bctx.bwrite = transform; #ifdef USE_ARM_NEON_ASM - ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; + ctx->use_neon = (features & HWF_ARM_NEON) != 0; #endif #ifdef USE_SSSE3 - ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + ctx->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; +#endif +#ifdef USE_AVX + ctx->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); #endif +#ifdef USE_AVX2 + ctx->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); +#endif + + (void)features; } @@ -507,12 +549,34 @@ unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data, void *state, size_t num_blks); #endif +#ifdef USE_AVX +unsigned int _gcry_sha512_transform_amd64_avx(const void *input_data, + void *state, size_t num_blks); +#endif + +#ifdef USE_AVX2 +unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data, + void *state, size_t num_blks); +#endif + static unsigned int transform (void *context, const unsigned char *data) { SHA512_CONTEXT *ctx = context; +#ifdef USE_AVX2 + if (ctx->use_avx2) + return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, 1) + + 4 * sizeof(void*); +#endif + +#ifdef USE_AVX + if (ctx->use_avx) + return _gcry_sha512_transform_amd64_avx (data, &ctx->state, 1) + + 4 * sizeof(void*); +#endif + #ifdef USE_SSSE3 if (ctx->use_ssse3) return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, 1) diff --git a/configure.ac b/configure.ac index 5990429..bc86956 100644 --- a/configure.ac +++ b/configure.ac @@ -1050,6 +1050,23 @@ fi # +# Check whether GCC inline assembler supports BMI2 instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions], + [gcry_cv_gcc_inline_asm_bmi2], + [gcry_cv_gcc_inline_asm_bmi2=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[void a(void) { + __asm__("rorxl \$23, %%eax, %%edx\\n\\t":::"memory"); + }]])], + [gcry_cv_gcc_inline_asm_bmi2=yes])]) +if test "$gcry_cv_gcc_inline_asm_bmi2" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_BMI2,1, + [Defined if inline assembler supports BMI2 instructions]) +fi + + +# # Check whether GCC assembler supports features needed for our amd64 # implementations # @@ -1728,6 +1745,8 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-avx2-bmi2-amd64.lo" ;; esac diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 0f7557d..410c45d 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -624,6 +624,8 @@ are @item padlock-aes @item padlock-sha @item padlock-mmul + at item intel-cpu + at item intel-bmi2 @item intel-ssse3 @item intel-pclmul @item intel-aesni diff --git a/src/g10lib.h b/src/g10lib.h index 6b2bafb..43dc011 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -197,6 +197,8 @@ int _gcry_log_verbosity( int level ); #define HWF_PADLOCK_SHA 4 #define HWF_PADLOCK_MMUL 8 +#define HWF_INTEL_CPU 16 +#define HWF_INTEL_BMI2 32 #define HWF_INTEL_SSSE3 64 #define HWF_INTEL_PCLMUL 128 #define HWF_INTEL_AESNI 256 diff --git a/src/hwf-x86.c b/src/hwf-x86.c index ab6dacd..4e82558 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -189,6 +189,7 @@ detect_x86_gnuc (void) else if (!strcmp (vendor_id, "GenuineIntel")) { /* This is an Intel CPU. */ + result |= HWF_INTEL_CPU; } else if (!strcmp (vendor_id, "AuthenticAMD")) { @@ -231,10 +232,14 @@ detect_x86_gnuc (void) * Source: http://www.sandpile.org/x86/cpuid.htm */ if (max_cpuid_level >= 7 && (features & 0x00000001)) { -#ifdef ENABLE_AVX2_SUPPORT /* Get CPUID:7 contains further Intel feature flags. */ get_cpuid(7, NULL, &features, NULL, NULL); + /* Test bit 8 for BMI2. */ + if (features & 0x00000100) + result |= HWF_INTEL_BMI2; + +#ifdef ENABLE_AVX2_SUPPORT /* Test bit 5 for AVX2. */ if (features & 0x00000020) result |= HWF_INTEL_AVX2; diff --git a/src/hwfeatures.c b/src/hwfeatures.c index 5c3bb18..58099c4 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -46,6 +46,8 @@ static struct { HWF_PADLOCK_AES, "padlock-aes" }, { HWF_PADLOCK_SHA, "padlock-sha" }, { HWF_PADLOCK_MMUL,"padlock-mmul"}, + { HWF_INTEL_CPU, "intel-cpu" }, + { HWF_INTEL_BMI2, "intel-bmi2" }, { HWF_INTEL_SSSE3, "intel-ssse3" }, { HWF_INTEL_PCLMUL,"intel-pclmul" }, { HWF_INTEL_AESNI, "intel-aesni" }, commit 69a6d0f9562fcd26112a589318c13de66ce1700e Author: Jussi Kivilinna Date: Thu Dec 12 12:43:08 2013 +0200 SHA-512: Add SSSE3 implementation for x86-64 * cipher/Makefile.am: Add 'sha512-ssse3-amd64.S'. * cipher/sha512-ssse3-amd64.S: New. * cipher/sha512.c (USE_SSSE3): New. (SHA512_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'. (sha512_init, sha384_init) [USE_SSSE3]: Initialize 'use_ssse3'. [USE_SSSE3] (_gcry_sha512_transform_amd64_ssse3): New. (transform) [USE_SSSE3]: Call SSSE3 implementation. * configure.ac (sha512): Add 'sha512-ssse3-amd64.lo'. -- Patch adds fast SSSE3 implementation of SHA-512 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA512 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/fast-sha512-implementations-ia-processors-paper.html Benchmarks: cpu Old New Diff Intel i5-4570 10.11 c/B 7.56 c/B 1.33x Intel i5-2450M 14.11 c/B 10.53 c/B 1.33x Intel Core2 T8100 11.92 c/B 10.22 c/B 1.16x Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 34f74e2..88c288a 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -79,7 +79,7 @@ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ sha256.c sha256-ssse3-amd64.S \ -sha512.c sha512-armv7-neon.S \ +sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c \ diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S new file mode 100644 index 0000000..2a719e5 --- /dev/null +++ b/cipher/sha512-ssse3-amd64.S @@ -0,0 +1,414 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + * + * Note: original implementation was named as SHA512-SSE4. However, only SSSE3 + * is required. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +.text + +/* Virtual Registers */ +msg = rdi /* ARG1 */ +digest = rsi /* ARG2 */ +msglen = rdx /* ARG3 */ +T1 = rcx +T2 = r8 +a_64 = r9 +b_64 = r10 +c_64 = r11 +d_64 = r12 +e_64 = r13 +f_64 = r14 +g_64 = r15 +h_64 = rbx +tmp0 = rax + +/* +; Local variables (stack frame) +; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP +*/ +frame_W = 0 /* Message Schedule */ +frame_W_size = (80 * 8) +frame_WK = ((frame_W) + (frame_W_size)) /* W[t] + K[t] | W[t+1] + K[t+1] */ +frame_WK_size = (2 * 8) +frame_GPRSAVE = ((frame_WK) + (frame_WK_size)) +frame_GPRSAVE_size = (5 * 8) +frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) + + +/* Useful QWORD "arrays" for simpler memory references */ +#define MSG(i) msg + 8*(i) /* Input message (arg1) */ +#define DIGEST(i) digest + 8*(i) /* Output Digest (arg2) */ +#define K_t(i) .LK512 + 8*(i) ADD_RIP /* SHA Constants (static mem) */ +#define W_t(i) rsp + frame_W + 8*(i) /* Message Schedule (stack frame) */ +#define WK_2(i) rsp + frame_WK + 8*((i) % 2) /* W[t]+K[t] (stack frame) */ +/* MSG, DIGEST, K_t, W_t are arrays */ +/* WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even */ + +.macro RotateState + /* Rotate symbles a..h right */ + __TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = __TMP +.endm + +.macro SHA512_Round t + /* Compute Round %%t */ + mov T1, f_64 /* T1 = f */ + mov tmp0, e_64 /* tmp = e */ + xor T1, g_64 /* T1 = f ^ g */ + ror tmp0, 23 /* 41 ; tmp = e ror 23 */ + and T1, e_64 /* T1 = (f ^ g) & e */ + xor tmp0, e_64 /* tmp = (e ror 23) ^ e */ + xor T1, g_64 /* T1 = ((f ^ g) & e) ^ g = CH(e,f,g) */ + add T1, [WK_2(\t)] /* W[t] + K[t] from message scheduler */ + ror tmp0, 4 /* 18 ; tmp = ((e ror 23) ^ e) ror 4 */ + xor tmp0, e_64 /* tmp = (((e ror 23) ^ e) ror 4) ^ e */ + mov T2, a_64 /* T2 = a */ + add T1, h_64 /* T1 = CH(e,f,g) + W[t] + K[t] + h */ + ror tmp0, 14 /* 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) */ + add T1, tmp0 /* T1 = CH(e,f,g) + W[t] + K[t] + S1(e) */ + mov tmp0, a_64 /* tmp = a */ + xor T2, c_64 /* T2 = a ^ c */ + and tmp0, c_64 /* tmp = a & c */ + and T2, b_64 /* T2 = (a ^ c) & b */ + xor T2, tmp0 /* T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) */ + mov tmp0, a_64 /* tmp = a */ + ror tmp0, 5 /* 39 ; tmp = a ror 5 */ + xor tmp0, a_64 /* tmp = (a ror 5) ^ a */ + add d_64, T1 /* e(next_state) = d + T1 */ + ror tmp0, 6 /* 34 ; tmp = ((a ror 5) ^ a) ror 6 */ + xor tmp0, a_64 /* tmp = (((a ror 5) ^ a) ror 6) ^ a */ + lea h_64, [T1 + T2] /* a(next_state) = T1 + Maj(a,b,c) */ + ror tmp0, 28 /* 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) */ + add h_64, tmp0 /* a(next_state) = T1 + Maj(a,b,c) S0(a) */ + RotateState +.endm + +.macro SHA512_2Sched_2Round_sse t +/* ; Compute rounds %%t-2 and %%t-1 + ; Compute message schedule QWORDS %%t and %%t+1 + + ; Two rounds are computed based on the values for K[t-2]+W[t-2] and + ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message + ; scheduler. + ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + ; They are then added to their respective SHA512 constants at + ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + ; For brievity, the comments following vectored instructions only refer to + ; the first of a pair of QWORDS. + ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + ; The computation of the message schedule and the rounds are tightly + ; stitched to take advantage of instruction-level parallelism. + ; For clarity, integer instructions (for the rounds calculation) are indented + ; by one tab. Vectored instructions (for the message scheduler) are indented + ; by two tabs. */ + + mov T1, f_64 + movdqa xmm2, [W_t(\t-2)] /* XMM2 = W[t-2] */ + xor T1, g_64 + and T1, e_64 + movdqa xmm0, xmm2 /* XMM0 = W[t-2] */ + xor T1, g_64 + add T1, [WK_2(\t)] + movdqu xmm5, [W_t(\t-15)] /* XMM5 = W[t-15] */ + mov tmp0, e_64 + ror tmp0, 23 /* 41 */ + movdqa xmm3, xmm5 /* XMM3 = W[t-15] */ + xor tmp0, e_64 + ror tmp0, 4 /* 18 */ + psrlq xmm0, 61 - 19 /* XMM0 = W[t-2] >> 42 */ + xor tmp0, e_64 + ror tmp0, 14 /* 14 */ + psrlq xmm3, (8 - 7) /* XMM3 = W[t-15] >> 1 */ + add T1, tmp0 + add T1, h_64 + pxor xmm0, xmm2 /* XMM0 = (W[t-2] >> 42) ^ W[t-2] */ + mov T2, a_64 + xor T2, c_64 + pxor xmm3, xmm5 /* XMM3 = (W[t-15] >> 1) ^ W[t-15] */ + and T2, b_64 + mov tmp0, a_64 + psrlq xmm0, 19 - 6 /* XMM0 = ((W[t-2]>>42)^W[t-2])>>13 */ + and tmp0, c_64 + xor T2, tmp0 + psrlq xmm3, (7 - 1) /* XMM3 = ((W[t-15]>>1)^W[t-15])>>6 */ + mov tmp0, a_64 + ror tmp0, 5 /* 39 */ + pxor xmm0, xmm2 /* XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] */ + xor tmp0, a_64 + ror tmp0, 6 /* 34 */ + pxor xmm3, xmm5 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] */ + xor tmp0, a_64 + ror tmp0, 28 /* 28 */ + psrlq xmm0, 6 /* XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 */ + add T2, tmp0 + add d_64, T1 + psrlq xmm3, 1 /* XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 */ + lea h_64, [T1 + T2] + RotateState + movdqa xmm1, xmm2 /* XMM1 = W[t-2] */ + mov T1, f_64 + xor T1, g_64 + movdqa xmm4, xmm5 /* XMM4 = W[t-15] */ + and T1, e_64 + xor T1, g_64 + psllq xmm1, (64 - 19) - (64 - 61) /* XMM1 = W[t-2] << 42 */ + add T1, [WK_2(\t+1)] + mov tmp0, e_64 + psllq xmm4, (64 - 1) - (64 - 8) /* XMM4 = W[t-15] << 7 */ + ror tmp0, 23 /* 41 */ + xor tmp0, e_64 + pxor xmm1, xmm2 /* XMM1 = (W[t-2] << 42)^W[t-2] */ + ror tmp0, 4 /* 18 */ + xor tmp0, e_64 + pxor xmm4, xmm5 /* XMM4 = (W[t-15]<<7)^W[t-15] */ + ror tmp0, 14 /* 14 */ + add T1, tmp0 + psllq xmm1, (64 - 61) /* XMM1 = ((W[t-2] << 42)^W[t-2])<<3 */ + add T1, h_64 + mov T2, a_64 + psllq xmm4, (64 - 8) /* XMM4 = ((W[t-15]<<7)^W[t-15])<<56 */ + xor T2, c_64 + and T2, b_64 + pxor xmm0, xmm1 /* XMM0 = s1(W[t-2]) */ + mov tmp0, a_64 + and tmp0, c_64 + movdqu xmm1, [W_t(\t- 7)] /* XMM1 = W[t-7] */ + xor T2, tmp0 + pxor xmm3, xmm4 /* XMM3 = s0(W[t-15]) */ + mov tmp0, a_64 + paddq xmm0, xmm3 /* XMM0 = s1(W[t-2]) + s0(W[t-15]) */ + ror tmp0, 5 /* 39 */ + paddq xmm0, [W_t(\t-16)] /* XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] */ + xor tmp0, a_64 + paddq xmm0, xmm1 /* XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] */ + ror tmp0, 6 /* 34 */ + movdqa [W_t(\t)], xmm0 /* Store scheduled qwords */ + xor tmp0, a_64 + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + ror tmp0, 28 /* 28 */ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] for next rounds */ + add T2, tmp0 + add d_64, T1 + lea h_64, [T1 + T2] + RotateState +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void sha512_sse4(const void* M, void* D, uint64_t L); +; Purpose: Updates the SHA512 digest stored at D with the message stored in M. +; The size of the message pointed to by M must be an integer multiple of SHA512 +; message blocks. +; L is the message length in SHA512 blocks. +*/ +.globl _gcry_sha512_transform_amd64_ssse3 +.type _gcry_sha512_transform_amd64_ssse3, at function; +.align 16 +_gcry_sha512_transform_amd64_ssse3: + xor eax, eax + + cmp msglen, 0 + je .Lnowork + + /* Allocate Stack Space */ + sub rsp, frame_size + + /* Save GPRs */ + mov [rsp + frame_GPRSAVE + 8 * 0], rbx + mov [rsp + frame_GPRSAVE + 8 * 1], r12 + mov [rsp + frame_GPRSAVE + 8 * 2], r13 + mov [rsp + frame_GPRSAVE + 8 * 3], r14 + mov [rsp + frame_GPRSAVE + 8 * 4], r15 + +.Lupdateblock: + + /* Load state variables */ + mov a_64, [DIGEST(0)] + mov b_64, [DIGEST(1)] + mov c_64, [DIGEST(2)] + mov d_64, [DIGEST(3)] + mov e_64, [DIGEST(4)] + mov f_64, [DIGEST(5)] + mov g_64, [DIGEST(6)] + mov h_64, [DIGEST(7)] + + t = 0 + .rept 80/2 + 1 + /* (80 rounds) / (2 rounds/iteration) + (1 iteration) */ + /* +1 iteration because the scheduler leads hashing by 1 iteration */ + .if t < 2 + /* BSWAP 2 QWORDS */ + movdqa xmm1, [.LXMM_QWORD_BSWAP ADD_RIP] + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 /* BSWAP */ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + movdqa [WK_2(t)], xmm0 /* Store into WK for rounds */ + .elseif t < 16 + /* BSWAP 2 QWORDS; Compute 2 Rounds */ + movdqu xmm0, [MSG(t)] + pshufb xmm0, xmm1 /* BSWAP */ + SHA512_Round (t - 2) /* Round t-2 */ + movdqa [W_t(t)], xmm0 /* Store Scheduled Pair */ + paddq xmm0, [K_t(t)] /* Compute W[t]+K[t] */ + SHA512_Round (t - 1) /* Round t-1 */ + movdqa [WK_2(t)], xmm0 /* Store W[t]+K[t] into WK */ + .elseif t < 79 + /* Schedule 2 QWORDS; Compute 2 Rounds */ + SHA512_2Sched_2Round_sse t + .else + /* Compute 2 Rounds */ + SHA512_Round (t - 2) + SHA512_Round (t - 1) + .endif + t = (t)+2 + .endr + + /* Update digest */ + add [DIGEST(0)], a_64 + add [DIGEST(1)], b_64 + add [DIGEST(2)], c_64 + add [DIGEST(3)], d_64 + add [DIGEST(4)], e_64 + add [DIGEST(5)], f_64 + add [DIGEST(6)], g_64 + add [DIGEST(7)], h_64 + + /* Advance to next message block */ + add msg, 16*8 + dec msglen + jnz .Lupdateblock + + /* Restore GPRs */ + mov rbx, [rsp + frame_GPRSAVE + 8 * 0] + mov r12, [rsp + frame_GPRSAVE + 8 * 1] + mov r13, [rsp + frame_GPRSAVE + 8 * 2] + mov r14, [rsp + frame_GPRSAVE + 8 * 3] + mov r15, [rsp + frame_GPRSAVE + 8 * 4] + + /* Restore Stack Pointer */ + add rsp, frame_size + + /* Return stack burn depth */ + mov rax, frame_size + +.Lnowork: + ret + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Binary Data +*/ + +.data + +.align 16 + +/* Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. */ +.LXMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +/* K[t] used in SHA512 hashing */ +.LK512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +#endif +#endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 14608dc..34b3215 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -64,6 +64,16 @@ # endif #endif + +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_SSSE3 1 +#endif + + typedef struct { u64 h0, h1, h2, h3, h4, h5, h6, h7; @@ -74,7 +84,10 @@ typedef struct gcry_md_block_ctx_t bctx; SHA512_STATE state; #ifdef USE_ARM_NEON_ASM - int use_neon; + unsigned int use_neon:1; +#endif +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; #endif } SHA512_CONTEXT; @@ -105,6 +118,9 @@ sha512_init (void *context) #ifdef USE_ARM_NEON_ASM ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; #endif +#ifdef USE_SSSE3 + ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } static void @@ -131,6 +147,9 @@ sha384_init (void *context) #ifdef USE_ARM_NEON_ASM ctx->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; #endif +#ifdef USE_SSSE3 + ctx->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -483,16 +502,27 @@ void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd, const u64 k[]); #endif +#ifdef USE_SSSE3 +unsigned int _gcry_sha512_transform_amd64_ssse3(const void *input_data, + void *state, size_t num_blks); +#endif + static unsigned int transform (void *context, const unsigned char *data) { SHA512_CONTEXT *ctx = context; +#ifdef USE_SSSE3 + if (ctx->use_ssse3) + return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, 1) + + 4 * sizeof(void*); +#endif + #ifdef USE_ARM_NEON_ASM if (ctx->use_neon) { - _gcry_sha512_transform_armv7_neon(&ctx->state, data, k); + _gcry_sha512_transform_armv7_neon (&ctx->state, data, k); /* _gcry_sha512_transform_armv7_neon does not store sensitive data * to stack. */ diff --git a/configure.ac b/configure.ac index 97401c3..5990429 100644 --- a/configure.ac +++ b/configure.ac @@ -1724,6 +1724,13 @@ if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo" AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included]) + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-ssse3-amd64.lo" + ;; + esac + if test x"$neonsupport" = xyes ; then # Build with the NEON implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo" commit e1a3931263e67aacec3c0bfcaa86c7d1441d5c6a Author: Jussi Kivilinna Date: Wed Dec 11 19:32:08 2013 +0200 SHA-256: Add SSSE3 implementation for x86-64 * cipher/Makefile.am: Add 'sha256-ssse3-amd64.S'. * cipher/sha256-ssse3-amd64.S: New. * cipher/sha256.c (USE_SSSE3): New. (SHA256_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'. (sha256_init, sha224_init) [USE_SSSE3]: Initialize 'use_ssse3'. (transform): Rename to... (_transform): This. [USE_SSSE3] (_gcry_sha256_transform_amd64_ssse3): New. (transform): New. * configure.ac (HAVE_INTEL_SYNTAX_PLATFORM_AS): New check. (sha256): Add 'sha256-ssse3-amd64.lo'. * doc/gcrypt.texi: Document 'intel-ssse3'. * src/g10lib.h (HWF_INTEL_SSSE3): New. * src/hwfeatures.c (hwflist): Add "intel-ssse3". * src/hwf-x86.c (detect_x86_gnuc): Test for SSSE3. -- Patch adds fast SSSE3 implementation of SHA-256 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA - 256 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/sha-256-implementations-paper.html Benchmarks: cpu Old New Diff Intel i5-4570 13.99 c/B 10.66 c/B 1.31x Intel i5-2450M 21.53 c/B 15.79 c/B 1.36x Intel Core2 T8100 20.84 c/B 15.07 c/B 1.38x Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index ff9deca..34f74e2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,7 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ -sha256.c \ +sha256.c sha256-ssse3-amd64.S \ sha512.c sha512-armv7-neon.S \ stribog.c \ tiger.c \ diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S new file mode 100644 index 0000000..06070d1 --- /dev/null +++ b/cipher/sha256-ssse3-amd64.S @@ -0,0 +1,526 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + * + * Note: original implementation was named as SHA256-SSE4. However, only SSSE3 + * is required. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +#define MOVDQ movdqu /* assume buffers not aligned */ + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ + +/* addm [mem], reg + * Add reg to mem using reg-mem add and store */ +.macro addm p1 p2 + add \p2, \p1 + mov \p1, \p2 +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask + * Load xmm with mem and byte swap each dword */ +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + MOVDQ \p1, \p2 + pshufb \p1, \p3 +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +X0 = xmm4 +X1 = xmm5 +X2 = xmm6 +X3 = xmm7 + +XTMP0 = xmm0 +XTMP1 = xmm1 +XTMP2 = xmm2 +XTMP3 = xmm3 +XTMP4 = xmm8 +XFER = xmm9 + +SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ +SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ +BYTE_FLIP_MASK = xmm12 + +NUM_BLKS = rdx /* 3rd arg */ +CTX = rsi /* 2nd arg */ +INP = rdi /* 1st arg */ + +SRND = rdi /* clobbers INP */ +c = ecx +d = r8d +e = edx + +TBL = rbp +a = eax +b = ebx + +f = r9d +g = r10d +h = r11d + +y0 = r13d +y1 = r14d +y2 = r15d + + + +#define _INP_END_SIZE 8 +#define _INP_SIZE 8 +#define _XFER_SIZE 8 +#define _XMM_SAVE_SIZE 0 +/* STACK_SIZE plus pushes must be an odd multiple of 8 */ +#define _ALIGN_SIZE 8 + +#define _INP_END 0 +#define _INP (_INP_END + _INP_END_SIZE) +#define _XFER (_INP + _INP_SIZE) +#define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) +#define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) + +/* rotate_Xs + * Rotate values of symbols X0...X3 */ +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +/* ROTATE_ARGS + * Rotate values of symbols a...h */ +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + /* compute s0 four at a time and s1 two at a time + * compute W[-16] + W[-7] 4 at a time */ + movdqa XTMP0, X3 + mov y0, e /* y0 = e */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + movdqa XTMP1, X1 + xor y1, a /* y1 = a ^ (a >> (22-13) */ + xor y2, g /* y2 = f^g */ + paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + /* compute s0 */ + palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ + movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pslld XTMP1, (32-7) + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + psrld XTMP2, 7 + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */ + mov y0, e /* y0 = e */ + mov y1, a /* y1 = a */ + movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + pslld XTMP3, (32-18) + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y2, g /* y2 = f^g */ + psrld XTMP2, 18 + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + pxor XTMP1, XTMP3 + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pxor XTMP1, XTMP4 /* XTMP1 = s0 */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + /* compute low s1 */ + pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */ + mov y0, e /* y0 = e */ + mov y1, a /* y1 = a */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + mov y2, f /* y2 = f */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ + xor y2, g /* y2 = f^g */ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + pxor XTMP2, XTMP3 + add y2, y0 /* y2 = S1 + CH */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ + pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + /* compute high s1 */ + pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */ + mov y0, e /* y0 = e */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + xor y2, g /* y2 = f^g */ + psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + pxor XTMP2, XTMP3 + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ + pxor X0, XTMP2 /* X0 = s1 {xDxC} */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS +rotate_Xs +.endm + +/* input is [rsp + _XFER + %1 * 4] */ +.macro DO_ROUND i1 + mov y0, e /* y0 = e */ + ror y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + ror y1, (22-13) /* y1 = a >> (22-13) */ + mov y2, f /* y2 = f */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y2, g /* y2 = f^g */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + and y2, e /* y2 = (f^g)&e */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + add y2, y0 /* y2 = S1 + CH */ + ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + ROTATE_ARGS +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +*/ +.text +.globl _gcry_sha256_transform_amd64_ssse3 +.type _gcry_sha256_transform_amd64_ssse3, at function; +.align 16 +_gcry_sha256_transform_amd64_ssse3: + push rbx + push rbp + push r13 + push r14 + push r15 + + sub rsp, STACK_SIZE + + shl NUM_BLKS, 6 /* convert to bytes */ + jz .Ldone_hash + add NUM_BLKS, INP /* pointer to end of data */ + mov [rsp + _INP_END], NUM_BLKS + + /* load initial digest */ + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + movdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + movdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] + movdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] + +.Loop0: + lea TBL, [.LK256 ADD_RIP] + + /* byte swap first 16 dwords */ + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + /* schedule 48 input dwords, by doing 3 rounds of 16 each */ + mov SRND, 3 +.align 16 +.Loop1: + movdqa XFER, [TBL + 0*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne .Loop1 + + mov SRND, 2 +.Loop2: + paddd X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne .Loop2 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne .Loop0 + +.Ldone_hash: + add rsp, STACK_SIZE + + pop r15 + pop r14 + pop r13 + pop rbp + pop rbx + + mov rax, STACK_SIZE + ret + + +.data +.align 16 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 + +/* shuffle xBxA -> 00BA */ +.L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +/* shuffle xDxC -> DC00 */ +.L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF + +#endif +#endif diff --git a/cipher/sha256.c b/cipher/sha256.c index bd5a412..f3c1d62 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -46,11 +46,25 @@ #include "cipher.h" #include "hash-common.h" + +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_SSSE3 1 +#endif + + typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4,h5,h6,h7; +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; +#endif } SHA256_CONTEXT; + static unsigned int transform (void *c, const unsigned char *data); @@ -74,6 +88,10 @@ sha256_init (void *context) hd->bctx.count = 0; hd->bctx.blocksize = 64; hd->bctx.bwrite = transform; + +#ifdef USE_SSSE3 + hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -96,6 +114,10 @@ sha224_init (void *context) hd->bctx.count = 0; hd->bctx.blocksize = 64; hd->bctx.bwrite = transform; + +#ifdef USE_SSSE3 + hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -148,7 +170,7 @@ Sum1 (u32 x) static unsigned int -transform (void *ctx, const unsigned char *data) +_transform (void *ctx, const unsigned char *data) { SHA256_CONTEXT *hd = ctx; static const u32 K[64] = { @@ -254,6 +276,27 @@ transform (void *ctx, const unsigned char *data) #undef R +#ifdef USE_SSSE3 +unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data, + u32 state[8], size_t num_blks); +#endif + + +static unsigned int +transform (void *ctx, const unsigned char *data) +{ + SHA256_CONTEXT *hd = ctx; + +#ifdef USE_SSSE3 + if (hd->use_ssse3) + return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, 1) + + 4 * sizeof(void*); +#endif + + return _transform (hd, data); +} + + /* The routine finally terminates the computation and returns the digest. The handle is prepared for a new cycle, but adding bytes diff --git a/configure.ac b/configure.ac index 6d40343..97401c3 100644 --- a/configure.ac +++ b/configure.ac @@ -1077,6 +1077,40 @@ fi # +# Check whether GCC assembler supports features needed for assembly +# implementations that use Intel syntax +# +AC_CACHE_CHECK([whether GCC assembler is compatible for Intel syntax assembly implementations], + [gcry_cv_gcc_platform_as_ok_for_intel_syntax], + [gcry_cv_gcc_platform_as_ok_for_intel_syntax=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[__asm__( + ".intel_syntax noprefix\n\t" + "pxor xmm1, xmm7;\n\t" + /* Intel syntax implementation also use GAS macros, so check + * for them here. */ + "VAL_A = xmm4\n\t" + "VAL_B = xmm2\n\t" + ".macro SET_VAL_A p1\n\t" + " VAL_A = \\\\p1 \n\t" + ".endm\n\t" + ".macro SET_VAL_B p1\n\t" + " VAL_B = \\\\p1 \n\t" + ".endm\n\t" + "vmovdqa VAL_A, VAL_B;\n\t" + "SET_VAL_A eax\n\t" + "SET_VAL_B ebp\n\t" + "add VAL_A, VAL_B;\n\t" + "add VAL_B, 0b10101;\n\t" + );]])], + [gcry_cv_gcc_platform_as_ok_for_intel_syntax=yes])]) +if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then + AC_DEFINE(HAVE_INTEL_SYNTAX_PLATFORM_AS,1, + [Defined if underlying assembler is compatible with Intel syntax assembly implementations]) +fi + + +# # Check whether compiler is configured for ARMv6 or newer architecture # AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture], @@ -1676,6 +1710,13 @@ LIST_MEMBER(sha256, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256.lo" AC_DEFINE(USE_SHA256, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo" + ;; + esac fi LIST_MEMBER(sha512, $enabled_digests) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index f3af29f..0f7557d 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -624,6 +624,7 @@ are @item padlock-aes @item padlock-sha @item padlock-mmul + at item intel-ssse3 @item intel-pclmul @item intel-aesni @item intel-rdrand diff --git a/src/g10lib.h b/src/g10lib.h index 1e58ef6..6b2bafb 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -197,6 +197,7 @@ int _gcry_log_verbosity( int level ); #define HWF_PADLOCK_SHA 4 #define HWF_PADLOCK_MMUL 8 +#define HWF_INTEL_SSSE3 64 #define HWF_INTEL_PCLMUL 128 #define HWF_INTEL_AESNI 256 #define HWF_INTEL_RDRAND 512 diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 784fe2a..ab6dacd 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -206,6 +206,9 @@ detect_x86_gnuc (void) if (features & 0x00000002) result |= HWF_INTEL_PCLMUL; #endif + /* Test bit 9 for SSSE3. */ + if (features & 0x00000200) + result |= HWF_INTEL_SSSE3; #ifdef ENABLE_AESNI_SUPPORT /* Test bit 25 for AES-NI. */ if (features & 0x02000000) diff --git a/src/hwfeatures.c b/src/hwfeatures.c index 6699816..5c3bb18 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -46,6 +46,7 @@ static struct { HWF_PADLOCK_AES, "padlock-aes" }, { HWF_PADLOCK_SHA, "padlock-sha" }, { HWF_PADLOCK_MMUL,"padlock-mmul"}, + { HWF_INTEL_SSSE3, "intel-ssse3" }, { HWF_INTEL_PCLMUL,"intel-pclmul" }, { HWF_INTEL_AESNI, "intel-aesni" }, { HWF_INTEL_RDRAND,"intel-rdrand" }, ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 4 +- cipher/sha256-ssse3-amd64.S | 526 ++++++++++++++++++++++++++ cipher/sha256.c | 45 ++- cipher/sha512-avx-amd64.S | 412 ++++++++++++++++++++ cipher/sha512-avx2-bmi2-amd64.S | 783 +++++++++++++++++++++++++++++++++++++++ cipher/sha512-ssse3-amd64.S | 414 +++++++++++++++++++++ cipher/sha512.c | 102 ++++- configure.ac | 69 +++- doc/gcrypt.texi | 3 + random/rndw32.c | 6 +- src/g10lib.h | 3 + src/hwf-x86.c | 10 +- src/hwfeatures.c | 3 + 13 files changed, 2368 insertions(+), 12 deletions(-) create mode 100644 cipher/sha256-ssse3-amd64.S create mode 100644 cipher/sha512-avx-amd64.S create mode 100644 cipher/sha512-avx2-bmi2-amd64.S create mode 100644 cipher/sha512-ssse3-amd64.S hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Fri Dec 13 11:09:20 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 13 Dec 2013 12:09:20 +0200 Subject: tests/basic fails sometimes... Message-ID: <52AADCD0.5040409@iki.fi> Hello, 'basic' tests appear to fail sometimes, with following error. ~/libgcrypt$ ( count=1; while tests/basic; do count=$(expr $count "+" 1); done; echo "count: $count"; ) gcry_pk_decrypt failed: expected 155 (Encoding problem), got 0 (Success) gcry_pk_encrypt/gcry_pk_decrypt do not roundtrip count: 34 ~/libgcrypt$ ( count=1; while tests/basic; do count=$(expr $count "+" 1); done; echo "count: $count"; ) gcry_pk_decrypt failed: expected 155 (Encoding problem), got 0 (Success) gcry_pk_encrypt/gcry_pk_decrypt do not roundtrip count: 44 Is this known problem? -Jussi From cvs at cvs.gnupg.org Fri Dec 13 14:56:09 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Fri, 13 Dec 2013 14:56:09 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-438-g764643a Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 764643a3d5634bcbc47790bd8505f6a1a5280d9c (commit) via 413bfc4dd358be4728f856292e545fc67078fed8 (commit) from e41d605ee41469e8a33cdc4d38f742cfb931f835 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 764643a3d5634bcbc47790bd8505f6a1a5280d9c Author: Werner Koch Date: Fri Dec 13 14:52:21 2013 +0100 Update license information * LICENSES: New. * Makefile.am (EXTRA_DIST): Add LICENSES. * AUTHORS: Add list of copyright holders. * README: Reference AUTHORS. Signed-off-by: Werner Koch diff --git a/AUTHORS b/AUTHORS index fcc992c..dc933dc 100644 --- a/AUTHORS +++ b/AUTHORS @@ -6,8 +6,32 @@ Security related bug reports: License (library): LGPLv2.1+ License (manual and tools): GPLv2+ -Libgcrypt used to be part of GnuPG but has been taken out into its own -package on 2000-12-21. +Libgcrypt is free software. See the files COPYING.LIB and COPYING for +copying conditions, and LICENSES for notices about a few contributions +that require these additional notices to be distributed. License +copyright years may be listed using range notation, e.g., 2000-2013, +indicating that every year in the range, inclusive, is a copyrightable +year that would otherwise be listed individually. + + +List of Copyright holders +========================= + + Copyright (C) 1989,1991-2012 Free Software Foundation, Inc. + Copyright (C) 1994 X Consortium + Copyright (C) 1996 L. Peter Deutsch + Copyright (C) 1997 Werner Koch + Copyright (C) 1998 The Internet Society + Copyright (C) 1996-1999 Peter Gutmann, Paul Kendall, and Chris Wedgwood + Copyright (C) 1996-2006 Peter Gutmann, Matt Thomlinson and Blake Coverett + Copyright (C) 2003 Nikos Mavroyanopoulos + Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation) + Copyright (C) 2012-2013 g10 Code GmbH + Copyright (C) 2012 Simon Josefsson, Niels M?ller + Copyright (c) 2012 Intel Corporation + Copyright (C) 2013 Christian Grothoff + Copyright (C) 2013 Jussi Kivilinna + Copyright (C) 2013 Dmitry Eremin-Solenikov Authors with a FSF copyright assignment @@ -145,6 +169,10 @@ Werner Koch (g10 Code GmbH) More credits ============ + +Libgcrypt used to be part of GnuPG but has been taken out into its own +package on 2000-12-21. + The ATH implementation (src/ath*) has been taken from GPGME and relicensed to the LGPL by the copyright holder of GPGME (g10 Code GmbH); it is now considered to be a part of Libgcrypt. @@ -168,7 +196,7 @@ It has a permissive license and is copyrighted by atsec information security corporation. See the file for details. The file salsa20.c is based on D.J. Bernstein's public domain code and -taken from Nettle. Copyright 2007 Simon Josefsson and Niels M?ller. +taken from Nettle. Copyright 2012 Simon Josefsson and Niels M?ller. This file is free software; as a special exception the author gives diff --git a/LICENSES b/LICENSES new file mode 100644 index 0000000..8594cfd --- /dev/null +++ b/LICENSES @@ -0,0 +1,134 @@ +Additional license notices for Libgcrypt. -*- org -*- + +This file contains the copying permission notices for various files in +the Libgcrypt distribution which are not covered by the GNU Lesser +General Public License (LGPL) or the GNU General Public License (GPL). + +These notices all require that a copy of the notice be included +in the accompanying documentation and be distributed with binary +distributions of the code, so be sure to include this file along +with any binary distributions derived from the GNU C Library. + +* BSD_3Clause + + For files: + - cipher/sha256-ssse3-amd64.S + - cipher/sha512-avx-amd64.S + - cipher/sha512-avx2-bmi2-amd64.S + - cipher/sha512-ssse3-amd64.S + +#+begin_quote + Copyright (c) 2012, Intel Corporation + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + * Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#+end_quote + +* Simple permissive licenses + + For files: + - cipher/crc.c + +#+begin_quote + Copyright (c) 1996 L. Peter Deutsch + + Permission is granted to copy and distribute this document for + any purpose and without charge, including translations into + other languages and incorporation into compilations, provided + that the copyright notice and this notice are preserved, and + that any substantive changes or deletions from the original are + clearly marked. +#+end_quote + +* IETF permissive licenses + + For files: + - cipher/crc.c + +#+begin_quote + Copyright (C) The Internet Society (1998). All Rights Reserved. + + This document and translations of it may be copied and furnished + to others, and derivative works that comment on or otherwise + explain it or assist in its implementation may be prepared, + copied, published and distributed, in whole or in part, without + restriction of any kind, provided that the above copyright notice + and this paragraph are included on all such copies and derivative + works. However, this document itself may not be modified in any + way, such as by removing the copyright notice or references to + the Internet Society or other Internet organizations, except as + needed for the purpose of developing Internet standards in which + case the procedures for copyrights defined in the Internet + Standards process must be followed, or as required to translate + it into languages other than English. + + The limited permissions granted above are perpetual and will not be + revoked by the Internet Society or its successors or assigns. + + This document and the information contained herein is provided on + an "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET + ENGINEERING TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE + OF THE INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY + IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR + PURPOSE. +#+end_quote + +* X License + + For files: + - install.sh + +#+begin_quote + Copyright (C) 1994 X Consortium + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC- + TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + Except as contained in this notice, the name of the X Consortium shall not + be used in advertising or otherwise to promote the sale, use or other deal- + ings in this Software without prior written authorization from the X Consor- + tium. +#+end_quote diff --git a/Makefile.am b/Makefile.am index f1dbd09..7fb7b50 100644 --- a/Makefile.am +++ b/Makefile.am @@ -27,7 +27,7 @@ GITLOG_TO_CHANGELOG=gitlog-to-changelog DIST_SUBDIRS = m4 compat mpi cipher random src doc tests SUBDIRS = compat mpi cipher random src doc tests -EXTRA_DIST = autogen.sh README.GIT \ +EXTRA_DIST = autogen.sh README.GIT LICENSES \ ChangeLog-2011 scripts/ChangeLog-2011 doc/ChangeLog-2011 \ m4/ChangeLog-2011 cipher/ChangeLog-2011 src/ChangeLog-2011 \ random/ChangeLog-2011 tests/ChangeLog-2011 mpi/ChangeLog-2011 \ diff --git a/README b/README index 1778951..ff3ce9c 100644 --- a/README +++ b/README @@ -2,19 +2,11 @@ ------------------------------------ Version 1.6 - !!! THIS IS A DEVELOPMENT VERSION VERSION !!! - - Copyright 2000, 2002, 2003, 2004, 2007, 2008, 2009, - 2011, 2012 Free Software Foundation, Inc. - - This file is free software; as a special exception the author gives - unlimited permission to copy and/or distribute it, with or without - modifications, as long as this notice is preserved. - - This file is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY, to the extent permitted by law; without even the - implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + Copyright (C) 1989,1991-2012 Free Software Foundation, Inc. + Libgcrypt is free software. See the file AUTHORS for full copying + notices, and LICENSES for notices about contributions that require + these additional notices to be distributed. Overview commit 413bfc4dd358be4728f856292e545fc67078fed8 Author: Werner Koch Date: Fri Dec 13 10:53:26 2013 +0100 doc: Minor manual fix. -- diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 410c45d..dc42950 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -4922,7 +4922,7 @@ may be used: @deftypefun void gcry_mpi_randomize (@w{gcry_mpi_t @var{w}}, @w{unsigned int @var{nbits}}, @w{enum gcry_random_level @var{level}}) -Set the multi-precision-integers @var{w} to a random value of +Set the multi-precision-integers @var{w} to a random non-negative number of @var{nbits}, using random data quality of level @var{level}. In case @var{nbits} is not a multiple of a byte, @var{nbits} is rounded up to the next byte boundary. When using a @var{level} of ----------------------------------------------------------------------- Summary of changes: AUTHORS | 34 ++++++++++++-- LICENSES | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ Makefile.am | 2 +- README | 16 ++----- doc/gcrypt.texi | 2 +- 5 files changed, 171 insertions(+), 17 deletions(-) create mode 100644 LICENSES hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Fri Dec 13 15:08:57 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 13 Dec 2013 16:08:57 +0200 Subject: [PATCH] Add missing register clearing in to SHA-256 and SHA-512 assembly Message-ID: <20131213140857.30855.86713.stgit@localhost6.localdomain6> * cipher/sha256-ssse3-amd.S: Clear used XMM/YMM registers at return. * cipher/sha512-avx-amd.S: Ditto. * cipher/sha512-avx2-bmi2-amd.S: Ditto. * cipher/sha512-ssse3-amd.S: Ditto. -- Signed-off-by: Jussi Kivilinna --- cipher/sha256-ssse3-amd64.S | 14 ++++++++++++++ cipher/sha512-avx-amd64.S | 14 ++++++++++++++ cipher/sha512-avx2-bmi2-amd64.S | 14 ++++++++++++++ cipher/sha512-ssse3-amd64.S | 7 +++++++ 4 files changed, 49 insertions(+) diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 06070d1..bcf0e19 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -481,6 +481,20 @@ _gcry_sha256_transform_amd64_ssse3: cmp INP, [rsp + _INP_END] jne .Loop0 + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + pxor xmm8, xmm8 + pxor xmm9, xmm9 + pxor xmm10, xmm10 + pxor xmm11, xmm11 + pxor xmm12, xmm12 + .Ldone_hash: add rsp, STACK_SIZE diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 691d771..900936e 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -267,6 +267,8 @@ _gcry_sha512_transform_amd64_avx: cmp msglen, 0 je .Lnowork + vzeroupper + /* Allocate Stack Space */ sub rsp, frame_size @@ -346,6 +348,18 @@ _gcry_sha512_transform_amd64_avx: /* Restore Stack Pointer */ add rsp, frame_size + vzeroupper + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + pxor xmm8, xmm8 + pxor xmm9, xmm9 + /* Return stack burn depth */ mov rax, frame_size diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 878c41b..9ece4fd 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -604,6 +604,8 @@ _gcry_sha512_transform_amd64_avx2: cmp rdx, 0 je .Lnowork + vzeroupper + /* Allocate Stack Space */ mov rax, rsp sub rsp, frame_size @@ -717,6 +719,18 @@ _gcry_sha512_transform_amd64_avx2: /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] + vzeroupper + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + pxor xmm8, xmm8 + pxor xmm9, xmm9 + mov rax, frame_size .Lnowork: ret diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 2a719e5..4c80baa 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -348,6 +348,13 @@ _gcry_sha512_transform_amd64_ssse3: /* Restore Stack Pointer */ add rsp, frame_size + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + /* Return stack burn depth */ mov rax, frame_size From jussi.kivilinna at iki.fi Fri Dec 13 15:10:32 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 13 Dec 2013 16:10:32 +0200 Subject: [PATCH] SHA-1: Add SSSE3 implementation Message-ID: <20131213141032.31174.48181.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha1-ssse3-amd64.c'. * cipher/sha1-ssse3-amd64.c: New. * cipher/sha1.c (USE_SSSE3): New. (SHA1_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'. (sha1_init) [USE_SSSE3]: Initialize 'use_ssse3'. (transform): Rename to... (_transform): this. (transform): New. * configure.ac [host=x86_64]: Add 'sha1-ssse3-amd64.lo'. -- Patch adds SSSE3 implementation based on white paper "Improving the Performance of the Secure Hash Algorithm (SHA-1)" at http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 Benchmarks: cpu Old New Diff Intel i5-4570 9.02 c/B 5.22 c/B 1.72x Intel i5-2450M 12.27 c/B 7.24 c/B 1.69x Intel Core2 T8100 7.94 c/B 6.76 c/B 1.17x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/sha1-ssse3-amd64.c | 319 +++++++++++++++++++++++++++++++++++++++++++++ cipher/sha1.c | 39 +++++- configure.ac | 7 + 4 files changed, 365 insertions(+), 2 deletions(-) create mode 100644 cipher/sha1-ssse3-amd64.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 7c85af2..0477772 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ -sha1.c \ +sha1.c sha1-ssse3-amd64.c \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c new file mode 100644 index 0000000..1342235 --- /dev/null +++ b/cipher/sha1-ssse3-amd64.c @@ -0,0 +1,319 @@ +/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function + * Copyright ? 2013 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) + +#ifdef HAVE_STDINT_H +# include /* uintptr_t */ +#elif defined(HAVE_INTTYPES_H) +# include +#else +/* In this case, uintptr_t is provided by config.h. */ +#endif + +#include "bithelp.h" + + +/* Helper macro to force alignment to 16 bytes. */ +#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) +#else +# define ATTR_ALIGNED_16 +#endif + + +typedef struct +{ + u32 h0,h1,h2,h3,h4; +} SHA1_STATE; + + +/* Round function macros. */ +#define K1 0x5A827999L +#define K2 0x6ED9EBA1L +#define K3 0x8F1BBCDCL +#define K4 0xCA62C1D6L +#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) ) +#define F2(x,y,z) ( x ^ y ^ z ) +#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) +#define F4(x,y,z) ( x ^ y ^ z ) +#define R(a,b,c,d,e,f,wk) do { e += rol( a, 5 ) \ + + f( b, c, d ) \ + + wk; \ + b = rol( b, 30 ); \ + } while(0) + +#define WK(i) (wk[i & 15]) + + +static const u32 K_XMM[4][4] ATTR_ALIGNED_16 = + { + { K1, K1, K1, K1 }, + { K2, K2, K2, K2 }, + { K3, K3, K3, K3 }, + { K4, K4, K4, K4 }, + }; +static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 = + { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f }; + + +/* + * Transform 64 bytes (16 32-bit words) at DATA. + */ +unsigned int +_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) +{ + SHA1_STATE *state = ctx; + register u32 a, b, c, d, e; /* Local copies of the chaining variables. */ + byte wk_unaligned[4*16+15]; /* The array we work on. */ + u32 *wk = (u32 *)(wk_unaligned + + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15)); + + /* Get the values of the chaining variables. */ + a = state->h0; + b = state->h1; + c = state->h2; + d = state->h3; + e = state->h4; + +#define Wtmp0 "xmm0" +#define Wtmp1 "xmm1" + +#define W0 "xmm2" +#define W1 "xmm3" +#define W2 "xmm4" +#define W3 "xmm5" +#define W4 "xmm6" +#define W5 "xmm7" +#define W6 "xmm8" +#define W7 "xmm9" + +#define BSWAP_REG "xmm10" + + __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t" + :: [bswap] "m" (bswap_shufb_ctl[0])); + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \ + ::[data] "m" (*(data+4*(i)))); + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %%"W";\n\t" \ + ::: "cc"); + +#define W_PRECALC_00_15_2(i, W, tmp0) \ + __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \ + ::[k_xmm] "m" (K_XMM[i / 20][0])); + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \ + :[wk] "=m" (WK(i&~3))); + + /* Precalc 0-15. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_3(15, W5, Wtmp0); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \ + "palignr $8, %%"W_m16", %%"W";\n\t" \ + "movdqa %%"W_m04", %%"tmp0";\n\t" \ + "psrldq $4, %%"tmp0";\n\t" \ + "pxor %%"W_m08", %%"W";\n\t" \ + :::"cc"); + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \ + "pxor %%"tmp0", %%"W";\n\t" \ + "movdqa %%"W", %%"tmp1";\n\t" \ + "movdqa %%"W", %%"tmp0";\n\t" \ + "pslldq $12, %%"tmp1";\n\t" \ + :::"cc"); + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("psrld $31, %%"W";\n\t" \ + "pslld $1, %%"tmp0";\n\t" \ + "por %%"W", %%"tmp0";\n\t" \ + "movdqa %%"tmp1", %%"W";\n\t" \ + "psrld $30, %%"tmp1";\n\t" \ + "pslld $2, %%"W";\n\t" \ + :::"cc"); + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \ + "pxor %%"tmp1", %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %%"W";\n\t" \ + "paddd %[k_xmm], %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %[wk];\n\t" \ + : [wk] "=m" (WK(i&~3)) \ + : [k_xmm] "m" (K_XMM[i / 20][0])); + + /* Transform 0-15 + Precalc 16-31. */ + R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \ + "pxor %%"W_m28", %%"W";\n\t" \ + "palignr $8, %%"W_m08", %%"tmp0";\n\t" \ + :::"cc"); + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \ + "pxor %%"tmp0", %%"W";\n\t" \ + "movdqa %%"W", %%"tmp0";\n\t" \ + :::"cc"); + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("psrld $30, %%"W";\n\t" \ + "pslld $2, %%"tmp0";\n\t" \ + "por %%"W", %%"tmp0";\n\t" \ + :::"cc"); + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \ + "paddd %[k_xmm], %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %[wk];\n\t" \ + : [wk] "=m" (WK(i&~3)) \ + : [k_xmm] "m" (K_XMM[i / 20][0])); + + /* Transform 16-63 + Precalc 32-79. */ + R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + +#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc"); + + /* Transform 64-79 + Clear XMM registers. */ + R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG); + R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0); + R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1); + R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0); + R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1); + R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2); + R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3); + R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4); + R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5); + R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6); + R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7); + R( a, b, c, d, e, F4, WK(75) ); + R( e, a, b, c, d, F4, WK(76) ); + R( d, e, a, b, c, F4, WK(77) ); + R( c, d, e, a, b, F4, WK(78) ); + R( b, c, d, e, a, F4, WK(79) ); + + /* Update the chaining variables. */ + state->h0 += a; + state->h1 += b; + state->h2 += c; + state->h3 += d; + state->h4 += e; + + return /* burn_stack */ 84+15; +} + +#endif +#endif diff --git a/cipher/sha1.c b/cipher/sha1.c index 025b3ab..af57b19 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -43,6 +43,15 @@ #include "hash-common.h" +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_SSSE3 1 +#endif + + /* A macro to test whether P is properly aligned for an u32 type. Note that config.h provides a suitable replacement for uintptr_t if it does not exist in stdint.h. */ @@ -56,6 +65,9 @@ typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4; +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; +#endif } SHA1_CONTEXT; static unsigned int @@ -78,6 +90,10 @@ sha1_init (void *context) hd->bctx.count = 0; hd->bctx.blocksize = 64; hd->bctx.bwrite = transform; + +#ifdef USE_SSSE3 + hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -107,7 +123,7 @@ sha1_init (void *context) * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA. */ static unsigned int -transform (void *ctx, const unsigned char *data) +_transform (void *ctx, const unsigned char *data) { SHA1_CONTEXT *hd = ctx; const u32 *idata = (const void *)data; @@ -217,6 +233,27 @@ transform (void *ctx, const unsigned char *data) } +#ifdef USE_SSSE3 +unsigned int +_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data); +#endif + + +static unsigned int +transform (void *ctx, const unsigned char *data) +{ + SHA1_CONTEXT *hd = ctx; + +#ifdef USE_SSSE3 + if (hd->use_ssse3) + return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data) + + 4 * sizeof(void*); +#endif + + return _transform (hd, data); +} + + /* The routine final terminates the computation and * returns the digest. * The handle is prepared for a new cycle, but adding bytes to the diff --git a/configure.ac b/configure.ac index b91240b..9088d02 100644 --- a/configure.ac +++ b/configure.ac @@ -1773,6 +1773,13 @@ GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo sha1.lo" AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included]) AC_DEFINE(USE_SHA1, 1, [Defined if this module should be included]) +case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo" + ;; +esac + LIST_MEMBER(scrypt, $enabled_kdfs) if test "$found" = "1" ; then GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo" From cvs at cvs.gnupg.org Fri Dec 13 15:19:06 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 13 Dec 2013 15:19:06 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-439-g04615cc Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 04615cc6803cdede25fa92e3ff697e252a23cd7a (commit) from 764643a3d5634bcbc47790bd8505f6a1a5280d9c (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 04615cc6803cdede25fa92e3ff697e252a23cd7a Author: Jussi Kivilinna Date: Fri Dec 13 16:14:05 2013 +0200 Add missing register clearing in to SHA-256 and SHA-512 assembly * cipher/sha256-ssse3-amd64.S: Clear used XMM/YMM registers at return. * cipher/sha512-avx-amd64.S: Ditto. * cipher/sha512-avx2-bmi2-amd64.S: Ditto. * cipher/sha512-ssse3-amd64.S: Ditto. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 06070d1..bcf0e19 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -481,6 +481,20 @@ _gcry_sha256_transform_amd64_ssse3: cmp INP, [rsp + _INP_END] jne .Loop0 + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + pxor xmm8, xmm8 + pxor xmm9, xmm9 + pxor xmm10, xmm10 + pxor xmm11, xmm11 + pxor xmm12, xmm12 + .Ldone_hash: add rsp, STACK_SIZE diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 691d771..900936e 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -267,6 +267,8 @@ _gcry_sha512_transform_amd64_avx: cmp msglen, 0 je .Lnowork + vzeroupper + /* Allocate Stack Space */ sub rsp, frame_size @@ -346,6 +348,18 @@ _gcry_sha512_transform_amd64_avx: /* Restore Stack Pointer */ add rsp, frame_size + vzeroupper + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + pxor xmm8, xmm8 + pxor xmm9, xmm9 + /* Return stack burn depth */ mov rax, frame_size diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 878c41b..9ece4fd 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -604,6 +604,8 @@ _gcry_sha512_transform_amd64_avx2: cmp rdx, 0 je .Lnowork + vzeroupper + /* Allocate Stack Space */ mov rax, rsp sub rsp, frame_size @@ -717,6 +719,18 @@ _gcry_sha512_transform_amd64_avx2: /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] + vzeroupper + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + pxor xmm8, xmm8 + pxor xmm9, xmm9 + mov rax, frame_size .Lnowork: ret diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 2a719e5..4c80baa 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -348,6 +348,13 @@ _gcry_sha512_transform_amd64_ssse3: /* Restore Stack Pointer */ add rsp, frame_size + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + pxor xmm4, xmm4 + pxor xmm5, xmm5 + /* Return stack burn depth */ mov rax, frame_size ----------------------------------------------------------------------- Summary of changes: cipher/sha256-ssse3-amd64.S | 14 ++++++++++++++ cipher/sha512-avx-amd64.S | 14 ++++++++++++++ cipher/sha512-avx2-bmi2-amd64.S | 14 ++++++++++++++ cipher/sha512-ssse3-amd64.S | 7 +++++++ 4 files changed, 49 insertions(+) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From cvs at cvs.gnupg.org Fri Dec 13 15:51:09 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 13 Dec 2013 15:51:09 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-440-gbe2238f Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via be2238f68abcc6f2b4e8c38ad9141376ce622a22 (commit) from 04615cc6803cdede25fa92e3ff697e252a23cd7a (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit be2238f68abcc6f2b4e8c38ad9141376ce622a22 Author: Jussi Kivilinna Date: Fri Dec 13 12:47:56 2013 +0200 SHA-1: Add SSSE3 implementation * cipher/Makefile.am: Add 'sha1-ssse3-amd64.c'. * cipher/sha1-ssse3-amd64.c: New. * cipher/sha1.c (USE_SSSE3): New. (SHA1_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'. (sha1_init) [USE_SSSE3]: Initialize 'use_ssse3'. (transform): Rename to... (_transform): this. (transform): New. * configure.ac [host=x86_64]: Add 'sha1-ssse3-amd64.lo'. -- Patch adds SSSE3 implementation based on white paper "Improving the Performance of the Secure Hash Algorithm (SHA-1)" at http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 Benchmarks: cpu Old New Diff Intel i5-4570 9.02 c/B 5.22 c/B 1.72x Intel i5-2450M 12.27 c/B 7.24 c/B 1.69x Intel Core2 T8100 7.94 c/B 6.76 c/B 1.17x Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 7c85af2..0477772 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ -sha1.c \ +sha1.c sha1-ssse3-amd64.c \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c new file mode 100644 index 0000000..1342235 --- /dev/null +++ b/cipher/sha1-ssse3-amd64.c @@ -0,0 +1,319 @@ +/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function + * Copyright ? 2013 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) + +#ifdef HAVE_STDINT_H +# include /* uintptr_t */ +#elif defined(HAVE_INTTYPES_H) +# include +#else +/* In this case, uintptr_t is provided by config.h. */ +#endif + +#include "bithelp.h" + + +/* Helper macro to force alignment to 16 bytes. */ +#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED +# define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) +#else +# define ATTR_ALIGNED_16 +#endif + + +typedef struct +{ + u32 h0,h1,h2,h3,h4; +} SHA1_STATE; + + +/* Round function macros. */ +#define K1 0x5A827999L +#define K2 0x6ED9EBA1L +#define K3 0x8F1BBCDCL +#define K4 0xCA62C1D6L +#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) ) +#define F2(x,y,z) ( x ^ y ^ z ) +#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) +#define F4(x,y,z) ( x ^ y ^ z ) +#define R(a,b,c,d,e,f,wk) do { e += rol( a, 5 ) \ + + f( b, c, d ) \ + + wk; \ + b = rol( b, 30 ); \ + } while(0) + +#define WK(i) (wk[i & 15]) + + +static const u32 K_XMM[4][4] ATTR_ALIGNED_16 = + { + { K1, K1, K1, K1 }, + { K2, K2, K2, K2 }, + { K3, K3, K3, K3 }, + { K4, K4, K4, K4 }, + }; +static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 = + { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f }; + + +/* + * Transform 64 bytes (16 32-bit words) at DATA. + */ +unsigned int +_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) +{ + SHA1_STATE *state = ctx; + register u32 a, b, c, d, e; /* Local copies of the chaining variables. */ + byte wk_unaligned[4*16+15]; /* The array we work on. */ + u32 *wk = (u32 *)(wk_unaligned + + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15)); + + /* Get the values of the chaining variables. */ + a = state->h0; + b = state->h1; + c = state->h2; + d = state->h3; + e = state->h4; + +#define Wtmp0 "xmm0" +#define Wtmp1 "xmm1" + +#define W0 "xmm2" +#define W1 "xmm3" +#define W2 "xmm4" +#define W3 "xmm5" +#define W4 "xmm6" +#define W5 "xmm7" +#define W6 "xmm8" +#define W7 "xmm9" + +#define BSWAP_REG "xmm10" + + __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t" + :: [bswap] "m" (bswap_shufb_ctl[0])); + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \ + ::[data] "m" (*(data+4*(i)))); + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %%"W";\n\t" \ + ::: "cc"); + +#define W_PRECALC_00_15_2(i, W, tmp0) \ + __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \ + ::[k_xmm] "m" (K_XMM[i / 20][0])); + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \ + :[wk] "=m" (WK(i&~3))); + + /* Precalc 0-15. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_3(15, W5, Wtmp0); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \ + "palignr $8, %%"W_m16", %%"W";\n\t" \ + "movdqa %%"W_m04", %%"tmp0";\n\t" \ + "psrldq $4, %%"tmp0";\n\t" \ + "pxor %%"W_m08", %%"W";\n\t" \ + :::"cc"); + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \ + "pxor %%"tmp0", %%"W";\n\t" \ + "movdqa %%"W", %%"tmp1";\n\t" \ + "movdqa %%"W", %%"tmp0";\n\t" \ + "pslldq $12, %%"tmp1";\n\t" \ + :::"cc"); + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("psrld $31, %%"W";\n\t" \ + "pslld $1, %%"tmp0";\n\t" \ + "por %%"W", %%"tmp0";\n\t" \ + "movdqa %%"tmp1", %%"W";\n\t" \ + "psrld $30, %%"tmp1";\n\t" \ + "pslld $2, %%"W";\n\t" \ + :::"cc"); + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \ + "pxor %%"tmp1", %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %%"W";\n\t" \ + "paddd %[k_xmm], %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %[wk];\n\t" \ + : [wk] "=m" (WK(i&~3)) \ + : [k_xmm] "m" (K_XMM[i / 20][0])); + + /* Transform 0-15 + Precalc 16-31. */ + R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \ + "pxor %%"W_m28", %%"W";\n\t" \ + "palignr $8, %%"W_m08", %%"tmp0";\n\t" \ + :::"cc"); + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \ + "pxor %%"tmp0", %%"W";\n\t" \ + "movdqa %%"W", %%"tmp0";\n\t" \ + :::"cc"); + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("psrld $30, %%"W";\n\t" \ + "pslld $2, %%"tmp0";\n\t" \ + "por %%"W", %%"tmp0";\n\t" \ + :::"cc"); + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \ + "paddd %[k_xmm], %%"tmp0";\n\t" \ + "movdqa %%"tmp0", %[wk];\n\t" \ + : [wk] "=m" (WK(i&~3)) \ + : [k_xmm] "m" (K_XMM[i / 20][0])); + + /* Transform 16-63 + Precalc 32-79. */ + R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + +#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc"); + + /* Transform 64-79 + Clear XMM registers. */ + R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG); + R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0); + R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1); + R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0); + R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1); + R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2); + R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3); + R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4); + R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5); + R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6); + R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7); + R( a, b, c, d, e, F4, WK(75) ); + R( e, a, b, c, d, F4, WK(76) ); + R( d, e, a, b, c, F4, WK(77) ); + R( c, d, e, a, b, F4, WK(78) ); + R( b, c, d, e, a, F4, WK(79) ); + + /* Update the chaining variables. */ + state->h0 += a; + state->h1 += b; + state->h2 += c; + state->h3 += d; + state->h4 += e; + + return /* burn_stack */ 84+15; +} + +#endif +#endif diff --git a/cipher/sha1.c b/cipher/sha1.c index 025b3ab..af57b19 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -43,6 +43,15 @@ #include "hash-common.h" +/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ +#undef USE_SSSE3 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_SSSE3 1 +#endif + + /* A macro to test whether P is properly aligned for an u32 type. Note that config.h provides a suitable replacement for uintptr_t if it does not exist in stdint.h. */ @@ -56,6 +65,9 @@ typedef struct { gcry_md_block_ctx_t bctx; u32 h0,h1,h2,h3,h4; +#ifdef USE_SSSE3 + unsigned int use_ssse3:1; +#endif } SHA1_CONTEXT; static unsigned int @@ -78,6 +90,10 @@ sha1_init (void *context) hd->bctx.count = 0; hd->bctx.blocksize = 64; hd->bctx.bwrite = transform; + +#ifdef USE_SSSE3 + hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; +#endif } @@ -107,7 +123,7 @@ sha1_init (void *context) * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA. */ static unsigned int -transform (void *ctx, const unsigned char *data) +_transform (void *ctx, const unsigned char *data) { SHA1_CONTEXT *hd = ctx; const u32 *idata = (const void *)data; @@ -217,6 +233,27 @@ transform (void *ctx, const unsigned char *data) } +#ifdef USE_SSSE3 +unsigned int +_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data); +#endif + + +static unsigned int +transform (void *ctx, const unsigned char *data) +{ + SHA1_CONTEXT *hd = ctx; + +#ifdef USE_SSSE3 + if (hd->use_ssse3) + return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data) + + 4 * sizeof(void*); +#endif + + return _transform (hd, data); +} + + /* The routine final terminates the computation and * returns the digest. * The handle is prepared for a new cycle, but adding bytes to the diff --git a/configure.ac b/configure.ac index b91240b..9088d02 100644 --- a/configure.ac +++ b/configure.ac @@ -1773,6 +1773,13 @@ GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo sha1.lo" AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included]) AC_DEFINE(USE_SHA1, 1, [Defined if this module should be included]) +case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo" + ;; +esac + LIST_MEMBER(scrypt, $enabled_kdfs) if test "$found" = "1" ; then GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo" ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 2 +- cipher/sha1-ssse3-amd64.c | 319 +++++++++++++++++++++++++++++++++++++++++++++ cipher/sha1.c | 39 +++++- configure.ac | 7 + 4 files changed, 365 insertions(+), 2 deletions(-) create mode 100644 cipher/sha1-ssse3-amd64.c hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From yumkam at gmail.com Fri Dec 13 20:12:23 2013 From: yumkam at gmail.com (Yuriy Kaminskiy) Date: Fri, 13 Dec 2013 23:12:23 +0400 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <87iovwznob.fsf@vigenere.g10code.de> References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> Message-ID: Werner Koch wrote: > On Tue, 12 Nov 2013 22:17, yumkam at gmail.com said: > >> And I'm not sure, but cipher-ccm.c also feels suspicious in this respect (won't >> it fail after SIZE_T_MAX bytes?). > > We need to look at it. GCRYCTL_SET_CCM_LENGTHS (and everything below - _gcry_cipher_ccm_set_lengths, gcry_cipher_handle.u_mode.ccm.encryptlen, etc) pass encryptlen (expected size of payload) as size_t, thus on 32-bit platform it's not possible to encrypt more than (2**32) bytes with CCM. According to SP800-38C, maximum payload size for CCM is 2**64 octets (and there are no problem with encrypting more than 4G on 64-bit platforms with libgcrypt). In this case, fixing this problem would also require changing API/ABI (which should be possible, since CCM is not in any released version, so ABI is not set in stone yet?) From jussi.kivilinna at iki.fi Fri Dec 13 21:51:09 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 13 Dec 2013 22:51:09 +0200 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> Message-ID: <52AB733D.7080302@iki.fi> On 13.12.2013 21:12, Yuriy Kaminskiy wrote: > Werner Koch wrote: >> On Tue, 12 Nov 2013 22:17, yumkam at gmail.com said: >> >>> And I'm not sure, but cipher-ccm.c also feels suspicious in this respect (won't >>> it fail after SIZE_T_MAX bytes?). >> >> We need to look at it. > > GCRYCTL_SET_CCM_LENGTHS (and everything below - _gcry_cipher_ccm_set_lengths, > gcry_cipher_handle.u_mode.ccm.encryptlen, etc) pass encryptlen (expected size of > payload) as size_t, thus on 32-bit platform it's not possible to encrypt more > than (2**32) bytes with CCM. > According to SP800-38C, maximum payload size for CCM is 2**64 octets (and there > are no problem with encrypting more than 4G on 64-bit platforms with libgcrypt). > In this case, fixing this problem would also require changing API/ABI (which > should be possible, since CCM is not in any released version, so ABI is not set > in stone yet?) That's right, size_t for lengths is not enough on 32-bit. Could we use uint64_t instead of size_t? -Jussi > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > From wk at gnupg.org Fri Dec 13 22:06:20 2013 From: wk at gnupg.org (Werner Koch) Date: Fri, 13 Dec 2013 22:06:20 +0100 Subject: tests/basic fails sometimes... In-Reply-To: <52AADCD0.5040409@iki.fi> (Jussi Kivilinna's message of "Fri, 13 Dec 2013 12:09:20 +0200") References: <52AADCD0.5040409@iki.fi> Message-ID: <8738lwo3zn.fsf@vigenere.g10code.de> On Fri, 13 Dec 2013 11:09, jussi.kivilinna at iki.fi said: > 'basic' tests appear to fail sometimes, with following error. I have not seen that. You can add the option --verbose to see which test fails. What platform are you using? Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Fri Dec 13 22:13:14 2013 From: wk at gnupg.org (Werner Koch) Date: Fri, 13 Dec 2013 22:13:14 +0100 Subject: [PATCH] SHA-1: Add SSSE3 implementation In-Reply-To: <20131213141032.31174.48181.stgit@localhost6.localdomain6> (Jussi Kivilinna's message of "Fri, 13 Dec 2013 16:10:32 +0200") References: <20131213141032.31174.48181.stgit@localhost6.localdomain6> Message-ID: <87vbysmp3p.fsf@vigenere.g10code.de> On Fri, 13 Dec 2013 15:10, jussi.kivilinna at iki.fi said: > cpu Old New Diff > Intel i5-4570 9.02 c/B 5.22 c/B 1.72x > Intel i5-2450M 12.27 c/B 7.24 c/B 1.69x > Intel Core2 T8100 7.94 c/B 6.76 c/B 1.17x I justed drafted an article which says For the important SHA-1 algorithm 1.6 improves over 1.5 but is not yet up to what Nettle does. The better improvement is likely due to changes in helper functions (stack burning) and not due to changes in the actual implementation. There is definitely room for improvements. You are a really fast hacker :-). Guess I need to re-run the benchmarks. Yes, please add this. If there is ever a problem with this code the admin can disable it using /etc/gcrypt/hwf.deny. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From cvs at cvs.gnupg.org Fri Dec 13 21:49:36 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Fri, 13 Dec 2013 21:49:36 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-441-gd2b8532 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via d2b853246c2ed056a92096d89c3ca057e45c9c92 (commit) from be2238f68abcc6f2b4e8c38ad9141376ce622a22 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit d2b853246c2ed056a92096d89c3ca057e45c9c92 Author: Jussi Kivilinna Date: Fri Dec 13 21:07:41 2013 +0200 Convert SHA-1 SSSE3 implementation from mixed asm&C to pure asm * cipher/Makefile.am: Change 'sha1-ssse3-amd64.c' to 'sha1-ssse3-amd64.S'. * cipher/sha1-ssse3-amd64.c: Remove. * cipher/sha1-ssse3-amd64.S: New. -- Mixed C&asm implementation appears to trigger GCC bugs easily. Therefore convert SSSE3 implementation to pure assembly for safety. Benchmark also show smallish speed improvement. cpu C&asm asm Intel i5-4570 5.22 c/B 5.09 c/B Intel i5-2450M 7.24 c/B 7.00 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 0477772..7d737e2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ -sha1.c sha1-ssse3-amd64.c \ +sha1.c sha1-ssse3-amd64.S \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S new file mode 100644 index 0000000..5165f3f --- /dev/null +++ b/cipher/sha1-ssse3-amd64.S @@ -0,0 +1,378 @@ +/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function + * Copyright ? 2013 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +.data +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 16 +.LK_XMM: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + +.Lbswap_shufb_ctl: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + + +/* Register macros */ + +#define RSTATE %r8 +#define RDATA %r9 +#define ROLDSTACK %r10 + +#define a %eax +#define b %ebx +#define c %ecx +#define d %edx +#define e %edi + +#define RT0 %esi +#define RT1 %ebp + +#define Wtmp0 %xmm0 +#define Wtmp1 %xmm1 + +#define W0 %xmm2 +#define W1 %xmm3 +#define W2 %xmm4 +#define W3 %xmm5 +#define W4 %xmm6 +#define W5 %xmm7 +#define W6 %xmm8 +#define W7 %xmm9 + +#define BSWAP_REG %xmm10 + + +/* Round function macros. */ + +#define WK(i) (((i) & 15) * 4)(%rsp) + +#define R_F1(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl d, RT0; \ + movl a, RT1; \ + andl b, RT0; \ + roll $30, b; \ + xorl d, RT0; \ + leal (RT0,e), e; \ + roll $5, RT1; \ + addl RT1, e; + +#define R_F2(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl b, RT0; \ + roll $30, b; \ + xorl d, RT0; \ + movl a, RT1; \ + leal (RT0,e), e; \ + roll $5, RT1; \ + addl RT1, e; + +#define R_F3(a,b,c,d,e,i) \ + movl c, RT0; \ + movl b, RT1; \ + xorl b, RT0; \ + andl c, RT1; \ + andl d, RT0; \ + addl RT1, e; \ + addl WK(i), e; \ + roll $30, b; \ + movl a, RT1; \ + leal (RT0,e), e; \ + roll $5, RT1; \ + addl RT1, e; + +#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) + +#define R(a,b,c,d,e,f,i) \ + R_##f(a,b,c,d,e,i) + + +/* Input expansion macros. */ + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + movdqu (4*(i))(RDATA), tmp0; + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + pshufb BSWAP_REG, tmp0; \ + movdqa tmp0, W; + +#define W_PRECALC_00_15_2(i, W, tmp0) \ + paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + movdqa tmp0, WK(i&~3); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + movdqa W_m12, W; \ + palignr $8, W_m16, W; \ + movdqa W_m04, tmp0; \ + psrldq $4, tmp0; \ + pxor W_m08, W; + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + pxor W_m16, tmp0; \ + pxor tmp0, W; \ + movdqa W, tmp1; \ + movdqa W, tmp0; \ + pslldq $12, tmp1; + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + psrld $31, W; \ + pslld $1, tmp0; \ + por W, tmp0; \ + movdqa tmp1, W; \ + psrld $30, tmp1; \ + pslld $2, W; + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + pxor W, tmp0; \ + pxor tmp1, tmp0; \ + movdqa tmp0, W; \ + paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \ + movdqa tmp0, WK((i)&~3); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + movdqa W_m04, tmp0; \ + pxor W_m28, W; \ + palignr $8, W_m08, tmp0; + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + pxor W_m16, W; \ + pxor tmp0, W; \ + movdqa W, tmp0; + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + psrld $30, W; \ + pslld $2, tmp0; \ + por W, tmp0; + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + movdqa tmp0, W; \ + paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \ + movdqa tmp0, WK((i)&~3); + +#define CLEAR_REG(reg) pxor reg, reg; + + +/* + * Transform 64 bytes (16 32-bit words) at DATA. + * + * unsigned int + * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) + */ +.text +.globl _gcry_sha1_transform_amd64_ssse3 +.type _gcry_sha1_transform_amd64_ssse3, at function +.align 16 +_gcry_sha1_transform_amd64_ssse3: + /* input: + * %rdi: ctx, CTX + * %rsi: data (64 bytes) + * %rdx: ... + */ + + movq %rdi, RSTATE; + movq %rsi, RDATA; + pushq %rbx; + pushq %rbp; + + movq %rsp, ROLDSTACK; + + subq $(16*4), %rsp; + andq $(~31), %rsp; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + + movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + + /* Precalc 0-15. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_3(15, W5, Wtmp0); + + /* Transform 0-15 + Precalc 16-31. */ + R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + + /* Transform 16-63 + Precalc 32-79. */ + R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + + /* Transform 64-79 + Clear XMM registers. */ + R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); + R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); + R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1); + R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0); + R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1); + R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2); + R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3); + R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4); + R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5); + R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6); + R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7); + R( a, b, c, d, e, F4, 75 ); + R( e, a, b, c, d, F4, 76 ); + R( d, e, a, b, c, F4, 77 ); + R( c, d, e, a, b, F4, 78 ); + R( b, c, d, e, a, F4, 79 ); + + /* Update the chaining variables. */ + addl state_h0(RSTATE), a; + addl state_h1(RSTATE), b; + addl state_h2(RSTATE), c; + addl state_h3(RSTATE), d; + addl state_h4(RSTATE), e; + + movl a, state_h0(RSTATE); + movl b, state_h1(RSTATE); + movl c, state_h2(RSTATE); + movl d, state_h3(RSTATE); + movl e, state_h4(RSTATE); + + movq ROLDSTACK, %rsp; + + popq %rbp; + popq %rbx; + + /* burn_stack */ + movl $(16*4 + 2*8 + 31), %eax; + + ret; + +#endif +#endif diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c deleted file mode 100644 index 1342235..0000000 --- a/cipher/sha1-ssse3-amd64.c +++ /dev/null @@ -1,319 +0,0 @@ -/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function - * Copyright ? 2013 Jussi Kivilinna - * - * Based on sha1.c: - * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see . - */ - -/* - * Intel SSSE3 accelerated SHA-1 implementation based on white paper: - * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 - */ - -#ifdef __x86_64__ -#include - -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ - defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) - -#ifdef HAVE_STDINT_H -# include /* uintptr_t */ -#elif defined(HAVE_INTTYPES_H) -# include -#else -/* In this case, uintptr_t is provided by config.h. */ -#endif - -#include "bithelp.h" - - -/* Helper macro to force alignment to 16 bytes. */ -#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED -# define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) -#else -# define ATTR_ALIGNED_16 -#endif - - -typedef struct -{ - u32 h0,h1,h2,h3,h4; -} SHA1_STATE; - - -/* Round function macros. */ -#define K1 0x5A827999L -#define K2 0x6ED9EBA1L -#define K3 0x8F1BBCDCL -#define K4 0xCA62C1D6L -#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) ) -#define F2(x,y,z) ( x ^ y ^ z ) -#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) -#define F4(x,y,z) ( x ^ y ^ z ) -#define R(a,b,c,d,e,f,wk) do { e += rol( a, 5 ) \ - + f( b, c, d ) \ - + wk; \ - b = rol( b, 30 ); \ - } while(0) - -#define WK(i) (wk[i & 15]) - - -static const u32 K_XMM[4][4] ATTR_ALIGNED_16 = - { - { K1, K1, K1, K1 }, - { K2, K2, K2, K2 }, - { K3, K3, K3, K3 }, - { K4, K4, K4, K4 }, - }; -static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 = - { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f }; - - -/* - * Transform 64 bytes (16 32-bit words) at DATA. - */ -unsigned int -_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) -{ - SHA1_STATE *state = ctx; - register u32 a, b, c, d, e; /* Local copies of the chaining variables. */ - byte wk_unaligned[4*16+15]; /* The array we work on. */ - u32 *wk = (u32 *)(wk_unaligned - + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15)); - - /* Get the values of the chaining variables. */ - a = state->h0; - b = state->h1; - c = state->h2; - d = state->h3; - e = state->h4; - -#define Wtmp0 "xmm0" -#define Wtmp1 "xmm1" - -#define W0 "xmm2" -#define W1 "xmm3" -#define W2 "xmm4" -#define W3 "xmm5" -#define W4 "xmm6" -#define W5 "xmm7" -#define W6 "xmm8" -#define W7 "xmm9" - -#define BSWAP_REG "xmm10" - - __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t" - :: [bswap] "m" (bswap_shufb_ctl[0])); - -#define W_PRECALC_00_15_0(i, W, tmp0) \ - __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \ - ::[data] "m" (*(data+4*(i)))); - -#define W_PRECALC_00_15_1(i, W, tmp0) \ - __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %%"W";\n\t" \ - ::: "cc"); - -#define W_PRECALC_00_15_2(i, W, tmp0) \ - __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \ - ::[k_xmm] "m" (K_XMM[i / 20][0])); - -#define W_PRECALC_00_15_3(i, W, tmp0) \ - __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \ - :[wk] "=m" (WK(i&~3))); - - /* Precalc 0-15. */ - W_PRECALC_00_15_0(0, W0, Wtmp0); - W_PRECALC_00_15_1(1, W0, Wtmp0); - W_PRECALC_00_15_2(2, W0, Wtmp0); - W_PRECALC_00_15_3(3, W0, Wtmp0); - W_PRECALC_00_15_0(4, W7, Wtmp0); - W_PRECALC_00_15_1(5, W7, Wtmp0); - W_PRECALC_00_15_2(6, W7, Wtmp0); - W_PRECALC_00_15_3(7, W7, Wtmp0); - W_PRECALC_00_15_0(8, W6, Wtmp0); - W_PRECALC_00_15_1(9, W6, Wtmp0); - W_PRECALC_00_15_2(10, W6, Wtmp0); - W_PRECALC_00_15_3(11, W6, Wtmp0); - W_PRECALC_00_15_0(12, W5, Wtmp0); - W_PRECALC_00_15_1(13, W5, Wtmp0); - W_PRECALC_00_15_2(14, W5, Wtmp0); - W_PRECALC_00_15_3(15, W5, Wtmp0); - -#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \ - "palignr $8, %%"W_m16", %%"W";\n\t" \ - "movdqa %%"W_m04", %%"tmp0";\n\t" \ - "psrldq $4, %%"tmp0";\n\t" \ - "pxor %%"W_m08", %%"W";\n\t" \ - :::"cc"); - -#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \ - "pxor %%"tmp0", %%"W";\n\t" \ - "movdqa %%"W", %%"tmp1";\n\t" \ - "movdqa %%"W", %%"tmp0";\n\t" \ - "pslldq $12, %%"tmp1";\n\t" \ - :::"cc"); - -#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("psrld $31, %%"W";\n\t" \ - "pslld $1, %%"tmp0";\n\t" \ - "por %%"W", %%"tmp0";\n\t" \ - "movdqa %%"tmp1", %%"W";\n\t" \ - "psrld $30, %%"tmp1";\n\t" \ - "pslld $2, %%"W";\n\t" \ - :::"cc"); - -#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \ - "pxor %%"tmp1", %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %%"W";\n\t" \ - "paddd %[k_xmm], %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %[wk];\n\t" \ - : [wk] "=m" (WK(i&~3)) \ - : [k_xmm] "m" (K_XMM[i / 20][0])); - - /* Transform 0-15 + Precalc 16-31. */ - R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - -#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \ - "pxor %%"W_m28", %%"W";\n\t" \ - "palignr $8, %%"W_m08", %%"tmp0";\n\t" \ - :::"cc"); - -#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \ - "pxor %%"tmp0", %%"W";\n\t" \ - "movdqa %%"W", %%"tmp0";\n\t" \ - :::"cc"); - -#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("psrld $30, %%"W";\n\t" \ - "pslld $2, %%"tmp0";\n\t" \ - "por %%"W", %%"tmp0";\n\t" \ - :::"cc"); - -#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \ - "paddd %[k_xmm], %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %[wk];\n\t" \ - : [wk] "=m" (WK(i&~3)) \ - : [k_xmm] "m" (K_XMM[i / 20][0])); - - /* Transform 16-63 + Precalc 32-79. */ - R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - -#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc"); - - /* Transform 64-79 + Clear XMM registers. */ - R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG); - R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0); - R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1); - R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0); - R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1); - R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2); - R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3); - R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4); - R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5); - R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6); - R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7); - R( a, b, c, d, e, F4, WK(75) ); - R( e, a, b, c, d, F4, WK(76) ); - R( d, e, a, b, c, F4, WK(77) ); - R( c, d, e, a, b, F4, WK(78) ); - R( b, c, d, e, a, F4, WK(79) ); - - /* Update the chaining variables. */ - state->h0 += a; - state->h1 += b; - state->h2 += c; - state->h3 += d; - state->h4 += e; - - return /* burn_stack */ 84+15; -} - -#endif -#endif ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 2 +- cipher/sha1-ssse3-amd64.S | 378 +++++++++++++++++++++++++++++++++++++++++++++ cipher/sha1-ssse3-amd64.c | 319 -------------------------------------- 3 files changed, 379 insertions(+), 320 deletions(-) create mode 100644 cipher/sha1-ssse3-amd64.S delete mode 100644 cipher/sha1-ssse3-amd64.c hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Fri Dec 13 23:44:53 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 14 Dec 2013 00:44:53 +0200 Subject: tests/basic fails sometimes... In-Reply-To: <8738lwo3zn.fsf@vigenere.g10code.de> References: <52AADCD0.5040409@iki.fi> <8738lwo3zn.fsf@vigenere.g10code.de> Message-ID: <52AB8DE5.9060409@iki.fi> On 13.12.2013 23:06, Werner Koch wrote: > On Fri, 13 Dec 2013 11:09, jussi.kivilinna at iki.fi said: > >> 'basic' tests appear to fail sometimes, with following error. > > I have not seen that. You can add the option --verbose to see which > test fails. Here's few outputs... On x86-64 Ubuntu 13.10: encryption/decryption test 12 (algo 1) data: (data (flags oaep) (value #11223344556677889900AA#) ) ciph: (enc-val (flags pkcs1) (rsa (a #3285AB8496E0AFCAF101A3D37B7ACE1EA4D70E1ED01084E95259539A4AD87C17E005A876ED72A6809F76FC46A18D13B9A50E06C7FE8BE41E0C58724A926653C9CA1B23AF645DE9A01F0F0C3C91AD79F64E09868587E3A2D942F95C0E6529AA83ADE0DFE36CA7C242F1AEF04E6F8918BA3D3CB6C6835609C24FCC0D9FA2B0C304#) ) ) key: (private-key (rsa (n #00C02CCF63987B270C20B6C35D3164C485110087C7CFE2A9E24E6D88AF04762D5B62400EE4F7384DA222224D3DAE1F1A76A763561FBC7F9D17293BD86F1DDC25C43574290C2BE1C2FAFE52E8A2031D7FE8E9E01B87A1EDA67C8F4883C54F514243AF73A3CD1507454A618420637C3108F92F88134D2DA71D02753B108660DC2419#) (e #010001#) (d #097803C171654B059DD487807F632ACC302BE5A4CE9D4957D75BAE070A9A1C8EC2C2EB80F45DEBB454087AE411C0E253CB9859495FC72D69335CC9A3A2F5E28DE7ED8C3EA494614ACE35BAD2D253A575A9FBB1F20AD1663470E34561D77E3D2007F3AD7842A527A25EBEC93FA8C930E4DF20A063650E380B782D35AEE9DAF371#) (p #00C0A09C6F81CF6C3A5A620BEE1B4CA31A019CAA9216BC02D6E87106DE3AD14CAB3FEDB87FFEA6457367FF894676F2A3363D70B01E1B5A45BD019AF4A6285464B1#) (q #00FF661A028063A1174FB6F25266A4E3CCBDD7406C03AE948C1B582CD0051A80D8B1D2AFCC3387C62C305111D7391579BE9F9009BF0698FADA35C39DEEAA902FE9#) (u #008896DA905EB7D39DC940DB3CFB261970B388735D679A19E04B5F3805220BF5B3F550AA2A4432011BE060AE20145E7D76226AAA5A2CBE0E7997F6B5303B807F33#) ) ) gcry_pk_decrypt failed: expected 155 (Encoding problem), got 0 (Success) gcry_pk_encrypt/gcry_pk_decrypt do not roundtrip On i386 Debian 7.2: encryption/decryption test 12 (algo 1) data: (data (flags oaep) (value #11223344556677889900AA#) ) ciph: (enc-val (flags pkcs1) (rsa (a #98517CDB7942C179BAFDC77EC9FF60801F2DA70059F46B19BEFACC1EA2945388892A4AB22A3F16EF306F1D7B997AB81823D471445B50F2E46A8FD1DCE912BCC0CC0A3901A37D66580665788544C87FF176F6427D6D0A92046F0933477C2C6E2D7C5939C56E2C38A4998825BF3754351DC944B5000742F51CC14307C4334629C8#) ) ) key: (private-key (rsa (n #00B910CA8FC0B4D48F7BF395B1F1CC10597F7FB14FF13F3E09FA36EFBDEB5B6061A6A8DCA77287B5707AA7E51D199E5E00536C9091D9CEF641AF8389502725B9AF383760789E26E0EC05094254D44E3431FDAFA9DA63AC22AEB07DD0F864BB34BE423D0A18E51EEA93A5A1AA26B7B1CC976B9C9E2A025D2F8A20F012B2CB12EBE9#) (e #010001#) (d #3345B8A91D1DBC6A485013202ABA7BA9200ED3A42F55732DB679F3EFED3DF2B4FB447B6594F08E4F2C5C49B8C7393E17DF050C04AC908F5F616E12E3F70497ADBFF88CA2A2BC3569E93108918F386C9E5D1A434BDD11C9EEA644F6DF4E1997C5D6E0207C52E9CD0A5EFD7A899D42D5F2C97D13CD9FBB75E8669F213DBAD137C5#) (p #00CA2C03EA8833365BBB20730895F802D34533B77B3D520D824A800C18D194AF32C8CD6787A32EA88241D5363DE24287FB45807718C1AB3166787DDBCDAE0B88EF#) (q #00EA56CFE995AA3C3C3FA8F8C4FA72F5241993AB9040987192DE6EB751F65E4935432EC2620FD05DC1A4E20465B31184E867D07A53E0D1B89A9045C68A41CCE8A7#) (u #1CF4F9CF570F57292385912FA5A7EC16B519B92CF680935C3793ABD24C885969846D6981C22A37007AA9BC266431C705878EDEA8B65D86FD592BAC4899A9CFA7#) ) ) gcry_pk_decrypt failed: expected 155 (Encoding problem), got 0 (Success) gcry_pk_encrypt/gcry_pk_decrypt do not roundtrip > What platform are you using? So far I have tested: - x86-64 machines running 64-bit Ubuntu 13.10 and 12.04 - mingw32 build on Ubuntu 13.10 - i386 build on Ubuntu 13.10 - "--disable-asm" with x86-64 Ubuntu 13.10 - i386 machine with Debian 7.2 - armhf cross-compile build on Ubuntu 13.10 and run tests/basic with QEMU -Jussi > > > Shalom-Salam, > > Werner > From jussi.kivilinna at iki.fi Fri Dec 13 23:49:34 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 14 Dec 2013 00:49:34 +0200 Subject: [PATCH] SHA-1: Add SSSE3 implementation In-Reply-To: <87vbysmp3p.fsf@vigenere.g10code.de> References: <20131213141032.31174.48181.stgit@localhost6.localdomain6> <87vbysmp3p.fsf@vigenere.g10code.de> Message-ID: <52AB8EFE.6040905@iki.fi> On 13.12.2013 23:13, Werner Koch wrote: > On Fri, 13 Dec 2013 15:10, jussi.kivilinna at iki.fi said: > >> cpu Old New Diff >> Intel i5-4570 9.02 c/B 5.22 c/B 1.72x >> Intel i5-2450M 12.27 c/B 7.24 c/B 1.69x >> Intel Core2 T8100 7.94 c/B 6.76 c/B 1.17x > > I justed drafted an article which says > > For the important SHA-1 algorithm 1.6 improves over 1.5 but is not > yet up to what Nettle does. The better improvement is likely due to > changes in helper functions (stack burning) and not due to changes in > the actual implementation. There is definitely room for > improvements. > > You are a really fast hacker :-). Guess I need to re-run the > benchmarks. Yes, please add this. If there is ever a problem with this > code the admin can disable it using /etc/gcrypt/hwf.deny. Heh.. and I now have hacked up BMI2/AVX optimized SHA-1 which does 3.85 c/B on Intel Haswell. But lets leave these for 1.7, because of dependency on largeish 'bulk hash transform' patch. -Jussi > > > Salam-Shalom, > > Werner > From cvs at cvs.gnupg.org Sat Dec 14 00:16:38 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sat, 14 Dec 2013 00:16:38 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-442-gc86c355 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via c86c35534a153b13e880d0bb0ea3e48e1c0ecaf9 (commit) from d2b853246c2ed056a92096d89c3ca057e45c9c92 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit c86c35534a153b13e880d0bb0ea3e48e1c0ecaf9 Author: Jussi Kivilinna Date: Sat Dec 14 01:11:32 2013 +0200 SHA-1/SSSE3: Do not check for Intel syntax assembly support * cipher/sha1-ssse3-amd64.S: Remove check for HAVE_INTEL_SYNTAX_PLATFORM_AS. * cipher/sha1.c [USE_SSSE3]: Ditto. -- SHA-1 SSSE3 implementation uses AT&T syntax so check for HAVE_INTEL_SYNTAX_PLATFORM_AS is unnecessary. Signed-off-by: Jussi Kivilinna diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index 5165f3f..cf8c079 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -30,7 +30,6 @@ #include #if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ - defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) #ifdef __PIC__ diff --git a/cipher/sha1.c b/cipher/sha1.c index af57b19..18b6daa 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -46,8 +46,7 @@ /* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */ #undef USE_SSSE3 #if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_SSSE3) && \ - defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) + defined(HAVE_GCC_INLINE_ASM_SSSE3) # define USE_SSSE3 1 #endif ----------------------------------------------------------------------- Summary of changes: cipher/sha1-ssse3-amd64.S | 1 - cipher/sha1.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Sat Dec 14 00:54:58 2013 From: wk at gnupg.org (Werner Koch) Date: Sat, 14 Dec 2013 00:54:58 +0100 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <52AB733D.7080302@iki.fi> (Jussi Kivilinna's message of "Fri, 13 Dec 2013 22:51:09 +0200") References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> <52AB733D.7080302@iki.fi> Message-ID: <878uvomhm5.fsf@vigenere.g10code.de> On Fri, 13 Dec 2013 21:51, jussi.kivilinna at iki.fi said: > That's right, size_t for lengths is not enough on 32-bit. Could we use uint64_t > instead of size_t? uint64_t is not really portable because C99 is not deployed everywhere. However, this is a minor issue because we could enable CCM only if we have that type. We do the same for some algorithm (TIGER comes to mind). We could document a certain limit on CCM or we look for a more general solution. In Windows this has traditionally be solved using unions and structs. It's not nice API, though. off_t would be a natural choice but it has problems as well. For example the two defined ABI variants and I have not checked whether it is suitable for a size_t replacement (I guess not). What do we we need to change in the API to correctly support it. What side-effects will be have if we use uint64_t and provide the interfaces only if uint64 is defined? Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Sat Dec 14 00:58:06 2013 From: wk at gnupg.org (Werner Koch) Date: Sat, 14 Dec 2013 00:58:06 +0100 Subject: tests/basic fails sometimes... In-Reply-To: <52AB8DE5.9060409@iki.fi> (Jussi Kivilinna's message of "Sat, 14 Dec 2013 00:44:53 +0200") References: <52AADCD0.5040409@iki.fi> <8738lwo3zn.fsf@vigenere.g10code.de> <52AB8DE5.9060409@iki.fi> Message-ID: <874n6cmhgx.fsf@vigenere.g10code.de> On Fri, 13 Dec 2013 23:44, jussi.kivilinna at iki.fi said: > Here's few outputs... > > On x86-64 Ubuntu 13.10: Thanks. Meanwhile I was abale to trigger that bug myself. I'll look at it. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From funman at videolan.org Sat Dec 14 02:37:01 2013 From: funman at videolan.org (=?ISO-8859-1?Q?Rafa=EBl_Carr=E9?=) Date: Fri, 13 Dec 2013 20:37:01 -0500 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <878uvomhm5.fsf@vigenere.g10code.de> References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> <52AB733D.7080302@iki.fi> <878uvomhm5.fsf@vigenere.g10code.de> Message-ID: <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> Hello, Werner Koch a ?crit?: >On Fri, 13 Dec 2013 21:51, jussi.kivilinna at iki.fi said: > >> That's right, size_t for lengths is not enough on 32-bit. Could we >use uint64_t >> instead of size_t? > >uint64_t is not really portable because C99 is not deployed everywhere. Are you thinking about Visual Studio and its incomplete C99 support? I hope that's the only compiler left that doesn't support C99. And I'm not sure you should keep gcrypt off of 15 years old features just because of it. If there're others compilers I'd like to know because I've been using C99 extensively for a few years. In any case we've had these types in mingw for a while and I think there are copies of a MSVC compatible stdint.h in the wild. >However, this is a minor issue because we could enable CCM only if we >have that type. We do the same for some algorithm (TIGER comes to >mind). > >We could document a certain limit on CCM or we look for a more general >solution. In Windows this has traditionally be solved using unions and >structs. It's not nice API, though. > >off_t would be a natural choice but it has problems as well. For >example the two defined ABI variants and I have not checked whether it >is suitable for a size_t replacement (I guess not). I think off_t depends on __USE_FILE_OFFSET64 at least on 32 bits so it doesn't sound like a good choice. >What do we we need to change in the API to correctly support it. What >side-effects will be have if we use uint64_t and provide the interfaces >only if uint64 is defined? Applications failing to link correctly when built with the non C99 able compilers I guess, I can't think of something else. >Shalom-Salam, > > Werner Regards, From jussi.kivilinna at iki.fi Sat Dec 14 09:19:48 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 14 Dec 2013 10:19:48 +0200 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> <52AB733D.7080302@iki.fi> <878uvomhm5.fsf@vigenere.g10code.de> <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> Message-ID: <52AC14A4.1040906@iki.fi> On 14.12.2013 03:37, Rafa?l Carr? wrote: > > > Hello, > > Werner Koch a ?crit : >> On Fri, 13 Dec 2013 21:51, jussi.kivilinna at iki.fi said: >> >>> That's right, size_t for lengths is not enough on 32-bit. Could we >> use uint64_t >>> instead of size_t? >> >> uint64_t is not really portable because C99 is not deployed everywhere. > > Are you thinking about Visual Studio and its incomplete C99 support? > > I hope that's the only compiler left that doesn't support C99. And I'm not sure you should keep gcrypt off of 15 years old features just because of it. > > If there're others compilers I'd like to know because I've been using C99 extensively for a few years. > > In any case we've had these types in mingw for a while and I think there are copies of a MSVC compatible stdint.h in the wild. > >> However, this is a minor issue because we could enable CCM only if we >> have that type. We do the same for some algorithm (TIGER comes to >> mind). >> >> We could document a certain limit on CCM or we look for a more general >> solution. In Windows this has traditionally be solved using unions and >> structs. It's not nice API, though. >> >> off_t would be a natural choice but it has problems as well. For >> example the two defined ABI variants and I have not checked whether it >> is suitable for a size_t replacement (I guess not). > > I think off_t depends on __USE_FILE_OFFSET64 at least on 32 bits so it doesn't sound like a good choice. > >> What do we we need to change in the API to correctly support it. What >> side-effects will be have if we use uint64_t and provide the interfaces >> only if uint64 is defined? > > Applications failing to link correctly when built with the non C99 able compilers I guess, I can't think of something else. With GCRYCTL_SET_CCM_LENGTHS, linking should not be problem since CCM data lengths are passed in through: gcry_error_t gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen); So uint64_t would not be used for any function prototype or any types. When libgcrypt is build without uint64_t, GCRYCTL_SET_CCM_LENGTHS would return error. -Jussi > >> Shalom-Salam, >> >> Werner > > Regards, > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > From cvs at cvs.gnupg.org Sat Dec 14 10:35:06 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sat, 14 Dec 2013 10:35:06 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-443-gffd9b2a Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via ffd9b2aa5abda7f4d7790ed48116ed5d71ab9995 (commit) from c86c35534a153b13e880d0bb0ea3e48e1c0ecaf9 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit ffd9b2aa5abda7f4d7790ed48116ed5d71ab9995 Author: Jussi Kivilinna Date: Sat Dec 14 11:23:03 2013 +0200 Minor fixes to SHA assembly implementations * cipher/Makefile.am: Correct 'sha256-avx*.S' to 'sha512-avx*.S'. * cipher/sha1-ssse3-amd64.S: First line, correct filename. * cipher/sha256-ssse3-amd64.S: Return correct stack burn depth. * cipher/sha512-avx-amd64.S: Use 'vzeroall' to clear registers. * cipher/sha512-avx2-bmi2-amd64.S: Ditto and return correct stack burn depth. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 7d737e2..a1718c5 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,8 +78,9 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c sha1-ssse3-amd64.S \ -sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ -sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ +sha256.c sha256-ssse3-amd64.S \ +sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ + sha512-armv7-neon.S \ stribog.c \ tiger.c \ whirlpool.c \ diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index cf8c079..5e5716b 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -1,4 +1,4 @@ -/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function +/* sha1-ssse3-amd64.S - Intel SSSE3 accelerated SHA-1 transform function * Copyright ? 2013 Jussi Kivilinna * * Based on sha1.c: diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index bcf0e19..9b27f8f 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -504,7 +504,7 @@ _gcry_sha256_transform_amd64_ssse3: pop rbp pop rbx - mov rax, STACK_SIZE + mov eax, STACK_SIZE + 5*8 ret diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 900936e..3449b87 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -348,17 +348,7 @@ _gcry_sha512_transform_amd64_avx: /* Restore Stack Pointer */ add rsp, frame_size - vzeroupper - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm2, xmm2 - pxor xmm3, xmm3 - pxor xmm4, xmm4 - pxor xmm5, xmm5 - pxor xmm6, xmm6 - pxor xmm7, xmm7 - pxor xmm8, xmm8 - pxor xmm9, xmm9 + vzeroall /* Return stack burn depth */ mov rax, frame_size diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 9ece4fd..d6301f3 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -719,19 +719,9 @@ _gcry_sha512_transform_amd64_avx2: /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] - vzeroupper - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm2, xmm2 - pxor xmm3, xmm3 - pxor xmm4, xmm4 - pxor xmm5, xmm5 - pxor xmm6, xmm6 - pxor xmm7, xmm7 - pxor xmm8, xmm8 - pxor xmm9, xmm9 - - mov rax, frame_size + vzeroall + + mov eax, frame_size + 31 .Lnowork: ret ----------------------------------------------------------------------- Summary of changes: cipher/Makefile.am | 5 +++-- cipher/sha1-ssse3-amd64.S | 2 +- cipher/sha256-ssse3-amd64.S | 2 +- cipher/sha512-avx-amd64.S | 12 +----------- cipher/sha512-avx2-bmi2-amd64.S | 16 +++------------- 5 files changed, 9 insertions(+), 28 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Sat Dec 14 18:44:31 2013 From: wk at gnupg.org (Werner Koch) Date: Sat, 14 Dec 2013 18:44:31 +0100 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <52AC14A4.1040906@iki.fi> (Jussi Kivilinna's message of "Sat, 14 Dec 2013 10:19:48 +0200") References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> <52AB733D.7080302@iki.fi> <878uvomhm5.fsf@vigenere.g10code.de> <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> <52AC14A4.1040906@iki.fi> Message-ID: <87wqj7l43k.fsf@vigenere.g10code.de> On Sat, 14 Dec 2013 09:19, jussi.kivilinna at iki.fi said: > So uint64_t would not be used for any function prototype or any types. When > libgcrypt is build without uint64_t, GCRYCTL_SET_CCM_LENGTHS would return > error. Right, I realized that only after I wrote my reply. Thus we can do it quite easy: We only need to document that uint64_t is required. If it is not available, there will be no CCM support (GPG_ERR_NOT_SUPPORTED). Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Sat Dec 14 18:48:05 2013 From: wk at gnupg.org (Werner Koch) Date: Sat, 14 Dec 2013 18:48:05 +0100 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> (=?utf-8?Q?=22Rafa=C3=ABl=09Carr=C3=A9=22's?= message of "Fri, 13 Dec 2013 20:37:01 -0500") References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> <52AB733D.7080302@iki.fi> <878uvomhm5.fsf@vigenere.g10code.de> <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> Message-ID: <87sitvl3xm.fsf@vigenere.g10code.de> On Sat, 14 Dec 2013 02:37, funman at videolan.org said: > Are you thinking about Visual Studio and its incomplete C99 support? No. > I hope that's the only compiler left that doesn't support C99. And I'm not sure you should keep gcrypt off of 15 years old features just because of it. There are lot of old Unix systems in use which don't have a decent toolchain. We go into great lengths for portability and thus we should not throw old systems out only for that the awkward CCM. > I think off_t depends on __USE_FILE_OFFSET64 at least on 32 bits so it doesn't sound like a good choice. Yes, it is troublesome. At times most support requests for GPGME were just about this. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From funman at videolan.org Sat Dec 14 19:05:31 2013 From: funman at videolan.org (=?ISO-8859-1?Q?Rafa=EBl_Carr=E9?=) Date: Sat, 14 Dec 2013 13:05:31 -0500 Subject: sha1 hash using libgcrypt different from what returns sha1sum In-Reply-To: <87sitvl3xm.fsf@vigenere.g10code.de> References: <20131111175258.B2868140A04@edrusb.is-a-geek.org> <87iovx1q3j.fsf@vigenere.g10code.de> <87iovwznob.fsf@vigenere.g10code.de> <52AB733D.7080302@iki.fi> <878uvomhm5.fsf@vigenere.g10code.de> <1a35af1f-dbad-4a92-bbda-7f0f0c95b7ea@email.android.com> <87sitvl3xm.fsf@vigenere.g10code.de> Message-ID: <52AC9DEB.5050701@videolan.org> Le 14/12/2013 12:48, Werner Koch a ?crit : > On Sat, 14 Dec 2013 02:37, funman at videolan.org said: > >> Are you thinking about Visual Studio and its incomplete C99 support? > > No. Good. >> I hope that's the only compiler left that doesn't support C99. And I'm not sure you should keep gcrypt off of 15 years old features just because of it. > > There are lot of old Unix systems in use which don't have a decent > toolchain. We go into great lengths for portability and thus we should > not throw old systems out only for that the awkward CCM. Thanks for the details, From wk at gnupg.org Sat Dec 14 21:17:59 2013 From: wk at gnupg.org (Werner Koch) Date: Sat, 14 Dec 2013 21:17:59 +0100 Subject: tests/basic fails sometimes... In-Reply-To: <874n6cmhgx.fsf@vigenere.g10code.de> (Werner Koch's message of "Sat, 14 Dec 2013 00:58:06 +0100") References: <52AADCD0.5040409@iki.fi> <8738lwo3zn.fsf@vigenere.g10code.de> <52AB8DE5.9060409@iki.fi> <874n6cmhgx.fsf@vigenere.g10code.de> Message-ID: <87y53njifc.fsf@vigenere.g10code.de> On Sat, 14 Dec 2013 00:58, wk at gnupg.org said: > Thanks. Meanwhile I was abale to trigger that bug myself. I'll look at > it. The reason for the problem is that we check whether the decryption returns a specific error code if a OAEP encrypted data is decrypted as PKCS#1 encrypted data. Now if the decrypted frame starts with 0x02 followed by a few random bytes, a 0x00 and more bytes, this looks like correct PKCS#1 encrytpted data and Libgcrypt return with success and not with the expected error code. Later the value is checked and that triggers the sceond error - as expected. I'll add a fix to the test. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From cvs at cvs.gnupg.org Sat Dec 14 21:44:36 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Sat, 14 Dec 2013 21:44:36 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-444-gbfb43a1 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via bfb43a17d8db571fca4ed433ee8be5c366745844 (commit) from ffd9b2aa5abda7f4d7790ed48116ed5d71ab9995 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit bfb43a17d8db571fca4ed433ee8be5c366745844 Author: Werner Koch Date: Sat Dec 14 21:40:36 2013 +0100 tests: Prevent rare failure of gcry_pk_decrypt test. * tests/basic.c (check_pubkey_crypt): Add special mode 1. (main): Add option --loop. -- This failure has been reported by Jussi Kivilinna. The new loop option was needed to track that down. It took me up to 100 iterations to trigger the bug. With the fix applied I am currently at 1000 iteration with no problems. Command line to evoke the problem was: ./basic --pubkey --verbose --loop -1 --die Signed-off-by: Werner Koch diff --git a/tests/basic.c b/tests/basic.c index 789297f..6ffc3f5 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5302,6 +5302,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) int unpadded; int encrypt_expected_rc; int decrypt_expected_rc; + int special; } datas[] = { { GCRY_PK_RSA, @@ -5385,14 +5386,14 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) "(flags oaep)", 1, 0, - GPG_ERR_ENCODING_PROBLEM }, + GPG_ERR_ENCODING_PROBLEM, 1 }, { GCRY_PK_RSA, "(data\n (flags oaep)\n" " (value #11223344556677889900AA#))\n", "(flags pkcs1)", 1, 0, - GPG_ERR_ENCODING_PROBLEM }, + GPG_ERR_ENCODING_PROBLEM, 1 }, { 0, "(data\n (flags pss)\n" " (value #11223344556677889900AA#))\n", @@ -5424,6 +5425,8 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) if (!rc) { + int expect_mismatch = 0; + /* Insert decoding hint to CIPH. */ if (datas[dataidx].hint) { @@ -5460,7 +5463,16 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) ciph = list; } rc = gcry_pk_decrypt (&plain, ciph, skey); - if (gcry_err_code (rc) != datas[dataidx].decrypt_expected_rc) + if (!rc && datas[dataidx].special == 1) + { + /* It may happen that OAEP formatted data which is + decrypted as pkcs#1 data returns a valid pkcs#1 + frame. However, the returned value will not be + identical - thus we expect a mismatch and test further on + whether this mismatch actually happened. */ + expect_mismatch = 1; + } + else if (gcry_err_code (rc) != datas[dataidx].decrypt_expected_rc) { if (verbose) { @@ -5488,8 +5500,19 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) s1 = gcry_sexp_nth_data (p1, 1, &n1); s2 = gcry_sexp_nth_data (p2, 1, &n2); if (n1 != n2 || memcmp (s1, s2, n1)) - fail ("gcry_pk_encrypt/gcry_pk_decrypt do not roundtrip\n"); + { + if (expect_mismatch) + expect_mismatch = 0; + else + fail ("gcry_pk_encrypt/gcry_pk_decrypt " + "do not roundtrip\n"); + } } + + if (expect_mismatch) + fail ("gcry_pk_encrypt/gcry_pk_decrypt " + "expected mismatch did not happen\n"); + gcry_sexp_release (p1); gcry_sexp_release (p2); } @@ -5925,6 +5948,8 @@ main (int argc, char **argv) int use_fips = 0; int selftest_only = 0; int pubkey_only = 0; + int loop = 0; + unsigned int loopcount = 0; if (argc) { argc--; argv++; } @@ -5961,7 +5986,6 @@ main (int argc, char **argv) else if (!strcmp (*argv, "--pubkey")) { pubkey_only = 1; - verbose += 2; argc--; argv++; } else if (!strcmp (*argv, "--die")) @@ -5969,6 +5993,15 @@ main (int argc, char **argv) die_on_error = 1; argc--; argv++; } + else if (!strcmp (*argv, "--loop")) + { + argc--; argv++; + if (argc) + { + loop = atoi (*argv); + argc--; argv++; + } + } } gcry_control (GCRYCTL_SET_VERBOSITY, (int)verbose); @@ -5996,19 +6029,29 @@ main (int argc, char **argv) /* No valuable keys are create, so we can speed up our RNG. */ gcry_control (GCRYCTL_ENABLE_QUICK_RANDOM, 0); - if (pubkey_only) - check_pubkey (); - else if (!selftest_only) + do { - check_ciphers (); - check_cipher_modes (); - check_bulk_cipher_modes (); - check_digests (); - check_hmac (); - check_mac (); - check_pubkey (); + if (pubkey_only) + check_pubkey (); + else if (!selftest_only) + { + check_ciphers (); + check_cipher_modes (); + check_bulk_cipher_modes (); + check_digests (); + check_hmac (); + check_mac (); + check_pubkey (); + } + loopcount++; + if (loop) + { + fprintf (stderr, "Test iteration %u completed.\n", loopcount); + if (loop != -1) + loop--; + } } - + while (loop); if (in_fips_mode && !selftest_only) { ----------------------------------------------------------------------- Summary of changes: tests/basic.c | 75 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 16 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Sun Dec 15 15:04:17 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 15 Dec 2013 16:04:17 +0200 Subject: [PATCH] Use u64 for CCM data lengths Message-ID: <20131215140417.5653.38117.stgit@localhost6.localdomain6> * cipher/cipher-ccm.c: Move code inside [HAVE_U64_TYPEDEF]. [!HAVE_U64_TYPEDEF] (_gcry_cipher_ccm_encrypt) (_gcry_cipher_ccm_decrypt, _gcry_cipher_ccm_set_nonce) (_gcry_cipher_ccm_authenticate, _gcry_cipher_ccm_get_tag) (_gcry_cipher_ccm_check_tag): Dummy functions returning GPG_ERROR_NOT_SUPPORTED. * cipher/cipher-internal.h (gcry_cipher_handle.u_mode.ccm) (_gcry_cipher_ccm_set_lengths): Move inside [HAVE_U64_TYPEDEF] and use u64 instead of size_t for CCM data lengths. * cipher/cipher.c (_gcry_cipher_open_internal, cipher_reset) (_gcry_cipher_ctl) [!HAVE_U64_TYPEDEF]: Return GPG_ERR_NOT_SUPPORTED for CCM. (_gcry_cipher_ctl) [HAVE_U64_TYPEDEF]: Use u64 for GCRYCTL_SET_CCM_LENGTHS length parameters. -- Signed-off-by: Jussi Kivilinna --- cipher/cipher-ccm.c | 88 +++++++++++++++++++++++++++++++++++++++++++--- cipher/cipher-internal.h | 11 ++++-- cipher/cipher.c | 12 ++++++ 3 files changed, 100 insertions(+), 11 deletions(-) diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c index d2b8841..47f2162 100644 --- a/cipher/cipher-ccm.c +++ b/cipher/cipher-ccm.c @@ -29,6 +29,9 @@ #include "bufhelp.h" #include "./cipher-internal.h" +/* We need a 64 bit type for this code. */ +#ifdef HAVE_U64_TYPEDEF + #define set_burn(burn, nburn) do { \ unsigned int __nburn = (nburn); \ @@ -149,14 +152,14 @@ _gcry_cipher_ccm_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce, gcry_err_code_t -_gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, size_t encryptlen, - size_t aadlen, size_t taglen) +_gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, u64 encryptlen, u64 aadlen, + u64 taglen) { unsigned int burn = 0; unsigned char b0[16]; size_t noncelen = 15 - (c->u_iv.iv[0] + 1); - size_t M = taglen; - size_t M_; + u64 M = taglen; + u64 M_; int i; M_ = (M - 2) / 2; @@ -203,7 +206,6 @@ _gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, size_t encryptlen, buf_put_be32(&b0[2], aadlen); set_burn (burn, do_cbc_mac (c, b0, 6, 0)); } -#ifdef HAVE_U64_TYPEDEF else if (aadlen > (unsigned int)0xffffffff) { b0[0] = 0xff; @@ -211,7 +213,6 @@ _gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, size_t encryptlen, buf_put_be64(&b0[2], aadlen); set_burn (burn, do_cbc_mac (c, b0, 10, 0)); } -#endif /* Generate S_0 and increase counter. */ set_burn (burn, c->spec->encrypt ( &c->context.c, c->u_mode.ccm.s0, @@ -364,3 +365,78 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, return err; } + +#else + +/* + * Provide dummy functions so that we avoid adding too much #ifdefs in + * cipher.c. + */ + +gcry_err_code_t +_gcry_cipher_ccm_encrypt(gcry_cipher_hd_t c, unsigned char *outbuf, + size_t outbuflen, const unsigned char *inbuf, + size_t inbuflen) +{ + (void)c; + (void)outbuf; + (void)outbuflen; + (void)inbuf; + (void)inbuflen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_decrypt(gcry_cipher_hd_t c, unsigned char *outbuf, + size_t outbuflen, const unsigned char *inbuf, + size_t inbuflen) +{ + (void)c; + (void)outbuf; + (void)outbuflen; + (void)inbuf; + (void)inbuflen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_set_nonce(gcry_cipher_hd_t c, const unsigned char *nonce, + size_t noncelen) +{ + (void)c; + (void)nonce; + (void)noncelen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_authenticate(gcry_cipher_hd_t c, const unsigned char *abuf, + size_t abuflen) +{ + (void)c; + (void)abuf; + (void)abuflen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_get_tag(gcry_cipher_hd_t c, unsigned char *outtag, + size_t taglen) +{ + (void)c; + (void)outtag; + (void)taglen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_check_tag(gcry_cipher_hd_t c, const unsigned char *intag, + size_t taglen) +{ + (void)c; + (void)intag; + (void)taglen; + return GPG_ERR_NOT_SUPPORTED; +} + +#endif /*HAVE_U64_TYPEDEF*/ diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 6fb3bac..cdac445 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -135,10 +135,11 @@ struct gcry_cipher_handle int unused; /* Number of unused bytes in LASTIV. */ union { +#ifdef HAVE_U64_TYPEDEF /* Mode specific storage for CCM mode. */ struct { - size_t encryptlen; - size_t aadlen; + u64 encryptlen; + u64 aadlen; unsigned int authlen; /* Space to save partial input lengths for MAC. */ @@ -151,6 +152,7 @@ struct gcry_cipher_handle unsigned int lengths:1; /* Set to 1 if CCM length parameters has been processed. */ } ccm; +#endif /* Mode specific storage for CMAC mode. */ struct { @@ -280,9 +282,10 @@ gcry_err_code_t _gcry_cipher_ccm_set_nonce size_t noncelen); gcry_err_code_t _gcry_cipher_ccm_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); +#ifdef HAVE_U64_TYPEDEF gcry_err_code_t _gcry_cipher_ccm_set_lengths -/* */ (gcry_cipher_hd_t c, size_t encryptedlen, size_t aadlen, - size_t taglen); +/* */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen); +#endif gcry_err_code_t _gcry_cipher_ccm_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); diff --git a/cipher/cipher.c b/cipher/cipher.c index 8b47abc..4c7b5b5 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -394,11 +394,15 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, switch (mode) { case GCRY_CIPHER_MODE_CCM: +#ifdef HAVE_U64_TYPEDEF if (spec->blocksize != GCRY_CCM_BLOCK_LEN) err = GPG_ERR_INV_CIPHER_MODE; if (!spec->encrypt || !spec->decrypt) err = GPG_ERR_INV_CIPHER_MODE; break; +#else + err = GPG_ERR_NOT_SUPPORTED; +#endif case GCRY_CIPHER_MODE_ECB: case GCRY_CIPHER_MODE_CBC: @@ -686,9 +690,11 @@ cipher_reset (gcry_cipher_hd_t c) } break; +#ifdef HAVE_U64_TYPEDEF case GCRY_CIPHER_MODE_CCM: memset (&c->u_mode.ccm, 0, sizeof c->u_mode.ccm); break; +#endif default: break; /* u_mode unused by other modes. */ @@ -1139,8 +1145,9 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen) break; case GCRYCTL_SET_CCM_LENGTHS: +#ifdef HAVE_U64_TYPEDEF { - size_t params[3]; + u64 params[3]; size_t encryptedlen; size_t aadlen; size_t authtaglen; @@ -1160,6 +1167,9 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen) rc = _gcry_cipher_ccm_set_lengths (h, encryptedlen, aadlen, authtaglen); } +#else + rc = GPG_ERR_NOT_SUPPORTED; +#endif break; case GCRYCTL_DISABLE_ALGO: From cvs at cvs.gnupg.org Sun Dec 15 19:11:03 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Sun, 15 Dec 2013 19:11:03 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-445-g110fed2 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 110fed2d6b0bbc97cb5cc0a3a564e05fc42afa2d (commit) from bfb43a17d8db571fca4ed433ee8be5c366745844 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 110fed2d6b0bbc97cb5cc0a3a564e05fc42afa2d Author: Jussi Kivilinna Date: Sun Dec 15 20:07:54 2013 +0200 Use u64 for CCM data lengths * cipher/cipher-ccm.c: Move code inside [HAVE_U64_TYPEDEF]. [HAVE_U64_TYPEDEF] (_gcry_cipher_ccm_set_lengths): Use 'u64' for data lengths. [!HAVE_U64_TYPEDEF] (_gcry_cipher_ccm_encrypt) (_gcry_cipher_ccm_decrypt, _gcry_cipher_ccm_set_nonce) (_gcry_cipher_ccm_authenticate, _gcry_cipher_ccm_get_tag) (_gcry_cipher_ccm_check_tag): Dummy functions returning GPG_ERROR_NOT_SUPPORTED. * cipher/cipher-internal.h (gcry_cipher_handle.u_mode.ccm) (_gcry_cipher_ccm_set_lengths): Move inside [HAVE_U64_TYPEDEF] and use u64 instead of size_t for CCM data lengths. * cipher/cipher.c (_gcry_cipher_open_internal, cipher_reset) (_gcry_cipher_ctl) [!HAVE_U64_TYPEDEF]: Return GPG_ERR_NOT_SUPPORTED for CCM. (_gcry_cipher_ctl) [HAVE_U64_TYPEDEF]: Use u64 for GCRYCTL_SET_CCM_LENGTHS length parameters. * tests/basic.c: Do not use CCM if !HAVE_U64_TYPEDEF. * tests/bench-slope.c: Ditto. * tests/benchmark.c: Ditto. -- Signed-off-by: Jussi Kivilinna diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c index d2b8841..47f2162 100644 --- a/cipher/cipher-ccm.c +++ b/cipher/cipher-ccm.c @@ -29,6 +29,9 @@ #include "bufhelp.h" #include "./cipher-internal.h" +/* We need a 64 bit type for this code. */ +#ifdef HAVE_U64_TYPEDEF + #define set_burn(burn, nburn) do { \ unsigned int __nburn = (nburn); \ @@ -149,14 +152,14 @@ _gcry_cipher_ccm_set_nonce (gcry_cipher_hd_t c, const unsigned char *nonce, gcry_err_code_t -_gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, size_t encryptlen, - size_t aadlen, size_t taglen) +_gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, u64 encryptlen, u64 aadlen, + u64 taglen) { unsigned int burn = 0; unsigned char b0[16]; size_t noncelen = 15 - (c->u_iv.iv[0] + 1); - size_t M = taglen; - size_t M_; + u64 M = taglen; + u64 M_; int i; M_ = (M - 2) / 2; @@ -203,7 +206,6 @@ _gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, size_t encryptlen, buf_put_be32(&b0[2], aadlen); set_burn (burn, do_cbc_mac (c, b0, 6, 0)); } -#ifdef HAVE_U64_TYPEDEF else if (aadlen > (unsigned int)0xffffffff) { b0[0] = 0xff; @@ -211,7 +213,6 @@ _gcry_cipher_ccm_set_lengths (gcry_cipher_hd_t c, size_t encryptlen, buf_put_be64(&b0[2], aadlen); set_burn (burn, do_cbc_mac (c, b0, 10, 0)); } -#endif /* Generate S_0 and increase counter. */ set_burn (burn, c->spec->encrypt ( &c->context.c, c->u_mode.ccm.s0, @@ -364,3 +365,78 @@ _gcry_cipher_ccm_decrypt (gcry_cipher_hd_t c, unsigned char *outbuf, return err; } + +#else + +/* + * Provide dummy functions so that we avoid adding too much #ifdefs in + * cipher.c. + */ + +gcry_err_code_t +_gcry_cipher_ccm_encrypt(gcry_cipher_hd_t c, unsigned char *outbuf, + size_t outbuflen, const unsigned char *inbuf, + size_t inbuflen) +{ + (void)c; + (void)outbuf; + (void)outbuflen; + (void)inbuf; + (void)inbuflen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_decrypt(gcry_cipher_hd_t c, unsigned char *outbuf, + size_t outbuflen, const unsigned char *inbuf, + size_t inbuflen) +{ + (void)c; + (void)outbuf; + (void)outbuflen; + (void)inbuf; + (void)inbuflen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_set_nonce(gcry_cipher_hd_t c, const unsigned char *nonce, + size_t noncelen) +{ + (void)c; + (void)nonce; + (void)noncelen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_authenticate(gcry_cipher_hd_t c, const unsigned char *abuf, + size_t abuflen) +{ + (void)c; + (void)abuf; + (void)abuflen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_get_tag(gcry_cipher_hd_t c, unsigned char *outtag, + size_t taglen) +{ + (void)c; + (void)outtag; + (void)taglen; + return GPG_ERR_NOT_SUPPORTED; +} + +gcry_err_code_t +_gcry_cipher_ccm_check_tag(gcry_cipher_hd_t c, const unsigned char *intag, + size_t taglen) +{ + (void)c; + (void)intag; + (void)taglen; + return GPG_ERR_NOT_SUPPORTED; +} + +#endif /*HAVE_U64_TYPEDEF*/ diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 6fb3bac..cdac445 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -135,10 +135,11 @@ struct gcry_cipher_handle int unused; /* Number of unused bytes in LASTIV. */ union { +#ifdef HAVE_U64_TYPEDEF /* Mode specific storage for CCM mode. */ struct { - size_t encryptlen; - size_t aadlen; + u64 encryptlen; + u64 aadlen; unsigned int authlen; /* Space to save partial input lengths for MAC. */ @@ -151,6 +152,7 @@ struct gcry_cipher_handle unsigned int lengths:1; /* Set to 1 if CCM length parameters has been processed. */ } ccm; +#endif /* Mode specific storage for CMAC mode. */ struct { @@ -280,9 +282,10 @@ gcry_err_code_t _gcry_cipher_ccm_set_nonce size_t noncelen); gcry_err_code_t _gcry_cipher_ccm_authenticate /* */ (gcry_cipher_hd_t c, const unsigned char *abuf, size_t abuflen); +#ifdef HAVE_U64_TYPEDEF gcry_err_code_t _gcry_cipher_ccm_set_lengths -/* */ (gcry_cipher_hd_t c, size_t encryptedlen, size_t aadlen, - size_t taglen); +/* */ (gcry_cipher_hd_t c, u64 encryptedlen, u64 aadlen, u64 taglen); +#endif gcry_err_code_t _gcry_cipher_ccm_get_tag /* */ (gcry_cipher_hd_t c, unsigned char *outtag, size_t taglen); diff --git a/cipher/cipher.c b/cipher/cipher.c index 8b47abc..8c5a0b4 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -394,11 +394,15 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle, switch (mode) { case GCRY_CIPHER_MODE_CCM: +#ifdef HAVE_U64_TYPEDEF if (spec->blocksize != GCRY_CCM_BLOCK_LEN) err = GPG_ERR_INV_CIPHER_MODE; if (!spec->encrypt || !spec->decrypt) err = GPG_ERR_INV_CIPHER_MODE; break; +#else + err = GPG_ERR_NOT_SUPPORTED; +#endif case GCRY_CIPHER_MODE_ECB: case GCRY_CIPHER_MODE_CBC: @@ -686,9 +690,11 @@ cipher_reset (gcry_cipher_hd_t c) } break; +#ifdef HAVE_U64_TYPEDEF case GCRY_CIPHER_MODE_CCM: memset (&c->u_mode.ccm, 0, sizeof c->u_mode.ccm); break; +#endif default: break; /* u_mode unused by other modes. */ @@ -1139,8 +1145,9 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen) break; case GCRYCTL_SET_CCM_LENGTHS: +#ifdef HAVE_U64_TYPEDEF { - size_t params[3]; + u64 params[3]; size_t encryptedlen; size_t aadlen; size_t authtaglen; @@ -1148,7 +1155,7 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen) if (h->mode != GCRY_CIPHER_MODE_CCM) return gcry_error (GPG_ERR_INV_CIPHER_MODE); - if (!buffer || buflen != 3 * sizeof(size_t)) + if (!buffer || buflen != 3 * sizeof(u64)) return gcry_error (GPG_ERR_INV_ARG); /* This command is used to pass additional length parameters needed @@ -1160,6 +1167,9 @@ _gcry_cipher_ctl (gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen) rc = _gcry_cipher_ccm_set_lengths (h, encryptedlen, aadlen, authtaglen); } +#else + rc = GPG_ERR_NOT_SUPPORTED; +#endif break; case GCRYCTL_DISABLE_ALGO: diff --git a/tests/basic.c b/tests/basic.c index 6ffc3f5..0eb8215 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -1545,6 +1545,7 @@ check_gcm_cipher (void) static void check_ccm_cipher (void) { +#ifdef HAVE_U64_TYPEDEF static const struct tv { int algo; @@ -1950,7 +1951,7 @@ check_ccm_cipher (void) static const int cut[] = { 0, 1, 8, 10, 16, 19, -1 }; gcry_cipher_hd_t hde, hdd; unsigned char out[MAX_DATA_LEN]; - size_t ctl_params[3]; + u64 ctl_params[3]; int split, aadsplit; size_t j, i, keylen, blklen, authlen; gcry_error_t err = 0; @@ -2305,10 +2306,11 @@ check_ccm_cipher (void) if (memcmp (buf, tag, taglen) != 0) fail ("cipher-ccm-huge, encrypt mismatch entry\n"); } -#endif if (verbose) fprintf (stderr, " Completed CCM checks.\n"); +#endif +#endif /*HAVE_U64_TYPEDEF*/ } diff --git a/tests/bench-slope.c b/tests/bench-slope.c index 219e0dd..bd05064 100644 --- a/tests/bench-slope.c +++ b/tests/bench-slope.c @@ -740,7 +740,7 @@ static struct bench_ops decrypt_ops = { }; - +#ifdef HAVE_U64_TYPEDEF static void bench_ccm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) { @@ -748,7 +748,7 @@ bench_ccm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) int err; char tag[8]; char nonce[11] = { 0x80, 0x01, }; - size_t params[3]; + u64 params[3]; gcry_cipher_setiv (hd, nonce, sizeof (nonce)); @@ -792,7 +792,7 @@ bench_ccm_decrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) int err; char tag[8] = { 0, }; char nonce[11] = { 0x80, 0x01, }; - size_t params[3]; + u64 params[3]; gcry_cipher_setiv (hd, nonce, sizeof (nonce)); @@ -839,7 +839,7 @@ bench_ccm_authenticate_do_bench (struct bench_obj *obj, void *buf, int err; char tag[8] = { 0, }; char nonce[11] = { 0x80, 0x01, }; - size_t params[3]; + u64 params[3]; char data = 0xff; gcry_cipher_setiv (hd, nonce, sizeof (nonce)); @@ -903,6 +903,8 @@ static struct bench_ops ccm_authenticate_ops = { &bench_encrypt_free, &bench_ccm_authenticate_do_bench }; +#endif /*HAVE_U64_TYPEDEF*/ + static void bench_gcm_encrypt_do_bench (struct bench_obj *obj, void *buf, size_t buflen) @@ -1037,9 +1039,11 @@ static struct bench_cipher_mode cipher_modes[] = { {GCRY_CIPHER_MODE_OFB, "OFB dec", &decrypt_ops}, {GCRY_CIPHER_MODE_CTR, "CTR enc", &encrypt_ops}, {GCRY_CIPHER_MODE_CTR, "CTR dec", &decrypt_ops}, +#ifdef HAVE_U64_TYPEDEF {GCRY_CIPHER_MODE_CCM, "CCM enc", &ccm_encrypt_ops}, {GCRY_CIPHER_MODE_CCM, "CCM dec", &ccm_decrypt_ops}, {GCRY_CIPHER_MODE_CCM, "CCM auth", &ccm_authenticate_ops}, +#endif {GCRY_CIPHER_MODE_GCM, "GCM enc", &gcm_encrypt_ops}, {GCRY_CIPHER_MODE_GCM, "GCM dec", &gcm_decrypt_ops}, {GCRY_CIPHER_MODE_GCM, "GCM auth", &gcm_authenticate_ops}, diff --git a/tests/benchmark.c b/tests/benchmark.c index 62dfc22..e009c22 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -562,13 +562,13 @@ mac_bench ( const char *algoname ) } - +#ifdef HAVE_U64_TYPEDEF static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen) { const int _L = 4; const int noncelen = 15 - _L; char nonce[noncelen]; - size_t params[3]; + u64 params[3]; gcry_error_t err = GPG_ERR_NO_ERROR; memset (nonce, 0x33, noncelen); @@ -594,6 +594,7 @@ static void ccm_aead_init(gcry_cipher_hd_t hd, size_t buflen, int authlen) exit (1); } } +#endif static void @@ -622,8 +623,10 @@ cipher_bench ( const char *algoname ) { GCRY_CIPHER_MODE_CFB, " CFB", 0 }, { GCRY_CIPHER_MODE_OFB, " OFB", 0 }, { GCRY_CIPHER_MODE_CTR, " CTR", 0 }, +#ifdef HAVE_U64_TYPEDEF { GCRY_CIPHER_MODE_CCM, " CCM", 0, ccm_aead_init, GCRY_CCM_BLOCK_LEN, 8 }, +#endif { GCRY_CIPHER_MODE_GCM, " GCM", 0, NULL, GCRY_GCM_BLOCK_LEN, GCRY_GCM_BLOCK_LEN }, { GCRY_CIPHER_MODE_STREAM, "", 0 }, ----------------------------------------------------------------------- Summary of changes: cipher/cipher-ccm.c | 88 ++++++++++++++++++++++++++++++++++++++++++---- cipher/cipher-internal.h | 11 +++--- cipher/cipher.c | 14 ++++++-- tests/basic.c | 6 ++-- tests/bench-slope.c | 12 ++++--- tests/benchmark.c | 7 ++-- 6 files changed, 118 insertions(+), 20 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From saranchiw at gmail.com Sun Dec 15 15:29:43 2013 From: saranchiw at gmail.com (Saran Chiwtanasuntorn) Date: Sun, 15 Dec 2013 21:29:43 +0700 Subject: Where is libgcrypt.a (version 1.5.3)? Message-ID: Hi, First, I want to tell you that I am very new with Linux (Ubuntu 12.04). Due to the crash in* gcry_mpi_powm* link: http://www.gossamer-threads.com/lists/gnupg/gcrypt/61993. I wanted to compile the latest libgcrypt version. I got libgcrypt-1.5.3.tar.gz source code from ftp://ftp.gnupg.org/gcrypt/libgcrypt. After I compiled the source code with: ./configure make && make install I found only - libgcrypt.la - libgcrypt.so - libgcrypt.so.11 - libgcrypt.so.11.8.2 There is no libgcrypt.a at all. How can I get it? Thanks, Saran -------------- next part -------------- An HTML attachment was scrubbed... URL: From funman at videolan.org Sun Dec 15 20:43:26 2013 From: funman at videolan.org (=?ISO-8859-1?Q?Rafa=EBl_Carr=E9?=) Date: Sun, 15 Dec 2013 14:43:26 -0500 Subject: Where is libgcrypt.a (version 1.5.3)? In-Reply-To: References: Message-ID: <52AE065E.1030100@videolan.org> Hello, Le 15/12/2013 09:29, Saran Chiwtanasuntorn a ?crit : > Hi, > > First, I want to tell you that I am very new with Linux (Ubuntu 12.04). > > Due to the crash in* gcry_mpi_powm* link: > http://www.gossamer-threads.com/lists/gnupg/gcrypt/61993. > I wanted to compile the latest libgcrypt version. > I got libgcrypt-1.5.3.tar.gz source code from > ftp://ftp.gnupg.org/gcrypt/libgcrypt. > After I compiled the source code with: > > ./configure Here's your problem, you need to add --enable-static to ./configure arguments Here's an extract of ./configure --help : --enable-static[=PKGS] build static libraries [default=no] --enable-shared[=PKGS] build shared libraries [default=yes] From wk at gnupg.org Sun Dec 15 20:50:27 2013 From: wk at gnupg.org (Werner Koch) Date: Sun, 15 Dec 2013 20:50:27 +0100 Subject: Where is libgcrypt.a (version 1.5.3)? In-Reply-To: (Saran Chiwtanasuntorn's message of "Sun, 15 Dec 2013 21:29:43 +0700") References: Message-ID: <87wqj5j3lo.fsf@vigenere.g10code.de> On Sun, 15 Dec 2013 15:29, saranchiw at gmail.com said: > There is no libgcrypt.a at all. How can I get it? ./configure --enable-static && make Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From dbaryshkov at gmail.com Sun Dec 15 22:02:40 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 01:02:40 +0400 Subject: [PATCH] Use u64 for CCM data lengths In-Reply-To: <20131215140417.5653.38117.stgit@localhost6.localdomain6> References: <20131215140417.5653.38117.stgit@localhost6.localdomain6> Message-ID: Hello, On Sun, Dec 15, 2013 at 6:04 PM, Jussi Kivilinna wrote: > > * cipher/cipher-ccm.c: Move code inside [HAVE_U64_TYPEDEF]. > [!HAVE_U64_TYPEDEF] (_gcry_cipher_ccm_encrypt) > (_gcry_cipher_ccm_decrypt, _gcry_cipher_ccm_set_nonce) > (_gcry_cipher_ccm_authenticate, _gcry_cipher_ccm_get_tag) > (_gcry_cipher_ccm_check_tag): Dummy functions returning > GPG_ERROR_NOT_SUPPORTED. > * cipher/cipher-internal.h (gcry_cipher_handle.u_mode.ccm) > (_gcry_cipher_ccm_set_lengths): Move inside [HAVE_U64_TYPEDEF] and use > u64 instead of size_t for CCM data lengths. > * cipher/cipher.c (_gcry_cipher_open_internal, cipher_reset) > (_gcry_cipher_ctl) [!HAVE_U64_TYPEDEF]: Return GPG_ERR_NOT_SUPPORTED > for CCM. > (_gcry_cipher_ctl) [HAVE_U64_TYPEDEF]: Use u64 for > GCRYCTL_SET_CCM_LENGTHS length parameters. > -- > It's probably a little bit too late, but I'd also suggest to add a testcase with really long data, so that in future it can catch possible u64 vs. u32 length issues. -- With best wishes Dmitry From saranchiw at gmail.com Mon Dec 16 03:24:29 2013 From: saranchiw at gmail.com (Saran Chiwtanasuntorn) Date: Mon, 16 Dec 2013 09:24:29 +0700 Subject: Where is libgcrypt.a (version 1.5.3)? In-Reply-To: <87wqj5j3lo.fsf@vigenere.g10code.de> References: <87wqj5j3lo.fsf@vigenere.g10code.de> Message-ID: Thanks, Saran On 16 December 2013 02:50, Werner Koch wrote: > On Sun, 15 Dec 2013 15:29, saranchiw at gmail.com said: > > > There is no libgcrypt.a at all. How can I get it? > > ./configure --enable-static && make > > > Shalom-Salam, > > Werner > > -- > Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. > > -- ?????? ??????????? -------------- next part -------------- An HTML attachment was scrubbed... URL: From jussi.kivilinna at iki.fi Mon Dec 16 09:58:33 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 16 Dec 2013 10:58:33 +0200 Subject: [PATCH] Use u64 for CCM data lengths In-Reply-To: References: <20131215140417.5653.38117.stgit@localhost6.localdomain6> Message-ID: <52AEC0B9.6080400@iki.fi> On 15.12.2013 23:02, Dmitry Eremin-Solenikov wrote: > Hello, > > > On Sun, Dec 15, 2013 at 6:04 PM, Jussi Kivilinna wrote: >> >> * cipher/cipher-ccm.c: Move code inside [HAVE_U64_TYPEDEF]. >> [!HAVE_U64_TYPEDEF] (_gcry_cipher_ccm_encrypt) >> (_gcry_cipher_ccm_decrypt, _gcry_cipher_ccm_set_nonce) >> (_gcry_cipher_ccm_authenticate, _gcry_cipher_ccm_get_tag) >> (_gcry_cipher_ccm_check_tag): Dummy functions returning >> GPG_ERROR_NOT_SUPPORTED. >> * cipher/cipher-internal.h (gcry_cipher_handle.u_mode.ccm) >> (_gcry_cipher_ccm_set_lengths): Move inside [HAVE_U64_TYPEDEF] and use >> u64 instead of size_t for CCM data lengths. >> * cipher/cipher.c (_gcry_cipher_open_internal, cipher_reset) >> (_gcry_cipher_ctl) [!HAVE_U64_TYPEDEF]: Return GPG_ERR_NOT_SUPPORTED >> for CCM. >> (_gcry_cipher_ctl) [HAVE_U64_TYPEDEF]: Use u64 for >> GCRYCTL_SET_CCM_LENGTHS length parameters. >> -- >> > > It's probably a little bit too late, but I'd also suggest to add a > testcase with really long > data, so that in future it can catch possible u64 vs. u32 length issues. > Yes, tests with very long data would be nice. Maybe add 'tests/hugedata' which checks different cipher modes, MACs and hashes and would test if data lengths over ((1 << 32) * "cipher/hash blocksize") work. This test would be skipped on normal 'make check' run. -Jussi From wk at gnupg.org Mon Dec 16 09:51:39 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 09:51:39 +0100 Subject: [PATCH] Use u64 for CCM data lengths In-Reply-To: (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 01:02:40 +0400") References: <20131215140417.5653.38117.stgit@localhost6.localdomain6> Message-ID: <87lhzli3fo.fsf@vigenere.g10code.de> On Sun, 15 Dec 2013 22:02, dbaryshkov at gmail.com said: > testcase with really long > data, so that in future it can catch possible u64 vs. u32 length issues. Well, we need to have test vectors for that. For the hash algorithms we can manually run those tests but for CCM I need a hint on how to create test vectors. It would be nice if we could reuse tests/genhashtest.c to create them. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wachs at net.in.tum.de Mon Dec 16 10:28:01 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Mon, 16 Dec 2013 10:28:01 +0100 Subject: Compile error with latest git Message-ID: <52AEC7A1.4050706@net.in.tum.de> Hi, On a freebsd 9.1 amd64 and a debian Lenny x86 system: In file included from mac.c:27: mac-internal.h:22: error: redefinition of typedef 'gcry_mac_hd_t' ../src/gcrypt.h:1301: error: previous declaration of 'gcry_mac_hd_t' was here *** [mac.lo] Error code 1 -Matthias -- Dipl.-Inf. Matthias Wachs Free Secure Network Systems Group Technische Universitaet Muenchen Chair for Network Architectures and Services Institute for Informatics / I8 Tel: +49 89 289 18037 Boltzmannstr. 3 / Room 03.05.042 Fax: +49 89 289 18033 D-85748 Garching b. Muenchen, Germany Email: wachs at net.in.tum.de -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 555 bytes Desc: OpenPGP digital signature URL: From wachs at net.in.tum.de Mon Dec 16 10:41:21 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Mon, 16 Dec 2013 10:41:21 +0100 Subject: compile errors continued Message-ID: <52AECAC1.4030500@net.in.tum.de> Hi, while updating our buildbots I got another compile error: On a OS X machine: Darwin luke.net.in.tum.de 11.3.0 Darwin Kernel Version 11.3.0: Thu Jan 12 18:47:41 PST 2012; root:xnu-1699.24.23~1/RELEASE_X86_64 x86_64 /bin/sh ../libtool --tag=CC --mode=compile gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src -I/opt/local/include -I/opt/local/include -g -O2 -Wall -MT mpih-div.lo -MD -MP -MF .deps/mpih-div.Tpo -c -o mpih-div.lo mpih-div.c libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src -I/opt/local/include -I/opt/local/include -g -O2 -Wall -MT mpih-div.lo -MD -MP -MF .deps/mpih-div.Tpo -c mpih-div.c -fno-common -DPIC -o .libs/mpih-div.o mpih-div.c: In function '_gcry_mpih_mod_1': mpih-div.c:183: error: unsupported inline asm: input constraint with a matching output constraint of incompatible type! make[2]: *** [mpih-div.lo] Error 1 make[1]: *** [all-recursive] Error 1 make: *** [all] Error 2 -Matthias -- Dipl.-Inf. Matthias Wachs Free Secure Network Systems Group Technische Universitaet Muenchen Chair for Network Architectures and Services Institute for Informatics / I8 Tel: +49 89 289 18037 Boltzmannstr. 3 / Room 03.05.042 Fax: +49 89 289 18033 D-85748 Garching b. Muenchen, Germany Email: wachs at net.in.tum.de -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 555 bytes Desc: OpenPGP digital signature URL: From cvs at cvs.gnupg.org Mon Dec 16 11:20:47 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Mon, 16 Dec 2013 11:20:47 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-447-g953535a Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 953535a7de68cf62b5b1ad6f96ea3a9edd83762c (commit) via 5c31990214b58c4e17edb01fbbe6d9f573975a22 (commit) from 110fed2d6b0bbc97cb5cc0a3a564e05fc42afa2d (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 953535a7de68cf62b5b1ad6f96ea3a9edd83762c Author: Jussi Kivilinna Date: Mon Dec 16 12:15:37 2013 +0200 Change dummy variable in mpih-div.c to mpi_limb_t type * mpi/mpih-div.c (_gcry_mpih_mod_1, _gcry_mpih_divmod_1): Change dummy variable to 'mpi_limb_t' type from 'int'. -- Patch attempts to fix problem reported by Matthias Wachs: while updating our buildbots I got another compile error: On a OS X machine: Darwin luke.net.in.tum.de 11.3.0 Darwin Kernel Version 11.3.0: Thu Jan 12 18:47:41 PST 2012; root:xnu-1699.24.23~1/RELEASE_X86_64 x86_64 /bin/sh ../libtool --tag=CC --mode=compile gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src -I/opt/local/include -I/opt/local/include -g -O2 -Wall -MT mpih-div.lo -MD -MP -MF .deps/mpih-div.Tpo -c -o mpih-div.lo mpih-div.c libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src -I/opt/local/include -I/opt/local/include -g -O2 -Wall -MT mpih-div.lo -MD -MP -MF .deps/mpih-div.Tpo -c mpih-div.c -fno-common -DPIC -o .libs/mpih-div.o mpih-div.c: In function '_gcry_mpih_mod_1': mpih-div.c:183: error: unsupported inline asm: input constraint with a matching output constraint of incompatible type! make[2]: *** [mpih-div.lo] Error 1 make[1]: *** [all-recursive] Error 1 make: *** [all] Error 2 The new x86-64 inline assembly for MPI expects outputs to be limb sized variables (64-bit), but mpi/mpih-div.c was using 32-bit dummy variable. Appearently this mismatch between assembly output and variable sizes does not fail on every platform. Signed-off-by: Jussi Kivilinna diff --git a/mpi/mpih-div.c b/mpi/mpih-div.c index 0bddd22..57c1b58 100644 --- a/mpi/mpih-div.c +++ b/mpi/mpih-div.c @@ -48,7 +48,7 @@ _gcry_mpih_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, { mpi_size_t i; mpi_limb_t n1, n0, r; - int dummy GCC_ATTR_UNUSED; + mpi_limb_t dummy GCC_ATTR_UNUSED; /* Botch: Should this be handled at all? Rely on callers? */ if( !dividend_size ) @@ -396,7 +396,7 @@ _gcry_mpih_divmod_1( mpi_ptr_t quot_ptr, { mpi_size_t i; mpi_limb_t n1, n0, r; - int dummy GCC_ATTR_UNUSED; + mpi_limb_t dummy GCC_ATTR_UNUSED; if( !dividend_size ) return 0; commit 5c31990214b58c4e17edb01fbbe6d9f573975a22 Author: Jussi Kivilinna Date: Mon Dec 16 11:54:37 2013 +0200 Remove duplicate gcry_mac_hd_t typedef * cipher/mac-internal.h (gcry_mac_hd_t): Remove. -- Attempt to fix problem reported by Matthias Wachs: On a freebsd 9.1 amd64 and a debian Lenny x86 system: In file included from mac.c:27: mac-internal.h:22: error: redefinition of typedef 'gcry_mac_hd_t' ../src/gcrypt.h:1301: error: previous declaration of 'gcry_mac_hd_t' was here *** [mac.lo] Error code 1 Signed-off-by: Jussi Kivilinna diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index 2a5e7c6..6fc304b 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -19,7 +19,6 @@ /* The data object used to hold a handle to an encryption object. */ struct gcry_mac_handle; -typedef struct gcry_mac_handle *gcry_mac_hd_t; /* ----------------------------------------------------------------------- Summary of changes: cipher/mac-internal.h | 1 - mpi/mpih-div.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Mon Dec 16 11:22:32 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 16 Dec 2013 12:22:32 +0200 Subject: compile errors continued In-Reply-To: <52AECAC1.4030500@net.in.tum.de> References: <52AECAC1.4030500@net.in.tum.de> Message-ID: <52AED468.9060205@iki.fi> On 16.12.2013 11:41, Matthias Wachs wrote: > Hi, > > while updating our buildbots I got another compile error: Hello, I've pushed two patches to repository which attempt to fix the reported build problems. -Jussi > > On a OS X machine: > > Darwin luke.net.in.tum.de 11.3.0 Darwin Kernel Version 11.3.0: Thu Jan > 12 18:47:41 PST 2012; root:xnu-1699.24.23~1/RELEASE_X86_64 x86_64 > > /bin/sh ../libtool --tag=CC --mode=compile gcc -DHAVE_CONFIG_H -I. > -I.. -I../src -I../src -I/opt/local/include -I/opt/local/include -g -O2 > -Wall -MT mpih-div.lo -MD -MP -MF .deps/mpih-div.Tpo -c -o mpih-div.lo > mpih-div.c > libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src > -I/opt/local/include -I/opt/local/include -g -O2 -Wall -MT mpih-div.lo > -MD -MP -MF .deps/mpih-div.Tpo -c mpih-div.c -fno-common -DPIC -o > .libs/mpih-div.o > mpih-div.c: In function '_gcry_mpih_mod_1': > mpih-div.c:183: error: unsupported inline asm: input constraint with a > matching output constraint of incompatible type! > make[2]: *** [mpih-div.lo] Error 1 > make[1]: *** [all-recursive] Error 1 > make: *** [all] Error 2 > > > -Matthias > > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > From wachs at net.in.tum.de Mon Dec 16 11:45:17 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Mon, 16 Dec 2013 11:45:17 +0100 Subject: compile errors continued In-Reply-To: <52AED468.9060205@iki.fi> References: <52AECAC1.4030500@net.in.tum.de> <52AED468.9060205@iki.fi> Message-ID: <52AED9BD.1030108@net.in.tum.de> It now compiles on all machines... Thanks a lot.. -Matthias On 12/16/2013 11:22 AM, Jussi Kivilinna wrote: > On 16.12.2013 11:41, Matthias Wachs wrote: >> Hi, >> >> while updating our buildbots I got another compile error: > > Hello, > > I've pushed two patches to repository which attempt to fix the reported > build problems. > > -Jussi > >> >> On a OS X machine: >> >> Darwin luke.net.in.tum.de 11.3.0 Darwin Kernel Version 11.3.0: Thu Jan >> 12 18:47:41 PST 2012; root:xnu-1699.24.23~1/RELEASE_X86_64 x86_64 >> >> /bin/sh ../libtool --tag=CC --mode=compile gcc -DHAVE_CONFIG_H -I. >> -I.. -I../src -I../src -I/opt/local/include -I/opt/local/include -g -O2 >> -Wall -MT mpih-div.lo -MD -MP -MF .deps/mpih-div.Tpo -c -o mpih-div.lo >> mpih-div.c >> libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src >> -I/opt/local/include -I/opt/local/include -g -O2 -Wall -MT mpih-div.lo >> -MD -MP -MF .deps/mpih-div.Tpo -c mpih-div.c -fno-common -DPIC -o >> .libs/mpih-div.o >> mpih-div.c: In function '_gcry_mpih_mod_1': >> mpih-div.c:183: error: unsupported inline asm: input constraint with a >> matching output constraint of incompatible type! >> make[2]: *** [mpih-div.lo] Error 1 >> make[1]: *** [all-recursive] Error 1 >> make: *** [all] Error 2 >> >> >> -Matthias >> >> >> >> _______________________________________________ >> Gcrypt-devel mailing list >> Gcrypt-devel at gnupg.org >> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel >> > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > -- Dipl.-Inf. Matthias Wachs Free Secure Network Systems Group Technische Universitaet Muenchen Chair for Network Architectures and Services Institute for Informatics / I8 Tel: +49 89 289 18037 Boltzmannstr. 3 / Room 03.05.042 Fax: +49 89 289 18033 D-85748 Garching b. Muenchen, Germany Email: wachs at net.in.tum.de -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 555 bytes Desc: OpenPGP digital signature URL: From cvs at cvs.gnupg.org Mon Dec 16 11:56:36 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 16 Dec 2013 11:56:36 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-450-ga6b9304 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via a6b9304a889397ac98e1c2c4ac3e178669d94492 (commit) via 5a7ce59396fe56f0d681df314bfbdb5f7732d4b1 (commit) via dec048b2ec79271a2f4405be5b87b1e768b3f1a9 (commit) from 953535a7de68cf62b5b1ad6f96ea3a9edd83762c (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit a6b9304a889397ac98e1c2c4ac3e178669d94492 Author: Werner Koch Date: Mon Dec 16 11:43:22 2013 +0100 Add configure option --enable-large-data-tests. * configure.ac: Add option --enable-large-data-tests. * tests/hashtest-256g.in: New. * tests/Makefile.am (EXTRA_DIST): Add hashtest-256g.in. (TESTS): Split up into tests_bin, tests_bin_last, tests_sh, and tests_sh_last. (tests_sh_last): Add hashtest-256g (noinst_PROGRAMS): Add only tests_bin and tests_bin_last. (bench-slope.log, hashtest-256g.log): New rules to enforce serial run. Signed-off-by: Werner Koch diff --git a/README b/README index ff3ce9c..558e008 100644 --- a/README +++ b/README @@ -80,6 +80,10 @@ Here is a list of configure options which are sometimes useful for installation. + --enable-large-data-tests + With this option a "make check" will take really + long due to extra checks for the hash algorithms. + --enable-m-guard Enable the integrated malloc checking code. Please note that this feature does not work on all CPUs diff --git a/configure.ac b/configure.ac index 9088d02..571e156 100644 --- a/configure.ac +++ b/configure.ac @@ -524,6 +524,16 @@ if test "$use_m_guard" = yes ; then AC_DEFINE(M_GUARD,1,[Define to use the (obsolete) malloc guarding feature]) fi +# Implementation of the --enable-large-data-tests switch. +AC_MSG_CHECKING([whether to run large data tests]) +AC_ARG_ENABLE(large-data-tests, + AC_HELP_STRING([--enable-large-data-tests], + [Enable the real long ruinning large data tests]), + large_data_tests=$enableval,large_data_tests=no) +AC_MSG_RESULT($large_data_tests) +AC_SUBST(RUN_LARGE_DATA_TESTS, $large_data_tests) + + # Implementation of the --with-capabilities switch. # Check whether we want to use Linux capabilities AC_MSG_CHECKING([whether use of capabilities is requested]) @@ -1914,6 +1924,7 @@ src/libgcrypt-config src/versioninfo.rc tests/Makefile ]) +AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g]) AC_OUTPUT diff --git a/tests/Makefile.am b/tests/Makefile.am index 3fb9fd6..f5b5b9f 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -18,13 +18,24 @@ ## Process this file with automake to produce Makefile.in -TESTS = version mpitests tsexp t-convert \ +tests_bin = \ + version mpitests tsexp t-convert \ t-mpi-bit t-mpi-point curves \ prime basic keygen pubkey hmac hashtest t-kdf keygrip \ fips186-dsa aeswrap pkcs1v2 random dsa-rfc6979 t-ed25519 -# The last tests to run. -TESTS += benchmark bench-slope +tests_bin_last = benchmark bench-slope + +tests_sh = + +tests_sh_last = hashtest-256g + +TESTS = $(tests_bin) $(tests_sh) $(tests_bin_last) $(tests_sh_last) + +# Force sequential run of some tests. +bench-slope.log: benchmark.log +hashtest-256g.log: bench-slope.log + TESTS_ENVIRONMENT = GCRYPT_IN_REGRESSION_TEST=1 @@ -37,8 +48,8 @@ AM_CFLAGS = $(GPG_ERROR_CFLAGS) LDADD = ../src/libgcrypt.la $(DL_LIBS) ../compat/libcompat.la $(GPG_ERROR_LIBS) EXTRA_PROGRAMS = testapi pkbench -noinst_PROGRAMS = $(TESTS) fipsdrv rsacvt genhashdata +noinst_PROGRAMS = $(tests_bin) $(tests_bin_last) fipsdrv rsacvt genhashdata EXTRA_DIST = README rsa-16k.key cavs_tests.sh cavs_driver.pl \ pkcs1v2-oaep.h pkcs1v2-pss.h pkcs1v2-v15c.h pkcs1v2-v15s.h \ - t-ed25519.inp stopwatch.h + t-ed25519.inp stopwatch.h hashtest-256g.in diff --git a/tests/hashtest-256g.in b/tests/hashtest-256g.in new file mode 100755 index 0000000..02ab3f6 --- /dev/null +++ b/tests/hashtest-256g.in @@ -0,0 +1,7 @@ +#!/bin/sh + +algos="SHA1 SHA256" + +test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77 +echo " now running 256 GiB tests for $algos - this takes looong" +exec ./hashtest --gigs 256 $algos commit 5a7ce59396fe56f0d681df314bfbdb5f7732d4b1 Author: Werner Koch Date: Mon Dec 16 09:45:02 2013 +0100 random: Call random progress handler more often. * random/rndlinux.c (_gcry_rndlinux_gather_random): Update progress indicator earlier. -- GnuPG-bug-id: 1531 Signed-off-by: Werner Koch diff --git a/random/rndlinux.c b/random/rndlinux.c index 21ea8c4..89ac203 100644 --- a/random/rndlinux.c +++ b/random/rndlinux.c @@ -183,6 +183,18 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, struct timeval tv; int rc; + /* If we collected some bytes update the progress indicator. We + do this always and not just if the select timed out because + often just a few bytes are gathered within the timeout + period. */ + if (any_need_entropy || last_so_far != (want - length) ) + { + last_so_far = want - length; + _gcry_random_progress ("need_entropy", 'X', + (int)last_so_far, (int)want); + any_need_entropy = 1; + } + /* If the system has no limit on the number of file descriptors and we encounter an fd which is larger than the fd_set size, we don't use the select at all. The select code is only used @@ -198,13 +210,7 @@ _gcry_rndlinux_gather_random (void (*add)(const void*, size_t, tv.tv_usec = delay? 0 : 100000; if ( !(rc=select(fd+1, &rfds, NULL, NULL, &tv)) ) { - if (!any_need_entropy || last_so_far != (want - length) ) - { - last_so_far = want - length; - _gcry_random_progress ("need_entropy", 'X', - (int)last_so_far, (int)want); - any_need_entropy = 1; - } + any_need_entropy = 1; delay = 3; /* Use 3 seconds henceforth. */ continue; } commit dec048b2ec79271a2f4405be5b87b1e768b3f1a9 Author: Werner Koch Date: Mon Dec 16 09:22:10 2013 +0100 cipher: Normalize the MPIs used as input to secret key functions. * cipher/dsa.c (sign): Normalize INPUT. * cipher/elgamal.c (decrypt): Normalize A and B. * cipher/rsa.c (secret): Normalize the INPUT. (rsa_decrypt): Reduce DATA before passing to secret. -- mpi_normalize is in general not required because extra leading zeroes do not harm the computation. However, adding extra all zero limbs or padding with multiples of N may be useful in side-channel attacks. This is an extra pre-caution in case RSA blinding has been disabled. CVE-id: CVE-2013-4576 Signed-off-by: Werner Koch diff --git a/cipher/dsa.c b/cipher/dsa.c index 5d29ba4..50bdab1 100644 --- a/cipher/dsa.c +++ b/cipher/dsa.c @@ -583,7 +583,10 @@ sign (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_secret_key *skey, mpi_rshift (hash, hash, abits - qbits); } else - hash = input; + { + mpi_normalize (input); + hash = input; + } again: /* Create the K value. */ diff --git a/cipher/elgamal.c b/cipher/elgamal.c index 3645e7d..a71a9bc 100644 --- a/cipher/elgamal.c +++ b/cipher/elgamal.c @@ -497,10 +497,13 @@ do_encrypt(gcry_mpi_t a, gcry_mpi_t b, gcry_mpi_t input, ELG_public_key *pkey ) static void -decrypt(gcry_mpi_t output, gcry_mpi_t a, gcry_mpi_t b, ELG_secret_key *skey ) +decrypt (gcry_mpi_t output, gcry_mpi_t a, gcry_mpi_t b, ELG_secret_key *skey ) { gcry_mpi_t t1 = mpi_alloc_secure( mpi_get_nlimbs( skey->p ) ); + mpi_normalize (a); + mpi_normalize (b); + /* output = b/(a^x) mod p */ mpi_powm( t1, a, skey->x, skey->p ); mpi_invm( t1, t1, skey->p ); diff --git a/cipher/rsa.c b/cipher/rsa.c index a97dcfd..e595e38 100644 --- a/cipher/rsa.c +++ b/cipher/rsa.c @@ -712,6 +712,9 @@ stronger_key_check ( RSA_secret_key *skey ) static void secret (gcry_mpi_t output, gcry_mpi_t input, RSA_secret_key *skey ) { + /* Remove superfluous leading zeroes from INPUT. */ + mpi_normalize (input); + if (!skey->p || !skey->q || !skey->u) { mpi_powm (output, input, skey->d, skey->n); @@ -997,6 +1000,13 @@ rsa_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms) } } + /* Better make sure that there are no superfluous leading zeroes in + the input and it has not been "padded" using multiples of N. + This mitigates side-channel attacks (CVE-2013-4576). */ + mpi_normalize (data); + mpi_fdiv_r (data, data, sk.n); + + /* Allocate MPI for the plaintext. */ plain = mpi_snew (ctx.nbits); /* We use blinding by default to mitigate timing attacks which can ----------------------------------------------------------------------- Summary of changes: README | 4 ++++ cipher/dsa.c | 5 ++++- cipher/elgamal.c | 5 ++++- cipher/rsa.c | 10 ++++++++++ configure.ac | 11 +++++++++++ random/rndlinux.c | 20 +++++++++++++------- tests/Makefile.am | 21 ++++++++++++++++----- tests/hashtest-256g.in | 7 +++++++ 8 files changed, 69 insertions(+), 14 deletions(-) create mode 100755 tests/hashtest-256g.in hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Mon Dec 16 12:42:21 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 12:42:21 +0100 Subject: compile errors continued In-Reply-To: <52AED9BD.1030108@net.in.tum.de> (Matthias Wachs's message of "Mon, 16 Dec 2013 11:45:17 +0100") References: <52AECAC1.4030500@net.in.tum.de> <52AED468.9060205@iki.fi> <52AED9BD.1030108@net.in.tum.de> Message-ID: <874n69hvj6.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 11:45, wachs at net.in.tum.de said: > It now compiles on all machines... Is it okay to do a release now? Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wachs at net.in.tum.de Mon Dec 16 13:04:08 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Mon, 16 Dec 2013 13:04:08 +0100 Subject: compile errors continued In-Reply-To: <874n69hvj6.fsf@vigenere.g10code.de> References: <52AECAC1.4030500@net.in.tum.de> <52AED468.9060205@iki.fi> <52AED9BD.1030108@net.in.tum.de> <874n69hvj6.fsf@vigenere.g10code.de> Message-ID: <52AEEC38.6060800@net.in.tum.de> One of our buildbots (the infamous sparc machine) is offline at the moment, but on the rest it compiles without issues -Matthias On 12/16/2013 12:42 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 11:45, wachs at net.in.tum.de said: >> It now compiles on all machines... > > Is it okay to do a release now? > > > Salam-Shalom, > > Werner > -- Dipl.-Inf. Matthias Wachs Free Secure Network Systems Group Technische Universitaet Muenchen Chair for Network Architectures and Services Institute for Informatics / I8 Tel: +49 89 289 18037 Boltzmannstr. 3 / Room 03.05.042 Fax: +49 89 289 18033 D-85748 Garching b. Muenchen, Germany Email: wachs at net.in.tum.de -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 555 bytes Desc: OpenPGP digital signature URL: From cvs at cvs.gnupg.org Mon Dec 16 13:03:46 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 16 Dec 2013 13:03:46 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-451-g0d3bd23 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 0d3bd23d7f730b9bbc81fc8da8d99f4853c36020 (commit) from a6b9304a889397ac98e1c2c4ac3e178669d94492 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 0d3bd23d7f730b9bbc81fc8da8d99f4853c36020 Author: Werner Koch Date: Mon Dec 16 12:43:50 2013 +0100 tests: Add SHA-512 to the long hash test. * tests/hashtest.c (testvectors): Add vectors for 256GiB SHA-512. * tests/hashtest-256g.in (algos): Add test for SHA-512. Signed-off-by: Werner Koch diff --git a/tests/genhashdata.c b/tests/genhashdata.c index 8777f9c..e16c49b 100644 --- a/tests/genhashdata.c +++ b/tests/genhashdata.c @@ -40,6 +40,19 @@ $ for i in -64 -1 0 1 64; do ./genhashdata --gigs 256 --bytes $i|sha256sum;done 5a2e21b1e79cd866acf53a2a18ca76bd4e02c4b01bf4627354171824c812d95f - 34444808af8e9d995e67f9e155ed94bf55f195a51dc1d8a989e6bcf95511c8a2 - + +$ for i in -64 -1 0 1 64; do ./genhashdata --gigs 256 --bytes $i|sha512sum;done +e01bf8140874bf240e8426cb2bcbc377cbed2e6037334116637149e1cd8cd462 \ +96828b71f32b9f002771d4cb51172ce578b73b7939221e4df655ecd08601e655 - +4917ff94514b1757705c289fdc3e7d6ffcce5771b20ae237ebc03d2ec9eb435f \ +b7ce9f0e27272be8cced77a5edae1a01a0ad62b0a44169d88bbee45474a17734 - +1e28e8b3c79f2f47da11f3c0b7da4e7981e7d932db6d17d528a31e191922edda \ +8fc4bb2df10ea876232db5a1c606bc41886e8b2c570a3e721221f60c8c7dc4ab - +027d3324dd1cf127770ceb53681f4c70937c9bca4e3acd5fd76cb266c7d4527d \ +58140290a1822e8d60c4d3ae9725fb923183230d6dfd2d7d73c0d74a4757f34a - +49920704ea9d6ee19f0742d6c868110fa3eda8ac09f026e9ef22cc731af53020 \ +de40eedef66cb1afd94c61e285fa9327e01336e804903740a9145ab1f065c2d5 - + */ #include diff --git a/tests/hashtest-256g.in b/tests/hashtest-256g.in index 02ab3f6..e897c54 100755 --- a/tests/hashtest-256g.in +++ b/tests/hashtest-256g.in @@ -1,6 +1,6 @@ #!/bin/sh -algos="SHA1 SHA256" +algos="SHA1 SHA256 SHA512" test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77 echo " now running 256 GiB tests for $algos - this takes looong" diff --git a/tests/hashtest.c b/tests/hashtest.c index 15310d0..6fbce0c 100644 --- a/tests/hashtest.c +++ b/tests/hashtest.c @@ -63,6 +63,7 @@ static struct { { GCRY_MD_SHA1, 256, -0, "71b923afde1c8c040884c723a2e3335b333e64c6" }, { GCRY_MD_SHA1, 256, 1, "2d99f9b5b86e9c9c937104f4242bd6b8bc0927ef" }, { GCRY_MD_SHA1, 256, 64, "a60dabe8d749f798b7ec3a684cc3eab487451482" }, + { GCRY_MD_SHA224, 256, -64, "b5672b54d2480a5688a2dc727a1ad4db7a81ef31ce8999e0bbaeffdc" }, { GCRY_MD_SHA224, 256, -1, @@ -73,6 +74,7 @@ static struct { "e578d5d523320876565bbbc892511a485427caee6dd754d57e3e58c2" }, { GCRY_MD_SHA224, 256, 64, "ff0464df248cd298b63765bc4f87f21e25c93c657fdf3656d3c878e5" }, + { GCRY_MD_SHA256, 256, -64, "87a9828d3de78d55d252341db2a622908c4e0ceaee9961ecf9768700fc799ec8" }, { GCRY_MD_SHA256, 256, -1, @@ -83,10 +85,25 @@ static struct { "5a2e21b1e79cd866acf53a2a18ca76bd4e02c4b01bf4627354171824c812d95f" }, { GCRY_MD_SHA256, 256, 64, "34444808af8e9d995e67f9e155ed94bf55f195a51dc1d8a989e6bcf95511c8a2" }, - { 0 } -}; + { GCRY_MD_SHA512, 256, -64, + "e01bf8140874bf240e8426cb2bcbc377cbed2e6037334116637149e1cd8cd462" + "96828b71f32b9f002771d4cb51172ce578b73b7939221e4df655ecd08601e655" }, + { GCRY_MD_SHA512, 256, -1, + "4917ff94514b1757705c289fdc3e7d6ffcce5771b20ae237ebc03d2ec9eb435f" + "b7ce9f0e27272be8cced77a5edae1a01a0ad62b0a44169d88bbee45474a17734" }, + { GCRY_MD_SHA512, 256, 0, + "1e28e8b3c79f2f47da11f3c0b7da4e7981e7d932db6d17d528a31e191922edda" + "8fc4bb2df10ea876232db5a1c606bc41886e8b2c570a3e721221f60c8c7dc4ab" }, + { GCRY_MD_SHA512, 256, 1, + "027d3324dd1cf127770ceb53681f4c70937c9bca4e3acd5fd76cb266c7d4527d" + "58140290a1822e8d60c4d3ae9725fb923183230d6dfd2d7d73c0d74a4757f34a" }, + { GCRY_MD_SHA512, 256, 64, + "49920704ea9d6ee19f0742d6c868110fa3eda8ac09f026e9ef22cc731af53020" + "de40eedef66cb1afd94c61e285fa9327e01336e804903740a9145ab1f065c2d5" }, + { 0 } +}; ----------------------------------------------------------------------- Summary of changes: tests/genhashdata.c | 13 +++++++++++++ tests/hashtest-256g.in | 2 +- tests/hashtest.c | 21 +++++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Mon Dec 16 13:44:22 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 13:44:22 +0100 Subject: compile errors continued In-Reply-To: <52AEEC38.6060800@net.in.tum.de> (Matthias Wachs's message of "Mon, 16 Dec 2013 13:04:08 +0100") References: <52AECAC1.4030500@net.in.tum.de> <52AED468.9060205@iki.fi> <52AED9BD.1030108@net.in.tum.de> <874n69hvj6.fsf@vigenere.g10code.de> <52AEEC38.6060800@net.in.tum.de> Message-ID: <87r49dge3d.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 13:04, wachs at net.in.tum.de said: > One of our buildbots (the infamous sparc machine) is offline at the > moment, but on the rest it compiles without issues Weel, we know that it worked on the sparc. I just did a test on AIX with Power7 w/o problems. Now, if I only could find the disk for my VIA board to test the padlock stuff. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From dkg at fifthhorseman.net Mon Dec 16 15:05:18 2013 From: dkg at fifthhorseman.net (Daniel Kahn Gillmor) Date: Mon, 16 Dec 2013 09:05:18 -0500 Subject: Compile error with latest git In-Reply-To: <52AEC7A1.4050706@net.in.tum.de> References: <52AEC7A1.4050706@net.in.tum.de> Message-ID: <52AF089E.7020408@fifthhorseman.net> On 12/16/2013 04:28 AM, Matthias Wachs wrote: > On a freebsd 9.1 amd64 and a debian Lenny x86 system: fwiw, debian lenny is *very* old, and no longer supported by anyone that i'm aware of. Have you tried against a more recent version of debian? --dkg -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 1027 bytes Desc: OpenPGP digital signature URL: From wk at gnupg.org Mon Dec 16 16:37:22 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 16:37:22 +0100 Subject: Compile error with latest git In-Reply-To: <52AF089E.7020408@fifthhorseman.net> (Daniel Kahn Gillmor's message of "Mon, 16 Dec 2013 09:05:18 -0500") References: <52AEC7A1.4050706@net.in.tum.de> <52AF089E.7020408@fifthhorseman.net> Message-ID: <877gb4hknh.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 15:05, dkg at fifthhorseman.net said: > fwiw, debian lenny is *very* old, and no longer supported by anyone that > i'm aware of. Have you tried against a more recent version of debian? Libgcrypt is designed to be portable and shall run on all Unix platforms supporting a C89 compiler. However, with the long list of changes it is quite clear that we will experience portability bugs. They will be addressed by the next versions. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From dbaryshkov at gmail.com Mon Dec 16 17:40:44 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 20:40:44 +0400 Subject: [PATCH] Add a simple PKCS#1 padding mode Message-ID: <1387212044-19849-1-git-send-email-dbaryshkov@gmail.com> * cipher/rsa-common.c (_gcry_rsa_pkcs1_encode_simple_for_sig): PKCS#1-encode data with embedded hash OID for signature verification. * cipher/pubkey-util.c (_gcry_pk_util_data_to_mpi): handle s-exp like (data (flags pkcs1) (value xxxxx)) * tests/basic.c (check_pubkey_sign): fix the return value for this kind of s-exp. -- Allow user to specify (flags pkcs1) to enable pkcs1 padding of raw value (no hash algorithm is specified). It is up to the user to verify that passed value is properly formatted and includes DER-encoded ASN OID of the hash function. This is required to enable GnuTLS to use gcrypt library as a crypto backend. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/pubkey-internal.h | 4 +++ cipher/pubkey-util.c | 15 +++++++++++ cipher/rsa-common.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ tests/basic.c | 2 +- 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/cipher/pubkey-internal.h b/cipher/pubkey-internal.h index db1399d..4b5ee90 100644 --- a/cipher/pubkey-internal.h +++ b/cipher/pubkey-internal.h @@ -56,6 +56,10 @@ gpg_err_code_t _gcry_rsa_pkcs1_decode_for_enc (unsigned char **r_result, size_t *r_resultlen, unsigned int nbits, gcry_mpi_t value); gpg_err_code_t +_gcry_rsa_pkcs1_encode_simple_for_sig (gcry_mpi_t *r_result, unsigned int nbits, + const unsigned char *value, size_t valuelen); + +gpg_err_code_t _gcry_rsa_pkcs1_encode_for_sig (gcry_mpi_t *r_result, unsigned int nbits, const unsigned char *value, size_t valuelen, int algo); diff --git a/cipher/pubkey-util.c b/cipher/pubkey-util.c index 616b499..01d10ed 100644 --- a/cipher/pubkey-util.c +++ b/cipher/pubkey-util.c @@ -850,6 +850,21 @@ _gcry_pk_util_data_to_mpi (gcry_sexp_t input, gcry_mpi_t *ret_mpi, ctx->hash_algo); } } + else if (ctx->encoding == PUBKEY_ENC_PKCS1 && lvalue + && (ctx->op == PUBKEY_OP_SIGN || ctx->op == PUBKEY_OP_VERIFY)) + { + const void * value; + size_t valuelen; + + if (sexp_length (lvalue) != 2) + rc = GPG_ERR_INV_OBJ; + else if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) + || !valuelen ) + rc = GPG_ERR_INV_OBJ; + else + rc = _gcry_rsa_pkcs1_encode_simple_for_sig (ret_mpi, ctx->nbits, + value, valuelen); + } else if (ctx->encoding == PUBKEY_ENC_OAEP && lvalue && ctx->op == PUBKEY_OP_ENCRYPT) { diff --git a/cipher/rsa-common.c b/cipher/rsa-common.c index 4f5a659..0c8df3a 100644 --- a/cipher/rsa-common.c +++ b/cipher/rsa-common.c @@ -319,6 +319,71 @@ _gcry_rsa_pkcs1_encode_for_sig (gcry_mpi_t *r_result, unsigned int nbits, return rc; } +/* Encode {VALUE,VALUELEN} for an NBITS keys using the pkcs#1 block + type 1 padding. On success the result is stored as a new MPI at + R_RESULT. On error the value at R_RESULT is undefined. + + We encode the value in this way: + + 0 1 PAD(n bytes) 0 VALUE(valuelen bytes) + + 0 is a marker we unfortunately can't encode because we return an + MPI which strips all leading zeroes. + 1 is the block type. + PAD consists of 0xff bytes. + 0 marks the end of the padding. + + (Note that PGP prior to version 2.3 encoded the message digest as: + 0 1 MD(16 bytes) 0 PAD(n bytes) 1 + The MD is always 16 bytes here because it's always MD5. GnuPG + does not not support pre-v2.3 signatures, but I'm including this + comment so the information is easily found if needed.) +*/ +gpg_err_code_t +_gcry_rsa_pkcs1_encode_simple_for_sig (gcry_mpi_t *r_result, unsigned int nbits, + const unsigned char *value, size_t valuelen) +{ + gcry_err_code_t rc = 0; + gcry_error_t err; + byte *frame = NULL; + size_t nframe = (nbits+7) / 8; + int i; + size_t n; + + if ( !valuelen || valuelen + 4 > nframe) + { + /* Can't encode an DLEN byte digest MD into an NFRAME byte + frame. */ + return GPG_ERR_TOO_SHORT; + } + + if ( !(frame = xtrymalloc (nframe)) ) + return gpg_err_code_from_syserror (); + + /* Assemble the pkcs#1 block type 1. */ + n = 0; + frame[n++] = 0; + frame[n++] = 1; /* block type */ + i = nframe - valuelen - 3 ; + gcry_assert (i > 1); + memset (frame+n, 0xff, i ); + n += i; + frame[n++] = 0; + memcpy (frame+n, value, valuelen ); + n += valuelen; + gcry_assert (n == nframe); + + /* Convert it into an MPI. */ + err = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, frame, n, &nframe); + if (err) + rc = gcry_err_code (err); + else if (DBG_CIPHER) + log_mpidump ("PKCS#1 block type 1 encoded data", *r_result); + xfree (frame); + + return rc; +} + /* Mask generation function for OAEP. See RFC-3447 B.2.1. */ static gcry_err_code_t diff --git a/tests/basic.c b/tests/basic.c index 84a2f60..719fb0e 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5361,7 +5361,7 @@ check_pubkey_sign (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) { "(data\n (flags pkcs1)\n" " (value #11223344556677889900AA#))\n", GCRY_PK_RSA, - GPG_ERR_CONFLICT }, + 0 }, { "(data\n (flags raw foo)\n" " (value #11223344556677889900AA#))\n", 0, -- 1.8.5.1 From dbaryshkov at gmail.com Mon Dec 16 17:41:24 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 20:41:24 +0400 Subject: [PATCH] Add an utility to calculate hashes over a set of files Message-ID: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> * src/gchash.c: New. -- An utility like rhash that has the ability to calculate different hashes over a set of files it usefull. Add gchash utility to calculate hashes supported by libgcrypt. Signed-off-by: Dmitry Eremin-Solenikov --- .gitignore | 1 + src/Makefile.am | 6 ++- src/gchash.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 src/gchash.c diff --git a/.gitignore b/.gitignore index 3a94fbc..82c8d4a 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,7 @@ random/Makefile random/librandom.la src/Makefile src/dumpsexp +src/gchash src/gcrypt.h src/hmac256 src/libgcrypt-config diff --git a/src/Makefile.am b/src/Makefile.am index c020239..4259424 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -29,7 +29,7 @@ m4data_DATA = libgcrypt.m4 include_HEADERS = gcrypt.h lib_LTLIBRARIES = libgcrypt.la -bin_PROGRAMS = dumpsexp hmac256 mpicalc +bin_PROGRAMS = dumpsexp hmac256 mpicalc gchash if USE_RANDOM_DAEMON sbin_PROGRAMS = gcryptrnd bin_PROGRAMS += getrandom @@ -128,6 +128,10 @@ dumpsexp_SOURCES = dumpsexp.c dumpsexp_CFLAGS = $(arch_gpg_error_cflags) dumpsexp_LDADD = $(arch_gpg_error_libs) +gchash_SOURCES = gchash.c +gchash_CFLAGS = $(GPG_ERROR_CFLAGS) +gchash_LDADD = libgcrypt.la $(GPG_ERROR_LIBS) + mpicalc_SOURCES = mpicalc.c mpicalc_CFLAGS = $(GPG_ERROR_CFLAGS) mpicalc_LDADD = libgcrypt.la $(GPG_ERROR_LIBS) diff --git a/src/gchash.c b/src/gchash.c new file mode 100644 index 0000000..7a2aad6 --- /dev/null +++ b/src/gchash.c @@ -0,0 +1,120 @@ +/* gchash.c - Calculate hash values + * Copyright (C) 2013 Dmitry Eremin-Solenikov + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include + +#ifdef _GCRYPT_IN_LIBGCRYPT +# undef _GCRYPT_IN_LIBGCRYPT +# include "gcrypt.h" +#else +# include +#endif + + +void +init_gcrypt (void) +{ + if (!gcry_check_version (GCRYPT_VERSION)) { + fputs ("libgcrypt version mismatch\n", stderr); + exit (2); + } + + gcry_control (GCRYCTL_SUSPEND_SECMEM_WARN); + + /* Allocate a pool of 16k secure memory. This make the secure memory + * available and also drops privileges where needed. */ + gcry_control (GCRYCTL_INIT_SECMEM, 16384, 0); + + gcry_control (GCRYCTL_RESUME_SECMEM_WARN); + + gcry_control (GCRYCTL_INITIALIZATION_FINISHED, 0); +} + +int +main (int argc, char **argv) +{ + gcry_md_hd_t hd; + gcry_error_t err; + int algo; + + init_gcrypt(); + + if (argc < 2 || (argv[1] && !strcmp(argv[1], "--help"))) + { + fprintf (stderr, "Usage: %s ...\n", argv[0]); + return 1; + } + + algo = gcry_md_map_name (argv[1]); + if (algo == GCRY_MD_NONE) + { + fprintf (stderr, "Unknown algorithm '%s'\n", argv[1]); + return 1; + } + + err = gcry_md_open(&hd, algo, 0); + if (err) + { + fprintf (stderr, "LibGCrypt error %s/%s\n", + gcry_strsource (err), + gcry_strerror (err)); + exit (1); + } + + for (argv += 2; *argv; argv++) + { + FILE *fp; + unsigned char buf[1024]; + size_t size; + int i; + unsigned char *h; + if (!strcmp (*argv, "-")) + fp = stdin; + else + fp = fopen (*argv, "r"); + + if (fp == NULL) + { + perror ("fopen"); + return 1; + } + + while (!feof (fp)) + { + size = fread (buf, 1, sizeof(buf), fp); + gcry_md_write (hd, buf, size); + } + + h = gcry_md_read(hd, 0); + + for (i = 0; i < gcry_md_get_algo_dlen (algo); i++) + printf("%02hhx", h[i]); + printf(" %s\n", *argv); + + gcry_md_reset(hd); + } + + gcry_md_close(hd); + return 0; +} -- 1.8.5.1 From cvs at cvs.gnupg.org Mon Dec 16 17:54:13 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 16 Dec 2013 17:54:13 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-454-g93919d6 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 93919d63c887bdef31f94aeccac94f108fe129d3 (commit) via 0ea9731e1c93a962f6266004ab0e7418c19d6277 (commit) via 9a912f8c4f366c53f1cdb94513b67b937e87178b (commit) from 0d3bd23d7f730b9bbc81fc8da8d99f4853c36020 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 93919d63c887bdef31f94aeccac94f108fe129d3 Author: Werner Koch Date: Mon Dec 16 17:49:56 2013 +0100 Post release updates. -- diff --git a/NEWS b/NEWS index 48f8a21..8f43923 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Noteworthy changes in version 1.6.1 (unreleased) +------------------------------------------------ + + Noteworthy changes in version 1.6.0 (2013-12-16) ------------------------------------------------ diff --git a/configure.ac b/configure.ac index 66c3120..c99765c 100644 --- a/configure.ac +++ b/configure.ac @@ -30,7 +30,7 @@ min_automake_version="1.10" # for the LT versions. m4_define(mym4_version_major, [1]) m4_define(mym4_version_minor, [6]) -m4_define(mym4_version_micro, [0]) +m4_define(mym4_version_micro, [1]) # Below is m4 magic to extract and compute the revision number, the # decimalized short revision number, a beta version string, and a flag commit 0ea9731e1c93a962f6266004ab0e7418c19d6277 Author: Werner Koch Date: Mon Dec 16 17:38:55 2013 +0100 Release 1.6.0. diff --git a/NEWS b/NEWS index 978047f..48f8a21 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Noteworthy changes in version 1.6.0 (unreleased) +Noteworthy changes in version 1.6.0 (2013-12-16) ------------------------------------------------ * Removed the long deprecated gcry_ac interface. Thus Libgcrypt is @@ -12,6 +12,9 @@ Noteworthy changes in version 1.6.0 (unreleased) * Removed deprecated control codes. + * Improved performance of most cipher algorithms as well as for the + SHA family of hash functions. + * Added support for the IDEA cipher algorithm. * Added support for the Salsa20 and reduced Salsa20/12 stream ciphers. @@ -48,6 +51,8 @@ Noteworthy changes in version 1.6.0 (unreleased) * Added a feature to globally disable selected hardware features. + * Added debug helper functions. + * Interface changes relative to the 1.5.0 release: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ gcry_ac_* REMOVED. diff --git a/README b/README index 558e008..43f9094 100644 --- a/README +++ b/README @@ -3,6 +3,8 @@ Version 1.6 Copyright (C) 1989,1991-2012 Free Software Foundation, Inc. + Copyright (C) 2012-2013 g10 Code GmbH + Copyright (C) 2013 Jussi Kivilinna Libgcrypt is free software. See the file AUTHORS for full copying notices, and LICENSES for notices about contributions that require @@ -33,7 +35,7 @@ You should get the latest versions of course. After building and installing the libgpg-error package, you may - continue with Libgcrypt installation As with allmost all GNU + continue with Libgcrypt installation as with allmost all GNU packages, you just have to do ./configure @@ -237,7 +239,8 @@ actual terms. The helper programs (e.g. gcryptrnd and getrandom) as well as the documentation are distributed under the terms of the GNU General Public License (GPL); see the file COPYING for the - actual terms. + actual terms. The file LICENSES has notices about contributions + that require these additional notices are distributed. This library used to be available under the GPL - this was changed with version 1.1.7 with the rationale that there are now many free @@ -263,6 +266,14 @@ Commercial grade support for Libgcrypt is available; please see http://www.gnupg.org/service.html . + Commercial grade support for Libgcrypt is available; for a listing + of offers see http://www.gnupg.org/service.html . The driving + force behind the development of Libgcrypt is the company of its + principal author, Werner Koch. Maintenance and improvement of + Libgcrypt takes up a lot resources. To allow him to continue his + work, he asks to either purchase a support contract, engage them + for custom enhancements, or to donate money. See http://g10code.com . + This file is Free Software; as a special exception the authors gives unlimited permission to copy and/or distribute it, with or without diff --git a/compat/compat.c b/compat/compat.c index d259130..5678067 100644 --- a/compat/compat.c +++ b/compat/compat.c @@ -25,12 +25,14 @@ const char * _gcry_compat_identification (void) { + /* For complete list of copyright holders see the file AUTHORS in + the source distribution. */ static const char blurb[] = "\n\n" "This is Libgcrypt " PACKAGE_VERSION " - The GNU Crypto Library\n" - "Copyright 2000, 2002, 2003, 2004, 2007, 2008, 2009,\n" - " 2010, 2011, 2012 Free Software Foundation, Inc.\n" - "Copyright 2012, 2013 g10 Code GmbH\n" + "Copyright (C) 2000-2012 Free Software Foundation, Inc.\n" + "Copyright (C) 2012-2013 g10 Code GmbH\n" + "Copyright (C) 2013 Jussi Kivilinna\n" "\n" "(" BUILD_REVISION " " BUILD_TIMESTAMP ")\n" "\n\n"; diff --git a/configure.ac b/configure.ac index 571e156..66c3120 100644 --- a/configure.ac +++ b/configure.ac @@ -54,15 +54,15 @@ AC_INIT([libgcrypt],[mym4_full_version],[http://bugs.gnupg.org]) # (Interfaces removed: CURRENT++, AGE=0, REVISION=0) # (Interfaces added: CURRENT++, AGE++, REVISION=0) # (No interfaces changed: REVISION++) -# CAUTION: Due to the ABI change in 1.6 the LT version numbers below have -# already been set for the next release. Thus don't update them for -# the 1.6.0 release. LIBGCRYPT_LT_CURRENT=20 LIBGCRYPT_LT_AGE=0 LIBGCRYPT_LT_REVISION=0 # If the API is changed in an incompatible way: increment the next counter. +# +# 1.6: ABI and API change but the change is to most users irrelevant +# and thus the API version number has not been incremented. LIBGCRYPT_CONFIG_API_VERSION=1 # If you change the required gpg-error version, please remove commit 9a912f8c4f366c53f1cdb94513b67b937e87178b Author: Werner Koch Date: Mon Dec 16 16:54:53 2013 +0100 doc: Change yat2m to allow arbitrary condition names. * doc/yat2m.c (MAX_CONDITION_NESTING): New. (gpgone_defined): Remove. (condition_s, condition_stack, condition_stack_idx): New. (cond_is_active, cond_in_verbatim): New. (add_predefined_macro, set_macro, macro_set_p): New. (evaluate_conditions, push_condition, pop_condition): New. (parse_file): Rewrite to use the condition stack. (top_parse_file): Set prefined macros. (main): Change -D to define arbitrary macros. -- This change allows the use of other conditionals than "gpgone" and thus make "gpgtwoone" et al. actually work. It does now also track conditionals over included files. Signed-off-by: Werner Koch From GnuPG master commit a15c35f37ed2b58805adc213029998aa3e52f038 diff --git a/doc/yat2m.c b/doc/yat2m.c index 9d7bdec..2ac4390 100644 --- a/doc/yat2m.c +++ b/doc/yat2m.c @@ -1,6 +1,6 @@ /* yat2m.c - Yet Another Texi 2 Man converter - * Copyright (C) 2005 g10 Code GmbH - * Copyright (C) 2006, 2008 Free Software Foundation, Inc. + * Copyright (C) 2005, 2013 g10 Code GmbH + * Copyright (C) 2006, 2008, 2011 Free Software Foundation, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,7 +17,7 @@ */ /* - This is a simple textinfo to man page converter. It needs some + This is a simple texinfo to man page converter. It needs some special markup in th e texinfo and tries best to get a create man page. It has been designed for the GnuPG man pages and thus only a few texinfo commands are supported. @@ -72,7 +72,21 @@ extracted from one file, either using the --store or the --select option. - + If you want to indent tables in the source use this style: + + @table foo + @item + @item + @table + @item + @end + @end + + Don't change the indentation within a table and keep the same + number of white space at the start of the line. yat2m simply + detects the number of white spaces in front of an @item and remove + this number of spaces from all following lines until a new @item + is found or there are less spaces than for the last @item. */ #include @@ -93,6 +107,9 @@ character. */ #define LINESIZE 1024 +/* Number of allowed condition nestings. */ +#define MAX_CONDITION_NESTING 10 + /* Option flags. */ static int verbose; static int quiet; @@ -103,10 +120,6 @@ static const char *opt_select; static const char *opt_include; static int opt_store; -/* The only define we understand is -D gpgone. Thus we need a simple - boolean tro track it. */ -static int gpgone_defined; - /* Flag to keep track whether any error occurred. */ static int any_error; @@ -115,7 +128,7 @@ static int any_error; struct macro_s { struct macro_s *next; - char *value; /* Malloced value. */ + char *value; /* Malloced value. */ char name[1]; }; typedef struct macro_s *macro_t; @@ -123,6 +136,24 @@ typedef struct macro_s *macro_t; /* List of all defined macros. */ static macro_t macrolist; +/* List of global macro names. The value part is not used. */ +static macro_t predefinedmacrolist; + +/* Object to keep track of @isset and @ifclear. */ +struct condition_s +{ + int manverb; /* "manverb" needs special treatment. */ + int isset; /* This is an @isset condition. */ + char name[1]; /* Name of the condition macro. */ +}; +typedef struct condition_s *condition_t; + +/* The stack used to evaluate conditions. And the current states. */ +static condition_t condition_stack[MAX_CONDITION_NESTING]; +static int condition_stack_idx; +static int cond_is_active; /* State of ifset/ifclear */ +static int cond_in_verbatim; /* State of "manverb". */ + /* Object to store one line of content. */ struct line_buffer_s @@ -299,7 +330,158 @@ isodatestring (void) } +/* Add NAME to the list of predefined macros which are global for all + files. */ +static void +add_predefined_macro (const char *name) +{ + macro_t m; + + for (m=predefinedmacrolist; m; m = m->next) + if (!strcmp (m->name, name)) + break; + if (!m) + { + m = xcalloc (1, sizeof *m + strlen (name)); + strcpy (m->name, name); + m->next = predefinedmacrolist; + predefinedmacrolist = m; + } +} + + +/* Create or update a macro with name MACRONAME and set its values TO + MACROVALUE. Note that ownership of the macro value is transferred + to this function. */ +static void +set_macro (const char *macroname, char *macrovalue) +{ + macro_t m; + + for (m=macrolist; m; m = m->next) + if (!strcmp (m->name, macroname)) + break; + if (m) + free (m->value); + else + { + m = xcalloc (1, sizeof *m + strlen (macroname)); + strcpy (m->name, macroname); + m->next = macrolist; + macrolist = m; + } + m->value = macrovalue; + macrovalue = NULL; +} + + +/* Return true if the macro NAME is set, i.e. not the empty string and + not evaluating to 0. */ +static int +macro_set_p (const char *name) +{ + macro_t m; + + for (m = macrolist; m ; m = m->next) + if (!strcmp (m->name, name)) + break; + if (!m || !m->value || !*m->value) + return 0; + if ((*m->value & 0x80) || !isdigit (*m->value)) + return 1; /* Not a digit but some other string. */ + return !!atoi (m->value); +} + + +/* Evaluate the current conditions. */ +static void +evaluate_conditions (const char *fname, int lnr) +{ + int i; + + /* for (i=0; i < condition_stack_idx; i++) */ + /* inf ("%s:%d: stack[%d] %s %s %c", */ + /* fname, lnr, i, condition_stack[i]->isset? "set":"clr", */ + /* condition_stack[i]->name, */ + /* (macro_set_p (condition_stack[i]->name) */ + /* ^ !condition_stack[i]->isset)? 't':'f'); */ + + cond_is_active = 1; + cond_in_verbatim = 0; + if (condition_stack_idx) + { + for (i=0; i < condition_stack_idx; i++) + { + if (condition_stack[i]->manverb) + cond_in_verbatim = (macro_set_p (condition_stack[i]->name) + ^ !condition_stack[i]->isset); + else if (!(macro_set_p (condition_stack[i]->name) + ^ !condition_stack[i]->isset)) + { + cond_is_active = 0; + break; + } + } + } + + /* inf ("%s:%d: active=%d verbatim=%d", */ + /* fname, lnr, cond_is_active, cond_in_verbatim); */ +} + +/* Push a condition with condition macro NAME onto the stack. If + ISSET is true, a @isset condition is pushed. */ +static void +push_condition (const char *name, int isset, const char *fname, int lnr) +{ + condition_t cond; + int manverb = 0; + + if (condition_stack_idx >= MAX_CONDITION_NESTING) + { + err ("%s:%d: condition nested too deep", fname, lnr); + return; + } + + if (!strcmp (name, "manverb")) + { + if (!isset) + { + err ("%s:%d: using \"@ifclear manverb\" is not allowed", fname, lnr); + return; + } + manverb = 1; + } + + cond = xcalloc (1, sizeof *cond + strlen (name)); + cond->manverb = manverb; + cond->isset = isset; + strcpy (cond->name, name); + + condition_stack[condition_stack_idx++] = cond; + evaluate_conditions (fname, lnr); +} + + +/* Remove the last condition from the stack. ISSET is used for error + reporting. */ +static void +pop_condition (int isset, const char *fname, int lnr) +{ + if (!condition_stack_idx) + { + err ("%s:%d: unbalanced \"@end %s\"", + fname, lnr, isset?"isset":"isclear"); + return; + } + condition_stack_idx--; + free (condition_stack[condition_stack_idx]); + condition_stack[condition_stack_idx] = NULL; + evaluate_conditions (fname, lnr); +} + + + /* Return a section buffer for the section NAME. Allocate a new buffer if this is a new section. Keep track of the sections in THEPAGE. This function may reallocate the section array in THEPAGE. */ @@ -400,7 +582,7 @@ static void start_page (char *name) { if (verbose) - inf ("starting page `%s'", name); + inf ("starting page '%s'", name); assert (!thepage.name); thepage.name = xstrdup (name); thepage.n_sections = 0; @@ -420,7 +602,7 @@ write_th (FILE *fp) p = strrchr (name, '.'); if (!p || !p[1]) { - err ("no section name in man page `%s'", thepage.name); + err ("no section name in man page '%s'", thepage.name); free (name); return -1; } @@ -577,7 +759,7 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len, ignore_args = 1; /* Parameterized macros are not yet supported. */ } else - inf ("texinfo command `%s' not supported (%.*s)", command, + inf ("texinfo command '%s' not supported (%.*s)", command, ((s = memchr (rest, '\n', len)), (s? (s-rest) : len)), rest); } @@ -591,7 +773,7 @@ proc_texi_cmd (FILE *fp, const char *command, const char *rest, size_t len, i--; if (i) { - err ("closing brace for command `%s' not found", command); + err ("closing brace for command '%s' not found", command); return len; } if (n > 2 && !ignore_args) @@ -766,13 +948,13 @@ finish_page (void) return; /* No page active. */ if (verbose) - inf ("finishing page `%s'", thepage.name); + inf ("finishing page '%s'", thepage.name); if (opt_select) { if (!strcmp (opt_select, thepage.name)) { - inf ("selected `%s'", thepage.name ); + inf ("selected '%s'", thepage.name ); fp = stdout; } else @@ -784,10 +966,10 @@ finish_page (void) } else if (opt_store) { - inf ("writing `%s'", thepage.name ); + inf ("writing '%s'", thepage.name ); fp = fopen ( thepage.name, "w" ); if (!fp) - die ("failed to create `%s': %s\n", thepage.name, strerror (errno)); + die ("failed to create '%s': %s\n", thepage.name, strerror (errno)); } else fp = stdout; @@ -848,14 +1030,9 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) int lnr = 0; /* Fixme: The following state variables don't carry over to include files. */ - int in_verbatim = 0; int skip_to_end = 0; /* Used to skip over menu entries. */ int skip_sect_line = 0; /* Skip after @mansect. */ - int ifset_nesting = 0; /* How often a ifset has been seen. */ - int ifclear_nesting = 0; /* How often a ifclear has been seen. */ - int in_gpgone = 0; /* Keep track of "@ifset gpgone" parts. */ - int not_in_gpgone = 0; /* Keep track of "@ifclear gpgone" parts. */ - int not_in_man = 0; /* Keep track of "@ifclear isman" parts. */ + int item_indent = 0; /* How far is the current @item indented. */ /* Helper to define a macro. */ char *macroname = NULL; @@ -868,7 +1045,7 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) { size_t n = strlen (line); int got_line = 0; - char *p; + char *p, *pend; lnr++; if (!n || line[n-1] != '\n') @@ -879,6 +1056,24 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) } line[--n] = 0; + /* Kludge to allow indentation of tables. */ + for (p=line; *p == ' ' || *p == '\t'; p++) + ; + if (*p) + { + if (*p == '@' && !strncmp (p+1, "item", 4)) + item_indent = p - line; /* Set a new indent level. */ + else if (p - line < item_indent) + item_indent = 0; /* Switch off indention. */ + + if (item_indent) + { + memmove (line, line+item_indent, n - item_indent + 1); + n -= item_indent; + } + } + + if (*line == '@') { for (p=line+1, n=1; *p && *p != ' ' && *p != '\t'; p++) @@ -897,26 +1092,12 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) && !strncmp (p, "macro", 5) && (p[5]==' '||p[5]=='\t'||!p[5])) { - macro_t m; - if (macrovalueused) macrovalue[--macrovalueused] = 0; /* Kill the last LF. */ macrovalue[macrovalueused] = 0; /* Terminate macro. */ macrovalue = xrealloc (macrovalue, macrovalueused+1); - for (m= macrolist; m; m = m->next) - if (!strcmp (m->name, macroname)) - break; - if (m) - free (m->value); - else - { - m = xcalloc (1, sizeof *m + strlen (macroname)); - strcpy (m->name, macroname); - m->next = macrolist; - macrolist = m; - } - m->value = macrovalue; + set_macro (macroname, macrovalue); macrovalue = NULL; free (macroname); macroname = NULL; @@ -964,23 +1145,33 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) if (n == 6 && !memcmp (line, "@ifset", 6) && (line[6]==' '||line[6]=='\t')) { - ifset_nesting++; - - if (!strncmp (p, "manverb", 7) && (p[7]==' '||p[7]=='\t'||!p[7])) + for (p=line+7; *p == ' ' || *p == '\t'; p++) + ; + if (!*p) { - if (in_verbatim) - err ("%s:%d: nested \"@ifset manverb\"", fname, lnr); - else - in_verbatim = ifset_nesting; + err ("%s:%d: name missing after \"@ifset\"", fname, lnr); + continue; } - else if (!strncmp (p, "gpgone", 6) - && (p[6]==' '||p[6]=='\t'||!p[6])) + for (pend=p; *pend && *pend != ' ' && *pend != '\t'; pend++) + ; + *pend = 0; /* Ignore rest of the line. */ + push_condition (p, 1, fname, lnr); + continue; + } + else if (n == 8 && !memcmp (line, "@ifclear", 8) + && (line[8]==' '||line[8]=='\t')) + { + for (p=line+9; *p == ' ' || *p == '\t'; p++) + ; + if (!*p) { - if (in_gpgone) - err ("%s:%d: nested \"@ifset gpgone\"", fname, lnr); - else - in_gpgone = ifset_nesting; + err ("%s:%d: name missing after \"@ifsclear\"", fname, lnr); + continue; } + for (pend=p; *pend && *pend != ' ' && *pend != '\t'; pend++) + ; + *pend = 0; /* Ignore rest of the line. */ + push_condition (p, 0, fname, lnr); continue; } else if (n == 4 && !memcmp (line, "@end", 4) @@ -988,40 +1179,7 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) && !strncmp (p, "ifset", 5) && (p[5]==' '||p[5]=='\t'||!p[5])) { - if (in_verbatim && ifset_nesting == in_verbatim) - in_verbatim = 0; - if (in_gpgone && ifset_nesting == in_gpgone) - in_gpgone = 0; - - if (ifset_nesting) - ifset_nesting--; - else - err ("%s:%d: unbalanced \"@end ifset\"", fname, lnr); - continue; - } - else if (n == 8 && !memcmp (line, "@ifclear", 8) - && (line[8]==' '||line[8]=='\t')) - { - ifclear_nesting++; - - if (!strncmp (p, "gpgone", 6) - && (p[6]==' '||p[6]=='\t'||!p[6])) - { - if (not_in_gpgone) - err ("%s:%d: nested \"@ifclear gpgone\"", fname, lnr); - else - not_in_gpgone = ifclear_nesting; - } - - else if (!strncmp (p, "isman", 5) - && (p[5]==' '||p[5]=='\t'||!p[5])) - { - if (not_in_man) - err ("%s:%d: nested \"@ifclear isman\"", fname, lnr); - else - not_in_man = ifclear_nesting; - } - + pop_condition (1, fname, lnr); continue; } else if (n == 4 && !memcmp (line, "@end", 4) @@ -1029,23 +1187,13 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) && !strncmp (p, "ifclear", 7) && (p[7]==' '||p[7]=='\t'||!p[7])) { - if (not_in_gpgone && ifclear_nesting == not_in_gpgone) - not_in_gpgone = 0; - if (not_in_man && ifclear_nesting == not_in_man) - not_in_man = 0; - - if (ifclear_nesting) - ifclear_nesting--; - else - err ("%s:%d: unbalanced \"@end ifclear\"", fname, lnr); + pop_condition (0, fname, lnr); continue; } } /* Take action on ifset/ifclear. */ - if ( (in_gpgone && !gpgone_defined) - || (not_in_gpgone && gpgone_defined) - || not_in_man) + if (!cond_is_active) continue; /* Process commands. */ @@ -1057,7 +1205,7 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) { skip_to_end = 0; } - else if (in_verbatim) + else if (cond_in_verbatim) { got_line = 1; } @@ -1129,7 +1277,7 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) } if (!incfp) - err ("can't open include file `%s':%s", + err ("can't open include file '%s':%s", incname, strerror (errno)); else { @@ -1149,7 +1297,7 @@ parse_file (const char *fname, FILE *fp, char **section_name, int in_pause) else if (!skip_to_end) got_line = 1; - if (got_line && in_verbatim) + if (got_line && cond_in_verbatim) add_content (*section_name, line, 1); else if (got_line && thepage.name && *section_name && !in_pause) add_content (*section_name, line, 0); @@ -1168,13 +1316,19 @@ top_parse_file (const char *fname, FILE *fp) { char *section_name = NULL; /* Name of the current section or NULL if not in a section. */ + macro_t m; + while (macrolist) { - macro_t m = macrolist->next; - free (m->value); - free (m); - macrolist = m; + macro_t next = macrolist->next; + free (macrolist->value); + free (macrolist); + macrolist = next; } + for (m=predefinedmacrolist; m; m = m->next) + set_macro (m->name, xstrdup ("1")); + cond_is_active = 1; + cond_in_verbatim = 0; parse_file (fname, fp, §ion_name, 0); free (section_name); @@ -1190,6 +1344,12 @@ main (int argc, char **argv) opt_source = "GNU"; opt_release = ""; + /* Define default macros. The trick is that these macros are not + defined when using the actual texinfo renderer. */ + add_predefined_macro ("isman"); + add_predefined_macro ("manverb"); + + /* Option parsing. */ if (argc) { argc--; argv++; @@ -1294,8 +1454,7 @@ main (int argc, char **argv) argc--; argv++; if (argc) { - if (!strcmp (*argv, "gpgone")) - gpgone_defined = 1; + add_predefined_macro (*argv); argc--; argv++; } } ----------------------------------------------------------------------- Summary of changes: NEWS | 11 +- README | 15 ++- compat/compat.c | 8 +- configure.ac | 8 +- doc/yat2m.c | 373 +++++++++++++++++++++++++++++++++++++++---------------- 5 files changed, 298 insertions(+), 117 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From dbaryshkov at gmail.com Mon Dec 16 17:34:46 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 20:34:46 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme Message-ID: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign, _gcry_ecc_ecdsa_verify): as required by ECDSA scheme, truncate hash values to bitlength of used curve. * tests/pubkey.c (check_ecc_sample_key): add a testcase for hash truncation. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/ecc-ecdsa.c | 22 ++++++++++++++++++++-- tests/pubkey.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 3 deletions(-) diff --git a/cipher/ecc-ecdsa.c b/cipher/ecc-ecdsa.c index b4bbe2c..0cdd12b 100644 --- a/cipher/ecc-ecdsa.c +++ b/cipher/ecc-ecdsa.c @@ -67,7 +67,16 @@ _gcry_ecc_ecdsa_sign (gcry_mpi_t input, ECC_secret_key *skey, mpi_rshift (hash, hash, abits - qbits); } else - hash = input; + { + abits = mpi_get_nbits (input); + if (abits > qbits) + { + hash = _gcry_mpi_copy (input); + mpi_rshift (hash, hash, mpi_get_nbits (input) - qbits); + } + else + hash = input; + } k = NULL; @@ -164,6 +173,7 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, gcry_mpi_t h, h1, h2, x; mpi_point_struct Q, Q1, Q2; mpi_ec_t ctx; + unsigned int abits, nbits; if( !(mpi_cmp_ui (r, 0) > 0 && mpi_cmp (r, pkey->E.n) < 0) ) return GPG_ERR_BAD_SIGNATURE; /* Assertion 0 < r < n failed. */ @@ -183,8 +193,16 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, /* h = s^(-1) (mod n) */ mpi_invm (h, s, pkey->E.n); + abits = mpi_get_nbits (input); + nbits = mpi_get_nbits (pkey->E.n); /* h1 = hash * s^(-1) (mod n) */ - mpi_mulm (h1, input, h, pkey->E.n); + if (abits > nbits) + { + mpi_rshift (h1, input, abits - nbits); + mpi_mulm (h1, h1, h, pkey->E.n); + } + else + mpi_mulm (h1, input, h, pkey->E.n); /* Q1 = [ hash * s^(-1) ]G */ _gcry_mpi_ec_mul_point (&Q1, h1, &pkey->E.G, ctx); /* h2 = r * s^(-1) (mod n) */ diff --git a/tests/pubkey.c b/tests/pubkey.c index 4e12dfd..3150669 100644 --- a/tests/pubkey.c +++ b/tests/pubkey.c @@ -980,9 +980,23 @@ check_ecc_sample_key (void) "(data (flags raw)\n" " (value #00112233445566778899AABBCCDDEEFF" /* */ "000102030405060708090A0B0C0D0E0F#))"; + static const char hash2_string[] = + "(data (flags raw)\n" + " (value #00112233445566778899AABBCCDDEEFF" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "00112233445566778899AABBCCDDEEFF#))"; + /* hash2, but longer than curve length, so it will be truncated */ + static const char hash3_string[] = + "(data (flags raw)\n" + " (value #00112233445566778899AABBCCDDEEFF" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "00112233445566778899AABBCCDDEEFF" + /* */ "000102030405060708090A0B0C0D0E0F#))"; gpg_error_t err; - gcry_sexp_t key, hash, sig; + gcry_sexp_t key, hash, hash2, hash3, sig, sig2; if (verbose) fprintf (stderr, "Checking sample ECC key.\n"); @@ -990,6 +1004,12 @@ check_ecc_sample_key (void) if ((err = gcry_sexp_new (&hash, hash_string, 0, 1))) die ("line %d: %s", __LINE__, gpg_strerror (err)); + if ((err = gcry_sexp_new (&hash2, hash2_string, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + + if ((err = gcry_sexp_new (&hash3, hash3_string, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + if ((err = gcry_sexp_new (&key, ecc_private_key, 0, 1))) die ("line %d: %s", __LINE__, gpg_strerror (err)); @@ -1003,6 +1023,27 @@ check_ecc_sample_key (void) if ((err = gcry_pk_verify (sig, hash, key))) die ("gcry_pk_verify failed: %s", gpg_strerror (err)); + /* Verify hash truncation */ + gcry_sexp_release (key); + if ((err = gcry_sexp_new (&key, ecc_private_key, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + + if ((err = gcry_pk_sign (&sig2, hash2, key))) + die ("gcry_pk_sign failed: %s", gpg_strerror (err)); + + if ((err = gcry_pk_sign (&sig, hash3, key))) + die ("gcry_pk_sign failed: %s", gpg_strerror (err)); + + gcry_sexp_release (key); + if ((err = gcry_sexp_new (&key, ecc_public_key, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + + if ((err = gcry_pk_verify (sig, hash2, key))) + die ("gcry_pk_verify failed 2: %s", gpg_strerror (err)); + + if ((err = gcry_pk_verify (sig2, hash3, key))) + die ("gcry_pk_verify failed 3: %s", gpg_strerror (err)); + /* Now try signing without the Q parameter. */ gcry_sexp_release (key); @@ -1021,8 +1062,10 @@ check_ecc_sample_key (void) die ("gcry_pk_verify signed without Q failed: %s", gpg_strerror (err)); gcry_sexp_release (sig); + gcry_sexp_release (sig2); gcry_sexp_release (key); gcry_sexp_release (hash); + gcry_sexp_release (hash2); } -- 1.8.5.1 From cvs at cvs.gnupg.org Mon Dec 16 18:00:26 2013 From: cvs at cvs.gnupg.org (by Werner Koch) Date: Mon, 16 Dec 2013 18:00:26 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-2-g210b723 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 210b7237706f6ad5cbc1a3362707f63db2c8a780 (commit) from 93919d63c887bdef31f94aeccac94f108fe129d3 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 210b7237706f6ad5cbc1a3362707f63db2c8a780 Author: Werner Koch Date: Mon Dec 16 17:58:42 2013 +0100 Open new development branch. -- diff --git a/NEWS b/NEWS index 8f43923..dbe30ee 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Noteworthy changes in version 1.6.1 (unreleased) +Noteworthy changes in version 1.7.0 (unreleased) ------------------------------------------------ diff --git a/README b/README index 43f9094..938c6c6 100644 --- a/README +++ b/README @@ -1,6 +1,8 @@ Libgcrypt - The GNU Crypto Library ------------------------------------ - Version 1.6 + Version 1.7 + + ==== THIS IS A DEVELOPMENT VERSION - NOT FOR REAL USE ==== Copyright (C) 1989,1991-2012 Free Software Foundation, Inc. Copyright (C) 2012-2013 g10 Code GmbH diff --git a/configure.ac b/configure.ac index c99765c..ed14d89 100644 --- a/configure.ac +++ b/configure.ac @@ -29,8 +29,8 @@ min_automake_version="1.10" # commit and push so that the git magic is able to work. See below # for the LT versions. m4_define(mym4_version_major, [1]) -m4_define(mym4_version_minor, [6]) -m4_define(mym4_version_micro, [1]) +m4_define(mym4_version_minor, [7]) +m4_define(mym4_version_micro, [0]) # Below is m4 magic to extract and compute the revision number, the # decimalized short revision number, a beta version string, and a flag ----------------------------------------------------------------------- Summary of changes: NEWS | 2 +- README | 4 +++- configure.ac | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From wk at gnupg.org Mon Dec 16 18:03:10 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 18:03:10 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 20:34:46 +0400") References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <87y53kg241.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 17:34, dbaryshkov at gmail.com said: > * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign, _gcry_ecc_ecdsa_verify): > as required by ECDSA scheme, truncate hash values to bitlength of > used curve. Please explain and name the specs. In particular I wonder about truncating the less significant bits. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Mon Dec 16 18:05:51 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 18:05:51 +0100 Subject: [PATCH] Add a simple PKCS#1 padding mode In-Reply-To: <1387212044-19849-1-git-send-email-dbaryshkov@gmail.com> (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 20:40:44 +0400") References: <1387212044-19849-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <87txe8g1zk.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 17:40, dbaryshkov at gmail.com said: > Allow user to specify (flags pkcs1) to enable pkcs1 padding of raw value > (no hash algorithm is specified). It is up to the user to verify that > passed value is properly formatted and includes DER-encoded ASN OID of > the hash function. Please xplain this too. Why shall we add a crippled pkcs#1 mode? Is this tfor the SHA1+MD5 hash of TLS? If so, we should add a specila hash-algo string. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Mon Dec 16 18:10:10 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 18:10:10 +0100 Subject: [PATCH] Add an utility to calculate hashes over a set of files In-Reply-To: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 20:41:24 +0400") References: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <87ppowg1sd.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 17:41, dbaryshkov at gmail.com said: > An utility like rhash that has the ability to calculate different hashes > over a set of files it usefull. Add gchash utility to calculate hashes > supported by libgcrypt. I can't see why Libgcrypt needs to provide such a tool. It is a library and applications should provide user interafces. For example gpg has a feature to create digests for all Libgcrypt supported algorithms. Salam-Shalom, Werner ps. Sorry, for rejecting/commenting the 3 patches at a time. -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From dbaryshkov at gmail.com Mon Dec 16 18:30:45 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 21:30:45 +0400 Subject: [PATCH] Add an utility to calculate hashes over a set of files In-Reply-To: <87ppowg1sd.fsf@vigenere.g10code.de> References: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> <87ppowg1sd.fsf@vigenere.g10code.de> Message-ID: On Mon, Dec 16, 2013 at 9:10 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 17:41, dbaryshkov at gmail.com said: > >> An utility like rhash that has the ability to calculate different hashes >> over a set of files it usefull. Add gchash utility to calculate hashes >> supported by libgcrypt. > > I can't see why Libgcrypt needs to provide such a tool. It is a library > and applications should provide user interafces. For example gpg has a > feature to create digests for all Libgcrypt supported algorithms. This is mostly used for testing/verification, like mpicalc is. I used that to check gost hashes implementation (to compare file caches with hashes generated by other tools). When I had several similar test programs locally, I thought that libgcrypt can benefit from having such standalone tool. > ps. > Sorry, for rejecting/commenting the 3 patches at a time. N/p. I got used to that in the kernel world. -- With best wishes Dmitry From dbaryshkov at gmail.com Mon Dec 16 18:32:33 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 21:32:33 +0400 Subject: [PATCH] Add an utility to calculate hashes over a set of files In-Reply-To: References: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> <87ppowg1sd.fsf@vigenere.g10code.de> Message-ID: On Mon, Dec 16, 2013 at 9:30 PM, Dmitry Eremin-Solenikov wrote: > On Mon, Dec 16, 2013 at 9:10 PM, Werner Koch wrote: >> On Mon, 16 Dec 2013 17:41, dbaryshkov at gmail.com said: >> >>> An utility like rhash that has the ability to calculate different hashes >>> over a set of files it usefull. Add gchash utility to calculate hashes >>> supported by libgcrypt. >> >> I can't see why Libgcrypt needs to provide such a tool. It is a library >> and applications should provide user interafces. For example gpg has a >> feature to create digests for all Libgcrypt supported algorithms. > > This is mostly used for testing/verification, like mpicalc is. I used that > to check gost hashes implementation (to compare file caches with > hashes generated by other tools). When I had several similar > test programs locally, I thought that libgcrypt can benefit from having > such standalone tool. Addition: I did not think about using gpg to generate digests. I wanted to minimize dependencies in my development queue. -- With best wishes Dmitry From wk at gnupg.org Mon Dec 16 18:49:01 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 18:49:01 +0100 Subject: Libgcrypt 1.6.0 released Message-ID: <87haa8fzzm.fsf@vigenere.g10code.de> Hello! The GNU project is pleased to announce the availability of Libgcrypt version 1.6.0. This is the new stable version of Libgcrypt with the API being mostly compatible to previous versions. Due to the removal of certain long deprecated functions this version introduces an ABI change. Libgcrypt is a general purpose library of cryptographic building blocks. It is originally based on code used by GnuPG. It does not provide any implementation of OpenPGP or other protocols. Thorough understanding of applied cryptography is required to use Libgcrypt. The main features of this version are performance improvements [3], better support for elliptic curves, new algorithms and modes, as well as API and internal cleanups. Better performance of public key algorithms, in particular for Curve25519, is planned for forthcoming releases. Note that the 1.5 series will enter end of life state on 2016-12-31. Noteworthy changes between version 1.5.0 and 1.6.0: =================================================== * Removed the long deprecated gcry_ac interface. Thus Libgcrypt is not anymore ABI compatible to previous versions if they used the ac interface. * Removed the module register subsystem. * The deprecated message digest debug macros have been removed. Use gcry_md_debug instead. * Removed deprecated control codes. * Improved performance of most cipher algorithms as well as for the SHA family of hash functions. * Added support for the IDEA cipher algorithm. * Added support for the Salsa20 and reduced Salsa20/12 stream ciphers. * Added limited support for the GOST 28147-89 cipher algorithm. * Added support for the GOST R 34.11-94 and R 34.11-2012 (Stribog) hash algorithms. * Added a random number generator to directly use the system's RNG. Also added an interface to prefer the use of a specified RNG. * Added support for the SCRYPT algorithm. * Mitigated the Yarom/Falkner flush+reload side-channel attack on RSA secret keys. See [CVE-2013-4242]. * Added support for Deterministic DSA as per RFC-6969. * Added support for curve Ed25519. * Added a scatter gather hash convenience function. * Added several MPI amd SEXP helper functions. * Added support for negative numbers to gcry_mpi_print, gcry_mpi_aprint and gcry_mpi_scan. * The algorithm ids GCRY_PK_ECDSA and GCRY_PK_ECDH are now deprecated. Use GCRY_PK_ECC if you need an algorithm id. * Changed gcry_pk_genkey for "ecc" to only include the curve name and not the parameters. The flag "param" may be used to revert this. * Added a feature to globally disable selected hardware features. * Added debug helper functions. For Interface changes relative to the 1.5.0 release see below [4]. Download ======== Source code is hosted at the GnuPG FTP server and its mirrors as listed at http://www.gnupg.org/download/mirrors.html . On the primary server the source file and its digital signatures is: ftp://ftp.gnupg.org/gcrypt/libgcrypt/libgcrypt-1.6.0.tar.bz2 (2441k) ftp://ftp.gnupg.org/gcrypt/libgcrypt/libgcrypt-1.6.0.tar.bz2.sig This file is bzip2 compressed. A gzip compressed version is also available: ftp://ftp.gnupg.org/gcrypt/libgcrypt/libgcrypt-1.6.0.tar.gz (2866k) ftp://ftp.gnupg.org/gcrypt/libgcrypt/libgcrypt-1.6.0.tar.gz.sig Due to the amount of changes we don't provide a patch file against 1.5.x. The SHA-1 checksums are: 43283c0b41c41e3d3bc13c2d8f937dfe2aaa1a77 libgcrypt-1.6.0.tar.bz2 03551121fe5b706532158667699f63b6e2606755 libgcrypt-1.6.0.tar.gz Copying ======= Libgcrypt is distributed under the terms of the GNU Lesser General Public License (LGPLv2.1+). The helper programs as well as the documentation are distributed under the terms of the GNU General Public License (GPLv2+). The file LICENSES has notices about contributions that require these additional notices are distributed. Support ======= For help on developing with Libgcrypt you should read the included manual and optional ask on the gcrypt-devel mailing list [1]. A listing with commercial support offers for Libgcrypt and related software is available at the GnuPG web site [2]. The driving force behind the development of Libgcrypt is my company g10 Code. Maintenance and improvement of Libgcrypt and related software takes up most of our resources. To allow us to continue our work on free software, we ask to either purchase a support contract, engage us for custom enhancements, or to donate money: http://g10code.com/gnupg-donation.html Thanks ====== Many thanks to all who contributed to Libgcrypt development, be it bug fixes, code, documentation, testing or helping users. Special thanks to Jussi Kivilinna who did most of the performance improvement work. Happy hacking, Werner [1] http://www.gnupg.org/documentation/mailing-lists.html [2] http://www.gnupg.org/service.html [3] http://blog.gnupg.org/20131215-gcrypt-bench.html [4] Interface changes relative to the 1.5.0 release: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ gcry_ac_* REMOVED. GCRY_AC_* REMOVED. gcry_module_t REMOVED. gcry_cipher_register REMOVED. gcry_cipher_unregister REMOVED. gcry_cipher_list REMOVED. gcry_pk_register REMOVED. gcry_pk_unregister REMOVED. gcry_pk_list REMOVED. gcry_md_register REMOVED. gcry_md_unregister REMOVED. gcry_md_list REMOVED. gcry_md_start_debug REMOVED (macro). gcry_md_stop_debug REMOVED (macro). GCRYCTL_SET_KEY REMOVED. GCRYCTL_SET_IV REMOVED. GCRYCTL_SET_CTR REMOVED. GCRYCTL_DISABLE_ALGO CHANGED: Not anymore thread-safe. gcry_pk_genkey CHANGED: ECC curve params not returned. gcry_md_hash_buffers NEW. gcry_buffer_t NEW. GCRYCTL_SET_ENFORCED_FIPS_FLAG NEW. GCRYCTL_SET_PREFERRED_RNG_TYPE NEW. GCRYCTL_GET_CURRENT_RNG_TYPE NEW. GCRYCTL_CLOSE_RANDOM_DEVICE NEW. GCRY_RNG_TYPE_STANDARD NEW. GCRY_RNG_TYPE_FIPS NEW. GCRY_RNG_TYPE_SYSTEM NEW. gcry_mpi_is_neg NEW. gcry_mpi_neg NEW. gcry_mpi_abs NEW. gcry_mpi_snatch NEW. gcry_mpi_set_opaque_copy NEW. gcry_mpi_point_t NEW. gcry_mpi_point_new NEW. gcry_mpi_point_release NEW. gcry_mpi_point_get NEW. gcry_mpi_point_snatch_get NEW. gcry_mpi_point_set NEW. gcry_mpi_point_snatch_set NEW. gcry_ctx_t NEW. gcry_ctx_release NEW. gcry_mpi_ec_new NEW. gcry_mpi_ec_get_mpi NEW. gcry_mpi_ec_get_point NEW. gcry_mpi_ec_set_mpi NEW. gcry_mpi_ec_set_point NEW. gcry_mpi_ec_get_affine NEW. gcry_mpi_ec_dup NEW. gcry_mpi_ec_add NEW. gcry_mpi_ec_mul NEW. gcry_mpi_ec_curve_point NEW. GCRYMPI_FLAG_IMMUTABLE NEW. GCRYMPI_FLAG_CONST NEW. GCRYMPI_FLAG_USER1 NEW. GCRYMPI_FLAG_USER2 NEW. GCRYMPI_FLAG_USER3 NEW. GCRYMPI_FLAG_USER4 NEW. GCRYMPI_CONST_ONE NEW. GCRYMPI_CONST_TWO NEW. GCRYMPI_CONST_THREE NEW. GCRYMPI_CONST_FOUR NEW. GCRYMPI_CONST_EIGHT NEW. GCRYMPI_FMT_OPAQUE NEW. GCRYPT_VERSION_NUMBER NEW. GCRY_KDF_SCRYPT NEW. gcry_pubkey_get_sexp NEW. GCRYCTL_DISABLE_LOCKED_SECMEM NEW. GCRYCTL_DISABLE_PRIV_DROP NEW. GCRY_CIPHER_SALSA20 NEW. gcry_sexp_nth_buffer NEW. gcry_sexp_extract_param NEW. GCRY_CIPHER_SALSA20R12 NEW. GCRY_CIPHER_GOST28147 NEW. GCRY_MD_GOSTR3411_94 NEW. GCRY_MD_STRIBOG256 NEW. GCRY_MD_STRIBOG512 NEW. GCRY_PK_ECC NEW. gcry_log_debug NEW. gcry_log_debughex NEW. gcry_log_debugmpi NEW. gcry_log_debugpnt NEW. -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 204 bytes Desc: not available URL: From dbaryshkov at gmail.com Mon Dec 16 19:05:22 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 22:05:22 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <87y53kg241.fsf@vigenere.g10code.de> References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> Message-ID: On Mon, Dec 16, 2013 at 9:03 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 17:34, dbaryshkov at gmail.com said: >> * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign, _gcry_ecc_ecdsa_verify): >> as required by ECDSA scheme, truncate hash values to bitlength of >> used curve. > > Please explain and name the specs. In particular I wonder about > truncating the less significant bits. I don't have access to specs (thanks ANSI), I'm still researching this topic. Wikipedia slighlty mentions that: https://en.wikipedia.org/wiki/ECDSA At least this is what other libraries do: OpenSSL http://git.openssl.org/gitweb/?p=openssl.git;a=blob;f=crypto/ecdsa/ecs_ossl.c;h=adab1f74b41daf6e719ca1fdae1ba817085c7802;hb=HEAD#l309 Nettle: http://git.lysator.liu.se/nettle/nettle/blobs/master/ecc-ecdsa-sign.c#line86 http://git.lysator.liu.se/nettle/nettle/blobs/master/ecc-hash.c NSS: https://hg.mozilla.org/projects/nss/file/49360b638350/lib/freebl/ec.c#l746 Note: we are truncating hash, so there should be no difference in truncating LSB or MSB. Both should be equally distributed. -- With best wishes Dmitry From dbaryshkov at gmail.com Mon Dec 16 19:10:34 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 22:10:34 +0400 Subject: Libgcrypt 1.6.0 released In-Reply-To: <87haa8fzzm.fsf@vigenere.g10code.de> References: <87haa8fzzm.fsf@vigenere.g10code.de> Message-ID: Hello, On Mon, Dec 16, 2013 at 9:49 PM, Werner Koch wrote: > * Added limited support for the GOST 28147-89 cipher algorithm. > > * Added support for the GOST R 34.11-94 and R 34.11-2012 (Stribog) > hash algorithms. Missing a note regarding GOST R 34.10-2001/34.10-2012 ECC signatures support. This makes libgcrypt second F/LOSS library to support all gost algorithms (with limitations, but...) and first to support -2012 versions of algorithms. -- With best wishes Dmitry From dbaryshkov at gmail.com Mon Dec 16 19:17:26 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 22:17:26 +0400 Subject: [PATCH] Add a simple PKCS#1 padding mode In-Reply-To: <87txe8g1zk.fsf@vigenere.g10code.de> References: <1387212044-19849-1-git-send-email-dbaryshkov@gmail.com> <87txe8g1zk.fsf@vigenere.g10code.de> Message-ID: Hello, On Mon, Dec 16, 2013 at 9:05 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 17:40, dbaryshkov at gmail.com said: > >> Allow user to specify (flags pkcs1) to enable pkcs1 padding of raw value >> (no hash algorithm is specified). It is up to the user to verify that >> passed value is properly formatted and includes DER-encoded ASN OID of >> the hash function. > > Please xplain this too. Why shall we add a crippled pkcs#1 mode? Is > this tfor the SHA1+MD5 hash of TLS? If so, we should add a specila > hash-algo string. No, this is purely for users to verify hash algorithm OID on their own. Basically that is the way how GnuTLS expects to sign/verify to work (ATM). This can be changed, with cooperation from GnuTLS, but for now I wanted to be as not-intrusive, as possible. -- With best wishes Dmitry From wk at gnupg.org Mon Dec 16 20:14:03 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 20:14:03 +0100 Subject: [PATCH] Add an utility to calculate hashes over a set of files In-Reply-To: (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 21:30:45 +0400") References: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> <87ppowg1sd.fsf@vigenere.g10code.de> Message-ID: <87bo0gfw1w.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 18:30, dbaryshkov at gmail.com said: > This is mostly used for testing/verification, like mpicalc is. I used that > to check gost hashes implementation (to compare file caches with > hashes generated by other tools). When I had several similar I understand. For these cases I usually add options to the test programs. For example to tests/hashtest.c. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Mon Dec 16 20:19:47 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 20:19:47 +0100 Subject: [PATCH] Add a simple PKCS#1 padding mode In-Reply-To: (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 22:17:26 +0400") References: <1387212044-19849-1-git-send-email-dbaryshkov@gmail.com> <87txe8g1zk.fsf@vigenere.g10code.de> Message-ID: <877gb4fvsc.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 19:17, dbaryshkov at gmail.com said: > No, this is purely for users to verify hash algorithm OID on their own. > Basically that is the way how GnuTLS expects to sign/verify to work I would prefer to use a flag for this or maybe better a separate encoding "pksc1-raw". Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Mon Dec 16 20:23:15 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 16 Dec 2013 20:23:15 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 22:05:22 +0400") References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> Message-ID: <8738lsfvmk.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 19:05, dbaryshkov at gmail.com said: > Note: we are truncating hash, so there should be no difference in truncating > LSB or MSB. Both should be equally distributed. But that would we incompatible. I have not checked but it might be that we do the truncation in GnuPG. Needs to be further researched. Would you mind to open a ticket for this? Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From dbaryshkov at gmail.com Mon Dec 16 20:57:00 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 16 Dec 2013 23:57:00 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <8738lsfvmk.fsf@vigenere.g10code.de> References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <8738lsfvmk.fsf@vigenere.g10code.de> Message-ID: On Mon, Dec 16, 2013 at 11:23 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 19:05, dbaryshkov at gmail.com said: > >> Note: we are truncating hash, so there should be no difference in truncating >> LSB or MSB. Both should be equally distributed. > > But that would we incompatible. I have not checked but it might be that > we do the truncation in GnuPG. Needs to be further researched. > > Would you mind to open a ticket for this? https://bugs.g10code.com/gnupg/issue1583 -- With best wishes Dmitry From dbaryshkov at gmail.com Mon Dec 16 23:47:10 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Tue, 17 Dec 2013 02:47:10 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <1387230490.32327.6.camel@aspire.lan> References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <1387230490.32327.6.camel@aspire.lan> Message-ID: On Tue, Dec 17, 2013 at 1:48 AM, Nikos Mavrogiannopoulos wrote: > On Mon, 2013-12-16 at 22:05 +0400, Dmitry Eremin-Solenikov wrote: > >> >> * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign, _gcry_ecc_ecdsa_verify): >> >> as required by ECDSA scheme, truncate hash values to bitlength of >> >> used curve. >> > Please explain and name the specs. In particular I wonder about >> > truncating the less significant bits. >> >> I don't have access to specs (thanks ANSI), I'm still researching this topic. >> Wikipedia slighlty mentions that: https://en.wikipedia.org/wiki/ECDSA > > The spec for ECDSA (and DSA) is FIPS-186-4 [0]. I believe the text you > are looking for is: "When the length of the output of the hash function > is greater than the bit length of n, then the leftmost n bits of the > hash function output block shall be used in any calculation using the > hash function output during the generation or verification of a digital > signature." > > [0]. http://csrc.nist.gov/publications/PubsFIPS.html Ah, I see, I skimmed FIPS 186-4, but I was mostly paying attention to ECDSA paragraphs, not to the generic ones. However your quote broadens my question. I checked dsa_sign() function and its sign() part - it looks like gcrypt shoud also truncate an mpi there (it is done only for originally-opaque mpis, not for 'normal' ones). Should it or should it not? -- With best wishes Dmitry From nmav at gnutls.org Mon Dec 16 22:48:10 2013 From: nmav at gnutls.org (Nikos Mavrogiannopoulos) Date: Mon, 16 Dec 2013 22:48:10 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> Message-ID: <1387230490.32327.6.camel@aspire.lan> On Mon, 2013-12-16 at 22:05 +0400, Dmitry Eremin-Solenikov wrote: > >> * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign, _gcry_ecc_ecdsa_verify): > >> as required by ECDSA scheme, truncate hash values to bitlength of > >> used curve. > > Please explain and name the specs. In particular I wonder about > > truncating the less significant bits. > > I don't have access to specs (thanks ANSI), I'm still researching this topic. > Wikipedia slighlty mentions that: https://en.wikipedia.org/wiki/ECDSA The spec for ECDSA (and DSA) is FIPS-186-4 [0]. I believe the text you are looking for is: "When the length of the output of the hash function is greater than the bit length of n, then the leftmost n bits of the hash function output block shall be used in any calculation using the hash function output during the generation or verification of a digital signature." [0]. http://csrc.nist.gov/publications/PubsFIPS.html regards, Nikos From dbaryshkov at gmail.com Tue Dec 17 00:09:46 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Tue, 17 Dec 2013 03:09:46 +0400 Subject: [PATCH] Add an utility to calculate hashes over a set of files In-Reply-To: <87bo0gfw1w.fsf@vigenere.g10code.de> References: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> <87ppowg1sd.fsf@vigenere.g10code.de> <87bo0gfw1w.fsf@vigenere.g10code.de> Message-ID: On Mon, Dec 16, 2013 at 11:14 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 18:30, dbaryshkov at gmail.com said: > >> This is mostly used for testing/verification, like mpicalc is. I used that >> to check gost hashes implementation (to compare file caches with >> hashes generated by other tools). When I had several similar > > I understand. For these cases I usually add options to the test > programs. For example to tests/hashtest.c. I don't think it's worth doing that. What about just moving gchash to tests/? Or switching it to noinst_PROGRAMS ? -- With best wishes Dmitry From nmav at gnutls.org Tue Dec 17 08:37:35 2013 From: nmav at gnutls.org (Nikos Mavrogiannopoulos) Date: Tue, 17 Dec 2013 08:37:35 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <1387230490.32327.6.camel@aspire.lan> Message-ID: <1387265855.1948.4.camel@aspire.lan> On Tue, 2013-12-17 at 02:47 +0400, Dmitry Eremin-Solenikov wrote: > Ah, I see, I skimmed FIPS 186-4, but I was mostly paying attention > to ECDSA paragraphs, not to the generic ones. However your > quote broadens my question. I checked dsa_sign() function and its > sign() part - it looks like gcrypt shoud also truncate an mpi there (it is done > only for originally-opaque mpis, not for 'normal' ones). > Should it or should it not? My understanding is that truncation applies to both DSA and ECDSA (I'm not aware of the difference in opaque-mpis and normal ones though). It is more interesting that truncation should also apply on the bit-level (i.e., on a curve of 255 bits, the truncation of SHA256 should be done by a single bit), but I don't think any implementation does that. regards, Nikos From wk at gnupg.org Tue Dec 17 09:28:21 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 17 Dec 2013 09:28:21 +0100 Subject: [PATCH] Add an utility to calculate hashes over a set of files In-Reply-To: (Dmitry Eremin-Solenikov's message of "Tue, 17 Dec 2013 03:09:46 +0400") References: <1387212084-19904-1-git-send-email-dbaryshkov@gmail.com> <87ppowg1sd.fsf@vigenere.g10code.de> <87bo0gfw1w.fsf@vigenere.g10code.de> Message-ID: <87haa7eva2.fsf@vigenere.g10code.de> On Tue, 17 Dec 2013 00:09, dbaryshkov at gmail.com said: > I don't think it's worth doing that. What about just moving gchash to tests/? > Or switching it to noinst_PROGRAMS ? tests/ and noinst_PROGRAMS is fine. That is actually the place were I would look for such tools. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Tue Dec 17 09:52:10 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 17 Dec 2013 09:52:10 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <1387265855.1948.4.camel@aspire.lan> (Nikos Mavrogiannopoulos's message of "Tue, 17 Dec 2013 08:37:35 +0100") References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <1387230490.32327.6.camel@aspire.lan> <1387265855.1948.4.camel@aspire.lan> Message-ID: <87zjnzdflx.fsf@vigenere.g10code.de> On Tue, 17 Dec 2013 08:37, nmav at gnutls.org said: > My understanding is that truncation applies to both DSA and ECDSA (I'm Right. > not aware of the difference in opaque-mpis and normal ones though). It > is more interesting that truncation should also apply on the bit-level > (i.e., on a curve of 255 bits, the truncation of SHA256 should be done > by a single bit), but I don't think any implementation does that. Libgcrypt does that; itwas not readily accesible by my brain, given that I mostly did EdDSA stuff the last weeks: /* Convert the INPUT into an MPI if needed. */ if (mpi_is_opaque (input)) { abuf = mpi_get_opaque (input, &abits); rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); if (rc) return rc; if (abits > qbits) mpi_rshift (hash, hash, abits - qbits); } else hash = input; I am not 100% that the conversion to an unsigned integer and then shifting the MPI is the right solution. However, given that the same code is in the DSA code and that passed the FIPS validation, it should be okay. In general I prefer to use opaque MPIs for hash values because that avoids the leading zero problems and is anyway better for data which is not a number. For historic reasons an MPI is sometimes to passed to the fucntions and thus we need to implement the two cases. Having said this, I think it is okay to apply Dimitry's patch to the master (1.7). Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Tue Dec 17 09:57:04 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 17 Dec 2013 09:57:04 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> (Dmitry Eremin-Solenikov's message of "Mon, 16 Dec 2013 20:34:46 +0400") References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <87vbyndfdr.fsf@vigenere.g10code.de> On Mon, 16 Dec 2013 17:34, dbaryshkov at gmail.com said: > @@ -164,6 +173,7 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, > + if (abits > nbits) > + { > + mpi_rshift (h1, input, abits - nbits); > + mpi_mulm (h1, h1, h, pkey->E.n); > + } > + else > + mpi_mulm (h1, input, h, pkey->E.n); Either move this to ecc.c:ecc_verify or move the opaque code handling used there to here. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From dbaryshkov at gmail.com Tue Dec 17 12:37:11 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Tue, 17 Dec 2013 15:37:11 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <87zjnzdflx.fsf@vigenere.g10code.de> References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <1387230490.32327.6.camel@aspire.lan> <1387265855.1948.4.camel@aspire.lan> <87zjnzdflx.fsf@vigenere.g10code.de> Message-ID: On Tue, Dec 17, 2013 at 12:52 PM, Werner Koch wrote: > On Tue, 17 Dec 2013 08:37, nmav at gnutls.org said: > >> My understanding is that truncation applies to both DSA and ECDSA (I'm > > Right. > >> not aware of the difference in opaque-mpis and normal ones though). It >> is more interesting that truncation should also apply on the bit-level >> (i.e., on a curve of 255 bits, the truncation of SHA256 should be done >> by a single bit), but I don't think any implementation does that. > > Libgcrypt does that; itwas not readily accesible by my brain, given that > I mostly did EdDSA stuff the last weeks: > > /* Convert the INPUT into an MPI if needed. */ > if (mpi_is_opaque (input)) > { > abuf = mpi_get_opaque (input, &abits); > rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); > if (rc) > return rc; > if (abits > qbits) > mpi_rshift (hash, hash, abits - qbits); > } > else > hash = input; I was looking onto this code. If mpi is opaque, it will be shifted. Hopefully. If not, it will be used as is - not shifted! I think this code should be reimplemented as following (more or less) if (mpi_is_opaque(input)) { abuf = mpi_get_opaque (input, &abits); rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); } else { hash = mpi_copy(input) abits = mpi_get_nbits(input); } if (abits > qbits) mpi_rshift (hash, hash, abits - qbits); This would be more correct, isn't it? > I am not 100% that the conversion to an unsigned integer and then > shifting the MPI is the right solution. However, given that the same > code is in the DSA code and that passed the FIPS validation, it should > be okay. > > In general I prefer to use opaque MPIs for hash values because that > avoids the leading zero problems and is anyway better for data which is > not a number. For historic reasons an MPI is sometimes to passed to the > fucntions and thus we need to implement the two cases. This makes sense. -- With best wishes Dmitry From dbaryshkov at gmail.com Tue Dec 17 12:38:04 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Tue, 17 Dec 2013 15:38:04 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <87vbyndfdr.fsf@vigenere.g10code.de> References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87vbyndfdr.fsf@vigenere.g10code.de> Message-ID: On Tue, Dec 17, 2013 at 12:57 PM, Werner Koch wrote: > On Mon, 16 Dec 2013 17:34, dbaryshkov at gmail.com said: > >> @@ -164,6 +173,7 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, > >> + if (abits > nbits) >> + { >> + mpi_rshift (h1, input, abits - nbits); >> + mpi_mulm (h1, h1, h, pkey->E.n); >> + } >> + else >> + mpi_mulm (h1, input, h, pkey->E.n); > > Either move this to ecc.c:ecc_verify or move the opaque code handling > used there to here. What about Ed25519 and truncation of hashes? -- With best wishes Dmitry From wk at gnupg.org Tue Dec 17 14:53:47 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 17 Dec 2013 14:53:47 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: (Dmitry Eremin-Solenikov's message of "Tue, 17 Dec 2013 15:37:11 +0400") References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <1387230490.32327.6.camel@aspire.lan> <1387265855.1948.4.camel@aspire.lan> <87zjnzdflx.fsf@vigenere.g10code.de> Message-ID: <87wqj3bn2s.fsf@vigenere.g10code.de> On Tue, 17 Dec 2013 12:37, dbaryshkov at gmail.com said: > if (mpi_is_opaque(input)) > { > abuf = mpi_get_opaque (input, &abits); > rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); > } > else > { > hash = mpi_copy(input) > abits = mpi_get_nbits(input); > } > if (abits > qbits) > mpi_rshift (hash, hash, abits - qbits); > > This would be more correct, isn't it? Yes. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From wk at gnupg.org Tue Dec 17 14:55:44 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 17 Dec 2013 14:55:44 +0100 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: (Dmitry Eremin-Solenikov's message of "Tue, 17 Dec 2013 15:38:04 +0400") References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87vbyndfdr.fsf@vigenere.g10code.de> Message-ID: <87sitrbmzj.fsf@vigenere.g10code.de> On Tue, 17 Dec 2013 12:38, dbaryshkov at gmail.com said: > What about Ed25519 and truncation of hashes? Ed25519 is used with EdDSA which is its own signature scheme. If you use it with ECDSA, you have the same ECDSA code is used - but its use is not not suggested. The code has only been implemented for a special use case in GNUnet. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From jussi.kivilinna at iki.fi Tue Dec 17 15:37:11 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Dec 2013 16:37:11 +0200 Subject: [PATCH 1/6] Add bulk processing for hash transform functions Message-ID: <20131217143711.31473.21328.stgit@localhost6.localdomain6> * cipher/hash-common.c (_gcry_md_block_write): Preload 'hd->blocksize' to stack, pass number of blocks to 'hd->bwrite'. * cipher/hash-common.c (_gcry_md_block_write_t): Add 'nblks'. * cipher/gostr3411-94.c: Rename 'transform' function to 'transform_blk', add new 'transform' function with 'nblks' as additional input. * cipher/md4.c: Ditto. * cipher/md5.c: Ditto. * cipher/md4.c: Ditto. * cipher/rmd160.c: Ditto. * cipher/sha1.c: Ditto. * cipher/sha256.c: Ditto. * cipher/sha512.c: Ditto. * cipher/stribog.c: Ditto. * cipher/tiger.c: Ditto. * cipher/whirlpool.c: Ditto. -- Pass number of blocks to algorithm for futher optimizations. Signed-off-by: Jussi Kivilinna --- cipher/gostr3411-94.c | 21 +++++++++++++++++++-- cipher/hash-common.c | 25 ++++++++++++++----------- cipher/hash-common.h | 3 ++- cipher/md4.c | 21 ++++++++++++++++++--- cipher/md5.c | 23 +++++++++++++++++++---- cipher/rmd160.c | 24 ++++++++++++++++++++---- cipher/sha1.c | 30 +++++++++++++++++++++++------- cipher/sha256.c | 20 ++++++++++++++------ cipher/sha512.c | 31 ++++++++++++++++++++++--------- cipher/stribog.c | 27 +++++++++++++++++++++------ cipher/tiger.c | 22 +++++++++++++++++++--- cipher/whirlpool.c | 19 +++++++++++++++++-- 12 files changed, 208 insertions(+), 58 deletions(-) diff --git a/cipher/gostr3411-94.c b/cipher/gostr3411-94.c index 1267216..b3326aa 100644 --- a/cipher/gostr3411-94.c +++ b/cipher/gostr3411-94.c @@ -41,7 +41,7 @@ typedef struct { } GOSTR3411_CONTEXT; static unsigned int -transform (void *c, const unsigned char *data); +transform (void *c, const unsigned char *data, size_t nblks); static void gost3411_init (void *context) @@ -211,7 +211,7 @@ do_hash_step (GOST28147_context *hd, unsigned char *h, unsigned char *m) static unsigned int -transform (void *ctx, const unsigned char *data) +transform_blk (void *ctx, const unsigned char *data) { GOSTR3411_CONTEXT *hd = ctx; byte m[32]; @@ -224,6 +224,23 @@ transform (void *ctx, const unsigned char *data) return /* burn_stack */ burn + 3 * sizeof(void*) + 32 + 2 * sizeof(void*); } + +static unsigned int +transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = transform_blk (c, data); + data += 32; + } + while (--nblks); + + return burn; +} + + /* The routine finally terminates the computation and returns the digest. The handle is prepared for a new cycle, but adding bytes diff --git a/cipher/hash-common.c b/cipher/hash-common.c index ffbc39e..ed63a0b 100644 --- a/cipher/hash-common.c +++ b/cipher/hash-common.c @@ -102,16 +102,18 @@ _gcry_md_block_write (void *context, const void *inbuf_arg, size_t inlen) const unsigned char *inbuf = inbuf_arg; gcry_md_block_ctx_t *hd = context; unsigned int stack_burn = 0; + const unsigned int blocksize = hd->blocksize; + size_t inblocks; - if (sizeof(hd->buf) < hd->blocksize) + if (sizeof(hd->buf) < blocksize) BUG(); if (hd->buf == NULL || hd->bwrite == NULL) return; - if (hd->count == hd->blocksize) /* Flush the buffer. */ + if (hd->count == blocksize) /* Flush the buffer. */ { - stack_burn = hd->bwrite (hd, hd->buf); + stack_burn = hd->bwrite (hd, hd->buf, 1); _gcry_burn_stack (stack_burn); stack_burn = 0; hd->count = 0; @@ -123,23 +125,24 @@ _gcry_md_block_write (void *context, const void *inbuf_arg, size_t inlen) if (hd->count) { - for (; inlen && hd->count < hd->blocksize; inlen--) + for (; inlen && hd->count < blocksize; inlen--) hd->buf[hd->count++] = *inbuf++; _gcry_md_block_write (hd, NULL, 0); if (!inlen) return; } - while (inlen >= hd->blocksize) + if (inlen >= blocksize) { - stack_burn = hd->bwrite (hd, inbuf); + inblocks = inlen / blocksize; + stack_burn = hd->bwrite (hd, inbuf, inblocks); hd->count = 0; - if (!++hd->nblocks) - hd->nblocks_high++; - inlen -= hd->blocksize; - inbuf += hd->blocksize; + hd->nblocks_high += (hd->nblocks + inblocks < inblocks); + hd->nblocks += inblocks; + inlen -= inblocks * blocksize; + inbuf += inblocks * blocksize; } _gcry_burn_stack (stack_burn); - for (; inlen && hd->count < hd->blocksize; inlen--) + for (; inlen && hd->count < blocksize; inlen--) hd->buf[hd->count++] = *inbuf++; } diff --git a/cipher/hash-common.h b/cipher/hash-common.h index aa95365..c949191 100644 --- a/cipher/hash-common.h +++ b/cipher/hash-common.h @@ -30,7 +30,8 @@ const char * _gcry_hash_selftest_check_one /* Type for the md_write helper function. */ typedef unsigned int (*_gcry_md_block_write_t) (void *c, - const unsigned char *buf); + const unsigned char *blks, + size_t nblks); #if defined(HAVE_U64_TYPEDEF) && (defined(USE_SHA512) || defined(USE_WHIRLPOOL)) /* SHA-512 needs u64 and larger buffer. Whirlpool needs u64. */ diff --git a/cipher/md4.c b/cipher/md4.c index b9a1a95..40dc058 100644 --- a/cipher/md4.c +++ b/cipher/md4.c @@ -66,7 +66,7 @@ typedef struct { } MD4_CONTEXT; static unsigned int -transform ( void *c, const unsigned char *data ); +transform ( void *c, const unsigned char *data, size_t nblks ); static void md4_init( void *context ) @@ -94,7 +94,7 @@ md4_init( void *context ) * transform 64 bytes */ static unsigned int -transform ( void *c, const unsigned char *data ) +transform_blk ( void *c, const unsigned char *data ) { MD4_CONTEXT *ctx = c; u32 in[16]; @@ -181,6 +181,21 @@ transform ( void *c, const unsigned char *data ) } +static unsigned int +transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = transform_blk (c, data); + data += 64; + } + while (--nblks); + + return burn; +} + /* The routine final terminates the message-digest computation and * ends with the desired message digest in mdContext->digest[0...15]. @@ -234,7 +249,7 @@ md4_final( void *context ) /* append the 64 bit count */ buf_put_le32(hd->bctx.buf + 56, lsb); buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform( hd, hd->bctx.buf ); + burn = transform ( hd, hd->bctx.buf, 1 ); _gcry_burn_stack (burn); p = hd->bctx.buf; diff --git a/cipher/md5.c b/cipher/md5.c index 79b6e87..d06d3f7 100644 --- a/cipher/md5.c +++ b/cipher/md5.c @@ -50,7 +50,7 @@ typedef struct { } MD5_CONTEXT; static unsigned int -transform ( void *ctx, const unsigned char *data ); +transform ( void *ctx, const unsigned char *data, size_t datalen ); static void md5_init( void *context ) @@ -81,10 +81,10 @@ md5_init( void *context ) /**************** - * transform n*64 bytes + * transform 64 bytes */ static unsigned int -transform ( void *c, const unsigned char *data ) +transform_blk ( void *c, const unsigned char *data ) { MD5_CONTEXT *ctx = c; u32 correct_words[16]; @@ -205,6 +205,21 @@ transform ( void *c, const unsigned char *data ) } +static unsigned int +transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = transform_blk (c, data); + data += 64; + } + while (--nblks); + + return burn; +} + /* The routine final terminates the message-digest computation and * ends with the desired message digest in mdContext->digest[0...15]. @@ -258,7 +273,7 @@ md5_final( void *context) /* append the 64 bit count */ buf_put_le32(hd->bctx.buf + 56, lsb); buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform( hd, hd->bctx.buf ); + burn = transform ( hd, hd->bctx.buf, 1 ); _gcry_burn_stack (burn); p = hd->bctx.buf; diff --git a/cipher/rmd160.c b/cipher/rmd160.c index a6d9a80..224694f 100644 --- a/cipher/rmd160.c +++ b/cipher/rmd160.c @@ -141,7 +141,7 @@ */ static unsigned int -transform ( void *ctx, const unsigned char *data ); +transform ( void *ctx, const unsigned char *data, size_t nblks ); void _gcry_rmd160_init (void *context) @@ -167,7 +167,7 @@ _gcry_rmd160_init (void *context) * Transform the message X which consists of 16 32-bit-words */ static unsigned int -transform ( void *ctx, const unsigned char *data ) +transform_blk ( void *ctx, const unsigned char *data ) { RMD160_CONTEXT *hd = ctx; register u32 a,b,c,d,e; @@ -386,6 +386,22 @@ transform ( void *ctx, const unsigned char *data ) } +static unsigned int +transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = transform_blk (c, data); + data += 64; + } + while (--nblks); + + return burn; +} + + /**************** * Apply the rmd160 transform function on the buffer which must have * a length 64 bytes. Do not use this function together with the @@ -397,7 +413,7 @@ _gcry_rmd160_mixblock ( RMD160_CONTEXT *hd, void *blockof64byte ) { char *p = blockof64byte; - transform ( hd, blockof64byte ); + transform ( hd, blockof64byte, 64 ); #define X(a) do { *(u32*)p = hd->h##a ; p += 4; } while(0) X(0); X(1); @@ -457,7 +473,7 @@ rmd160_final( void *context ) /* append the 64 bit count */ buf_put_le32(hd->bctx.buf + 56, lsb); buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform( hd, hd->bctx.buf ); + burn = transform ( hd, hd->bctx.buf, 1 ); _gcry_burn_stack (burn); p = hd->bctx.buf; diff --git a/cipher/sha1.c b/cipher/sha1.c index 18b6daa..53f7538 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -70,7 +70,7 @@ typedef struct } SHA1_CONTEXT; static unsigned int -transform (void *c, const unsigned char *data); +transform (void *c, const unsigned char *data, size_t nblks); static void @@ -122,7 +122,7 @@ sha1_init (void *context) * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA. */ static unsigned int -_transform (void *ctx, const unsigned char *data) +transform_blk (void *ctx, const unsigned char *data) { SHA1_CONTEXT *hd = ctx; const u32 *idata = (const void *)data; @@ -239,17 +239,33 @@ _gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data); static unsigned int -transform (void *ctx, const unsigned char *data) +transform (void *ctx, const unsigned char *data, size_t nblks) { SHA1_CONTEXT *hd = ctx; + unsigned int burn; #ifdef USE_SSSE3 if (hd->use_ssse3) - return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data) - + 4 * sizeof(void*); + { + do + { + burn = _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data); + data += 64; + } + while (--nblks); + + return burn + 4 * sizeof(void*); + } #endif - return _transform (hd, data); + do + { + burn = transform_blk (ctx, data); + data += 64; + } + while (--nblks); + + return burn; } @@ -306,7 +322,7 @@ sha1_final(void *context) /* append the 64 bit count */ buf_put_be32(hd->bctx.buf + 56, msb); buf_put_be32(hd->bctx.buf + 60, lsb); - burn = transform( hd, hd->bctx.buf ); + burn = transform( hd, hd->bctx.buf, 1 ); _gcry_burn_stack (burn); p = hd->bctx.buf; diff --git a/cipher/sha256.c b/cipher/sha256.c index f3c1d62..c2045b8 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -66,7 +66,7 @@ typedef struct { static unsigned int -transform (void *c, const unsigned char *data); +transform (void *c, const unsigned char *data, size_t nblks); static void @@ -170,7 +170,7 @@ Sum1 (u32 x) static unsigned int -_transform (void *ctx, const unsigned char *data) +transform_blk (void *ctx, const unsigned char *data) { SHA256_CONTEXT *hd = ctx; static const u32 K[64] = { @@ -283,17 +283,25 @@ unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data, static unsigned int -transform (void *ctx, const unsigned char *data) +transform (void *ctx, const unsigned char *data, size_t nblks) { SHA256_CONTEXT *hd = ctx; + unsigned int burn; #ifdef USE_SSSE3 if (hd->use_ssse3) - return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, 1) + return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks) + 4 * sizeof(void*); #endif - return _transform (hd, data); + do + { + burn = transform_blk (hd, data); + data += 64; + } + while (--nblks); + + return burn; } @@ -348,7 +356,7 @@ sha256_final(void *context) /* append the 64 bit count */ buf_put_be32(hd->bctx.buf + 56, msb); buf_put_be32(hd->bctx.buf + 60, lsb); - burn = transform (hd, hd->bctx.buf); + burn = transform (hd, hd->bctx.buf, 1); _gcry_burn_stack (burn); p = hd->bctx.buf; diff --git a/cipher/sha512.c b/cipher/sha512.c index 586c809..215e8ed 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -116,7 +116,7 @@ typedef struct } SHA512_CONTEXT; static unsigned int -transform (void *context, const unsigned char *data); +transform (void *context, const unsigned char *data, size_t nblks); static void sha512_init (void *context) @@ -273,7 +273,7 @@ static const u64 k[] = * Transform the message W which consists of 16 64-bit-words */ static unsigned int -__transform (SHA512_STATE *hd, const unsigned char *data) +transform_blk (SHA512_STATE *hd, const unsigned char *data) { u64 a, b, c, d, e, f, g, h; u64 w[16]; @@ -561,32 +561,38 @@ unsigned int _gcry_sha512_transform_amd64_avx2(const void *input_data, static unsigned int -transform (void *context, const unsigned char *data) +transform (void *context, const unsigned char *data, size_t nblks) { SHA512_CONTEXT *ctx = context; + unsigned int burn; #ifdef USE_AVX2 if (ctx->use_avx2) - return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, 1) + return _gcry_sha512_transform_amd64_avx2 (data, &ctx->state, nblks) + 4 * sizeof(void*); #endif #ifdef USE_AVX if (ctx->use_avx) - return _gcry_sha512_transform_amd64_avx (data, &ctx->state, 1) + return _gcry_sha512_transform_amd64_avx (data, &ctx->state, nblks) + 4 * sizeof(void*); #endif #ifdef USE_SSSE3 if (ctx->use_ssse3) - return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, 1) + return _gcry_sha512_transform_amd64_ssse3 (data, &ctx->state, nblks) + 4 * sizeof(void*); #endif #ifdef USE_ARM_NEON_ASM if (ctx->use_neon) { - _gcry_sha512_transform_armv7_neon (&ctx->state, data, k); + do + { + _gcry_sha512_transform_armv7_neon (&ctx->state, data, k); + data += 128; + } + while (--nblks); /* _gcry_sha512_transform_armv7_neon does not store sensitive data * to stack. */ @@ -594,7 +600,14 @@ transform (void *context, const unsigned char *data) } #endif - return __transform (&ctx->state, data) + 3 * sizeof(void*); + do + { + burn = transform_blk (&ctx->state, data) + 3 * sizeof(void*); + data += 128; + } + while (--nblks); + + return burn; } @@ -652,7 +665,7 @@ sha512_final (void *context) /* append the 128 bit count */ buf_put_be64(hd->bctx.buf + 112, msb); buf_put_be64(hd->bctx.buf + 120, lsb); - stack_burn_depth = transform (hd, hd->bctx.buf); + stack_burn_depth = transform (hd, hd->bctx.buf, 1); _gcry_burn_stack (stack_burn_depth); p = hd->bctx.buf; diff --git a/cipher/stribog.c b/cipher/stribog.c index a3cab36..6d1d342 100644 --- a/cipher/stribog.c +++ b/cipher/stribog.c @@ -1194,7 +1194,7 @@ static inline void g (u64 *h, u64 *m, u64 *N) static unsigned int -transform64 (void *context, const unsigned char *inbuf_arg); +transform (void *context, const unsigned char *inbuf_arg, size_t datalen); static void @@ -1205,7 +1205,7 @@ stribog_init_512 (void *context) memset (hd, 0, sizeof (*hd)); hd->bctx.blocksize = 64; - hd->bctx.bwrite = transform64; + hd->bctx.bwrite = transform; } static void @@ -1217,7 +1217,7 @@ stribog_init_256 (void *context) } static void -transform (STRIBOG_CONTEXT *hd, const unsigned char *data, unsigned count) +transform_bits (STRIBOG_CONTEXT *hd, const unsigned char *data, unsigned count) { u64 M[8]; u64 l; @@ -1248,15 +1248,30 @@ transform (STRIBOG_CONTEXT *hd, const unsigned char *data, unsigned count) } static unsigned int -transform64 (void *context, const unsigned char *inbuf_arg) +transform_blk (void *context, const unsigned char *inbuf_arg) { STRIBOG_CONTEXT *hd = context; - transform (hd, inbuf_arg, 64 * 8); + transform_bits (hd, inbuf_arg, 64 * 8); return /* burn_stack */ 768; } +static unsigned int +transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = transform_blk (c, data); + data += 64; + } + while (--nblks); + + return burn; +} + /* The routine finally terminates the computation and returns the digest. The handle is prepared for a new cycle, but adding bytes @@ -1276,7 +1291,7 @@ stribog_final (void *context) hd->bctx.buf[i++] = 1; while (i < 64) hd->bctx.buf[i++] = 0; - transform (hd, hd->bctx.buf, hd->bctx.count * 8); + transform_bits (hd, hd->bctx.buf, hd->bctx.count * 8); g (hd->h, hd->N, Z); g (hd->h, hd->Sigma, Z); diff --git a/cipher/tiger.c b/cipher/tiger.c index 9b8d0ef..17c4119 100644 --- a/cipher/tiger.c +++ b/cipher/tiger.c @@ -590,7 +590,7 @@ static u64 sbox4[256] = { }; static unsigned int -transform ( void *ctx, const unsigned char *data ); +transform ( void *ctx, const unsigned char *data, size_t nblks ); static void do_init (void *context, int variant) @@ -695,7 +695,7 @@ key_schedule( u64 *x ) * Transform the message DATA which consists of 512 bytes (8 words) */ static unsigned int -transform ( void *ctx, const unsigned char *data ) +transform_blk ( void *ctx, const unsigned char *data ) { TIGER_CONTEXT *hd = ctx; u64 a,b,c,aa,bb,cc; @@ -729,6 +729,22 @@ transform ( void *ctx, const unsigned char *data ) } +static unsigned int +transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = transform_blk (c, data); + data += 64; + } + while (--nblks); + + return burn; +} + + /* The routine terminates the computation */ @@ -779,7 +795,7 @@ tiger_final( void *context ) /* append the 64 bit count */ buf_put_le32(hd->bctx.buf + 56, lsb); buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform( hd, hd->bctx.buf ); + burn = transform( hd, hd->bctx.buf, 1 ); _gcry_burn_stack (burn); p = hd->bctx.buf; diff --git a/cipher/whirlpool.c b/cipher/whirlpool.c index e562781..57ca882 100644 --- a/cipher/whirlpool.c +++ b/cipher/whirlpool.c @@ -1161,7 +1161,7 @@ static const u64 C7[256] = static unsigned int -whirlpool_transform (void *ctx, const unsigned char *data); +whirlpool_transform (void *ctx, const unsigned char *data, size_t nblks); @@ -1181,7 +1181,7 @@ whirlpool_init (void *ctx) * Transform block. */ static unsigned int -whirlpool_transform (void *ctx, const unsigned char *data) +whirlpool_transform_blk (void *ctx, const unsigned char *data) { whirlpool_context_t *context = ctx; whirlpool_block_t data_block; @@ -1280,6 +1280,21 @@ whirlpool_transform (void *ctx, const unsigned char *data) 4 * sizeof(void*); } +static unsigned int +whirlpool_transform ( void *c, const unsigned char *data, size_t nblks ) +{ + unsigned int burn; + + do + { + burn = whirlpool_transform_blk (c, data); + data += BLOCK_SIZE; + } + while (--nblks); + + return burn; +} + static void whirlpool_write (void *ctx, const void *buffer, size_t buffer_n) { From jussi.kivilinna at iki.fi Tue Dec 17 15:37:21 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Dec 2013 16:37:21 +0200 Subject: [PATCH 3/6] Add AVX and AVX/BMI2 implementations for SHA-1 In-Reply-To: <20131217143711.31473.21328.stgit@localhost6.localdomain6> References: <20131217143711.31473.21328.stgit@localhost6.localdomain6> Message-ID: <20131217143721.31473.61442.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha1-avx-amd64.S' and 'sha1-avx-bmi2-amd64.S'. * cipher/sha1-avx-amd64.S: New. * cipher/sha1-avx-bmi2-amd64.S: New. * cipher/sha1.c (USE_AVX, USE_BMI2): New. (SHA1_CONTEXT) [USE_AVX]: Add 'use_avx'. (SHA1_CONTEXT) [USE_BMI2]: Add 'use_bmi2'. (sha1_init): Initialize 'use_avx' and 'use_bmi2'. [USE_AVX] (_gcry_sha1_transform_amd64_avx): New. [USE_BMI2] (_gcry_sha1_transform_amd64_bmi2): New. (transform) [USE_BMI2]: Use BMI2 assembly if enabled. (transform) [USE_AVX]: Use AVX assembly if enabled. * configure.ac: Add 'sha1-avx-amd64.lo' and 'sha1-avx-bmi2-amd64.lo'. -- Patch adds AVX (for Sandybridge and Ivybridge) and AVX/BMI2 (for Haswell) optimized implementations of SHA-1. Note: AVX implementation is currently limited to Intel CPUs due to use of SHLD instruction for faster rotations on Sandybrigde. Benchmarks: cpu C-version SSSE3 AVX/(SHLD|BMI2) New vs C New vs SSSE3 Intel i5-4570 8.84 c/B 4.61 c/B 3.86 c/B 2.29x 1.19x Intel i5-2450M 9.45 c/B 5.30 c/B 4.39 c/B 2.15x 1.20x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/sha1-avx-amd64.S | 420 ++++++++++++++++++++++++++++++++++++++++++ cipher/sha1-avx-bmi2-amd64.S | 417 ++++++++++++++++++++++++++++++++++++++++++ cipher/sha1.c | 54 +++++ configure.ac | 2 5 files changed, 893 insertions(+), 2 deletions(-) create mode 100644 cipher/sha1-avx-amd64.S create mode 100644 cipher/sha1-avx-bmi2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index a1718c5..3ec651f 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ -sha1.c sha1-ssse3-amd64.S \ +sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ sha256.c sha256-ssse3-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S \ diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S new file mode 100644 index 0000000..233ad51 --- /dev/null +++ b/cipher/sha1-avx-amd64.S @@ -0,0 +1,420 @@ +/* sha1-avx-amd64.S - Intel AVX accelerated SHA-1 transform function + * Copyright ? 2013 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1) + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +.data +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 16 +.LK_XMM: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + +.Lbswap_shufb_ctl: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + + +/* Register macros */ + +#define RSTATE %r8 +#define RDATA %r9 +#define ROLDSTACK %r10 +#define RNBLKS %r11 + +#define a %eax +#define b %ebx +#define c %ecx +#define d %edx +#define e %edi + +#define RT0 %esi +#define RT1 %ebp + +#define Wtmp0 %xmm0 +#define Wtmp1 %xmm1 + +#define W0 %xmm2 +#define W1 %xmm3 +#define W2 %xmm4 +#define W3 %xmm5 +#define W4 %xmm6 +#define W5 %xmm7 +#define W6 %xmm8 +#define W7 %xmm9 + +#define BSWAP_REG %xmm10 + + +/* Round function macros. */ + +#define WK(i) (((i) & 15) * 4)(%rsp) + +#define R_F1(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl d, RT0; \ + movl a, RT1; \ + andl b, RT0; \ + shldl $30, b, b; \ + xorl d, RT0; \ + leal (RT0,e), e; \ + shldl $5, RT1, RT1; \ + addl RT1, e; + +#define R_F2(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl b, RT0; \ + shldl $30, b, b; \ + xorl d, RT0; \ + movl a, RT1; \ + leal (RT0,e), e; \ + shldl $5, RT1, RT1; \ + addl RT1, e; + +#define R_F3(a,b,c,d,e,i) \ + movl c, RT0; \ + movl b, RT1; \ + xorl b, RT0; \ + andl c, RT1; \ + andl d, RT0; \ + addl RT1, e; \ + addl WK(i), e; \ + shldl $30, b, b; \ + movl a, RT1; \ + leal (RT0,e), e; \ + shldl $5, RT1, RT1; \ + addl RT1, e; + +#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) + +#define R(a,b,c,d,e,f,i) \ + R_##f(a,b,c,d,e,i) + + +/* Input expansion macros. */ + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + vmovdqu (4*(i))(RDATA), tmp0; + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + vpshufb BSWAP_REG, tmp0, W; + +#define W_PRECALC_00_15_2(i, W, tmp0) \ + vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + vmovdqa tmp0, WK(i&~3); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpalignr $8, W_m16, W_m12, W; \ + vpsrldq $4, W_m04, tmp0; \ + vpxor W_m08, W, W; + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpxor W_m16, tmp0, tmp0; \ + vpxor tmp0, W, W; \ + vpslld $1, W, tmp0; \ + vpslldq $12, W, tmp1; \ + vpsrld $31, W, W; + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpor W, tmp0, tmp0; \ + vpsrld $30, tmp1, W; \ + vpslld $2, tmp1, tmp1; + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpxor W, tmp0, tmp0; \ + vpxor tmp1, tmp0, W; \ + vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vmovdqa tmp0, WK((i)&~3); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpxor W_m28, W, W; \ + vpalignr $8, W_m08, W_m04, tmp0; + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpxor W_m16, W, W; \ + vpxor tmp0, W, W; + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpsrld $30, W, tmp0; \ + vpslld $2, W, W; + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpor W, tmp0, W; \ + vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vmovdqa tmp0, WK((i)&~3); + + +/* + * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. + * + * unsigned int + * _gcry_sha1_transform_amd64_avx (void *ctx, const unsigned char *data, + * size_t nblks) + */ +.text +.globl _gcry_sha1_transform_amd64_avx +.type _gcry_sha1_transform_amd64_avx, at function +.align 16 +_gcry_sha1_transform_amd64_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: data (64*nblks bytes) + * %rdx: nblks + */ + + xorl %eax, %eax; + cmpq $0, %rdx; + jz .Lret; + + vzeroupper; + + movq %rdx, RNBLKS; + movq %rdi, RSTATE; + movq %rsi, RDATA; + pushq %rbx; + pushq %rbp; + + movq %rsp, ROLDSTACK; + + subq $(16*4), %rsp; + andq $(~31), %rsp; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + + movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + + /* Precalc 0-15. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_3(15, W5, Wtmp0); + +.align 8 +.Loop: + addq $64, RDATA; + + /* Transform 0-15 + Precalc 16-31. */ + R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + + /* Transform 16-63 + Precalc 32-79. */ + R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + + decq RNBLKS; + jz .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ + R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); + R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); + R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); + R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); + R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); + R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); + R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); + R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); + R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); + R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); + R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); + R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); + R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); + R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); + R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); + R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + jmp .Loop; + +.align 16 +.Lend: + vzeroall; + + /* Transform 64-79. */ + R( b, c, d, e, a, F4, 64 ); + R( a, b, c, d, e, F4, 65 ); + R( e, a, b, c, d, F4, 66 ); + R( d, e, a, b, c, F4, 67 ); + R( c, d, e, a, b, F4, 68 ); + R( b, c, d, e, a, F4, 69 ); + R( a, b, c, d, e, F4, 70 ); + R( e, a, b, c, d, F4, 71 ); + R( d, e, a, b, c, F4, 72 ); + R( c, d, e, a, b, F4, 73 ); + R( b, c, d, e, a, F4, 74 ); + R( a, b, c, d, e, F4, 75 ); + R( e, a, b, c, d, F4, 76 ); + R( d, e, a, b, c, F4, 77 ); + R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; + R( b, c, d, e, a, F4, 79 ); + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + movq ROLDSTACK, %rsp; + + popq %rbp; + popq %rbx; + + /* burn_stack */ + movl $(16*4 + 2*8 + 31), %eax; + +.Lret: + ret; + +#endif +#endif diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S new file mode 100644 index 0000000..a9075ff --- /dev/null +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -0,0 +1,417 @@ +/* sha1-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SHA-1 transform function + * Copyright ? 2013 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1) + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +.data +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 16 +.LK_XMM: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + +.Lbswap_shufb_ctl: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + + +/* Register macros */ + +#define RSTATE %r8 +#define RDATA %r9 +#define ROLDSTACK %r10 +#define RNBLKS %r11 + +#define a %eax +#define b %ebx +#define c %ecx +#define d %edx +#define e %edi + +#define RT0 %esi +#define RT1 %ebp + +#define Wtmp0 %xmm0 +#define Wtmp1 %xmm1 + +#define W0 %xmm2 +#define W1 %xmm3 +#define W2 %xmm4 +#define W3 %xmm5 +#define W4 %xmm6 +#define W5 %xmm7 +#define W6 %xmm8 +#define W7 %xmm9 + +#define BSWAP_REG %xmm10 + + +/* Round function macros. */ + +#define WK(i) (((i) & 15) * 4)(%rsp) + +#define R_F1(a,b,c,d,e,i) \ + movl c, RT0; \ + andn d, b, RT1; \ + addl WK(i), e; \ + andl b, RT0; \ + rorxl $2, b, b; \ + addl RT1, e; \ + leal (RT0,e), e; \ + rorxl $27, a, RT1; \ + addl RT1, e; + +#define R_F2(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl b, RT0; \ + rorxl $2, b, b; \ + xorl d, RT0; \ + leal (RT0,e), e; \ + rorxl $27, a, RT1; \ + addl RT1, e; + +#define R_F3(a,b,c,d,e,i) \ + movl c, RT0; \ + movl b, RT1; \ + xorl b, RT0; \ + andl c, RT1; \ + andl d, RT0; \ + addl RT1, e; \ + addl WK(i), e; \ + rorxl $2, b, b; \ + leal (RT0,e), e; \ + rorxl $27, a, RT1; \ + addl RT1, e; + +#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) + +#define R(a,b,c,d,e,f,i) \ + R_##f(a,b,c,d,e,i) + + +/* Input expansion macros. */ + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + vmovdqu (4*(i))(RDATA), tmp0; + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + vpshufb BSWAP_REG, tmp0, W; + +#define W_PRECALC_00_15_2(i, W, tmp0) \ + vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + vmovdqa tmp0, WK(i&~3); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpalignr $8, W_m16, W_m12, W; \ + vpsrldq $4, W_m04, tmp0; \ + vpxor W_m08, W, W; + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpxor W_m16, tmp0, tmp0; \ + vpxor tmp0, W, W; \ + vpslld $1, W, tmp0; \ + vpslldq $12, W, tmp1; \ + vpsrld $31, W, W; + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpor W, tmp0, tmp0; \ + vpsrld $30, tmp1, W; \ + vpslld $2, tmp1, tmp1; + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpxor W, tmp0, tmp0; \ + vpxor tmp1, tmp0, W; \ + vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vmovdqa tmp0, WK((i)&~3); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpxor W_m28, W, W; \ + vpalignr $8, W_m08, W_m04, tmp0; + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpxor W_m16, W, W; \ + vpxor tmp0, W, W; + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpsrld $30, W, tmp0; \ + vpslld $2, W, W; + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpor W, tmp0, W; \ + vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vmovdqa tmp0, WK((i)&~3); + + +/* + * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. + * + * unsigned int + * _gcry_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data, + * size_t nblks) + */ +.text +.globl _gcry_sha1_transform_amd64_avx_bmi2 +.type _gcry_sha1_transform_amd64_avx_bmi2, at function +.align 16 +_gcry_sha1_transform_amd64_avx_bmi2: + /* input: + * %rdi: ctx, CTX + * %rsi: data (64*nblks bytes) + * %rdx: nblks + */ + + xorl %eax, %eax; + cmpq $0, %rdx; + jz .Lret; + + vzeroupper; + + movq %rdx, RNBLKS; + movq %rdi, RSTATE; + movq %rsi, RDATA; + pushq %rbx; + pushq %rbp; + + movq %rsp, ROLDSTACK; + + subq $(16*4), %rsp; + andq $(~31), %rsp; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + + movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + + /* Precalc 0-15. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_3(15, W5, Wtmp0); + +.align 8 +.Loop: + addq $64, RDATA; + + /* Transform 0-15 + Precalc 16-31. */ + R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + + /* Transform 16-63 + Precalc 32-79. */ + R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + + decq RNBLKS; + jz .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ + R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); + R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); + R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); + R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); + R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); + R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); + R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); + R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); + R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); + R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); + R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); + R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); + R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); + R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); + R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); + R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + jmp .Loop; + +.align 16 +.Lend: + vzeroall; + + /* Transform 64-79. */ + R( b, c, d, e, a, F4, 64 ); + R( a, b, c, d, e, F4, 65 ); + R( e, a, b, c, d, F4, 66 ); + R( d, e, a, b, c, F4, 67 ); + R( c, d, e, a, b, F4, 68 ); + R( b, c, d, e, a, F4, 69 ); + R( a, b, c, d, e, F4, 70 ); + R( e, a, b, c, d, F4, 71 ); + R( d, e, a, b, c, F4, 72 ); + R( c, d, e, a, b, F4, 73 ); + R( b, c, d, e, a, F4, 74 ); + R( a, b, c, d, e, F4, 75 ); + R( e, a, b, c, d, F4, 76 ); + R( d, e, a, b, c, F4, 77 ); + R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; + R( b, c, d, e, a, F4, 79 ); + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + movq ROLDSTACK, %rsp; + + popq %rbp; + popq %rbx; + + /* burn_stack */ + movl $(16*4 + 2*8 + 31), %eax; + +.Lret: + ret; + +#endif +#endif diff --git a/cipher/sha1.c b/cipher/sha1.c index 8040e76..a55ff93 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -50,6 +50,20 @@ # define USE_SSSE3 1 #endif +/* USE_AVX indicates whether to compile with Intel AVX code. */ +#undef USE_AVX +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) +# define USE_AVX 1 +#endif + +/* USE_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */ +#undef USE_BMI2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && defined(HAVE_GCC_INLINE_ASM_BMI2) +# define USE_BMI2 1 +#endif + /* A macro to test whether P is properly aligned for an u32 type. Note that config.h provides a suitable replacement for uintptr_t if @@ -67,6 +81,12 @@ typedef struct #ifdef USE_SSSE3 unsigned int use_ssse3:1; #endif +#ifdef USE_AVX + unsigned int use_avx:1; +#endif +#ifdef USE_BMI2 + unsigned int use_bmi2:1; +#endif } SHA1_CONTEXT; static unsigned int @@ -77,6 +97,7 @@ static void sha1_init (void *context) { SHA1_CONTEXT *hd = context; + unsigned int features = _gcry_get_hw_features (); hd->h0 = 0x67452301; hd->h1 = 0xefcdab89; @@ -91,8 +112,17 @@ sha1_init (void *context) hd->bctx.bwrite = transform; #ifdef USE_SSSE3 - hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; +#endif +#ifdef USE_AVX + /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. + * Therefore use this implementation on Intel CPUs only. */ + hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); +#endif +#ifdef USE_BMI2 + hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2); #endif + (void)features; } @@ -238,6 +268,18 @@ _gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data, size_t nblks); #endif +#ifdef USE_AVX +unsigned int +_gcry_sha1_transform_amd64_avx (void *state, const unsigned char *data, + size_t nblks); +#endif + +#ifdef USE_BMI2 +unsigned int +_gcry_sha1_transform_amd64_avx_bmi2 (void *state, const unsigned char *data, + size_t nblks); +#endif + static unsigned int transform (void *ctx, const unsigned char *data, size_t nblks) @@ -245,6 +287,16 @@ transform (void *ctx, const unsigned char *data, size_t nblks) SHA1_CONTEXT *hd = ctx; unsigned int burn; +#ifdef USE_BMI2 + if (hd->use_bmi2) + return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks) + + 4 * sizeof(void*); +#endif +#ifdef USE_AVX + if (hd->use_avx) + return _gcry_sha1_transform_amd64_avx (&hd->h0, data, nblks) + + 4 * sizeof(void*); +#endif #ifdef USE_SSSE3 if (hd->use_ssse3) return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks) diff --git a/configure.ac b/configure.ac index ed14d89..231e3d3 100644 --- a/configure.ac +++ b/configure.ac @@ -1787,6 +1787,8 @@ case "${host}" in x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo" ;; esac From jussi.kivilinna at iki.fi Tue Dec 17 15:37:16 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Dec 2013 16:37:16 +0200 Subject: [PATCH 2/6] SHA-1/SSSE3: Improve performance on large buffers In-Reply-To: <20131217143711.31473.21328.stgit@localhost6.localdomain6> References: <20131217143711.31473.21328.stgit@localhost6.localdomain6> Message-ID: <20131217143716.31473.35849.stgit@localhost6.localdomain6> * cipher/sha1-ssse3-amd64.S (RNBLKS): New. (_gcry_sha1_transform_amd64_ssse3): Handle multiple input blocks, with software pipelining of next data block processing. * cipher/sha1.c [USE_SSSE3] (_gcry_sha1_transform_amd64_ssse3): Add 'nblks'. (transform) [USE_SSSE3]: Pass nblks to assembly function. -- Patch gives small improvement for large buffer processing, on Intel i5-4570 speed goes from 4.80 c/B to 4.61 c/B. Signed-off-by: Jussi Kivilinna --- cipher/sha1-ssse3-amd64.S | 70 +++++++++++++++++++++++++++++++++++++++------ cipher/sha1.c | 15 +++------- 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index 5e5716b..d80631d 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -71,6 +71,7 @@ #define RSTATE %r8 #define RDATA %r9 #define ROLDSTACK %r10 +#define RNBLKS %r11 #define a %eax #define b %ebx @@ -211,10 +212,11 @@ /* - * Transform 64 bytes (16 32-bit words) at DATA. + * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. * * unsigned int - * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) + * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data, + * size_t nblks) */ .text .globl _gcry_sha1_transform_amd64_ssse3 @@ -223,10 +225,15 @@ _gcry_sha1_transform_amd64_ssse3: /* input: * %rdi: ctx, CTX - * %rsi: data (64 bytes) - * %rdx: ... + * %rsi: data (64*nblks bytes) + * %rdx: nblks */ + xorl %eax, %eax; + cmpq $0, %rdx; + jz .Lret; + + movq %rdx, RNBLKS; movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; @@ -264,6 +271,10 @@ _gcry_sha1_transform_amd64_ssse3: W_PRECALC_00_15_2(14, W5, Wtmp0); W_PRECALC_00_15_3(15, W5, Wtmp0); +.align 8 +.Loop: + addq $64, RDATA; + /* Transform 0-15 + Precalc 16-31. */ R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); @@ -332,6 +343,44 @@ _gcry_sha1_transform_amd64_ssse3: R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + decq RNBLKS; + jz .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ + R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); + R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); + R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); + R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); + R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); + R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); + R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); + R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); + R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); + R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); + R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); + R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); + R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); + R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); + R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); + R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + jmp .Loop; + +.align 16 +.Lend: /* Transform 64-79 + Clear XMM registers. */ R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); @@ -348,19 +397,19 @@ _gcry_sha1_transform_amd64_ssse3: R( e, a, b, c, d, F4, 76 ); R( d, e, a, b, c, F4, 77 ); R( c, d, e, a, b, F4, 78 ); + addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); /* Update the chaining variables. */ - addl state_h0(RSTATE), a; - addl state_h1(RSTATE), b; - addl state_h2(RSTATE), c; addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; addl state_h4(RSTATE), e; - movl a, state_h0(RSTATE); - movl b, state_h1(RSTATE); - movl c, state_h2(RSTATE); movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; @@ -371,6 +420,7 @@ _gcry_sha1_transform_amd64_ssse3: /* burn_stack */ movl $(16*4 + 2*8 + 31), %eax; +.Lret: ret; #endif diff --git a/cipher/sha1.c b/cipher/sha1.c index 53f7538..8040e76 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -234,7 +234,8 @@ transform_blk (void *ctx, const unsigned char *data) #ifdef USE_SSSE3 unsigned int -_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data); +_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data, + size_t nblks); #endif @@ -246,16 +247,8 @@ transform (void *ctx, const unsigned char *data, size_t nblks) #ifdef USE_SSSE3 if (hd->use_ssse3) - { - do - { - burn = _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data); - data += 64; - } - while (--nblks); - - return burn + 4 * sizeof(void*); - } + return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks) + + 4 * sizeof(void*); #endif do From jussi.kivilinna at iki.fi Tue Dec 17 15:37:31 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Dec 2013 16:37:31 +0200 Subject: [PATCH 5/6] Improve performance of SHA-512/ARM/NEON implementation In-Reply-To: <20131217143711.31473.21328.stgit@localhost6.localdomain6> References: <20131217143711.31473.21328.stgit@localhost6.localdomain6> Message-ID: <20131217143731.31473.41503.stgit@localhost6.localdomain6> * cipher/sha512-armv7-neon.S (RT01q, RT23q, RT45q, RT67q): New. (round_0_63, round_64_79): Remove. (rounds2_0_63, rounds2_64_79): New. (_gcry_sha512_transform_armv7_neon): Add 'nblks' input; Handle multiple input blocks; Use new round macros. * cipher/sha512.c [USE_ARM_NEON_ASM] (_gcry_sha512_transform_armv7_neon): Add 'num_blks'. (transform) [USE_ARM_NEON_ASM]: Pass nblks to assembly. -- Benchmarks on ARM Cortex-A8: C-language: 139.1 c/B Old ARM/NEON: 34.30 c/B New ARM/NEON: 24.46 c/B New vs C: 5.68x New vs Old: 1.40x Signed-off-by: Jussi Kivilinna --- cipher/sha512-armv7-neon.S | 367 ++++++++++++++++++++++++++++++-------------- cipher/sha512.c | 9 - 2 files changed, 252 insertions(+), 124 deletions(-) diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S index 042b15a..0a6e86b 100644 --- a/cipher/sha512-armv7-neon.S +++ b/cipher/sha512-armv7-neon.S @@ -60,6 +60,11 @@ #define RT6 d14 #define RT7 d15 +#define RT01q q4 +#define RT23q q5 +#define RT45q q6 +#define RT67q q7 + #define RW0 d16 #define RW1 d17 #define RW2 d18 @@ -89,114 +94,190 @@ /*********************************************************************** * ARM assembly implementation of sha512 transform ***********************************************************************/ -#define round_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw14, rw9, rw1) \ +#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ - vshr.u64 RT1, re, #14; \ + vshr.u64 RT2, re, #14; \ vshl.u64 RT3, re, #64 - 14; \ + interleave_op(arg1); \ vshr.u64 RT4, re, #18; \ vshl.u64 RT5, re, #64 - 18; \ - veor.64 RT1, RT1, RT3; \ vld1.64 {RT0}, [RK]!; \ - veor.64 RT1, RT1, RT4; \ - vshr.u64 RT3, re, #41; \ - vshl.u64 RT4, re, #64 - 41; \ - veor.64 RT1, RT1, RT5; \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, re, #41; \ + vshl.u64 RT5, re, #64 - 41; \ vadd.u64 RT0, RT0, rw0; \ - veor.64 RT1, RT1, RT3; \ - vand.64 RT2, re, rf; \ - veor.64 RT1, RT1, RT4; \ - vbic.64 RT6, rg, re; \ + veor.64 RT23q, RT23q, RT45q; \ + vmov.64 RT7, re; \ + veor.64 RT1, RT2, RT3; \ + vbsl.64 RT7, rf, rg; \ \ vadd.u64 RT1, RT1, rh; \ - veor.64 RT2, RT2, RT6; \ - vshr.u64 rh, ra, #28; \ + vshr.u64 RT2, ra, #28; \ vshl.u64 RT3, ra, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, ra, #34; \ - veor.64 rh, rh, RT3; \ vshl.u64 RT5, ra, #64 - 34; \ - vadd.u64 RT1, RT1, RT2; \ + vadd.u64 RT1, RT1, RT7; \ \ /* h = Sum0 (a) + Maj (a, b, c); */ \ - veor.64 rh, rh, RT4; \ - vshr.u64 RT3, ra, #39; \ - vshl.u64 RT4, ra, #64 - 39; \ - vorr.64 RT6, ra, rb; \ - vand.64 RT0, ra, rb; \ - veor.64 rh, rh, RT5; \ - vand.64 RT6, RT6, rc; \ - veor.64 rh, rh, RT3; \ - vorr.64 RT0, RT0, RT6; \ - veor.64 rh, rh, RT4; \ - vshr.u64 RT4, rw14, #19; \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, ra, #39; \ + vshl.u64 RT5, ra, #64 - 39; \ + veor.64 RT0, ra, rb; \ + veor.64 RT23q, RT23q, RT45q; \ + vbsl.64 RT0, rc, rb; \ + vadd.u64 rd, rd, RT1; /* d+=t1; */ \ + veor.64 rh, RT2, RT3; \ + \ + /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ + vshr.u64 RT2, rd, #14; \ + vshl.u64 RT3, rd, #64 - 14; \ vadd.u64 rh, rh, RT0; \ - vshl.u64 RT2, rw14, #64 - 19; \ + vshr.u64 RT4, rd, #18; \ + vshl.u64 RT5, rd, #64 - 18; \ + vadd.u64 rh, rh, RT1; /* h+=t1; */ \ + vld1.64 {RT0}, [RK]!; \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, rd, #41; \ + vshl.u64 RT5, rd, #64 - 41; \ + vadd.u64 RT0, RT0, rw1; \ + veor.64 RT23q, RT23q, RT45q; \ + vmov.64 RT7, rd; \ + veor.64 RT1, RT2, RT3; \ + vbsl.64 RT7, re, rf; \ + \ + vadd.u64 RT1, RT1, rg; \ + vshr.u64 RT2, rh, #28; \ + vshl.u64 RT3, rh, #64 - 28; \ + vadd.u64 RT1, RT1, RT0; \ + vshr.u64 RT4, rh, #34; \ + vshl.u64 RT5, rh, #64 - 34; \ + vadd.u64 RT1, RT1, RT7; \ + \ + /* g = Sum0 (h) + Maj (h, a, b); */ \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, rh, #39; \ + vshl.u64 RT5, rh, #64 - 39; \ + veor.64 RT0, rh, ra; \ + veor.64 RT23q, RT23q, RT45q; \ + vbsl.64 RT0, rb, ra; \ + vadd.u64 rc, rc, RT1; /* c+=t1; */ \ + veor.64 rg, RT2, RT3; \ \ /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ - vshr.u64 RT3, rw14, #61; \ - vshl.u64 RT6, rw14, #64 - 61; \ - veor.64 RT0, RT4, RT2; \ - vshr.u64 RT2, rw14, 6; \ - veor.64 RT0, RT0, RT3; \ - vshr.u64 RT7, rw1, #1; \ - veor.64 RT0, RT0, RT6; \ - vshl.u64 RT4, rw1, #64 - 1; \ - veor.64 RT0, RT0, RT2; \ - vshr.u64 RT5, rw1, #8; \ - vadd.u64 rw0, rw0, RT0; \ - vshl.u64 RT6, rw1, #64 - 8; \ - veor.64 RT7, RT7, RT4; \ - vshr.u64 RT4, rw1, 7; \ - veor.64 RT7, RT7, RT5; \ - vadd.u64 rw0, rw0, rw9; /* w[0]+=w[9]; */\ - veor.64 RT7, RT7, RT6; \ - vadd.u64 rd, rd, RT1; /* d+=t1; */ \ - veor.64 RT7, RT7, RT4; \ - vadd.u64 rh, rh, RT1; /* h+=t1; */ \ - vadd.u64 rw0, rw0, RT7; \ + /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \ + \ + /**** S0(w[1:2]) */ \ + \ + /* w[0:1] += w[9:10] */ \ + /* RT23q = rw1:rw2 */ \ + vext.u64 RT23q, rw01q, rw23q, #1; \ + vadd.u64 rw0, rw9; \ + vadd.u64 rg, rg, RT0; \ + vadd.u64 rw1, rw10;\ + vadd.u64 rg, rg, RT1; /* g+=t1; */ \ + \ + vshr.u64 RT45q, RT23q, #1; \ + vshl.u64 RT67q, RT23q, #64 - 1; \ + vshr.u64 RT01q, RT23q, #8; \ + veor.u64 RT45q, RT45q, RT67q; \ + vshl.u64 RT67q, RT23q, #64 - 8; \ + veor.u64 RT45q, RT45q, RT01q; \ + vshr.u64 RT01q, RT23q, #7; \ + veor.u64 RT45q, RT45q, RT67q; \ + \ + /**** S1(w[14:15]) */ \ + vshr.u64 RT23q, rw1415q, #6; \ + veor.u64 RT01q, RT01q, RT45q; \ + vshr.u64 RT45q, rw1415q, #19; \ + vshl.u64 RT67q, rw1415q, #64 - 19; \ + veor.u64 RT23q, RT23q, RT45q; \ + vshr.u64 RT45q, rw1415q, #61; \ + veor.u64 RT23q, RT23q, RT67q; \ + vshl.u64 RT67q, rw1415q, #64 - 61; \ + veor.u64 RT23q, RT23q, RT45q; \ + vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \ + veor.u64 RT01q, RT23q, RT67q; +#define vadd_RT01q(rw01q) \ + /* w[0:1] += S(w[14:15]) */ \ + vadd.u64 rw01q, RT01q; + +#define dummy(_) /*_*/ -#define round_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0) \ +#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \ /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ - vld1.64 {RT0}, [RK]!; \ - vshr.u64 RT1, re, #14; \ + vshr.u64 RT2, re, #14; \ vshl.u64 RT3, re, #64 - 14; \ + interleave_op1(arg1); \ vshr.u64 RT4, re, #18; \ vshl.u64 RT5, re, #64 - 18; \ - veor.64 RT1, RT1, RT3; \ - vshr.u64 RT7, ra, #28; \ - veor.64 RT1, RT1, RT4; \ - vshr.u64 RT3, re, #41; \ - vshl.u64 RT4, re, #64 - 41; \ - veor.64 RT1, RT1, RT5; \ + interleave_op2(arg2); \ + vld1.64 {RT0}, [RK]!; \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, re, #41; \ + vshl.u64 RT5, re, #64 - 41; \ vadd.u64 RT0, RT0, rw0; \ - veor.64 RT1, RT1, RT3; \ - vand.64 RT2, re, rf; \ - veor.64 RT1, RT1, RT4; \ - vbic.64 RT6, rg, re; \ + veor.64 RT23q, RT23q, RT45q; \ + vmov.64 RT7, re; \ + veor.64 RT1, RT2, RT3; \ + vbsl.64 RT7, rf, rg; \ \ vadd.u64 RT1, RT1, rh; \ - veor.64 RT2, RT2, RT6; \ + vshr.u64 RT2, ra, #28; \ + vshl.u64 RT3, ra, #64 - 28; \ vadd.u64 RT1, RT1, RT0; \ vshr.u64 RT4, ra, #34; \ vshl.u64 RT5, ra, #64 - 34; \ + vadd.u64 RT1, RT1, RT7; \ \ - /* t7 = Sum0 (a) + Maj (a, b, c); */ \ - vshl.u64 RT6, ra, #64 - 28; \ - veor.64 RT7, RT7, RT4; \ - vshr.u64 RT3, ra, #39; \ - veor.64 RT7, RT7, RT6; \ - vshl.u64 RT4, ra, #64 - 39; \ - vorr.64 RT6, ra, rb; \ - vand.64 RT0, ra, rb; \ - veor.64 RT7, RT7, RT5; \ - vand.64 RT6, RT6, rc; \ - veor.64 RT7, RT7, RT3; \ - vorr.64 RT0, RT0, RT6; \ - veor.64 RT7, RT7, RT4; \ - vadd.u64 RT1, RT1, RT2; \ - vadd.u64 RT7, RT7, RT0; \ + /* h = Sum0 (a) + Maj (a, b, c); */ \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, ra, #39; \ + vshl.u64 RT5, ra, #64 - 39; \ + veor.64 RT0, ra, rb; \ + veor.64 RT23q, RT23q, RT45q; \ + vbsl.64 RT0, rc, rb; \ vadd.u64 rd, rd, RT1; /* d+=t1; */ \ - vadd.u64 rh, RT7, RT1; /* h=t7+t1; */ + veor.64 rh, RT2, RT3; \ + \ + /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ + vshr.u64 RT2, rd, #14; \ + vshl.u64 RT3, rd, #64 - 14; \ + vadd.u64 rh, rh, RT0; \ + vshr.u64 RT4, rd, #18; \ + vshl.u64 RT5, rd, #64 - 18; \ + vadd.u64 rh, rh, RT1; /* h+=t1; */ \ + vld1.64 {RT0}, [RK]!; \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, rd, #41; \ + vshl.u64 RT5, rd, #64 - 41; \ + vadd.u64 RT0, RT0, rw1; \ + veor.64 RT23q, RT23q, RT45q; \ + vmov.64 RT7, rd; \ + veor.64 RT1, RT2, RT3; \ + vbsl.64 RT7, re, rf; \ + \ + vadd.u64 RT1, RT1, rg; \ + vshr.u64 RT2, rh, #28; \ + vshl.u64 RT3, rh, #64 - 28; \ + vadd.u64 RT1, RT1, RT0; \ + vshr.u64 RT4, rh, #34; \ + vshl.u64 RT5, rh, #64 - 34; \ + vadd.u64 RT1, RT1, RT7; \ + \ + /* g = Sum0 (h) + Maj (h, a, b); */ \ + veor.64 RT23q, RT23q, RT45q; \ + vshr.u64 RT4, rh, #39; \ + vshl.u64 RT5, rh, #64 - 39; \ + veor.64 RT0, rh, ra; \ + veor.64 RT23q, RT23q, RT45q; \ + vbsl.64 RT0, rb, ra; \ + vadd.u64 rc, rc, RT1; /* c+=t1; */ \ + veor.64 rg, RT2, RT3; +#define vadd_rg_RT0(rg) \ + vadd.u64 rg, rg, RT0; +#define vadd_rg_RT1(rg) \ + vadd.u64 rg, rg, RT1; /* g+=t1; */ .align 3 .globl _gcry_sha512_transform_armv7_neon @@ -207,8 +288,11 @@ _gcry_sha512_transform_armv7_neon: * %r0: SHA512_CONTEXT * %r1: data * %r2: u64 k[] constants + * %r3: nblks */ - mov %r3, #0; + push {%lr}; + + mov %lr, #0; /* Load context to d0-d7 */ vld1.64 {RA-RD}, [%r0]!; @@ -220,7 +304,7 @@ _gcry_sha512_transform_armv7_neon: vld1.64 {RW0-RW3}, [%r1]!; vld1.64 {RW4-RW7}, [%r1]!; vld1.64 {RW8-RW11}, [%r1]!; - vld1.64 {RW12-RW15}, [%r1]; + vld1.64 {RW12-RW15}, [%r1]!; #ifdef __ARMEL__ /* byteswap */ vrev64.8 RW01q, RW01q; @@ -237,46 +321,95 @@ _gcry_sha512_transform_armv7_neon: vpush {RT0-RT7}; .Loop: - add %r3, #16; - round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW14, RW9, RW1); - cmp %r3, #64; - round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW1, RW15, RW10, RW2); - round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW0, RW11, RW3); - round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW3, RW1, RW12, RW4); - round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW2, RW13, RW5); - round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW5, RW3, RW14, RW6); - round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW4, RW15, RW7); - round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW7, RW5, RW0, RW8); - round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW6, RW1, RW9); - round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW9, RW7, RW2, RW10); - round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW8, RW3, RW11); - round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW11, RW9, RW4, RW12); - round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW10, RW5, RW13); - round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW13, RW11, RW6, RW14); - round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW12, RW7, RW15); - round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW15, RW13, RW8, RW0); - bne .Loop; - - round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0); - round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW1); - round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2); - round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW3); - round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4); - round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW5); - round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6); - round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW7); - round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8); - round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW9); - round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10); - round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW11); - round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12); - round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW13); - round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14); - round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW15); + rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _); + b .Lenter_rounds; + +.Loop_rounds: + rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q); +.Lenter_rounds: + rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q); + rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q); + rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q); + rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q); + rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q); + add %lr, #16; + rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q); + cmp %lr, #64; + rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q); + bne .Loop_rounds; + + subs %r3, #1; + + rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _); + rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG); + beq .Lhandle_tail; + vld1.64 {RW0-RW3}, [%r1]!; + rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); + rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); +#ifdef __ARMEL__ + vrev64.8 RW01q, RW01q; + vrev64.8 RW23q, RW23q; +#endif + vld1.64 {RW4-RW7}, [%r1]!; + rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); + rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); +#ifdef __ARMEL__ + vrev64.8 RW45q, RW45q; + vrev64.8 RW67q, RW67q; +#endif + vld1.64 {RW8-RW11}, [%r1]!; + rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); + rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); +#ifdef __ARMEL__ + vrev64.8 RW67q, RW67q; + vrev64.8 RW89q, RW89q; +#endif + vld1.64 {RW12-RW15}, [%r1]!; + vadd_rg_RT0(RA); + vadd_rg_RT1(RA); + + /* Load context */ + vld1.64 {RT0-RT3}, [%r0]!; + vld1.64 {RT4-RT7}, [%r0]; + sub %r0, #(4*8); + +#ifdef __ARMEL__ + vrev64.8 RW1213q, RW1213q; + vrev64.8 RW1415q, RW1415q; +#endif + + vadd.u64 RA, RT0; + vadd.u64 RB, RT1; + vadd.u64 RC, RT2; + vadd.u64 RD, RT3; + vadd.u64 RE, RT4; + vadd.u64 RF, RT5; + vadd.u64 RG, RT6; + vadd.u64 RH, RT7; + + /* Store the first half of context */ + vst1.64 {RA-RD}, [%r0]!; + sub RK, $(8*80); + vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ + mov %lr, #0; + sub %r0, #(4*8); + + b .Loop; +.ltorg + +.Lhandle_tail: + rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE); + rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC); + rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA); + rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG); + rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE); + rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC); /* Load context to d16-d23 */ vld1.64 {RW0-RW3}, [%r0]!; + vadd_rg_RT0(RA); vld1.64 {RW4-RW7}, [%r0]; + vadd_rg_RT1(RA); sub %r0, #(4*8); vadd.u64 RA, RW0; @@ -310,7 +443,7 @@ _gcry_sha512_transform_armv7_neon: veor.u64 %q2, %q2; veor.u64 %q3, %q3; - bx %lr; + pop {%pc}; .size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; #endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 215e8ed..3474694 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -541,7 +541,7 @@ transform_blk (SHA512_STATE *hd, const unsigned char *data) #ifdef USE_ARM_NEON_ASM void _gcry_sha512_transform_armv7_neon (SHA512_STATE *hd, const unsigned char *data, - const u64 k[]); + const u64 k[], size_t num_blks); #endif #ifdef USE_SSSE3 @@ -587,12 +587,7 @@ transform (void *context, const unsigned char *data, size_t nblks) #ifdef USE_ARM_NEON_ASM if (ctx->use_neon) { - do - { - _gcry_sha512_transform_armv7_neon (&ctx->state, data, k); - data += 128; - } - while (--nblks); + _gcry_sha512_transform_armv7_neon (&ctx->state, data, k, nblks); /* _gcry_sha512_transform_armv7_neon does not store sensitive data * to stack. */ From jussi.kivilinna at iki.fi Tue Dec 17 15:37:36 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Dec 2013 16:37:36 +0200 Subject: [PATCH 6/6] Add ARM/NEON implementation for SHA-1 In-Reply-To: <20131217143711.31473.21328.stgit@localhost6.localdomain6> References: <20131217143711.31473.21328.stgit@localhost6.localdomain6> Message-ID: <20131217143736.31473.37991.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha1-armv7-neon.S'. * cipher/sha1-armv7-neon.S: New. * cipher/sha1.c (USE_NEON): New. (SHA1_CONTEXT, sha1_init) [USE_NEON]: Add and initialize 'use_neon'. [USE_NEON] (_gcry_sha1_transform_armv7_neon): New. (transform) [USE_NEON]: Use ARM/NEON assembly if enabled. * configure.ac: Add 'sha1-armv7-neon.lo'. -- Patch adds ARM/NEON implementation for SHA-1. Benchmarks show 1.72x improvement on ARM Cortex-A8, 1008 Mhz: jussi at cubie:~/libgcrypt$ tests/bench-slope --cpu-mhz 1008 hash sha1 Hash: | nanosecs/byte mebibytes/sec cycles/byte SHA1 | 7.80 ns/B 122.3 MiB/s 7.86 c/B = jussi at cubie:~/libgcrypt$ tests/bench-slope --disable-hwf arm-neon --cpu-mhz 1008 hash sha1 Hash: | nanosecs/byte mebibytes/sec cycles/byte SHA1 | 13.41 ns/B 71.10 MiB/s 13.52 c/B = Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 1 cipher/sha1-armv7-neon.S | 501 ++++++++++++++++++++++++++++++++++++++++++++++ cipher/sha1.c | 29 +++ configure.ac | 4 4 files changed, 534 insertions(+), 1 deletion(-) create mode 100644 cipher/sha1-armv7-neon.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 575df38..98c6254 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,6 +78,7 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ + sha1-armv7-neon.S \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S \ diff --git a/cipher/sha1-armv7-neon.S b/cipher/sha1-armv7-neon.S new file mode 100644 index 0000000..cbb437a --- /dev/null +++ b/cipher/sha1-armv7-neon.S @@ -0,0 +1,501 @@ +/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function + * Copyright ? 2013 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) && defined(USE_SHA1) + +.text + +.syntax unified +.fpu neon +.arm + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 4 +gcry_sha1_armv7_neon_K_VEC: +.LK_VEC: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + + +/* Register macros */ + +#define RSTATE r0 +#define RDATA r1 +#define RNBLKS r2 +#define ROLDSTACK r3 +#define RK lr +#define RWK r12 + +#define _a r4 +#define _b r5 +#define _c r6 +#define _d r7 +#define _e r8 + +#define RT0 r9 +#define RT1 r10 +#define RT2 r11 + +#define W0 q0 +#define W1 q1 +#define W2 q2 +#define W3 q3 +#define W4 q4 +#define W5 q5 +#define W6 q6 +#define W7 q7 + +#define tmp0 q8 +#define tmp1 q9 +#define tmp2 q10 +#define tmp3 q11 + +#define curK q12 + + +/* Round function macros. */ + +#define WK_offs(i) (((i) & 15) * 4) + +#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + and RT0, c, b; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, a, ror #(32 - 5); \ + ldr RT2, [sp, WK_offs(i)]; \ + bic RT1, d, b; \ + add e, RT2; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + ror b, #(32 - 30); \ + eor RT0, RT1; \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, RT0; + +#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + eor RT0, c, b; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, a, ror #(32 - 5); \ + ldr RT2, [sp, WK_offs(i)]; \ + eor RT0, d; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, RT2; \ + ror b, #(32 - 30); \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, RT0; \ + +#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + eor RT0, c, b; \ + pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, e, a, ror #(32 - 5); \ + ldr RT2, [sp, WK_offs(i)]; \ + and RT1, c, b; \ + and RT0, d; \ + add e, RT2; \ + pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + ror b, #(32 - 30); \ + add e, RT1; \ + pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ + add e, RT0; + +#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ + _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define R(a,b,c,d,e,f,i) \ + _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) + +#define dummy(...) + + +/* Input expansion macros. */ + +/********* Precalc macros for rounds 0-15 *************************************/ + +#define W_PRECALC_00_15() \ + add RWK, sp, #(WK_offs(0)); \ + \ + vld1.32 {tmp0, tmp1}, [RDATA]!; \ + vrev32.8 W0, tmp0; /* big => little */ \ + vld1.32 {tmp2, tmp3}, [RDATA]!; \ + vadd.u32 tmp0, W0, curK; \ + vrev32.8 W7, tmp1; /* big => little */ \ + vrev32.8 W6, tmp2; /* big => little */ \ + vadd.u32 tmp1, W7, curK; \ + vrev32.8 W5, tmp3; /* big => little */ \ + vadd.u32 tmp2, W6, curK; \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + vadd.u32 tmp3, W5, curK; \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + +#define WPRECALC_00_15_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + add RWK, sp, #(WK_offs(0)); \ + +#define WPRECALC_00_15_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vld1.32 {tmp0, tmp1}, [RDATA]!; \ + +#define WPRECALC_00_15_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vrev32.8 W0, tmp0; /* big => little */ \ + +#define WPRECALC_00_15_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vld1.32 {tmp2, tmp3}, [RDATA]!; \ + +#define WPRECALC_00_15_4(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vadd.u32 tmp0, W0, curK; \ + +#define WPRECALC_00_15_5(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vrev32.8 W7, tmp1; /* big => little */ \ + +#define WPRECALC_00_15_6(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vrev32.8 W6, tmp2; /* big => little */ \ + +#define WPRECALC_00_15_7(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vadd.u32 tmp1, W7, curK; \ + +#define WPRECALC_00_15_8(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vrev32.8 W5, tmp3; /* big => little */ \ + +#define WPRECALC_00_15_9(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vadd.u32 tmp2, W6, curK; \ + +#define WPRECALC_00_15_10(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vst1.32 {tmp0, tmp1}, [RWK]!; \ + +#define WPRECALC_00_15_11(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vadd.u32 tmp3, W5, curK; \ + +#define WPRECALC_00_15_12(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vst1.32 {tmp2, tmp3}, [RWK]; \ + + +/********* Precalc macros for rounds 16-31 ************************************/ + +#define WPRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + add RWK, sp, #(WK_offs(i)); \ + +#define WPRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor tmp0, tmp0; \ + vext.8 W, W_m16, W_m12, #8; \ + +#define WPRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vext.8 tmp0, W_m04, tmp0, #4; \ + veor.32 W, W, W_m08; \ + +#define WPRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor tmp0, tmp0, W_m16; \ + veor tmp1, tmp1; \ + +#define WPRECALC_16_31_4(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor W, W, tmp0; \ + +#define WPRECALC_16_31_5(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vshl.u32 tmp0, W, #1; \ + vext.8 tmp1, tmp1, W, #(16-12); \ + vshr.u32 W, W, #31; \ + +#define WPRECALC_16_31_6(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vorr tmp0, tmp0, W; \ + vshr.u32 W, tmp1, #30; \ + +#define WPRECALC_16_31_7(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vshl.u32 tmp1, tmp1, #2; \ + +#define WPRECALC_16_31_8(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor tmp0, tmp0, W; \ + +#define WPRECALC_16_31_9(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor W, tmp0, tmp1; \ + +#define WPRECALC_16_31_10(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_16_31_11(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/********* Precalc macros for rounds 32-79 ************************************/ + +#define WPRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + add RWK, sp, #(WK_offs(i&~3)); \ + +#define WPRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor W, W_m28; \ + +#define WPRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vext.8 tmp0, W_m08, W_m04, #8; \ + +#define WPRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor W, W_m16; \ + +#define WPRECALC_32_79_4(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + veor W, tmp0; \ + +#define WPRECALC_32_79_5(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vshr.u32 tmp0, W, #30; \ + +#define WPRECALC_32_79_6(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vshl.u32 W, W, #2; \ + +#define WPRECALC_32_79_7(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vorr W, tmp0, W; \ + +#define WPRECALC_32_79_8(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vadd.u32 tmp0, W, curK; \ + +#define WPRECALC_32_79_9(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28) \ + vst1.32 {tmp0}, [RWK]; + + +/* Other functional macros */ + +#define CLEAR_REG(reg) veor reg, reg; + + +/* + * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. + * + * unsigned int + * _gcry_sha1_transform_armv7_neon (void *ctx, const unsigned char *data, + * size_t nblks) + */ +.align 3 +.globl _gcry_sha1_transform_armv7_neon +.type _gcry_sha1_transform_armv7_neon,%function; +_gcry_sha1_transform_armv7_neon: + /* input: + * r0: ctx, CTX + * r1: data (64*nblks bytes) + * r2: nblks + */ + + cmp RNBLKS, #0; + beq .Ldo_nothing; + + push {r4-r12, lr}; + vpush {q4-q7}; + + mov ROLDSTACK, sp; + ldr RK, =.LK_VEC; + + /* Align stack. */ + sub sp, #(16*4); + and sp, #(~(16-1)); + + /* Get the values of the chaining variables. */ + ldm RSTATE, {_a-_e}; + + /* Precalc 0-15. */ + vld1.32 {curK}, [RK]!; /* Load K1. */ + W_PRECALC_00_15(); + + b .Loop; + +.ltorg +.Loop: + /* Transform 0-15 + Precalc 16-31. */ + _R( _a, _b, _c, _d, _e, F1, 0, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, W4, W5, W6, W7, W0, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 1, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, W4, W5, W6, W7, W0, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 2, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, W4, W5, W6, W7, W0, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 3, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, W4, W5, W6, W7, W0, _, _, _ ); + + vld1.32 {curK}, [RK]!; /* Load K2. */ + _R( _b, _c, _d, _e, _a, F1, 4, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, W3, W4, W5, W6, W7, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 5, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, W3, W4, W5, W6, W7, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 6, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, W3, W4, W5, W6, W7, _, _, _ ); + _R( _d, _e, _a, _b, _c, F1, 7, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, W3, W4, W5, W6, W7, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F1, 8, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, W2, W3, W4, W5, W6, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 9, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, W2, W3, W4, W5, W6, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 10, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, W2, W3, W4, W5, W6, _, _, _ ); + _R( _e, _a, _b, _c, _d, F1, 11, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, W2, W3, W4, W5, W6, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F1, 12, WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, W1, W2, W3, W4, W5, _, _, _ ); + _R( _c, _d, _e, _a, _b, F1, 13, WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, W1, W2, W3, W4, W5, _, _, _ ); + _R( _b, _c, _d, _e, _a, F1, 14, WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, W1, W2, W3, W4, W5, _, _, _ ); + _R( _a, _b, _c, _d, _e, F1, 15, WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, W1, W2, W3, W4, W5, _, _, _ ); + + /* Transform 16-63 + Precalc 32-79. */ + _R( _e, _a, _b, _c, _d, F1, 16, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, W0, W1, W2, W3, W4, W5, W6, W7); + _R( _d, _e, _a, _b, _c, F1, 17, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, W0, W1, W2, W3, W4, W5, W6, W7); + _R( _c, _d, _e, _a, _b, F1, 18, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 32, W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F1, 19, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _a, _b, _c, _d, _e, F2, 20, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, W7, W0, W1, W2, W3, W4, W5, W6); + _R( _e, _a, _b, _c, _d, F2, 21, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, W7, W0, W1, W2, W3, W4, W5, W6); + _R( _d, _e, _a, _b, _c, F2, 22, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 36, W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F2, 23, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, W7, W0, W1, W2, W3, W4, W5, W6); + + vld1.32 {curK}, [RK]!; /* Load K3. */ + _R( _b, _c, _d, _e, _a, F2, 24, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, W6, W7, W0, W1, W2, W3, W4, W5); + _R( _a, _b, _c, _d, _e, F2, 25, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, W6, W7, W0, W1, W2, W3, W4, W5); + _R( _e, _a, _b, _c, _d, F2, 26, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 40, W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F2, 27, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, W6, W7, W0, W1, W2, W3, W4, W5); + + _R( _c, _d, _e, _a, _b, F2, 28, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, W5, W6, W7, W0, W1, W2, W3, W4); + _R( _b, _c, _d, _e, _a, F2, 29, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, W5, W6, W7, W0, W1, W2, W3, W4); + _R( _a, _b, _c, _d, _e, F2, 30, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 44, W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F2, 31, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, W5, W6, W7, W0, W1, W2, W3, W4); + + _R( _d, _e, _a, _b, _c, F2, 32, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, W4, W5, W6, W7, W0, W1, W2, W3); + _R( _c, _d, _e, _a, _b, F2, 33, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, W4, W5, W6, W7, W0, W1, W2, W3); + _R( _b, _c, _d, _e, _a, F2, 34, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 48, W4, W5, W6, W7, W0, W1, W2, W3); + _R( _a, _b, _c, _d, _e, F2, 35, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, W4, W5, W6, W7, W0, W1, W2, W3); + + _R( _e, _a, _b, _c, _d, F2, 36, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, W3, W4, W5, W6, W7, W0, W1, W2); + _R( _d, _e, _a, _b, _c, F2, 37, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, W3, W4, W5, W6, W7, W0, W1, W2); + _R( _c, _d, _e, _a, _b, F2, 38, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 52, W3, W4, W5, W6, W7, W0, W1, W2); + _R( _b, _c, _d, _e, _a, F2, 39, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, W3, W4, W5, W6, W7, W0, W1, W2); + + _R( _a, _b, _c, _d, _e, F3, 40, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, W2, W3, W4, W5, W6, W7, W0, W1); + _R( _e, _a, _b, _c, _d, F3, 41, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, W2, W3, W4, W5, W6, W7, W0, W1); + _R( _d, _e, _a, _b, _c, F3, 42, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 56, W2, W3, W4, W5, W6, W7, W0, W1); + _R( _c, _d, _e, _a, _b, F3, 43, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, W2, W3, W4, W5, W6, W7, W0, W1); + + vld1.32 {curK}, [RK]!; /* Load K4. */ + _R( _b, _c, _d, _e, _a, F3, 44, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, W1, W2, W3, W4, W5, W6, W7, W0); + _R( _a, _b, _c, _d, _e, F3, 45, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, W1, W2, W3, W4, W5, W6, W7, W0); + _R( _e, _a, _b, _c, _d, F3, 46, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 60, W1, W2, W3, W4, W5, W6, W7, W0); + _R( _d, _e, _a, _b, _c, F3, 47, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, W1, W2, W3, W4, W5, W6, W7, W0); + + _R( _c, _d, _e, _a, _b, F3, 48, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, W0, W1, W2, W3, W4, W5, W6, W7); + _R( _b, _c, _d, _e, _a, F3, 49, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, W0, W1, W2, W3, W4, W5, W6, W7); + _R( _a, _b, _c, _d, _e, F3, 50, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 64, W0, W1, W2, W3, W4, W5, W6, W7); + _R( _e, _a, _b, _c, _d, F3, 51, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, W0, W1, W2, W3, W4, W5, W6, W7); + + _R( _d, _e, _a, _b, _c, F3, 52, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, W7, W0, W1, W2, W3, W4, W5, W6); + _R( _c, _d, _e, _a, _b, F3, 53, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, W7, W0, W1, W2, W3, W4, W5, W6); + _R( _b, _c, _d, _e, _a, F3, 54, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 68, W7, W0, W1, W2, W3, W4, W5, W6); + _R( _a, _b, _c, _d, _e, F3, 55, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, W7, W0, W1, W2, W3, W4, W5, W6); + + _R( _e, _a, _b, _c, _d, F3, 56, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, W6, W7, W0, W1, W2, W3, W4, W5); + _R( _d, _e, _a, _b, _c, F3, 57, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, W6, W7, W0, W1, W2, W3, W4, W5); + _R( _c, _d, _e, _a, _b, F3, 58, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 72, W6, W7, W0, W1, W2, W3, W4, W5); + _R( _b, _c, _d, _e, _a, F3, 59, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, W6, W7, W0, W1, W2, W3, W4, W5); + + sub RK, #64; + _R( _a, _b, _c, _d, _e, F4, 60, WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, W5, W6, W7, W0, W1, W2, W3, W4); + _R( _e, _a, _b, _c, _d, F4, 61, WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, W5, W6, W7, W0, W1, W2, W3, W4); + _R( _d, _e, _a, _b, _c, F4, 62, WPRECALC_32_79_6, WPRECALC_32_79_7, dummy, 76, W5, W6, W7, W0, W1, W2, W3, W4); + _R( _c, _d, _e, _a, _b, F4, 63, WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, W5, W6, W7, W0, W1, W2, W3, W4); + + subs RNBLKS, #1; + beq .Lend; + + /* Transform 64-79 + Precalc 0-15 of next block. */ + vld1.32 {curK}, [RK]!; /* Load K1. */ + _R( _b, _c, _d, _e, _a, F4, 64, WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 65, WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 66, WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 67, WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _c, _d, _e, _a, _b, F4, 68, dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 69, dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 70, WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _e, _a, _b, _c, _d, F4, 71, WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _d, _e, _a, _b, _c, F4, 72, dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 73, dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 74, WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _a, _b, _c, _d, _e, F4, 75, WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + + _R( _e, _a, _b, _c, _d, F4, 76, WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _d, _e, _a, _b, _c, F4, 77, WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _c, _d, _e, _a, _b, F4, 78, WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); + _R( _b, _c, _d, _e, _a, F4, 79, WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT2}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h3]; + add _b, RT1; + ldr RT1, [RSTATE, #state_h4]; + add _c, RT2; + add _d, RT0; + add _e, RT1; + stm RSTATE, {_a-_e}; + + b .Loop; + +.ltorg +.Lend: + /* Transform 64-79 + Clear XMM registers. */ + R( _b, _c, _d, _e, _a, F4, 64 ); + R( _a, _b, _c, _d, _e, F4, 65 ); CLEAR_REG(tmp0); + R( _e, _a, _b, _c, _d, F4, 66 ); CLEAR_REG(tmp1); + R( _d, _e, _a, _b, _c, F4, 67 ); CLEAR_REG(W0); + R( _c, _d, _e, _a, _b, F4, 68 ); CLEAR_REG(W1); + R( _b, _c, _d, _e, _a, F4, 69 ); CLEAR_REG(W2); + R( _a, _b, _c, _d, _e, F4, 70 ); CLEAR_REG(W3); + R( _e, _a, _b, _c, _d, F4, 71 ); CLEAR_REG(W4); + R( _d, _e, _a, _b, _c, F4, 72 ); CLEAR_REG(W5); + R( _c, _d, _e, _a, _b, F4, 73 ); CLEAR_REG(W6); + R( _b, _c, _d, _e, _a, F4, 74 ); CLEAR_REG(W7); + R( _a, _b, _c, _d, _e, F4, 75 ); + R( _e, _a, _b, _c, _d, F4, 76 ); + R( _d, _e, _a, _b, _c, F4, 77 ); + R( _c, _d, _e, _a, _b, F4, 78 ); + R( _b, _c, _d, _e, _a, F4, 79 ); + + mov sp, ROLDSTACK; + + /* Update the chaining variables. */ + ldm RSTATE, {RT0-RT2}; + add _a, RT0; + ldr RT0, [RSTATE, #state_h3]; + add _b, RT1; + ldr RT1, [RSTATE, #state_h4]; + add _c, RT2; + add _d, RT0; + vpop {q4-q7}; + add _e, RT1; + stm RSTATE, {_a-_e}; + + /* burn_stack */ + mov r0, #(16*4 + 16*4 + 15); + + pop {r4-r12, pc}; + +.Ldo_nothing: + mov r0, #0; + bx lr + +#endif diff --git a/cipher/sha1.c b/cipher/sha1.c index a55ff93..889a7ea 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -64,6 +64,15 @@ # define USE_BMI2 1 #endif +/* USE_NEON indicates whether to enable ARM NEON assembly code. */ +#undef USE_NEON +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) +# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_NEON 1 +# endif +#endif + /* A macro to test whether P is properly aligned for an u32 type. Note that config.h provides a suitable replacement for uintptr_t if @@ -87,6 +96,9 @@ typedef struct #ifdef USE_BMI2 unsigned int use_bmi2:1; #endif +#ifdef USE_NEON + unsigned int use_neon:1; +#endif } SHA1_CONTEXT; static unsigned int @@ -122,6 +134,9 @@ sha1_init (void *context) #ifdef USE_BMI2 hd->use_bmi2 = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2); #endif +#ifdef USE_NEON + hd->use_neon = (features & HWF_ARM_NEON) != 0; +#endif (void)features; } @@ -148,6 +163,13 @@ sha1_init (void *context) } while(0) + +#ifdef USE_NEON +unsigned int +_gcry_sha1_transform_armv7_neon (void *state, const unsigned char *data, + size_t nblks); +#endif + /* * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA. */ @@ -302,10 +324,15 @@ transform (void *ctx, const unsigned char *data, size_t nblks) return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data, nblks) + 4 * sizeof(void*); #endif +#ifdef USE_NEON + if (hd->use_neon) + return _gcry_sha1_transform_armv7_neon (&hd->h0, data, nblks) + + 4 * sizeof(void*); +#endif do { - burn = transform_blk (ctx, data); + burn = transform_blk (hd, data); data += 64; } while (--nblks); diff --git a/configure.ac b/configure.ac index 2f24863..27de850 100644 --- a/configure.ac +++ b/configure.ac @@ -1792,6 +1792,10 @@ case "${host}" in GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo" ;; + arm*-*-*) + # Build with the assembly implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-armv7-neon.lo" + ;; esac LIST_MEMBER(scrypt, $enabled_kdfs) From jussi.kivilinna at iki.fi Tue Dec 17 15:37:26 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 17 Dec 2013 16:37:26 +0200 Subject: [PATCH 4/6] Add AVX and AVX2/BMI implementations for SHA-256 In-Reply-To: <20131217143711.31473.21328.stgit@localhost6.localdomain6> References: <20131217143711.31473.21328.stgit@localhost6.localdomain6> Message-ID: <20131217143726.31473.28326.stgit@localhost6.localdomain6> * LICENSES: Add 'cipher/sha256-avx-amd64.S' and 'cipher/sha256-avx2-bmi2-amd64.S'. * cipher/Makefile.am: Add 'sha256-avx-amd64.S' and 'sha256-avx2-bmi2-amd64.S'. * cipher/sha256-avx-amd64.S: New. * cipher/sha256-avx2-bmi2-amd64.S: New. * cipher/sha256-ssse3-amd64.S: Use 'lea' instead of 'add' in few places for tiny speed improvement. * cipher/sha256.c (USE_AVX, USE_AVX2): New. (SHA256_CONTEXT) [USE_AVX, USE_AVX2]: Add 'use_avx' and 'use_avx2'. (sha256_init, sha224_init) [USE_AVX, USE_AVX2]: Initialize above new context members. [USE_AVX] (_gcry_sha256_transform_amd64_avx): New. [USE_AVX2] (_gcry_sha256_transform_amd64_avx2): New. (transform) [USE_AVX2]: Use AVX2 assembly if enabled. (transform) [USE_AVX]: Use AVX assembly if enabled. * configure.ac: Add 'sha256-avx-amd64.lo' and 'sha256-avx2-bmi2-amd64.lo'. -- Patch adds fast AVX and AVX2/BMI2 implementations of SHA-256 by Intel Corporation. The assembly source is licensed under 3-clause BSD license, thus compatible with LGPL2.1+. Original source can be accessed at: http://www.intel.com/p/en_US/embedded/hwsw/technology/packet-processing#docs Implementation is described in white paper "Fast SHA - 256 Implementations on Intel? Architecture Processors" http://www.intel.com/content/www/us/en/intelligent-systems/intel-technology/sha-256-implementations-paper.html Note: AVX implementation uses SHLD instruction to emulate RORQ, since it's faster on Intel Sandy-Bridge. However, on non-Intel CPUs SHLD is much slower than RORQ, so therefore AVX implementation is (for now) limited to Intel CPUs. Note: AVX2 implementation also uses BMI2 instruction rorx, thus additional HWF flag. Benchmarks: cpu C-lang SSSE3 AVX/AVX2 C vs AVX/AVX2 vs SSSE3 Intel i5-4570 13.86 c/B 10.27 c/B 8.70 c/B 1.59x 1.18x Intel i5-2450M 17.25 c/B 12.36 c/B 10.31 c/B 1.67x 1.19x Signed-off-by: Jussi Kivilinna --- LICENSES | 2 cipher/Makefile.am | 2 cipher/sha256-avx-amd64.S | 521 +++++++++++++++++++++++++ cipher/sha256-avx2-bmi2-amd64.S | 807 +++++++++++++++++++++++++++++++++++++++ cipher/sha256-ssse3-amd64.S | 11 - cipher/sha256.c | 68 +++ configure.ac | 2 7 files changed, 1405 insertions(+), 8 deletions(-) create mode 100644 cipher/sha256-avx-amd64.S create mode 100644 cipher/sha256-avx2-bmi2-amd64.S diff --git a/LICENSES b/LICENSES index 8594cfd..6c09e1f 100644 --- a/LICENSES +++ b/LICENSES @@ -12,6 +12,8 @@ with any binary distributions derived from the GNU C Library. * BSD_3Clause For files: + - cipher/sha256-avx-amd64.S + - cipher/sha256-avx2-bmi2-amd64.S - cipher/sha256-ssse3-amd64.S - cipher/sha512-avx-amd64.S - cipher/sha512-avx2-bmi2-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3ec651f..575df38 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,7 @@ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ -sha256.c sha256-ssse3-amd64.S \ +sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S \ stribog.c \ diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S new file mode 100644 index 0000000..3912db7 --- /dev/null +++ b/cipher/sha256-avx-amd64.S @@ -0,0 +1,521 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + * + * Note: Based on the SSSE3 implementation. + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +#define VMOVDQ vmovdqu /* assume buffers not aligned */ + +.macro ROR p1 p2 + /* shld is faster than ror on Intel Sandybridge */ + shld \p1, \p1, (32 - \p2) +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/ + +/* addm [mem], reg + * Add reg to mem using reg-mem add and store */ +.macro addm p1 p2 + add \p2, \p1 + mov \p1, \p2 +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +/* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask + * Load xmm with mem and byte swap each dword */ +.macro COPY_XMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p1, \p2 + vpshufb \p1, \p1, \p3 +.endm + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +X0 = xmm4 +X1 = xmm5 +X2 = xmm6 +X3 = xmm7 + +XTMP0 = xmm0 +XTMP1 = xmm1 +XTMP2 = xmm2 +XTMP3 = xmm3 +XTMP4 = xmm8 +XFER = xmm9 + +SHUF_00BA = xmm10 /* shuffle xBxA -> 00BA */ +SHUF_DC00 = xmm11 /* shuffle xDxC -> DC00 */ +BYTE_FLIP_MASK = xmm12 + +NUM_BLKS = rdx /* 3rd arg */ +CTX = rsi /* 2nd arg */ +INP = rdi /* 1st arg */ + +SRND = rdi /* clobbers INP */ +c = ecx +d = r8d +e = edx + +TBL = rbp +a = eax +b = ebx + +f = r9d +g = r10d +h = r11d + +y0 = r13d +y1 = r14d +y2 = r15d + + + +#define _INP_END_SIZE 8 +#define _INP_SIZE 8 +#define _XFER_SIZE 8 +#define _XMM_SAVE_SIZE 0 +/* STACK_SIZE plus pushes must be an odd multiple of 8 */ +#define _ALIGN_SIZE 8 + +#define _INP_END 0 +#define _INP (_INP_END + _INP_END_SIZE) +#define _XFER (_INP + _INP_SIZE) +#define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE) +#define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE) + +/* rotate_Xs + * Rotate values of symbols X0...X3 */ +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +/* ROTATE_ARGS + * Rotate values of symbols a...h */ +.macro ROTATE_ARGS +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED + /* compute s0 four at a time and s1 two at a time + * compute W[-16] + W[-7] 4 at a time */ + mov y0, e /* y0 = e */ + ROR y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ + ROR y1, (22-13) /* y1 = a >> (22-13) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + xor y2, g /* y2 = f^g */ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + /* compute s0 */ + vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + vpslld XTMP2, XTMP1, (32-7) + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + vpsrld XTMP3, XTMP1, 7 + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + mov y0, e /* y0 = e */ + mov y1, a /* y1 = a */ + ROR y0, (25-11) /* y0 = e >> (25-11) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ROR y1, (22-13) /* y1 = a >> (22-13) */ + vpslld XTMP2, XTMP1, (32-18) + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y2, g /* y2 = f^g */ + vpsrld XTMP4, XTMP1, 18 + ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + vpxor XTMP4, XTMP4, XTMP3 + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */ + ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + /* compute low s1 */ + vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + mov y0, e /* y0 = e */ + mov y1, a /* y1 = a */ + ROR y0, (25-11) /* y0 = e >> (25-11) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + ROR y1, (22-13) /* y1 = a >> (22-13) */ + mov y2, f /* y2 = f */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ + xor y2, g /* y2 = f^g */ + vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ + ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + vpxor XTMP2, XTMP2, XTMP3 + add y2, y0 /* y2 = S1 + CH */ + ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */ + vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + /* compute high s1 */ + vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS + mov y0, e /* y0 = e */ + ROR y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + ROR y1, (22-13) /* y1 = a >> (22-13) */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + mov y2, f /* y2 = f */ + ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + xor y2, g /* y2 = f^g */ + vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + and y2, e /* y2 = (f^g)&e */ + ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + vpxor XTMP2, XTMP2, XTMP3 + ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, y0 /* y2 = S1 + CH */ + add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */ + vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ + +ROTATE_ARGS +rotate_Xs +.endm + +/* input is [rsp + _XFER + %1 * 4] */ +.macro DO_ROUND i1 + mov y0, e /* y0 = e */ + ROR y0, (25-11) /* y0 = e >> (25-11) */ + mov y1, a /* y1 = a */ + xor y0, e /* y0 = e ^ (e >> (25-11)) */ + ROR y1, (22-13) /* y1 = a >> (22-13) */ + mov y2, f /* y2 = f */ + xor y1, a /* y1 = a ^ (a >> (22-13) */ + ROR y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */ + xor y2, g /* y2 = f^g */ + xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */ + ROR y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */ + and y2, e /* y2 = (f^g)&e */ + xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */ + ROR y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */ + xor y2, g /* y2 = CH = ((f^g)&e)^g */ + add y2, y0 /* y2 = S1 + CH */ + ROR y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */ + add y2, [rsp + _XFER + \i1 * 4] /* y2 = k + w + S1 + CH */ + mov y0, a /* y0 = a */ + add h, y2 /* h = h + S1 + CH + k + w */ + mov y2, a /* y2 = a */ + or y0, c /* y0 = a|c */ + add d, h /* d = d + h + S1 + CH + k + w */ + and y2, c /* y2 = a&c */ + and y0, b /* y0 = (a|c)&b */ + add h, y1 /* h = h + S1 + CH + k + w + S0 */ + or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ + ROTATE_ARGS +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +*/ +.text +.globl _gcry_sha256_transform_amd64_avx +.type _gcry_sha256_transform_amd64_avx, at function; +.align 16 +_gcry_sha256_transform_amd64_avx: + vzeroupper + + push rbx + push rbp + push r13 + push r14 + push r15 + + sub rsp, STACK_SIZE + + shl NUM_BLKS, 6 /* convert to bytes */ + jz .Ldone_hash + add NUM_BLKS, INP /* pointer to end of data */ + mov [rsp + _INP_END], NUM_BLKS + + /* load initial digest */ + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] + vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] + +.Loop0: + lea TBL, [.LK256 ADD_RIP] + + /* byte swap first 16 dwords */ + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + mov [rsp + _INP], INP + + /* schedule 48 input dwords, by doing 3 rounds of 16 each */ + mov SRND, 3 +.align 16 +.Loop1: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 2*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 3*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne .Loop1 + + mov SRND, 2 +.Loop2: + vpaddd X0, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + vpaddd X1, X1, [TBL + 1*16] + vmovdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X0, X2 + vmovdqa X1, X3 + + sub SRND, 1 + jne .Loop2 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + mov INP, [rsp + _INP] + add INP, 64 + cmp INP, [rsp + _INP_END] + jne .Loop0 + + vzeroall + +.Ldone_hash: + add rsp, STACK_SIZE + + pop r15 + pop r14 + pop r13 + pop rbp + pop rbx + + mov eax, STACK_SIZE + 5*8 + + ret + + +.data +.align 16 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203 + +/* shuffle xBxA -> 00BA */ +.L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 + +/* shuffle xDxC -> DC00 */ +.L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF + +#endif +#endif diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S new file mode 100644 index 0000000..09df711 --- /dev/null +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -0,0 +1,807 @@ +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright (c) 2012, Intel Corporation +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are +; met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the +; distribution. +; +; * Neither the name of the Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; +; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY +; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; This code is described in an Intel White-Paper: +; "Fast SHA-256 Implementations on Intel Architecture Processors" +; +; To find it, surf to http://www.intel.com/p/en_US/embedded +; and search for that title. +; The paper is expected to be released roughly at the end of April, 2012 +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; This code schedules 2 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +*/ +/* + * Conversion to GAS assembly and integration to libgcrypt + * by Jussi Kivilinna + */ + +#ifdef __x86_64 +#include +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(USE_SHA256) + +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + +.intel_syntax noprefix + +#define VMOVDQ vmovdqu /* ; assume buffers not aligned */ + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros */ + +/* addm [mem], reg */ +/* Add reg to mem using reg-mem add and store */ +.macro addm p1 p2 + add \p2, \p1 + mov \p1, \p2 +.endm + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + +X0 = ymm4 +X1 = ymm5 +X2 = ymm6 +X3 = ymm7 + +/* XMM versions of above */ +XWORD0 = xmm4 +XWORD1 = xmm5 +XWORD2 = xmm6 +XWORD3 = xmm7 + +XTMP0 = ymm0 +XTMP1 = ymm1 +XTMP2 = ymm2 +XTMP3 = ymm3 +XTMP4 = ymm8 +XFER = ymm9 +XTMP5 = ymm11 + +SHUF_00BA = ymm10 /* shuffle xBxA -> 00BA */ +SHUF_DC00 = ymm12 /* shuffle xDxC -> DC00 */ +BYTE_FLIP_MASK = ymm13 + +X_BYTE_FLIP_MASK = xmm13 /* XMM version of BYTE_FLIP_MASK */ + +NUM_BLKS = rdx /* 3rd arg */ +CTX = rsi /* 2nd arg */ +INP = rdi /* 1st arg */ +c = ecx +d = r8d +e = edx /* clobbers NUM_BLKS */ +y3 = edi /* clobbers INP */ + +TBL = rbp +SRND = CTX /* SRND is same register as CTX */ + +a = eax +b = ebx +f = r9d +g = r10d +h = r11d +old_h = r11d + +T1 = r12d +y0 = r13d +y1 = r14d +y2 = r15d + + +_XFER_SIZE = 2*64*4 /* 2 blocks, 64 rounds, 4 bytes/round */ +_XMM_SAVE_SIZE = 0 +_INP_END_SIZE = 8 +_INP_SIZE = 8 +_CTX_SIZE = 8 +_RSP_SIZE = 8 + +_XFER = 0 +_XMM_SAVE = _XFER + _XFER_SIZE +_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE +_INP = _INP_END + _INP_END_SIZE +_CTX = _INP + _INP_SIZE +_RSP = _CTX + _CTX_SIZE +STACK_SIZE = _RSP + _RSP_SIZE + +/* rotate_Xs */ +/* Rotate values of symbols X0...X3 */ +.macro rotate_Xs +X_ = X0 +X0 = X1 +X1 = X2 +X2 = X3 +X3 = X_ +.endm + +/* ROTATE_ARGS */ +/* Rotate values of symbols a...h */ +.macro ROTATE_ARGS +old_h = h +TMP_ = h +h = g +g = f +f = e +e = d +d = c +c = b +b = a +a = TMP_ +.endm + +.macro FOUR_ROUNDS_AND_SCHED XFER +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + + add h, [\XFER+0*4] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 */ + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + + and y2, e /* y2 = (f^g)&e ; CH */ + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + add d, h /* d = k + w + h + d ; -- */ + + and y3, b /* y3 = (a|c)&b ; MAJA */ + vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + vpsrld XTMP2, XTMP1, 7 + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + + add y2, y0 /* y2 = S1 + CH ; -- */ + vpslld XTMP3, XTMP1, (32-7) + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */ + + vpsrld XTMP2, XTMP1,18 + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ + + +ROTATE_ARGS + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + add h, [\XFER+1*4] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + + vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + + + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + and y2, e /* y2 = (f^g)&e ; CH */ + add d, h /* d = k + w + h + d ; -- */ + + vpslld XTMP1, XTMP1, (32-18) + and y3, b /* y3 = (a|c)&b ; MAJA */ + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + + vpxor XTMP3, XTMP3, XTMP1 + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */ + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */ + vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ + + vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ + + +ROTATE_ARGS + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + add h, [\XFER+2*4] /* h = k + w + h ; -- */ + + vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + or y3, c /* y3 = a|c ; MAJA */ + mov y2, f /* y2 = f ; CH */ + xor y2, g /* y2 = f^g ; CH */ + + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ + and y2, e /* y2 = (f^g)&e ; CH */ + + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + vpxor XTMP2, XTMP2, XTMP3 + add d, h /* d = k + w + h + d ; -- */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */ + + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + + lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ + + +ROTATE_ARGS + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + mov y3, a /* y3 = a ; MAJA */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + add h, [\XFER+3*4] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + + vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */ + mov y2, f /* y2 = f ; CH */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + xor y2, g /* y2 = f^g ; CH */ + + + vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add d, h /* d = k + w + h + d ; -- */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + + vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + + vpxor XTMP2, XTMP2, XTMP3 + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */ + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */ + + vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and T1, c /* T1 = a&c ; MAJB */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + + add h, y1 /* h = k + w + h + S0 ; -- */ + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ + +ROTATE_ARGS +rotate_Xs +.endm + +.macro DO_4ROUNDS XFER +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + add h, [\XFER + 4*0] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /* add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + + /* lea h, [h + y3] ; h = t1 + S0 + MAJ ; -- */ + + ROTATE_ARGS + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + add h, [\XFER + 4*1] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /* add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + + /* lea h, [h + y3] ; h = t1 + S0 + MAJ ; -- */ + + ROTATE_ARGS + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + add h, [\XFER + 4*2] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + /* add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + + /* lea h, [h + y3] ; h = t1 + S0 + MAJ ; -- */ + + ROTATE_ARGS + +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ + + add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + mov y2, f /* y2 = f ; CH */ + rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ + rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ + xor y2, g /* y2 = f^g ; CH */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ + rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ + and y2, e /* y2 = (f^g)&e ; CH */ + add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ + + xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ + rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ + xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ + rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ + mov y3, a /* y3 = a ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ + rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ + add h, [\XFER + 4*3] /* h = k + w + h ; -- */ + or y3, c /* y3 = a|c ; MAJA */ + + xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ + mov T1, a /* T1 = a ; MAJB */ + and y3, b /* y3 = (a|c)&b ; MAJA */ + and T1, c /* T1 = a&c ; MAJB */ + add y2, y0 /* y2 = S1 + CH ; -- */ + + + add d, h /* d = k + w + h + d ; -- */ + or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ + add h, y1 /* h = k + w + h + S0 ; -- */ + + add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ + + + add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ + + lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ + + ROTATE_ARGS +.endm + +/* +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +;; arg 3 : Num blocks +*/ +.text +.globl _gcry_sha256_transform_amd64_avx2 +.type _gcry_sha256_transform_amd64_avx2, at function +.align 32 +_gcry_sha256_transform_amd64_avx2: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + + vzeroupper + + mov rax, rsp + sub rsp, STACK_SIZE + and rsp, -32 + mov [rsp + _RSP], rax + + shl NUM_BLKS, 6 /* convert to bytes */ + jz .Ldone_hash + lea NUM_BLKS, [NUM_BLKS + INP - 64] /* pointer to last block */ + mov [rsp + _INP_END], NUM_BLKS + + cmp INP, NUM_BLKS + je .Lonly_one_block + + /* ; load initial digest */ + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] + vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] + + mov [rsp + _CTX], CTX + +.Loop0: + lea TBL, [.LK256 ADD_RIP] + + /* ; Load first 16 dwords from two blocks */ + VMOVDQ XTMP0, [INP + 0*32] + VMOVDQ XTMP1, [INP + 1*32] + VMOVDQ XTMP2, [INP + 2*32] + VMOVDQ XTMP3, [INP + 3*32] + + /* ; byte swap data */ + vpshufb XTMP0, XTMP0, BYTE_FLIP_MASK + vpshufb XTMP1, XTMP1, BYTE_FLIP_MASK + vpshufb XTMP2, XTMP2, BYTE_FLIP_MASK + vpshufb XTMP3, XTMP3, BYTE_FLIP_MASK + + /* ; transpose data into high/low halves */ + vperm2i128 X0, XTMP0, XTMP2, 0x20 + vperm2i128 X1, XTMP0, XTMP2, 0x31 + vperm2i128 X2, XTMP1, XTMP3, 0x20 + vperm2i128 X3, XTMP1, XTMP3, 0x31 + +.Last_block_enter: + add INP, 64 + mov [rsp + _INP], INP + + /* ; schedule 48 input dwords, by doing 3 rounds of 12 each */ + xor SRND, SRND + +.align 16 +.Loop1: + vpaddd XFER, X0, [TBL + SRND + 0*32] + vmovdqa [rsp + _XFER + SRND + 0*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32 + + vpaddd XFER, X0, [TBL + SRND + 1*32] + vmovdqa [rsp + _XFER + SRND + 1*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32 + + vpaddd XFER, X0, [TBL + SRND + 2*32] + vmovdqa [rsp + _XFER + SRND + 2*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32 + + vpaddd XFER, X0, [TBL + SRND + 3*32] + vmovdqa [rsp + _XFER + SRND + 3*32], XFER + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32 + + add SRND, 4*32 + cmp SRND, 3 * 4*32 + jb .Loop1 + +.Loop2: + /* ; Do last 16 rounds with no scheduling */ + vpaddd XFER, X0, [TBL + SRND + 0*32] + vmovdqa [rsp + _XFER + SRND + 0*32], XFER + DO_4ROUNDS rsp + _XFER + SRND + 0*32 + vpaddd XFER, X1, [TBL + SRND + 1*32] + vmovdqa [rsp + _XFER + SRND + 1*32], XFER + DO_4ROUNDS rsp + _XFER + SRND + 1*32 + add SRND, 2*32 + + vmovdqa X0, X2 + vmovdqa X1, X3 + + cmp SRND, 4 * 4*32 + jb .Loop2 + + mov CTX, [rsp + _CTX] + mov INP, [rsp + _INP] + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + cmp INP, [rsp + _INP_END] + ja .Ldone_hash + + /* ;;; Do second block using previously scheduled results */ + xor SRND, SRND +.align 16 +.Loop3: + DO_4ROUNDS rsp + _XFER + SRND + 0*32 + 16 + DO_4ROUNDS rsp + _XFER + SRND + 1*32 + 16 + add SRND, 2*32 + cmp SRND, 4 * 4*32 + jb .Loop3 + + mov CTX, [rsp + _CTX] + mov INP, [rsp + _INP] + add INP, 64 + + addm [4*0 + CTX],a + addm [4*1 + CTX],b + addm [4*2 + CTX],c + addm [4*3 + CTX],d + addm [4*4 + CTX],e + addm [4*5 + CTX],f + addm [4*6 + CTX],g + addm [4*7 + CTX],h + + cmp INP, [rsp + _INP_END] + jb .Loop0 + ja .Ldone_hash + +.Ldo_last_block: + /* ;;; do last block */ + lea TBL, [.LK256 ADD_RIP] + + VMOVDQ XWORD0, [INP + 0*16] + VMOVDQ XWORD1, [INP + 1*16] + VMOVDQ XWORD2, [INP + 2*16] + VMOVDQ XWORD3, [INP + 3*16] + + vpshufb XWORD0, XWORD0, X_BYTE_FLIP_MASK + vpshufb XWORD1, XWORD1, X_BYTE_FLIP_MASK + vpshufb XWORD2, XWORD2, X_BYTE_FLIP_MASK + vpshufb XWORD3, XWORD3, X_BYTE_FLIP_MASK + + jmp .Last_block_enter + +.Lonly_one_block: + + /* ; load initial digest */ + mov a,[4*0 + CTX] + mov b,[4*1 + CTX] + mov c,[4*2 + CTX] + mov d,[4*3 + CTX] + mov e,[4*4 + CTX] + mov f,[4*5 + CTX] + mov g,[4*6 + CTX] + mov h,[4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] + vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] + + mov [rsp + _CTX], CTX + jmp .Ldo_last_block + +.Ldone_hash: + mov rsp, [rsp + _RSP] + + vzeroall + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + + /* stack burn depth */ + mov eax, STACK_SIZE + 6*8 + 31 + + ret + +.data +.align 64 +.LK256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.LPSHUFFLE_BYTE_FLIP_MASK: + .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 + +/* shuffle xBxA -> 00BA */ +.L_SHUF_00BA: + .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 + +/* shuffle xDxC -> DC00 */ +.L_SHUF_DC00: + .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF + +#endif +#endif diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 9b27f8f..80b1cec 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -206,7 +206,7 @@ a = TMP_ add h, y1 /* h = h + S1 + CH + k + w + S0 */ por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ - add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ ROTATE_ARGS movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */ @@ -247,7 +247,7 @@ ROTATE_ARGS add h, y1 /* h = h + S1 + CH + k + w + S0 */ paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ - add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ ROTATE_ARGS movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */ @@ -288,7 +288,7 @@ ROTATE_ARGS /* compute high s1 */ pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ - add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ ROTATE_ARGS movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */ @@ -327,7 +327,7 @@ ROTATE_ARGS and y0, b /* y0 = (a|c)&b */ add h, y1 /* h = h + S1 + CH + k + w + S0 */ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ - add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ ROTATE_ARGS rotate_Xs @@ -362,7 +362,7 @@ rotate_Xs and y0, b /* y0 = (a|c)&b */ add h, y1 /* h = h + S1 + CH + k + w + S0 */ or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */ - add h, y0 /* h = h + S1 + CH + k + w + S0 + MAJ */ + lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */ ROTATE_ARGS .endm @@ -505,6 +505,7 @@ _gcry_sha256_transform_amd64_ssse3: pop rbx mov eax, STACK_SIZE + 5*8 + ret diff --git a/cipher/sha256.c b/cipher/sha256.c index c2045b8..601e9c0 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -55,6 +55,22 @@ # define USE_SSSE3 1 #endif +/* USE_AVX indicates whether to compile with Intel AVX code. */ +#undef USE_AVX +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_AVX 1 +#endif + +/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */ +#undef USE_AVX2 +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) +# define USE_AVX2 1 +#endif + typedef struct { gcry_md_block_ctx_t bctx; @@ -62,6 +78,12 @@ typedef struct { #ifdef USE_SSSE3 unsigned int use_ssse3:1; #endif +#ifdef USE_AVX + unsigned int use_avx:1; +#endif +#ifdef USE_AVX2 + unsigned int use_avx2:1; +#endif } SHA256_CONTEXT; @@ -73,6 +95,7 @@ static void sha256_init (void *context) { SHA256_CONTEXT *hd = context; + unsigned int features = _gcry_get_hw_features (); hd->h0 = 0x6a09e667; hd->h1 = 0xbb67ae85; @@ -90,8 +113,17 @@ sha256_init (void *context) hd->bctx.bwrite = transform; #ifdef USE_SSSE3 - hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; +#endif +#ifdef USE_AVX + /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. + * Therefore use this implementation on Intel CPUs only. */ + hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); +#endif +#ifdef USE_AVX2 + hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); #endif + (void)features; } @@ -99,6 +131,7 @@ static void sha224_init (void *context) { SHA256_CONTEXT *hd = context; + unsigned int features = _gcry_get_hw_features (); hd->h0 = 0xc1059ed8; hd->h1 = 0x367cd507; @@ -116,8 +149,17 @@ sha224_init (void *context) hd->bctx.bwrite = transform; #ifdef USE_SSSE3 - hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0; + hd->use_ssse3 = (features & HWF_INTEL_SSSE3) != 0; #endif +#ifdef USE_AVX + /* AVX implementation uses SHLD which is known to be slow on non-Intel CPUs. + * Therefore use this implementation on Intel CPUs only. */ + hd->use_avx = (features & HWF_INTEL_AVX) && (features & HWF_INTEL_CPU); +#endif +#ifdef USE_AVX2 + hd->use_avx2 = (features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2); +#endif + (void)features; } @@ -281,6 +323,16 @@ unsigned int _gcry_sha256_transform_amd64_ssse3(const void *input_data, u32 state[8], size_t num_blks); #endif +#ifdef USE_AVX +unsigned int _gcry_sha256_transform_amd64_avx(const void *input_data, + u32 state[8], size_t num_blks); +#endif + +#ifdef USE_AVX2 +unsigned int _gcry_sha256_transform_amd64_avx2(const void *input_data, + u32 state[8], size_t num_blks); +#endif + static unsigned int transform (void *ctx, const unsigned char *data, size_t nblks) @@ -288,6 +340,18 @@ transform (void *ctx, const unsigned char *data, size_t nblks) SHA256_CONTEXT *hd = ctx; unsigned int burn; +#ifdef USE_AVX2 + if (hd->use_avx2) + return _gcry_sha256_transform_amd64_avx2 (data, &hd->h0, nblks) + + 4 * sizeof(void*); +#endif + +#ifdef USE_AVX + if (hd->use_avx) + return _gcry_sha256_transform_amd64_avx (data, &hd->h0, nblks) + + 4 * sizeof(void*); +#endif + #ifdef USE_SSSE3 if (hd->use_ssse3) return _gcry_sha256_transform_amd64_ssse3 (data, &hd->h0, nblks) diff --git a/configure.ac b/configure.ac index 231e3d3..2f24863 100644 --- a/configure.ac +++ b/configure.ac @@ -1742,6 +1742,8 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-ssse3-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha256-avx2-bmi2-amd64.lo" ;; esac fi From outer at interlog.com Wed Dec 18 17:15:39 2013 From: outer at interlog.com (Richard Outerbridge) Date: Wed, 18 Dec 2013 11:15:39 -0500 Subject: Syntax bug in libgcrypt 1.6.0 src/fips.c Message-ID: In make check, the basic test throws ?_gcry_USE_THE_UNDERSCORED_FUNCTION? error at fips.c line 605 (from visibility.h line 308). fips.c 605 old: fname = gcry_malloc (strlen (info.dli_fname) + 1 + 5 + 1 ); new: fname = _gcry_malloc (strlen (info.dli_fname) + 1 + 5 + 1 ); MacBookPro9,1; Mavericks OS X 10.9.1 (13B42) $ gcc ?version Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr --with-gxx-include-dir=/usr/include/c++/4.2.1 Apple LLVM version 5.0 (clang-500.2.79) (based on LLVM 3.3svn) Target: x86_64-apple-darwin13.0.0 Thread model: posix __outer From dbaryshkov at gmail.com Thu Dec 19 00:06:06 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Thu, 19 Dec 2013 03:06:06 +0400 Subject: [PATCH] Truncate hash values for ECDSA signature scheme In-Reply-To: <87wqj3bn2s.fsf@vigenere.g10code.de> References: <1387211686-19661-1-git-send-email-dbaryshkov@gmail.com> <87y53kg241.fsf@vigenere.g10code.de> <1387230490.32327.6.camel@aspire.lan> <1387265855.1948.4.camel@aspire.lan> <87zjnzdflx.fsf@vigenere.g10code.de> <87wqj3bn2s.fsf@vigenere.g10code.de> Message-ID: Hello, On Tue, Dec 17, 2013 at 5:53 PM, Werner Koch wrote: > On Tue, 17 Dec 2013 12:37, dbaryshkov at gmail.com said: > >> if (mpi_is_opaque(input)) >> { >> abuf = mpi_get_opaque (input, &abits); >> rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); >> } >> else >> { >> hash = mpi_copy(input) >> abits = mpi_get_nbits(input); >> } >> if (abits > qbits) >> mpi_rshift (hash, hash, abits - qbits); >> >> This would be more correct, isn't it? > > Yes. I was wrong, this code is incorrect. I think I now understand how to handle different cases wrt. hash truncation. I will post new patch iteration after the Weekend. -- With best wishes Dmitry From wk at gnupg.org Thu Dec 19 00:30:00 2013 From: wk at gnupg.org (Werner Koch) Date: Thu, 19 Dec 2013 00:30:00 +0100 Subject: Syntax bug in libgcrypt 1.6.0 src/fips.c In-Reply-To: (Richard Outerbridge's message of "Wed, 18 Dec 2013 11:15:39 -0500") References: Message-ID: <8738lp4u13.fsf@vigenere.g10code.de> On Wed, 18 Dec 2013 17:15, outer at interlog.com said: > In make check, the basic test throws ?_gcry_USE_THE_UNDERSCORED_FUNCTION? error at fips.c line 605 (from visibility.h line 308). You caught me. I did not test the configure options --enable-hmac-binary-check. Why are you using it - that thing is anwway not fips validated ;-). Thanks for noting. Will be fixed in the next release. Replace gcry_malloc by xtrymalloc. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From mvyskocil at suse.cz Thu Dec 19 11:42:57 2013 From: mvyskocil at suse.cz (Michal Vyskocil) Date: Thu, 19 Dec 2013 11:42:57 +0100 Subject: [PATCH] Use internall malloc in fips.c Message-ID: <20131219104253.GA19714@linux-xtv2.site> Hi, I've found this minor issue in 1.6.0 - gcry_malloc is not available in fips.c, so and internal one is used instead. Regards Michal Vyskocil -------------- next part -------------- A non-text attachment was scrubbed... Name: libgcrypt-1.6.0-use-intenal-functions.patch Type: text/x-patch Size: 673 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 198 bytes Desc: Digital signature URL: From alanh at fairlite.co.uk Thu Dec 19 16:57:06 2013 From: alanh at fairlite.co.uk (Alan) Date: Thu, 19 Dec 2013 15:57:06 +0000 Subject: [PATCH] Just use m68k for m68k-atari-mint platform Message-ID: <52B31752.4010304@fairlite.co.uk> A small patch to limit the atari build to m68k only. Thanks, Alan -------------- next part -------------- A non-text attachment was scrubbed... Name: libgcrypt-1.5.3-mint.patch Type: text/x-patch Size: 362 bytes Desc: not available URL: From jussi.kivilinna at iki.fi Fri Dec 20 15:05:45 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 20 Dec 2013 16:05:45 +0200 Subject: [PATCH] Add AMD64 assembly implementation for arcfour Message-ID: <20131220140545.1830.39389.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'arcfour-amd64.S'. * cipher/arcfour-amd64.S: New. * cipher/arcfour.c (USE_AMD64_ASM): New. [USE_AMD64_ASM] (ARCFOUR_context, _gcry_arcfour_amd64) (encrypt_stream): New. * configure.ac [host=x86_64]: Add 'arcfour-amd64.lo'. -- Patch adds Marc Bevand's public-domain AMD64 assembly implementation of RC4 to libgcrypt. Original implementation is at: http://www.zorinaq.com/papers/rc4-amd64.html Benchmarks on Intel i5-4570 (3200 Mhz): New: ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 1.29 ns/B 737.7 MiB/s 4.14 c/B STREAM dec | 1.31 ns/B 730.6 MiB/s 4.18 c/B Old (C-language): ARCFOUR | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.09 ns/B 457.4 MiB/s 6.67 c/B STREAM dec | 2.09 ns/B 457.2 MiB/s 6.68 c/B Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 - cipher/arcfour-amd64.S | 97 ++++++++++++++++++++++++++++++++++++++++++++++++ cipher/arcfour.c | 28 ++++++++++++++ configure.ac | 7 +++ 4 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 cipher/arcfour-amd64.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 98c6254..15400e5 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -56,7 +56,7 @@ dsa-common.c rsa-common.c \ rmd.h EXTRA_libcipher_la_SOURCES = \ -arcfour.c \ +arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S \ cast5.c cast5-amd64.S cast5-arm.S \ crc.c \ diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S new file mode 100644 index 0000000..c32cd6f --- /dev/null +++ b/cipher/arcfour-amd64.S @@ -0,0 +1,97 @@ +/* +** RC4 implementation optimized for AMD64. +** +** Author: Marc Bevand +** Licence: I hereby disclaim the copyright on this code and place it +** in the public domain. +** +** The throughput achieved by this code is about 320 MBytes/sec, on +** a 1.8 GHz AMD Opteron (rev C0) processor. +** +** 2013/12/20 : +** - Integrated to libgcrypt +** - 4.18 cycles/byte on Intel i5-4570 +*/ + +#ifdef __x86_64__ +#include +#if defined(USE_ARCFOUR) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) + +.text +.align 16 +.globl _gcry_arcfour_amd64 +.type _gcry_arcfour_amd64, at function +_gcry_arcfour_amd64: + push %rbp + push %rbx + mov %rdi, %rbp # key = ARG(key) + mov %rsi, %rbx # rbx = ARG(len) + mov %rdx, %rsi # in = ARG(in) + mov %rcx, %rdi # out = ARG(out) + mov (4*256)(%rbp), %ecx # x = key->x + mov (4*256+4)(%rbp),%edx # y = key->y + inc %rcx # x++ + and $255, %rcx # x &= 0xff + lea -8(%rbx,%rsi), %rbx # rbx = in+len-8 + mov %rbx, %r9 # tmp = in+len-8 + mov (%rbp,%rcx,4), %eax # tx = d[x] + cmp %rsi, %rbx # cmp in with in+len-8 + jl .Lend # jump if (in+len-8 < in) + +.Lstart: + add $8, %rsi # increment in + add $8, %rdi # increment out + + # generate the next 8 bytes of the rc4 stream into %r8 + mov $8, %r11 # byte counter +1: add %al, %dl # y += tx + mov (%rbp,%rdx,4), %ebx # ty = d[y] + mov %ebx, (%rbp,%rcx,4) # d[x] = ty + add %al, %bl # val = ty + tx + mov %eax, (%rbp,%rdx,4) # d[y] = tx + inc %cl # x++ (NEXT ROUND) + mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) + shl $8, %r8 + movb (%rbp,%rbx,4), %r8b # val = d[val] + dec %r11b + jnz 1b + + # xor 8 bytes + bswap %r8 + xor -8(%rsi), %r8 + cmp %r9, %rsi # cmp in+len-8 with in + mov %r8, -8(%rdi) + jle .Lstart # jump if (in <= in+len-8) + +.Lend: + add $8, %r9 # tmp = in+len + + # handle the last bytes, one by one +1: cmp %rsi, %r9 # cmp in with in+len + jle .Lfinished # jump if (in+len <= in) + add %al, %dl # y += tx + mov (%rbp,%rdx,4), %ebx # ty = d[y] + mov %ebx, (%rbp,%rcx,4) # d[x] = ty + add %al, %bl # val = ty + tx + mov %eax, (%rbp,%rdx,4) # d[y] = tx + inc %cl # x++ (NEXT ROUND) + mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND) + movb (%rbp,%rbx,4), %r8b # val = d[val] + xor (%rsi), %r8b # xor 1 byte + movb %r8b, (%rdi) + inc %rsi # in++ + inc %rdi # out++ + jmp 1b + +.Lfinished: + dec %rcx # x-- + movb %dl, (4*256)(%rbp) # key->y = y + movb %cl, (4*256+4)(%rbp) # key->x = x + pop %rbx + pop %rbp + ret +.L__gcry_arcfour_amd64_end: +.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64 + +#endif +#endif diff --git a/cipher/arcfour.c b/cipher/arcfour.c index d692c84..7488637 100644 --- a/cipher/arcfour.c +++ b/cipher/arcfour.c @@ -31,8 +31,34 @@ #include "g10lib.h" #include "cipher.h" +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_AMD64_ASM 1 +#endif + static const char *selftest(void); +#ifdef USE_AMD64_ASM + +typedef struct { + u32 sbox[256]; + u32 idx_i, idx_j; +} ARCFOUR_context; + +void _gcry_arcfour_amd64(void *key, size_t len, const byte *indata, + byte *outdata); + +static void +encrypt_stream (void *context, + byte *outbuf, const byte *inbuf, size_t length) +{ + ARCFOUR_context *ctx = (ARCFOUR_context *) context; + _gcry_arcfour_amd64 (context, length, inbuf, outbuf ); +} + +#else /*!USE_AMD64_ASM*/ + typedef struct { byte sbox[256]; int idx_i, idx_j; @@ -96,6 +122,8 @@ encrypt_stream (void *context, _gcry_burn_stack (64); } +#endif /*!USE_AMD64_ASM*/ + static gcry_err_code_t do_arcfour_setkey (void *context, const byte *key, unsigned int keylen) diff --git a/configure.ac b/configure.ac index 27de850..d97dd33 100644 --- a/configure.ac +++ b/configure.ac @@ -1505,6 +1505,13 @@ LIST_MEMBER(arcfour, $enabled_ciphers) if test "$found" = "1"; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour.lo" AC_DEFINE(USE_ARCFOUR, 1, [Defined if this module should be included]) + + case "${host}" in + x86_64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS arcfour-amd64.lo" + ;; + esac fi LIST_MEMBER(blowfish, $enabled_ciphers) From wk at gnupg.org Fri Dec 20 15:33:24 2013 From: wk at gnupg.org (Werner Koch) Date: Fri, 20 Dec 2013 15:33:24 +0100 Subject: [PATCH] Add AMD64 assembly implementation for arcfour In-Reply-To: <20131220140545.1830.39389.stgit@localhost6.localdomain6> (Jussi Kivilinna's message of "Fri, 20 Dec 2013 16:05:45 +0200") References: <20131220140545.1830.39389.stgit@localhost6.localdomain6> Message-ID: <87wqizpp6z.fsf@vigenere.g10code.de> On Fri, 20 Dec 2013 15:05, jussi.kivilinna at iki.fi said: > Patch adds Marc Bevand's public-domain AMD64 assembly implementation of RC4 to > libgcrypt. Original implementation is at: > http://www.zorinaq.com/papers/rc4-amd64.html Do we really want to improve a broken cipher? But well, if you like it. But please no NEWS entry - it might give users a bad hint to use it. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From ludo at gnu.org Sat Dec 21 12:42:41 2013 From: ludo at gnu.org (Ludovic =?utf-8?Q?Court=C3=A8s?=) Date: Sat, 21 Dec 2013 12:42:41 +0100 Subject: Sexp changes between 1.5.3 and 1.6.0 Message-ID: <87txe2o2fi.fsf@gnu.org> Hi, While writing Guile bindings to the public key API and the corresponding tests for Guix [0], we noticed a couple of differences between 1.5.3 and 1.6.0: 1. ?gcry_sexp_sprint?, in GCRYSEXP_FMT_ADVANCED mode, renders the sexp ?#C0FFEE#? as ?#C0FFEE#? with 1.6, but as a binary byte sequence in 1.5. 2. ?gcry_sexp_nth (a (b 3:pqr) (c 3:456) (d 3:xyz)) 0? returns the complete list in 1.5, whereas in 1.6 it returns NULL (which makes more sense IMO.) I would consider it bugs in the 1.5 series, WDYT? Thanks, Ludo?. [0] http://git.savannah.gnu.org/cgit/guix.git/commit/?id=3476ded934dc0beab1801d7fcdcc37b5c17bbf01 From wk at gnupg.org Sat Dec 21 20:24:41 2013 From: wk at gnupg.org (Werner Koch) Date: Sat, 21 Dec 2013 20:24:41 +0100 Subject: Sexp changes between 1.5.3 and 1.6.0 In-Reply-To: <87txe2o2fi.fsf@gnu.org> ("Ludovic =?utf-8?Q?Court=C3=A8s=22'?= =?utf-8?Q?s?= message of "Sat, 21 Dec 2013 12:42:41 +0100") References: <87txe2o2fi.fsf@gnu.org> Message-ID: <87iouiknwm.fsf@vigenere.g10code.de> On Sat, 21 Dec 2013 12:42, ludo at gnu.org said: > 1. ?gcry_sexp_sprint?, in GCRYSEXP_FMT_ADVANCED mode, renders the > sexp ?#C0FFEE#? as ?#C0FFEE#? with 1.6, but as a binary byte > sequence in 1.5. Right, I changed the heuristics to better cope with the data we are commonly using with Libgcrypt. > 2. ?gcry_sexp_nth (a (b 3:pqr) (c 3:456) (d 3:xyz)) 0? returns the > complete list in 1.5, whereas in 1.6 it returns NULL (which makes > more sense IMO.) Are you sure, that it was wrong 1.5? A quick check does not reveal code changes. But there might have been changes creating the sexp in the first place. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From ludo at gnu.org Sat Dec 21 21:15:16 2013 From: ludo at gnu.org (Ludovic =?utf-8?Q?Court=C3=A8s?=) Date: Sat, 21 Dec 2013 21:15:16 +0100 Subject: Sexp changes between 1.5.3 and 1.6.0 In-Reply-To: <87iouiknwm.fsf@vigenere.g10code.de> (Werner Koch's message of "Sat, 21 Dec 2013 20:24:41 +0100") References: <87txe2o2fi.fsf@gnu.org> <87iouiknwm.fsf@vigenere.g10code.de> Message-ID: <87ob4aklkb.fsf@gnu.org> Werner Koch skribis: > On Sat, 21 Dec 2013 12:42, ludo at gnu.org said: [...] >> 2. ?gcry_sexp_nth (a (b 3:pqr) (c 3:456) (d 3:xyz)) 0? returns the >> complete list in 1.5, whereas in 1.6 it returns NULL (which makes >> more sense IMO.) > > Are you sure, that it was wrong 1.5? A quick check does not reveal code > changes. But there might have been changes creating the sexp in the > first place. I?m creating sexps with ?gcry_sexp_new?, with ?autodetect? set, FWIW. (BTW, as a Schemer, it took me a while to understand that car and nth do not correspond to their traditional Lisp counterpart: the Lisp car and nth return the given element, regardless of whether it is a list or an ?atom?.) Ludo?. From ludo at gnu.org Sat Dec 21 21:24:38 2013 From: ludo at gnu.org (Ludovic =?utf-8?Q?Court=C3=A8s?=) Date: Sat, 21 Dec 2013 21:24:38 +0100 Subject: Storing keys and signatures as sexps Message-ID: <877gaykl4p.fsf@gnu.org> For the purposes of signing package binaries exported from Guix, I am considering storing both key pairs and signatures using the sexp ?advanced? external representation [0]. AFAICS the format is generic, stable, and not libgcrypt-specific, so this looks like a reasonable choice. Nevertheless, is there anything you would caution about? Thanks, Ludo?. [0] https://lists.gnu.org/archive/html/guix-devel/2013-12/msg00115.html From dbaryshkov at gmail.com Sun Dec 22 14:13:45 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Sun, 22 Dec 2013 17:13:45 +0400 Subject: [PATCH v2] Add an utility to calculate hashes over a set of files Message-ID: <1387718025-16209-1-git-send-email-dbaryshkov@gmail.com> * tests/gchash.c: New. -- An utility like rhash that has the ability to calculate different hashes over a set of files it usefull. Add gchash utility to calculate hashes supported by libgcrypt. Signed-off-by: Dmitry Eremin-Solenikov --- .gitignore | 1 + tests/Makefile.am | 3 +- tests/gchash.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 tests/gchash.c diff --git a/.gitignore b/.gitignore index ec7f8bb..8b235f9 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ tests/basic tests/benchmark tests/fips186-dsa tests/fipsdrv +tests/gchash tests/hmac tests/keygen tests/keygrip diff --git a/tests/Makefile.am b/tests/Makefile.am index f5b5b9f..46d2d81 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -48,7 +48,8 @@ AM_CFLAGS = $(GPG_ERROR_CFLAGS) LDADD = ../src/libgcrypt.la $(DL_LIBS) ../compat/libcompat.la $(GPG_ERROR_LIBS) EXTRA_PROGRAMS = testapi pkbench -noinst_PROGRAMS = $(tests_bin) $(tests_bin_last) fipsdrv rsacvt genhashdata +noinst_PROGRAMS = $(tests_bin) $(tests_bin_last) fipsdrv rsacvt genhashdata \ + gchash EXTRA_DIST = README rsa-16k.key cavs_tests.sh cavs_driver.pl \ pkcs1v2-oaep.h pkcs1v2-pss.h pkcs1v2-v15c.h pkcs1v2-v15s.h \ diff --git a/tests/gchash.c b/tests/gchash.c new file mode 100644 index 0000000..7a2aad6 --- /dev/null +++ b/tests/gchash.c @@ -0,0 +1,120 @@ +/* gchash.c - Calculate hash values + * Copyright (C) 2013 Dmitry Eremin-Solenikov + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include + +#ifdef _GCRYPT_IN_LIBGCRYPT +# undef _GCRYPT_IN_LIBGCRYPT +# include "gcrypt.h" +#else +# include +#endif + + +void +init_gcrypt (void) +{ + if (!gcry_check_version (GCRYPT_VERSION)) { + fputs ("libgcrypt version mismatch\n", stderr); + exit (2); + } + + gcry_control (GCRYCTL_SUSPEND_SECMEM_WARN); + + /* Allocate a pool of 16k secure memory. This make the secure memory + * available and also drops privileges where needed. */ + gcry_control (GCRYCTL_INIT_SECMEM, 16384, 0); + + gcry_control (GCRYCTL_RESUME_SECMEM_WARN); + + gcry_control (GCRYCTL_INITIALIZATION_FINISHED, 0); +} + +int +main (int argc, char **argv) +{ + gcry_md_hd_t hd; + gcry_error_t err; + int algo; + + init_gcrypt(); + + if (argc < 2 || (argv[1] && !strcmp(argv[1], "--help"))) + { + fprintf (stderr, "Usage: %s ...\n", argv[0]); + return 1; + } + + algo = gcry_md_map_name (argv[1]); + if (algo == GCRY_MD_NONE) + { + fprintf (stderr, "Unknown algorithm '%s'\n", argv[1]); + return 1; + } + + err = gcry_md_open(&hd, algo, 0); + if (err) + { + fprintf (stderr, "LibGCrypt error %s/%s\n", + gcry_strsource (err), + gcry_strerror (err)); + exit (1); + } + + for (argv += 2; *argv; argv++) + { + FILE *fp; + unsigned char buf[1024]; + size_t size; + int i; + unsigned char *h; + if (!strcmp (*argv, "-")) + fp = stdin; + else + fp = fopen (*argv, "r"); + + if (fp == NULL) + { + perror ("fopen"); + return 1; + } + + while (!feof (fp)) + { + size = fread (buf, 1, sizeof(buf), fp); + gcry_md_write (hd, buf, size); + } + + h = gcry_md_read(hd, 0); + + for (i = 0; i < gcry_md_get_algo_dlen (algo); i++) + printf("%02hhx", h[i]); + printf(" %s\n", *argv); + + gcry_md_reset(hd); + } + + gcry_md_close(hd); + return 0; +} -- 1.8.5.1 From dbaryshkov at gmail.com Sun Dec 22 14:15:52 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Sun, 22 Dec 2013 17:15:52 +0400 Subject: [PATCH v2] Truncate hash values for ECDSA signature scheme Message-ID: <1387718152-16275-1-git-send-email-dbaryshkov@gmail.com> * cipher/dsa-common (_gcry_dsa_normalize_hash): New. Truncate opaque mpis as required for DSA and ECDSA signature schemas. * cipher/dsa.c (verify): Return gpg_err_code_t value from verify() to behave like the rest of internal sign/verify functions. * cipher/dsa.c (sign, verify, dsa_verify): Factor out hash truncation. * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_sign): Factor out hash truncation. * cipher/ecc-ecdsa.c (_gcry_ecc_ecdsa_verify): as required by ECDSA scheme, truncate hash values to bitlength of used curve. * tests/pubkey.c (check_ecc_sample_key): add a testcase for hash truncation. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/dsa-common.c | 33 ++++++++++++++++++++ cipher/dsa.c | 81 ++++++++++++++++++++---------------------------- cipher/ecc-ecdsa.c | 28 ++++++++--------- cipher/pubkey-internal.h | 3 ++ tests/pubkey.c | 47 +++++++++++++++++++++++++++- 5 files changed, 129 insertions(+), 63 deletions(-) diff --git a/cipher/dsa-common.c b/cipher/dsa-common.c index d251eae..a5e42a2 100644 --- a/cipher/dsa-common.c +++ b/cipher/dsa-common.c @@ -359,3 +359,36 @@ _gcry_dsa_gen_rfc6979_k (gcry_mpi_t *r_k, *r_k = k; return rc; } + +/* + * Truncate opaque hash value to qbits for DSA. + * Non-opaque input is not truncated, in hope that user + * knows what is passed. It is not possible to correctly + * trucate non-opaque inputs. + */ +gpg_err_code_t +_gcry_dsa_normalize_hash (gcry_mpi_t input, + gcry_mpi_t *out, + unsigned int qbits) +{ + gpg_err_code_t rc = 0; + const void *abuf; + unsigned int abits; + gcry_mpi_t hash; + + if (mpi_is_opaque (input)) + { + abuf = mpi_get_opaque (input, &abits); + rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); + if (rc) + return rc; + if (abits > qbits) + mpi_rshift (hash, hash, abits - qbits); + } + else + hash = input; + + *out = hash; + + return rc; +} diff --git a/cipher/dsa.c b/cipher/dsa.c index 50bdab1..1707d8c 100644 --- a/cipher/dsa.c +++ b/cipher/dsa.c @@ -115,7 +115,7 @@ static gpg_err_code_t generate (DSA_secret_key *sk, gcry_mpi_t **ret_factors); static gpg_err_code_t sign (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_secret_key *skey, int flags, int hashalgo); -static int verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, +static gpg_err_code_t verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_public_key *pkey); static unsigned int dsa_get_nbits (gcry_sexp_t parms); @@ -165,12 +165,12 @@ test_keys (DSA_secret_key *sk, unsigned int qbits) sign (sig_a, sig_b, data, sk, 0, 0); /* Verify the signature using the public key. */ - if ( !verify (sig_a, sig_b, data, &pk) ) + if ( verify (sig_a, sig_b, data, &pk) ) goto leave; /* Signature does not match. */ /* Modify the data and check that the signing fails. */ mpi_add_ui (data, data, 1); - if ( verify (sig_a, sig_b, data, &pk) ) + if ( !verify (sig_a, sig_b, data, &pk) ) goto leave; /* Signature matches but should not. */ result = 0; /* The test succeeded. */ @@ -573,20 +573,9 @@ sign (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_secret_key *skey, qbits = mpi_get_nbits (skey->q); /* Convert the INPUT into an MPI. */ - if (mpi_is_opaque (input)) - { - abuf = mpi_get_opaque (input, &abits); - rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); - if (rc) - return rc; - if (abits > qbits) - mpi_rshift (hash, hash, abits - qbits); - } - else - { - mpi_normalize (input); - hash = input; - } + rc = _gcry_dsa_normalize_hash (input, &hash, qbits); + if (rc) + return rc; again: /* Create the K value. */ @@ -651,18 +640,25 @@ sign (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_secret_key *skey, /* Returns true if the signature composed from R and S is valid. */ -static int -verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t hash, DSA_public_key *pkey ) +static gpg_err_code_t +verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t input, DSA_public_key *pkey ) { - int rc; + gpg_err_code_t rc = 0; gcry_mpi_t w, u1, u2, v; gcry_mpi_t base[3]; gcry_mpi_t ex[3]; + gcry_mpi_t hash; + unsigned int nbits; if( !(mpi_cmp_ui( r, 0 ) > 0 && mpi_cmp( r, pkey->q ) < 0) ) - return 0; /* assertion 0 < r < q failed */ + return GPG_ERR_BAD_SIGNATURE; /* Assertion 0 < r < n failed. */ if( !(mpi_cmp_ui( s, 0 ) > 0 && mpi_cmp( s, pkey->q ) < 0) ) - return 0; /* assertion 0 < s < q failed */ + return GPG_ERR_BAD_SIGNATURE; /* Assertion 0 < s < n failed. */ + + nbits = mpi_get_nbits (pkey->q); + rc = _gcry_dsa_normalize_hash (input, &hash, nbits); + if (rc) + return rc; w = mpi_alloc( mpi_get_nlimbs(pkey->q) ); u1 = mpi_alloc( mpi_get_nlimbs(pkey->q) ); @@ -685,12 +681,25 @@ verify (gcry_mpi_t r, gcry_mpi_t s, gcry_mpi_t hash, DSA_public_key *pkey ) mpi_mulpowm( v, base, ex, pkey->p ); mpi_fdiv_r( v, v, pkey->q ); - rc = !mpi_cmp( v, r ); + if (mpi_cmp( v, r )) + { + if (DBG_CIPHER) + { + log_mpidump (" i", input); + log_mpidump (" h", hash); + log_mpidump (" v", v); + log_mpidump (" r", r); + log_mpidump (" s", s); + } + rc = GPG_ERR_BAD_SIGNATURE; + } mpi_free(w); mpi_free(u1); mpi_free(u2); mpi_free(v); + if (hash != input) + mpi_free (hash); return rc; } @@ -1090,31 +1099,7 @@ dsa_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms) } /* Verify the signature. */ - if (mpi_is_opaque (data)) - { - const void *abuf; - unsigned int abits, qbits; - gcry_mpi_t a; - - qbits = mpi_get_nbits (pk.q); - - abuf = mpi_get_opaque (data, &abits); - rc = _gcry_mpi_scan (&a, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); - if (!rc) - { - if (abits > qbits) - mpi_rshift (a, a, abits - qbits); - - if (!verify (sig_r, sig_s, a, &pk)) - rc = GPG_ERR_BAD_SIGNATURE; - _gcry_mpi_release (a); - } - } - else - { - if (!verify (sig_r, sig_s, data, &pk)) - rc = GPG_ERR_BAD_SIGNATURE; - } + rc = verify (sig_r, sig_s, data, &pk); leave: _gcry_mpi_release (pk.p); diff --git a/cipher/ecc-ecdsa.c b/cipher/ecc-ecdsa.c index b4bbe2c..1484830 100644 --- a/cipher/ecc-ecdsa.c +++ b/cipher/ecc-ecdsa.c @@ -57,18 +57,9 @@ _gcry_ecc_ecdsa_sign (gcry_mpi_t input, ECC_secret_key *skey, qbits = mpi_get_nbits (skey->E.n); /* Convert the INPUT into an MPI if needed. */ - if (mpi_is_opaque (input)) - { - abuf = mpi_get_opaque (input, &abits); - rc = _gcry_mpi_scan (&hash, GCRYMPI_FMT_USG, abuf, (abits+7)/8, NULL); - if (rc) - return rc; - if (abits > qbits) - mpi_rshift (hash, hash, abits - qbits); - } - else - hash = input; - + rc = _gcry_dsa_normalize_hash (input, &hash, qbits); + if (rc) + return rc; k = NULL; dr = mpi_alloc (0); @@ -161,15 +152,21 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, gcry_mpi_t r, gcry_mpi_t s) { gpg_err_code_t err = 0; - gcry_mpi_t h, h1, h2, x; + gcry_mpi_t hash, h, h1, h2, x; mpi_point_struct Q, Q1, Q2; mpi_ec_t ctx; + unsigned int nbits; if( !(mpi_cmp_ui (r, 0) > 0 && mpi_cmp (r, pkey->E.n) < 0) ) return GPG_ERR_BAD_SIGNATURE; /* Assertion 0 < r < n failed. */ if( !(mpi_cmp_ui (s, 0) > 0 && mpi_cmp (s, pkey->E.n) < 0) ) return GPG_ERR_BAD_SIGNATURE; /* Assertion 0 < s < n failed. */ + nbits = mpi_get_nbits (pkey->E.n); + err = _gcry_dsa_normalize_hash (input, &hash, nbits); + if (err) + return err; + h = mpi_alloc (0); h1 = mpi_alloc (0); h2 = mpi_alloc (0); @@ -184,7 +181,7 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, /* h = s^(-1) (mod n) */ mpi_invm (h, s, pkey->E.n); /* h1 = hash * s^(-1) (mod n) */ - mpi_mulm (h1, input, h, pkey->E.n); + mpi_mulm (h1, hash, h, pkey->E.n); /* Q1 = [ hash * s^(-1) ]G */ _gcry_mpi_ec_mul_point (&Q1, h1, &pkey->E.G, ctx); /* h2 = r * s^(-1) (mod n) */ @@ -230,5 +227,8 @@ _gcry_ecc_ecdsa_verify (gcry_mpi_t input, ECC_public_key *pkey, mpi_free (h2); mpi_free (h1); mpi_free (h); + if (hash != input) + mpi_free (hash); + return err; } diff --git a/cipher/pubkey-internal.h b/cipher/pubkey-internal.h index 96fe0e4..b8167c7 100644 --- a/cipher/pubkey-internal.h +++ b/cipher/pubkey-internal.h @@ -92,6 +92,9 @@ gpg_err_code_t _gcry_dsa_gen_rfc6979_k (gcry_mpi_t *r_k, int halgo, unsigned int extraloops); +gpg_err_code_t _gcry_dsa_normalize_hash (gcry_mpi_t input, + gcry_mpi_t *out, + unsigned int qbits); /*-- ecc.c --*/ gpg_err_code_t _gcry_pk_ecc_get_sexp (gcry_sexp_t *r_sexp, int mode, diff --git a/tests/pubkey.c b/tests/pubkey.c index 4e12dfd..2e60da9 100644 --- a/tests/pubkey.c +++ b/tests/pubkey.c @@ -980,9 +980,23 @@ check_ecc_sample_key (void) "(data (flags raw)\n" " (value #00112233445566778899AABBCCDDEEFF" /* */ "000102030405060708090A0B0C0D0E0F#))"; + static const char hash2_string[] = + "(data (flags raw)\n" + " (value #00112233445566778899AABBCCDDEEFF" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "00112233445566778899AABBCCDDEEFF#))"; + /* hash2, but longer than curve length, so it will be truncated */ + static const char hash3_string[] = + "(data (flags raw)\n" + " (value #00112233445566778899AABBCCDDEEFF" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "000102030405060708090A0B0C0D0E0F" + /* */ "00112233445566778899AABBCCDDEEFF" + /* */ "000102030405060708090A0B0C0D0E0F#))"; gpg_error_t err; - gcry_sexp_t key, hash, sig; + gcry_sexp_t key, hash, hash2, hash3, sig, sig2; if (verbose) fprintf (stderr, "Checking sample ECC key.\n"); @@ -990,6 +1004,12 @@ check_ecc_sample_key (void) if ((err = gcry_sexp_new (&hash, hash_string, 0, 1))) die ("line %d: %s", __LINE__, gpg_strerror (err)); + if ((err = gcry_sexp_new (&hash2, hash2_string, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + + if ((err = gcry_sexp_new (&hash3, hash3_string, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + if ((err = gcry_sexp_new (&key, ecc_private_key, 0, 1))) die ("line %d: %s", __LINE__, gpg_strerror (err)); @@ -1003,6 +1023,28 @@ check_ecc_sample_key (void) if ((err = gcry_pk_verify (sig, hash, key))) die ("gcry_pk_verify failed: %s", gpg_strerror (err)); + /* Verify hash truncation */ + gcry_sexp_release (key); + if ((err = gcry_sexp_new (&key, ecc_private_key, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + + if ((err = gcry_pk_sign (&sig2, hash2, key))) + die ("gcry_pk_sign failed: %s", gpg_strerror (err)); + + gcry_sexp_release (sig); + if ((err = gcry_pk_sign (&sig, hash3, key))) + die ("gcry_pk_sign failed: %s", gpg_strerror (err)); + + gcry_sexp_release (key); + if ((err = gcry_sexp_new (&key, ecc_public_key, 0, 1))) + die ("line %d: %s", __LINE__, gpg_strerror (err)); + + if ((err = gcry_pk_verify (sig, hash2, key))) + die ("gcry_pk_verify failed: %s", gpg_strerror (err)); + + if ((err = gcry_pk_verify (sig2, hash3, key))) + die ("gcry_pk_verify failed: %s", gpg_strerror (err)); + /* Now try signing without the Q parameter. */ gcry_sexp_release (key); @@ -1021,8 +1063,11 @@ check_ecc_sample_key (void) die ("gcry_pk_verify signed without Q failed: %s", gpg_strerror (err)); gcry_sexp_release (sig); + gcry_sexp_release (sig2); gcry_sexp_release (key); gcry_sexp_release (hash); + gcry_sexp_release (hash2); + gcry_sexp_release (hash3); } -- 1.8.5.1 From dbaryshkov at gmail.com Sun Dec 22 14:12:28 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Sun, 22 Dec 2013 17:12:28 +0400 Subject: [PATCH v2] Add a simple (raw) PKCS#1 padding mode Message-ID: <1387717948-16135-1-git-send-email-dbaryshkov@gmail.com> * src/cipher.h (PUBKEY_ENC_PKCS1_RAW): New. * cipher/pubkey-util.c (_gcry_pk_util_parse_flaglist): Handle pkcs1-raw flag. * cipher/pubkey-util.c (_gcry_pk_util_data_to_mpi): Handle s-exp like (data (flags pkcs1-raw) (value xxxxx)) * cipher/rsa-common.c (_gcry_rsa_pkcs1_encode_raw_for_sig): PKCS#1-encode data with embedded hash OID for signature verification. * tests/basic.c (check_pubkey_sign): Add tests for s-exps with pkcs1-raw flag. -- Allow user to specify (flags pkcs1-raw) to enable pkcs1 padding of raw value (no hash algorithm is specified). It is up to the user to verify that the passed value is properly formatted and includes DER-encoded ASN OID of the used hash function. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/pubkey-internal.h | 4 +++ cipher/pubkey-util.c | 25 +++++++++++++++++++ cipher/rsa-common.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ src/cipher.h | 1 + tests/basic.c | 8 ++++++ 5 files changed, 103 insertions(+) diff --git a/cipher/pubkey-internal.h b/cipher/pubkey-internal.h index db1399d..96fe0e4 100644 --- a/cipher/pubkey-internal.h +++ b/cipher/pubkey-internal.h @@ -56,6 +56,10 @@ gpg_err_code_t _gcry_rsa_pkcs1_decode_for_enc (unsigned char **r_result, size_t *r_resultlen, unsigned int nbits, gcry_mpi_t value); gpg_err_code_t +_gcry_rsa_pkcs1_encode_raw_for_sig (gcry_mpi_t *r_result, unsigned int nbits, + const unsigned char *value, size_t valuelen); + +gpg_err_code_t _gcry_rsa_pkcs1_encode_for_sig (gcry_mpi_t *r_result, unsigned int nbits, const unsigned char *value, size_t valuelen, int algo); diff --git a/cipher/pubkey-util.c b/cipher/pubkey-util.c index 616b499..514f1eb 100644 --- a/cipher/pubkey-util.c +++ b/cipher/pubkey-util.c @@ -142,6 +142,16 @@ _gcry_pk_util_parse_flaglist (gcry_sexp_t list, rc = GPG_ERR_INV_FLAG; break; + case 9: + if (!memcmp (s, "pkcs1-raw", 9) && encoding == PUBKEY_ENC_UNKNOWN) + { + encoding = PUBKEY_ENC_PKCS1_RAW; + flags |= PUBKEY_FLAG_FIXEDLEN; + } + else if (!igninvflag) + rc = GPG_ERR_INV_FLAG; + break; + case 10: if (!memcmp (s, "igninvflag", 10)) igninvflag = 1; @@ -850,6 +860,21 @@ _gcry_pk_util_data_to_mpi (gcry_sexp_t input, gcry_mpi_t *ret_mpi, ctx->hash_algo); } } + else if (ctx->encoding == PUBKEY_ENC_PKCS1_RAW && lvalue + && (ctx->op == PUBKEY_OP_SIGN || ctx->op == PUBKEY_OP_VERIFY)) + { + const void * value; + size_t valuelen; + + if (sexp_length (lvalue) != 2) + rc = GPG_ERR_INV_OBJ; + else if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) + || !valuelen ) + rc = GPG_ERR_INV_OBJ; + else + rc = _gcry_rsa_pkcs1_encode_raw_for_sig (ret_mpi, ctx->nbits, + value, valuelen); + } else if (ctx->encoding == PUBKEY_ENC_OAEP && lvalue && ctx->op == PUBKEY_OP_ENCRYPT) { diff --git a/cipher/rsa-common.c b/cipher/rsa-common.c index 4f5a659..f56e989 100644 --- a/cipher/rsa-common.c +++ b/cipher/rsa-common.c @@ -319,6 +319,71 @@ _gcry_rsa_pkcs1_encode_for_sig (gcry_mpi_t *r_result, unsigned int nbits, return rc; } +/* Encode {VALUE,VALUELEN} for an NBITS keys using the pkcs#1 block + type 1 padding. On success the result is stored as a new MPI at + R_RESULT. On error the value at R_RESULT is undefined. + + We encode the value in this way: + + 0 1 PAD(n bytes) 0 VALUE(valuelen bytes) + + 0 is a marker we unfortunately can't encode because we return an + MPI which strips all leading zeroes. + 1 is the block type. + PAD consists of 0xff bytes. + 0 marks the end of the padding. + + (Note that PGP prior to version 2.3 encoded the message digest as: + 0 1 MD(16 bytes) 0 PAD(n bytes) 1 + The MD is always 16 bytes here because it's always MD5. GnuPG + does not not support pre-v2.3 signatures, but I'm including this + comment so the information is easily found if needed.) +*/ +gpg_err_code_t +_gcry_rsa_pkcs1_encode_raw_for_sig (gcry_mpi_t *r_result, unsigned int nbits, + const unsigned char *value, size_t valuelen) +{ + gcry_err_code_t rc = 0; + gcry_error_t err; + byte *frame = NULL; + size_t nframe = (nbits+7) / 8; + int i; + size_t n; + + if ( !valuelen || valuelen + 4 > nframe) + { + /* Can't encode an DLEN byte digest MD into an NFRAME byte + frame. */ + return GPG_ERR_TOO_SHORT; + } + + if ( !(frame = xtrymalloc (nframe)) ) + return gpg_err_code_from_syserror (); + + /* Assemble the pkcs#1 block type 1. */ + n = 0; + frame[n++] = 0; + frame[n++] = 1; /* block type */ + i = nframe - valuelen - 3 ; + gcry_assert (i > 1); + memset (frame+n, 0xff, i ); + n += i; + frame[n++] = 0; + memcpy (frame+n, value, valuelen ); + n += valuelen; + gcry_assert (n == nframe); + + /* Convert it into an MPI. */ + err = _gcry_mpi_scan (r_result, GCRYMPI_FMT_USG, frame, n, &nframe); + if (err) + rc = gcry_err_code (err); + else if (DBG_CIPHER) + log_mpidump ("PKCS#1 block type 1 encoded data", *r_result); + xfree (frame); + + return rc; +} + /* Mask generation function for OAEP. See RFC-3447 B.2.1. */ static gcry_err_code_t diff --git a/src/cipher.h b/src/cipher.h index 10bfe0c..26ffddc 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -54,6 +54,7 @@ enum pk_encoding { PUBKEY_ENC_RAW, PUBKEY_ENC_PKCS1, + PUBKEY_ENC_PKCS1_RAW, PUBKEY_ENC_OAEP, PUBKEY_ENC_PSS, PUBKEY_ENC_UNKNOWN diff --git a/tests/basic.c b/tests/basic.c index 0eb8215..055b7b6 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -5092,6 +5092,10 @@ check_pubkey_sign (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) " (hash sha1 #11223344556677889900AABBCCDDEEFF10203040#))\n", GCRY_PK_RSA, 0 }, + { "(data\n (flags pkcs1-raw)\n" + " (hash sha1 #11223344556677889900AABBCCDDEEFF10203040#))\n", + GCRY_PK_RSA, + GPG_ERR_CONFLICT }, { "(data\n (flags oaep)\n" " (hash sha1 #11223344556677889900AABBCCDDEEFF10203040#))\n", 0, @@ -5124,6 +5128,10 @@ check_pubkey_sign (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo) " (value #11223344556677889900AA#))\n", GCRY_PK_RSA, GPG_ERR_CONFLICT }, + { "(data\n (flags pkcs1-raw)\n" + " (value #11223344556677889900AA#))\n", + GCRY_PK_RSA, + 0 }, { "(data\n (flags raw foo)\n" " (value #11223344556677889900AA#))\n", 0, -- 1.8.5.1 From dbaryshkov at gmail.com Mon Dec 23 11:05:47 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 23 Dec 2013 14:05:47 +0400 Subject: Libgcrypt threads model Message-ID: Hello, While adapting old GnuTLS gcrypt backend to current libgcrypt codebase, I stumbled upon current Libgcrypt threading code. After carving through ath.c, I have the following impression: Despite all definitions in ath.c gcrypt does not really support either W32 threads or pthreads on systems which do not support weak symbols (are there any of them?). Is it true? Is it intentional? How would ath mutexes behave if libgcrypt is linked into an application with static pthreads? With pth or npth? With w32? Is it correct that GCRYCTL_SET_THREAD_CBS gcry_control is also deprecated and should not be used in contemporary code? -- With best wishes Dmitry From dbaryshkov at gmail.com Mon Dec 23 12:21:13 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 23 Dec 2013 15:21:13 +0400 Subject: [PATCH] Correct formatting of gcry_mac_get_algo_keylen documentation Message-ID: <1387797673-23547-1-git-send-email-dbaryshkov@gmail.com> * doc/gcrypt.texi: add braces near gcry_mac_get_algo_keylen documentation. Use braces around unsigned int in gcry_mac_get_algo_keylen documentation, otherwise texinfo breaks that and uses 'int' as a function definition. Signed-off-by: Dmitry Eremin-Solenikov --- doc/gcrypt.texi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index ea4b161..405c8fe 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -3773,7 +3773,7 @@ for the MAC value. On error @code{0} is returned. @end deftypefun - at deftypefun unsigned int gcry_mac_get_algo_keylen (@var{algo}) + at deftypefun {unsigned int} gcry_mac_get_algo_keylen (@var{algo}) This function returns length of the key for MAC algorithm @var{algo}. If the algorithm supports multiple key lengths, the default supported key -- 1.8.5.1 From dbaryshkov at gmail.com Wed Dec 25 12:23:01 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Wed, 25 Dec 2013 15:23:01 +0400 Subject: [PATCH v2] Truncate hash values for ECDSA signature scheme In-Reply-To: <1387718152-16275-1-git-send-email-dbaryshkov@gmail.com> References: <1387718152-16275-1-git-send-email-dbaryshkov@gmail.com> Message-ID: On Sun, Dec 22, 2013 at 5:15 PM, Dmitry Eremin-Solenikov wrote: > --- a/tests/pubkey.c > +++ b/tests/pubkey.c > @@ -980,9 +980,23 @@ check_ecc_sample_key (void) > "(data (flags raw)\n" > " (value #00112233445566778899AABBCCDDEEFF" > /* */ "000102030405060708090A0B0C0D0E0F#))"; > + static const char hash2_string[] = > + "(data (flags raw)\n" > + " (value #00112233445566778899AABBCCDDEEFF" > + /* */ "000102030405060708090A0B0C0D0E0F" > + /* */ "000102030405060708090A0B0C0D0E0F" > + /* */ "00112233445566778899AABBCCDDEEFF#))"; > + /* hash2, but longer than curve length, so it will be truncated */ > + static const char hash3_string[] = > + "(data (flags raw)\n" > + " (value #00112233445566778899AABBCCDDEEFF" > + /* */ "000102030405060708090A0B0C0D0E0F" > + /* */ "000102030405060708090A0B0C0D0E0F" > + /* */ "00112233445566778899AABBCCDDEEFF" > + /* */ "000102030405060708090A0B0C0D0E0F#))"; I forgot about the test. It needs to be fixed. s/value/hash sha1/ in both lines. I will post patch in the evening. -- With best wishes Dmitry From dbaryshkov at gmail.com Fri Dec 27 09:37:12 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Fri, 27 Dec 2013 12:37:12 +0400 Subject: [PATCH 3/3] Add MD2-HMAC calculation support In-Reply-To: <1388133432-6743-1-git-send-email-dbaryshkov@gmail.com> References: <1388133432-6743-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <1388133432-6743-3-git-send-email-dbaryshkov@gmail.com> * src/gcrypt.h.in (GCRY_MAC_HMAC_MD2): New. * cipher/mac-hmac.c: Support GCRY_MAC_HMAC_MD2. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/mac-hmac.c | 8 ++++++++ src/gcrypt.h.in | 1 + 2 files changed, 9 insertions(+) diff --git a/cipher/mac-hmac.c b/cipher/mac-hmac.c index 15c613d..930f2c5 100644 --- a/cipher/mac-hmac.c +++ b/cipher/mac-hmac.c @@ -35,6 +35,8 @@ map_mac_algo_to_md (int mac_algo) { default: return GCRY_MD_NONE; + case GCRY_MAC_HMAC_MD2: + return GCRY_MD_MD2; case GCRY_MAC_HMAC_MD4: return GCRY_MD_MD4; case GCRY_MAC_HMAC_MD5: @@ -270,3 +272,9 @@ gcry_mac_spec_t _gcry_mac_type_spec_hmac_md4 = { &hmac_ops }; #endif +#if USE_MD2 +gcry_mac_spec_t _gcry_mac_type_spec_hmac_md2 = { + GCRY_MAC_HMAC_MD2, {0, 0}, "HMAC_MD2", + &hmac_ops +}; +#endif diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 23195af..3479bd1 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1320,6 +1320,7 @@ enum gcry_mac_algos GCRY_MAC_HMAC_GOSTR3411_94 = 111, GCRY_MAC_HMAC_STRIBOG256 = 112, GCRY_MAC_HMAC_STRIBOG512 = 113, + GCRY_MAC_HMAC_MD2 = 114, GCRY_MAC_CMAC_AES = 201, GCRY_MAC_CMAC_3DES = 202, -- 1.8.5.1 From dbaryshkov at gmail.com Fri Dec 27 09:37:11 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Fri, 27 Dec 2013 12:37:11 +0400 Subject: [PATCH 2/3] Add a function to retrieve algorithm used by MAC handler In-Reply-To: <1388133432-6743-1-git-send-email-dbaryshkov@gmail.com> References: <1388133432-6743-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <1388133432-6743-2-git-send-email-dbaryshkov@gmail.com> * cipher/mac.c (_gcry_mac_get_algo): New function, returns used algo. * src/visibility.c (gcry_mac_get_algo): New wrapper. * src/visibility.h: Hanlde gcry_mac_get_algo. * src/gcrypt-int.h (_gcry_mac_get_algo): New. * src/gcrypt.h.in (gcry_mac_get_algo): New. * src/libgcrypt.def (gcry_mac_get_algo): New. * src/libgcrypt.vers (gcry_mac_get_algo): New. * doc/gcrypt.texi: Document gcry_mac_get_algo. * tests/basic.c (check_one_mac): Verify gcry_mac_get_algo. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/mac.c | 7 +++++++ doc/gcrypt.texi | 10 ++++++++++ src/gcrypt-int.h | 1 + src/gcrypt.h.in | 3 +++ src/libgcrypt.def | 1 + src/libgcrypt.vers | 2 +- src/visibility.c | 6 ++++++ src/visibility.h | 2 ++ tests/basic.c | 6 ++++++ 9 files changed, 37 insertions(+), 1 deletion(-) diff --git a/cipher/mac.c b/cipher/mac.c index fa36c7d..e675b49 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -369,6 +369,13 @@ _gcry_mac_verify (gcry_mac_hd_t hd, const void *buf, size_t buflen) } +int +_gcry_mac_get_algo (gcry_mac_hd_t hd) +{ + return hd->algo; +} + + unsigned int _gcry_mac_get_algo_maclen (int algo) { diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 405c8fe..bb081da 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -3728,6 +3728,16 @@ the MAC calculated in object @var{h}. @end deftypefun +In some situations it might be hard to remember the algorithm used for +the MAC calculation. The following function might be used to get that +information: + + at deftypefun {int} gcry_mac_get_algo (gcry_mac_hd_t @var{h}) + +Retrieve the algorithm used with the handle @var{h}. + at end deftypefun + + @c *********************************** @c ***** MAC info functions ********** @c *********************************** diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h index 65dcb4d..8a6df84 100644 --- a/src/gcrypt-int.h +++ b/src/gcrypt-int.h @@ -180,6 +180,7 @@ gpg_err_code_t _gcry_mac_write (gcry_mac_hd_t hd, const void *buffer, gpg_err_code_t _gcry_mac_read (gcry_mac_hd_t hd, void *buffer, size_t *buflen); gpg_err_code_t _gcry_mac_verify (gcry_mac_hd_t hd, const void *buffer, size_t buflen); +int _gcry_mac_get_algo (gcry_mac_hd_t hd); unsigned int _gcry_mac_get_algo_maclen (int algo); unsigned int _gcry_mac_get_algo_keylen (int algo); const char *_gcry_mac_algo_name (int algorithm) _GCRY_GCC_ATTR_PURE; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index bfb1338..23195af 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1383,6 +1383,9 @@ gcry_error_t gcry_mac_read (gcry_mac_hd_t hd, void *buffer, size_t *buflen); gcry_error_t gcry_mac_verify (gcry_mac_hd_t hd, const void *buffer, size_t buflen); +/* Retrieve the algorithm used with MAC. */ +int gcry_mac_get_algo (gcry_mac_hd_t hd); + /* Retrieve the length in bytes of the MAC yielded by algorithm ALGO. */ unsigned int gcry_mac_get_algo_maclen (int algo); diff --git a/src/libgcrypt.def b/src/libgcrypt.def index a90efce..57ed490 100644 --- a/src/libgcrypt.def +++ b/src/libgcrypt.def @@ -274,6 +274,7 @@ EXPORTS gcry_mac_read @240 gcry_mac_verify @241 gcry_mac_ctl @242 + gcry_mac_get_algo @243 ;; end of file with public symbols for Windows. diff --git a/src/libgcrypt.vers b/src/libgcrypt.vers index 5118c81..7ee0541 100644 --- a/src/libgcrypt.vers +++ b/src/libgcrypt.vers @@ -54,7 +54,7 @@ GCRYPT_1.6 { gcry_cipher_authenticate; gcry_cipher_gettag; gcry_cipher_checktag; gcry_mac_algo_info; gcry_mac_algo_name; gcry_mac_map_name; - gcry_mac_get_algo_maclen; gcry_mac_get_algo_keylen; + gcry_mac_get_algo_maclen; gcry_mac_get_algo_keylen; gcry_mac_get_algo; gcry_mac_open; gcry_mac_close; gcry_mac_setkey; gcry_mac_setiv; gcry_mac_write; gcry_mac_read; gcry_mac_verify; gcry_mac_ctl; diff --git a/src/visibility.c b/src/visibility.c index 2989498..6ed57ca 100644 --- a/src/visibility.c +++ b/src/visibility.c @@ -855,6 +855,12 @@ gcry_mac_map_name (const char *string) return _gcry_mac_map_name (string); } +int +gcry_mac_get_algo (gcry_mac_hd_t hd) +{ + return _gcry_mac_get_algo (hd); +} + unsigned int gcry_mac_get_algo_maclen (int algo) { diff --git a/src/visibility.h b/src/visibility.h index 4127a43..96b5235 100644 --- a/src/visibility.h +++ b/src/visibility.h @@ -137,6 +137,7 @@ MARK_VISIBLEX (gcry_cipher_open) MARK_VISIBLEX (gcry_mac_algo_info) MARK_VISIBLEX (gcry_mac_algo_name) MARK_VISIBLEX (gcry_mac_map_name) +MARK_VISIBLEX (gcry_mac_get_algo) MARK_VISIBLEX (gcry_mac_get_algo_maclen) MARK_VISIBLEX (gcry_mac_get_algo_keylen) MARK_VISIBLEX (gcry_mac_open) @@ -380,6 +381,7 @@ MARK_VISIBLEX (_gcry_mpi_get_const) #define gcry_mac_algo_info _gcry_USE_THE_UNDERSCORED_FUNCTION #define gcry_mac_algo_name _gcry_USE_THE_UNDERSCORED_FUNCTION #define gcry_mac_map_name _gcry_USE_THE_UNDERSCORED_FUNCTION +#define gcry_mac_get_algo _gcry_USE_THE_UNDERSCORED_FUNCTION #define gcry_mac_get_algo_maclen _gcry_USE_THE_UNDERSCORED_FUNCTION #define gcry_mac_get_algo_keylen _gcry_USE_THE_UNDERSCORED_FUNCTION #define gcry_mac_open _gcry_USE_THE_UNDERSCORED_FUNCTION diff --git a/tests/basic.c b/tests/basic.c index 517daa2..80a9c86 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -4707,6 +4707,12 @@ check_one_mac (int algo, const char *data, int datalen, return; } + i = gcry_mac_get_algo (hd); + if (i != algo) + { + fail ("algo %d, gcry_mac_get_algo failed: %d\n", algo, i); + } + maclen = gcry_mac_get_algo_maclen (algo); if (maclen < 1 || maclen > 500) { -- 1.8.5.1 From dbaryshkov at gmail.com Fri Dec 27 09:37:10 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Fri, 27 Dec 2013 12:37:10 +0400 Subject: [PATCH 1/3] Correct formatting of gcry_mac_get_algo_keylen documentation Message-ID: <1388133432-6743-1-git-send-email-dbaryshkov@gmail.com> * doc/gcrypt.texi: add braces near gcry_mac_get_algo_keylen documentation. Use braces around unsigned int in gcry_mac_get_algo_keylen documentation, otherwise texinfo breaks that and uses 'int' as a function definition. Signed-off-by: Dmitry Eremin-Solenikov --- doc/gcrypt.texi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index ea4b161..405c8fe 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -3773,7 +3773,7 @@ for the MAC value. On error @code{0} is returned. @end deftypefun - at deftypefun unsigned int gcry_mac_get_algo_keylen (@var{algo}) + at deftypefun {unsigned int} gcry_mac_get_algo_keylen (@var{algo}) This function returns length of the key for MAC algorithm @var{algo}. If the algorithm supports multiple key lengths, the default supported key -- 1.8.5.1 From ametzler at bebt.de Sat Dec 28 15:04:22 2013 From: ametzler at bebt.de (Andreas Metzler) Date: Sat, 28 Dec 2013 15:04:22 +0100 Subject: Libgcrypt 1.6.0 released References: <87haa8fzzm.fsf@vigenere.g10code.de> Message-ID: <4c23pa-nl4.ln1@argenau.downhill.at.eu.org> Werner Koch wrote: > The GNU project is pleased to announce the availability of Libgcrypt > version 1.6.0. This is the new stable version of Libgcrypt with the API > being mostly compatible to previous versions. Due to the removal of > certain long deprecated functions this version introduces an ABI change. [...] Hello, FYI libgcrypt 1.6.0 is now available in Debian/experimental. Initial buildabilty/test-suite success looks good. No errors yet, arm*, ia64 and mipsel still have to try. cu Andreas https://buildd.debian.org/status/package.php?p=libgcrypt20&suite=experimental http://buildd.debian-ports.org/status/package.php?p=libgcrypt20&suite=experimental -- `What a good friend you are to him, Dr. Maturin. His other friends are so grateful to you.' `I sew his ears on from time to time, sure' From jussi.kivilinna at iki.fi Sun Dec 29 18:27:12 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 29 Dec 2013 19:27:12 +0200 Subject: [PATCH] Fix 'make check' with parallel test-suite Message-ID: <20131229172712.20514.24366.stgit@localhost6.localdomain6> * configure.ac: Change 'hashtest-256g' to 'hashtest-256g.sh'. * tests/Makefile.am: Add '.sh' to hashtest-256g and '.test' to benchmark and bench-slope. * tests/hashtest-256g.in: Rename to... * tests/hashtest-256g.sh.in: ...this. -- Patch fixes following problem with parallel test-suite run on Ubuntu 13.10: fatal: making test-suite.log: failed to create bench-slope.trs fatal: making test-suite.log: failed to create bench-slope.log fatal: making test-suite.log: failed to create hashtest-256g.trs fatal: making test-suite.log: failed to create hashtest-256g.log make[3]: *** [test-suite.log] Error 1 Signed-off-by: Jussi Kivilinna --- configure.ac | 2 +- tests/Makefile.am | 6 +++--- tests/hashtest-256g.in | 7 ------- tests/hashtest-256g.sh.in | 7 +++++++ 4 files changed, 11 insertions(+), 11 deletions(-) delete mode 100755 tests/hashtest-256g.in create mode 100755 tests/hashtest-256g.sh.in diff --git a/configure.ac b/configure.ac index 27de850..5120337 100644 --- a/configure.ac +++ b/configure.ac @@ -1932,7 +1932,7 @@ src/libgcrypt-config src/versioninfo.rc tests/Makefile ]) -AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g]) +AC_CONFIG_FILES([tests/hashtest-256g.sh], [chmod +x tests/hashtest-256g.sh]) AC_OUTPUT diff --git a/tests/Makefile.am b/tests/Makefile.am index f5b5b9f..86f516d 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -24,11 +24,11 @@ tests_bin = \ prime basic keygen pubkey hmac hashtest t-kdf keygrip \ fips186-dsa aeswrap pkcs1v2 random dsa-rfc6979 t-ed25519 -tests_bin_last = benchmark bench-slope +tests_bin_last = benchmark.test bench-slope.test tests_sh = -tests_sh_last = hashtest-256g +tests_sh_last = hashtest-256g.sh TESTS = $(tests_bin) $(tests_sh) $(tests_bin_last) $(tests_sh_last) @@ -52,4 +52,4 @@ noinst_PROGRAMS = $(tests_bin) $(tests_bin_last) fipsdrv rsacvt genhashdata EXTRA_DIST = README rsa-16k.key cavs_tests.sh cavs_driver.pl \ pkcs1v2-oaep.h pkcs1v2-pss.h pkcs1v2-v15c.h pkcs1v2-v15s.h \ - t-ed25519.inp stopwatch.h hashtest-256g.in + t-ed25519.inp stopwatch.h hashtest-256g.sh.in diff --git a/tests/hashtest-256g.in b/tests/hashtest-256g.in deleted file mode 100755 index e897c54..0000000 --- a/tests/hashtest-256g.in +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -algos="SHA1 SHA256 SHA512" - -test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77 -echo " now running 256 GiB tests for $algos - this takes looong" -exec ./hashtest --gigs 256 $algos diff --git a/tests/hashtest-256g.sh.in b/tests/hashtest-256g.sh.in new file mode 100755 index 0000000..e897c54 --- /dev/null +++ b/tests/hashtest-256g.sh.in @@ -0,0 +1,7 @@ +#!/bin/sh + +algos="SHA1 SHA256 SHA512" + +test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77 +echo " now running 256 GiB tests for $algos - this takes looong" +exec ./hashtest --gigs 256 $algos From jussi.kivilinna at iki.fi Sun Dec 29 18:41:15 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 29 Dec 2013 19:41:15 +0200 Subject: [PATCH] Parse /proc/cpuinfo for ARM HW features Message-ID: <20131229174115.17873.57578.stgit@localhost6.localdomain6> * src/hwf-arm.c [__linux__] (HAS_PROC_CPUINFO) (detect_arm_proc_cpuinfo): New. (_gcry_hwf_detect_arm) [HAS_PROC_CPUINFO]: Check '/proc/cpuinfo' for HW features. -- Some Linux platforms (read: Android) block read access to '/proc/self/auxv', which prevents NEON HW detection. Patch adds alternative check which parses '/proc/cpuinfo' which should be accessable by Android applications. Signed-off-by: Jussi Kivilinna --- src/hwf-arm.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 8071894..dbbb607 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -32,6 +32,7 @@ #endif #undef HAS_SYS_AT_HWCAP +#undef HAS_PROC_CPUINFO #ifdef __linux__ #define HAS_SYS_AT_HWCAP 1 @@ -94,6 +95,54 @@ detect_arm_at_hwcap(void) return features; } +#define HAS_PROC_CPUINFO 1 + +static unsigned int +detect_arm_proc_cpuinfo(void) +{ + char buf[1024]; /* large enough */ + char *str_features, *str_neon; + FILE *f; + int readlen, i; + static int cpuinfo_initialized = 0; + static unsigned int stored_cpuinfo_features; + + if (cpuinfo_initialized) + return stored_cpuinfo_features; + + f = fopen("/proc/cpuinfo", "r"); + if (!f) + return 0; + + memset (buf, 0, sizeof(buf)); + readlen = fread (buf, 1, sizeof(buf), f); + fclose (f); + if (readlen <= 0 || readlen > sizeof(buf)) + return 0; + + buf[sizeof(buf) - 1] = '\0'; + + cpuinfo_initialized = 1; + stored_cpuinfo_features = 0; + + /* Find features line. */ + str_features = strstr(buf, "Features"); + if (!str_features) + return stored_cpuinfo_features; + + /* Lines to strings. */ + for (i = 0; i < sizeof(buf); i++) + if (buf[i] == '\n') + buf[i] = '\0'; + + /* Check for NEON. */ + str_neon = strstr(str_features, " neon"); + if (str_neon && (str_neon[5] == ' ' || str_neon[5] == '\0')) + stored_cpuinfo_features |= HWF_ARM_NEON; + + return stored_cpuinfo_features; +} + #endif /* __linux__ */ unsigned int @@ -103,8 +152,10 @@ _gcry_hwf_detect_arm (void) #if defined (HAS_SYS_AT_HWCAP) ret |= detect_arm_at_hwcap (); -#else - ret |= 0; +#endif + +#if defined (HAS_PROC_CPUINFO) + ret |= detect_arm_proc_cpuinfo (); #endif #if defined(__ARM_NEON__) && defined(ENABLE_NEON_SUPPORT) From pchrist at gentoo.org Mon Dec 30 00:41:13 2013 From: pchrist at gentoo.org (Panagiotis Christopoulos (pchrist)) Date: Mon, 30 Dec 2013 01:41:13 +0200 Subject: [PATCH] Adding better detection of AVX/AVX2 support Message-ID: <20131229234113.GA3647@earth.members.linode.com> After upgrading libgcrypt from 1.5.3 to 1.6.0 on a remote XEN system (linode) my gpg2 stopped working properly, throwing SIGILL signals when doing sha512 operations etc. I managed to debug this with the help of Doublas Freed (dwfreed at mtu.edu) and it seems that the current AVX detection just checks for bit 28 on cpuid but the check still works on systems that have disabled the avx/avx2 instructions for some reason (eg. performance/unstability) resulting in SIGILLs (eg. when trying _gcry_sha512_transform_amd64_avx() ). From Intel resources[1][2], I found additional checks for better AVX detection and applied them in the following patch. Please review/change accordingly and commit some better AVX detection mechanism. The AVX part is tested but could not test the AVX2 one, because I lack proper hardware. I can provide additional information upon request. Use the patch only as a guideline, as it's not thoroughly tested. [1] http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled [2] http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf (sections 14.3 and 14.7.1) Signed-off-by: Panagiotis Christopoulos (pchrist) --- src/hwf-x86.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 4e82558..e58641e 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -129,6 +129,22 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx, if (edx) *edx = regs[3]; } + +static void +get_xgetbv(unsigned int *eax) +{ + unsigned int t_eax; + + asm volatile + ("xor %%ecx, %%ecx; xgetbv\n\t" + : "=a" (t_eax) + ); + + if (eax) + *eax = t_eax; +} + + #endif /* x86-64 && GNUC */ @@ -138,6 +154,7 @@ detect_x86_gnuc (void) { char vendor_id[12+1]; unsigned int features; + unsigned int xgetbv_result; unsigned int max_cpuid_level; unsigned int result = 0; @@ -216,9 +233,12 @@ detect_x86_gnuc (void) result |= HWF_INTEL_AESNI; #endif /*ENABLE_AESNI_SUPPORT*/ #ifdef ENABLE_AVX_SUPPORT - /* Test bit 28 for AVX. */ - if (features & 0x10000000) - result |= HWF_INTEL_AVX; + /* Test bits 27 and 28 for AVX. */ + if ((features & 0x18000000) == 0x18000000 ) { + get_xgetbv(&xgetbv_result); + if ((xgetbv_result & 0x6) == 0x6) + result |= HWF_INTEL_AVX; + } #endif /*ENABLE_AVX_SUPPORT*/ #ifdef ENABLE_DRNG_SUPPORT /* Test bit 30 for RDRAND. */ @@ -241,8 +261,19 @@ detect_x86_gnuc (void) #ifdef ENABLE_AVX2_SUPPORT /* Test bit 5 for AVX2. */ - if (features & 0x00000020) + if (features & 0x00000020) { +#ifndef ENABLE_AVX_SUPPORT + get_cpuid(1, NULL, NULL, &features, NULL); + if ((features & 0x18000000) == 0x18000000 ) { + get_xgetbv(&xgetbv_result); + if ((xgetbv_result & 0x6) == 0x6) + result |= HWF_INTEL_AVX2; + } +#else + if (result & HWF_INTEL_AVX) result |= HWF_INTEL_AVX2; +#endif + } #endif /*ENABLE_AVX_SUPPORT*/ } -- 1.8.5.2 -- Panagiotis Christopoulos ( pchrist ) ( Gentoo Lisp Project ) -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 291 bytes Desc: not available URL: From jussi.kivilinna at iki.fi Mon Dec 30 10:54:49 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 30 Dec 2013 11:54:49 +0200 Subject: [PATCH] [for-1.6] Fix buggy/incomplete detection of AVX/AVX2 support In-Reply-To: <20131229234113.GA3647@earth.members.linode.com> References: <20131229234113.GA3647@earth.members.linode.com> Message-ID: <20131230095449.27363.42713.stgit@localhost6.localdomain6> * configure.ac: Also check for 'xgetbv' instruction in AVX and AVX2 inline assembly checks. * src/hwf-x86.c [__i386__] (get_xgetbv): New function. [__x86_64__] (get_xgetbv): New function. [HAS_X86_CPUID] (detect_x86_gnuc): Check for OSXSAVE and OS support for XMM&YMM registers and enable AVX/AVX2 only if XMM&YMM registers are supported by OS. -- This patch is based on original patch and bug report by Panagiotis Christopoulos: Adding better detection of AVX/AVX2 support After upgrading libgcrypt from 1.5.3 to 1.6.0 on a remote XEN system (linode) my gpg2 stopped working properly, throwing SIGILL signals when doing sha512 operations etc. I managed to debug this with the help of Doublas Freed (dwfreed at mtu.edu) and it seems that the current AVX detection just checks for bit 28 on cpuid but the check still works on systems that have disabled the avx/avx2 instructions for some reason (eg. performance/unstability) resulting in SIGILLs (eg. when trying _gcry_sha512_transform_amd64_avx() ). From Intel resources[1][2], I found additional checks for better AVX detection and applied them in the following patch. Please review/change accordingly and commit some better AVX detection mechanism. The AVX part is tested but could not test the AVX2 one, because I lack proper hardware. I can provide additional information upon request. Use the patch only as a guideline, as it's not thoroughly tested. [1] http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled [2] http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf (sections 14.3 and 14.7.1) Reported-by: Panagiotis Christopoulos (pchrist) Cc: Doublas Freed Cc: Tim Harder Signed-off-by: Jussi Kivilinna --- configure.ac | 4 ++-- src/hwf-x86.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index c99765c..e98fa4e 100644 --- a/configure.ac +++ b/configure.ac @@ -1033,7 +1033,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions], [gcry_cv_gcc_inline_asm_avx=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { - __asm__("vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); + __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); }]])], [gcry_cv_gcc_inline_asm_avx=yes])]) if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then @@ -1050,7 +1050,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions], [gcry_cv_gcc_inline_asm_avx2=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { - __asm__("vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); + __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); }]])], [gcry_cv_gcc_inline_asm_avx2=yes])]) if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 4e82558..0591b4f 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -95,6 +95,21 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx, if (edx) *edx = regs[3]; } + +static unsigned int +get_xgetbv(void) +{ + unsigned int t_eax; + + asm volatile + ("xgetbv\n\t" + : "=a" (t_eax) + : "c" (0) + ); + + return t_eax; +} + #endif /* i386 && GNUC */ @@ -129,6 +144,21 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx, if (edx) *edx = regs[3]; } + +static unsigned int +get_xgetbv(void) +{ + unsigned int t_eax; + + asm volatile + ("xgetbv\n\t" + : "=a" (t_eax) + : "c" (0) + ); + + return t_eax; +} + #endif /* x86-64 && GNUC */ @@ -138,9 +168,12 @@ detect_x86_gnuc (void) { char vendor_id[12+1]; unsigned int features; + unsigned int os_supports_avx_avx2_registers = 0; unsigned int max_cpuid_level; unsigned int result = 0; + (void)os_supports_avx_avx2_registers; + if (!is_cpuid_available()) return 0; @@ -215,10 +248,20 @@ detect_x86_gnuc (void) if (features & 0x02000000) result |= HWF_INTEL_AESNI; #endif /*ENABLE_AESNI_SUPPORT*/ +#if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT) + /* Test bit 27 for OSXSAVE (required for AVX/AVX2). */ + if (features & 0x08000000) + { + /* Check that OS has enabled both XMM and YMM state support. */ + if ((get_xgetbv() & 0x6) == 0x6) + os_supports_avx_avx2_registers = 1; + } +#endif #ifdef ENABLE_AVX_SUPPORT /* Test bit 28 for AVX. */ if (features & 0x10000000) - result |= HWF_INTEL_AVX; + if (os_supports_avx_avx2_registers) + result |= HWF_INTEL_AVX; #endif /*ENABLE_AVX_SUPPORT*/ #ifdef ENABLE_DRNG_SUPPORT /* Test bit 30 for RDRAND. */ @@ -242,6 +285,7 @@ detect_x86_gnuc (void) #ifdef ENABLE_AVX2_SUPPORT /* Test bit 5 for AVX2. */ if (features & 0x00000020) + if (os_supports_avx_avx2_registers) result |= HWF_INTEL_AVX2; #endif /*ENABLE_AVX_SUPPORT*/ } From cvs at cvs.gnupg.org Mon Dec 30 10:58:41 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Mon, 30 Dec 2013 10:58:41 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-10-gbbcb121 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via bbcb12187afb1756cb27296166b57fa19ee45d4d (commit) from b7e814f93ee40fcfe17a187a8989c07fde2ba0cd (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit bbcb12187afb1756cb27296166b57fa19ee45d4d Author: Jussi Kivilinna Date: Mon Dec 30 11:57:57 2013 +0200 Fix buggy/incomplete detection of AVX/AVX2 support * configure.ac: Also check for 'xgetbv' instruction in AVX and AVX2 inline assembly checks. * src/hwf-x86.c [__i386__] (get_xgetbv): New function. [__x86_64__] (get_xgetbv): New function. [HAS_X86_CPUID] (detect_x86_gnuc): Check for OSXSAVE and OS support for XMM&YMM registers and enable AVX/AVX2 only if XMM&YMM registers are supported by OS. -- This patch is based on original patch and bug report by Panagiotis Christopoulos: Adding better detection of AVX/AVX2 support After upgrading libgcrypt from 1.5.3 to 1.6.0 on a remote XEN system (linode) my gpg2 stopped working properly, throwing SIGILL signals when doing sha512 operations etc. I managed to debug this with the help of Doublas Freed (dwfreed at mtu.edu) and it seems that the current AVX detection just checks for bit 28 on cpuid but the check still works on systems that have disabled the avx/avx2 instructions for some reason (eg. performance/unstability) resulting in SIGILLs (eg. when trying _gcry_sha512_transform_amd64_avx() ). From Intel resources[1][2], I found additional checks for better AVX detection and applied them in the following patch. Please review/change accordingly and commit some better AVX detection mechanism. The AVX part is tested but could not test the AVX2 one, because I lack proper hardware. I can provide additional information upon request. Use the patch only as a guideline, as it's not thoroughly tested. [1] http://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled [2] http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf (sections 14.3 and 14.7.1) Reported-by: Panagiotis Christopoulos (pchrist) Cc: Doublas Freed Cc: Tim Harder Signed-off-by: Jussi Kivilinna diff --git a/configure.ac b/configure.ac index 27de850..8b43d9a 100644 --- a/configure.ac +++ b/configure.ac @@ -1033,7 +1033,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX instructions], [gcry_cv_gcc_inline_asm_avx=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { - __asm__("vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); + __asm__("xgetbv; vaesdeclast (%[mem]),%%xmm0,%%xmm7\n\t"::[mem]"r"(0):); }]])], [gcry_cv_gcc_inline_asm_avx=yes])]) if test "$gcry_cv_gcc_inline_asm_avx" = "yes" ; then @@ -1050,7 +1050,7 @@ AC_CACHE_CHECK([whether GCC inline assembler supports AVX2 instructions], [gcry_cv_gcc_inline_asm_avx2=no AC_COMPILE_IFELSE([AC_LANG_SOURCE( [[void a(void) { - __asm__("vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); + __asm__("xgetbv; vpbroadcastb %%xmm7,%%ymm1\n\t":::"cc"); }]])], [gcry_cv_gcc_inline_asm_avx2=yes])]) if test "$gcry_cv_gcc_inline_asm_avx2" = "yes" ; then diff --git a/src/hwf-x86.c b/src/hwf-x86.c index 4e82558..0591b4f 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -95,6 +95,21 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx, if (edx) *edx = regs[3]; } + +static unsigned int +get_xgetbv(void) +{ + unsigned int t_eax; + + asm volatile + ("xgetbv\n\t" + : "=a" (t_eax) + : "c" (0) + ); + + return t_eax; +} + #endif /* i386 && GNUC */ @@ -129,6 +144,21 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx, if (edx) *edx = regs[3]; } + +static unsigned int +get_xgetbv(void) +{ + unsigned int t_eax; + + asm volatile + ("xgetbv\n\t" + : "=a" (t_eax) + : "c" (0) + ); + + return t_eax; +} + #endif /* x86-64 && GNUC */ @@ -138,9 +168,12 @@ detect_x86_gnuc (void) { char vendor_id[12+1]; unsigned int features; + unsigned int os_supports_avx_avx2_registers = 0; unsigned int max_cpuid_level; unsigned int result = 0; + (void)os_supports_avx_avx2_registers; + if (!is_cpuid_available()) return 0; @@ -215,10 +248,20 @@ detect_x86_gnuc (void) if (features & 0x02000000) result |= HWF_INTEL_AESNI; #endif /*ENABLE_AESNI_SUPPORT*/ +#if defined(ENABLE_AVX_SUPPORT) || defined(ENABLE_AVX2_SUPPORT) + /* Test bit 27 for OSXSAVE (required for AVX/AVX2). */ + if (features & 0x08000000) + { + /* Check that OS has enabled both XMM and YMM state support. */ + if ((get_xgetbv() & 0x6) == 0x6) + os_supports_avx_avx2_registers = 1; + } +#endif #ifdef ENABLE_AVX_SUPPORT /* Test bit 28 for AVX. */ if (features & 0x10000000) - result |= HWF_INTEL_AVX; + if (os_supports_avx_avx2_registers) + result |= HWF_INTEL_AVX; #endif /*ENABLE_AVX_SUPPORT*/ #ifdef ENABLE_DRNG_SUPPORT /* Test bit 30 for RDRAND. */ @@ -242,6 +285,7 @@ detect_x86_gnuc (void) #ifdef ENABLE_AVX2_SUPPORT /* Test bit 5 for AVX2. */ if (features & 0x00000020) + if (os_supports_avx_avx2_registers) result |= HWF_INTEL_AVX2; #endif /*ENABLE_AVX_SUPPORT*/ } ----------------------------------------------------------------------- Summary of changes: configure.ac | 4 ++-- src/hwf-x86.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From jussi.kivilinna at iki.fi Mon Dec 30 14:02:43 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 30 Dec 2013 15:02:43 +0200 Subject: [PATCH] Fix 'make check' with parallel test-suite In-Reply-To: <20131229172712.20514.24366.stgit@localhost6.localdomain6> References: <20131229172712.20514.24366.stgit@localhost6.localdomain6> Message-ID: <52C16EF3.9080106@iki.fi> On 29.12.2013 19:27, Jussi Kivilinna wrote: > * configure.ac: Change 'hashtest-256g' to 'hashtest-256g.sh'. > * tests/Makefile.am: Add '.sh' to hashtest-256g and '.test' to > benchmark and bench-slope. > * tests/hashtest-256g.in: Rename to... > * tests/hashtest-256g.sh.in: ...this. > -- > > Patch fixes following problem with parallel test-suite run on Ubuntu 13.10: Problem with this patch is that hashtest-256g.log was not updated to hastest-256g.sh.log in tests/Makefile.am and so hashtest-256g is being run in parallel with benchmark tests. With 'hashtest-256g.sh.log:' change, I get same kind of error as below. Any ideas for workaround? Problem seems to happen with automake-1.13, not earlier. -Jussi > > fatal: making test-suite.log: failed to create bench-slope.trs > fatal: making test-suite.log: failed to create bench-slope.log > fatal: making test-suite.log: failed to create hashtest-256g.trs > fatal: making test-suite.log: failed to create hashtest-256g.log > make[3]: *** [test-suite.log] Error 1 > > Signed-off-by: Jussi Kivilinna > --- > configure.ac | 2 +- > tests/Makefile.am | 6 +++--- > tests/hashtest-256g.in | 7 ------- > tests/hashtest-256g.sh.in | 7 +++++++ > 4 files changed, 11 insertions(+), 11 deletions(-) > delete mode 100755 tests/hashtest-256g.in > create mode 100755 tests/hashtest-256g.sh.in > > diff --git a/configure.ac b/configure.ac > index 27de850..5120337 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -1932,7 +1932,7 @@ src/libgcrypt-config > src/versioninfo.rc > tests/Makefile > ]) > -AC_CONFIG_FILES([tests/hashtest-256g], [chmod +x tests/hashtest-256g]) > +AC_CONFIG_FILES([tests/hashtest-256g.sh], [chmod +x tests/hashtest-256g.sh]) > AC_OUTPUT > > > diff --git a/tests/Makefile.am b/tests/Makefile.am > index f5b5b9f..86f516d 100644 > --- a/tests/Makefile.am > +++ b/tests/Makefile.am > @@ -24,11 +24,11 @@ tests_bin = \ > prime basic keygen pubkey hmac hashtest t-kdf keygrip \ > fips186-dsa aeswrap pkcs1v2 random dsa-rfc6979 t-ed25519 > > -tests_bin_last = benchmark bench-slope > +tests_bin_last = benchmark.test bench-slope.test > > tests_sh = > > -tests_sh_last = hashtest-256g > +tests_sh_last = hashtest-256g.sh > > TESTS = $(tests_bin) $(tests_sh) $(tests_bin_last) $(tests_sh_last) > > @@ -52,4 +52,4 @@ noinst_PROGRAMS = $(tests_bin) $(tests_bin_last) fipsdrv rsacvt genhashdata > > EXTRA_DIST = README rsa-16k.key cavs_tests.sh cavs_driver.pl \ > pkcs1v2-oaep.h pkcs1v2-pss.h pkcs1v2-v15c.h pkcs1v2-v15s.h \ > - t-ed25519.inp stopwatch.h hashtest-256g.in > + t-ed25519.inp stopwatch.h hashtest-256g.sh.in > diff --git a/tests/hashtest-256g.in b/tests/hashtest-256g.in > deleted file mode 100755 > index e897c54..0000000 > --- a/tests/hashtest-256g.in > +++ /dev/null > @@ -1,7 +0,0 @@ > -#!/bin/sh > - > -algos="SHA1 SHA256 SHA512" > - > -test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77 > -echo " now running 256 GiB tests for $algos - this takes looong" > -exec ./hashtest --gigs 256 $algos > diff --git a/tests/hashtest-256g.sh.in b/tests/hashtest-256g.sh.in > new file mode 100755 > index 0000000..e897c54 > --- /dev/null > +++ b/tests/hashtest-256g.sh.in > @@ -0,0 +1,7 @@ > +#!/bin/sh > + > +algos="SHA1 SHA256 SHA512" > + > +test "@RUN_LARGE_DATA_TESTS@" = yes || exit 77 > +echo " now running 256 GiB tests for $algos - this takes looong" > +exec ./hashtest --gigs 256 $algos > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > From cvs at cvs.gnupg.org Mon Dec 30 14:13:59 2013 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Mon, 30 Dec 2013 14:13:59 +0100 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-12-g7547898 Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via 7547898109c72a97e3102b2a045ee4fdb2aa40bf (commit) via a05be441d8cd89b90d8d58e3a343a436dae377d0 (commit) from bbcb12187afb1756cb27296166b57fa19ee45d4d (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit 7547898109c72a97e3102b2a045ee4fdb2aa40bf Author: Jussi Kivilinna