From dbaryshkov at gmail.com Fri Aug 2 09:14:14 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Fri, 2 Aug 2013 11:14:14 +0400 Subject: [RFC 1/2] Add API to support AEAD cipher modes In-Reply-To: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <1375427655-5894-2-git-send-email-dbaryshkov@gmail.com> * cipher/cipher.c (_gcry_cipher_authenticate, _gcry_cipher_tag): New. * src/visibility.c (gcry_cipher_authenticate, gcry_cipher_tag): New. * src/gcrypt.h.in, src/visibility.h: add declarations of these functions. * src/libgcrypt.defs, src/libgcrypt.vers: export functions. -- Authenticated Encryption with Associated Data (AEAD) cipher modes provide authentication tag that can be used to authenticate message. At the same time it allows one to specify additional (unencrypted data) that will be authenticated together with the message. This class of cipher modes requires additional API present in this commit. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/cipher.c | 15 +++++++++++++++ src/gcrypt.h.in | 7 +++++++ src/libgcrypt.def | 2 ++ src/libgcrypt.vers | 1 + src/visibility.c | 18 ++++++++++++++++++ src/visibility.h | 6 ++++++ 6 files changed, 49 insertions(+) diff --git a/cipher/cipher.c b/cipher/cipher.c index 08d6165..99bd3cd 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -1174,6 +1174,21 @@ _gcry_cipher_setctr (gcry_cipher_hd_t hd, const void *ctr, size_t ctrlen) return 0; } +gcry_error_t +_gcry_cipher_authenticate (gcry_cipher_hd_t hd, + const void *aad, size_t aadsize) +{ + log_fatal ("gcry_cipher_tag: invalid mode %d\n", hd->mode ); + return gpg_error (GPG_ERR_INV_CIPHER_MODE); +} + +gcry_error_t +_gcry_cipher_tag (gcry_cipher_hd_t hd, void *out, size_t outsize) +{ + log_fatal ("gcry_cipher_tag: invalid mode %d\n", hd->mode ); + return gpg_error (GPG_ERR_INV_CIPHER_MODE); +} + gcry_error_t gcry_cipher_ctl( gcry_cipher_hd_t h, int cmd, void *buffer, size_t buflen) diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 06d6663..faedb33 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -910,6 +910,13 @@ gcry_error_t gcry_cipher_setkey (gcry_cipher_hd_t hd, gcry_error_t gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen); +/* Provide additional authentication data for AEAD modes/ciphers. */ +gcry_error_t gcry_cipher_authenticate (gcry_cipher_hd_t h, + const void *aad, size_t aadlen); + +/* Get authentication tag for AEAD modes/ciphers. */ +gcry_error_t gcry_cipher_tag (gcry_cipher_hd_t h, + void *out, size_t outsize); /* Reset the handle to the state after open. */ #define gcry_cipher_reset(h) gcry_cipher_ctl ((h), GCRYCTL_RESET, NULL, 0) diff --git a/src/libgcrypt.def b/src/libgcrypt.def index bbc8f43..58695fa 100644 --- a/src/libgcrypt.def +++ b/src/libgcrypt.def @@ -239,5 +239,7 @@ EXPORTS gcry_sexp_get_buffer @214 + gcry_cipher_authenticate @215 + gcry_cipher_tag @216 ;; end of file with public symbols for Windows. diff --git a/src/libgcrypt.vers b/src/libgcrypt.vers index 473ee68..94235f4 100644 --- a/src/libgcrypt.vers +++ b/src/libgcrypt.vers @@ -50,6 +50,7 @@ GCRYPT_1.6 { gcry_cipher_info; gcry_cipher_map_name; gcry_cipher_mode_from_oid; gcry_cipher_open; gcry_cipher_setkey; gcry_cipher_setiv; gcry_cipher_setctr; + gcry_cipher_authenticate; gcry_cipher_tag; gcry_pk_algo_info; gcry_pk_algo_name; gcry_pk_ctl; gcry_pk_decrypt; gcry_pk_encrypt; gcry_pk_genkey; diff --git a/src/visibility.c b/src/visibility.c index bb51d58..6f69ed3 100644 --- a/src/visibility.c +++ b/src/visibility.c @@ -658,6 +658,24 @@ gcry_cipher_setiv (gcry_cipher_hd_t hd, const void *iv, size_t ivlen) return _gcry_cipher_setiv (hd, iv, ivlen); } +gcry_error_t +gcry_cipher_tag (gcry_cipher_hd_t hd, void *out, size_t outsize) +{ + if (!fips_is_operational ()) + return gpg_error (fips_not_operational ()); + + return _gcry_cipher_tag (hd, out, outsize); +} + +gcry_error_t +gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *aad, size_t aadsize) +{ + if (!fips_is_operational ()) + return gpg_error (fips_not_operational ()); + + return _gcry_cipher_authenticate (hd, aad, aadsize); +} + gpg_error_t gcry_cipher_setctr (gcry_cipher_hd_t hd, const void *ctr, size_t ctrlen) { diff --git a/src/visibility.h b/src/visibility.h index 54da016..9b8065a 100644 --- a/src/visibility.h +++ b/src/visibility.h @@ -80,6 +80,8 @@ #define gcry_cipher_setkey _gcry_cipher_setkey #define gcry_cipher_setiv _gcry_cipher_setiv #define gcry_cipher_setctr _gcry_cipher_setctr +#define gcry_cipher_authenticate _gcry_cipher_authenticate +#define gcry_cipher_tag _gcry_cipher_tag #define gcry_cipher_ctl _gcry_cipher_ctl #define gcry_cipher_decrypt _gcry_cipher_decrypt #define gcry_cipher_encrypt _gcry_cipher_encrypt @@ -296,6 +298,8 @@ gcry_err_code_t gcry_md_get (gcry_md_hd_t hd, int algo, #undef gcry_cipher_setkey #undef gcry_cipher_setiv #undef gcry_cipher_setctr +#undef gcry_cipher_authenticate +#undef gcry_cipher_tag #undef gcry_cipher_ctl #undef gcry_cipher_decrypt #undef gcry_cipher_encrypt @@ -472,6 +476,8 @@ MARK_VISIBLE (gcry_cipher_close) MARK_VISIBLE (gcry_cipher_setkey) MARK_VISIBLE (gcry_cipher_setiv) MARK_VISIBLE (gcry_cipher_setctr) +MARK_VISIBLE (gcry_cipher_authenticate) +MARK_VISIBLE (gcry_cipher_tag) MARK_VISIBLE (gcry_cipher_ctl) MARK_VISIBLE (gcry_cipher_decrypt) MARK_VISIBLE (gcry_cipher_encrypt) -- 1.7.10.4 From dbaryshkov at gmail.com Fri Aug 2 09:14:13 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Fri, 2 Aug 2013 11:14:13 +0400 Subject: [RFC 0/2] Draft implementation of GCM Message-ID: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> Hello, I have been working on GCM (Galois Cointer Mode) implementation. It is not yet finished (ghash/multiplication functions are very slow, cipher-gcm.c contains lots of magic values of '16', etc). However I would like to present these patches for review. Could you please provide any feedback? -- With best wishes Dmitry From dbaryshkov at gmail.com Fri Aug 2 09:14:15 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Fri, 2 Aug 2013 11:14:15 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> Message-ID: <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> Currently it is very slow. Signed-off-by: Dmitry Eremin-Solenikov --- cipher/Makefile.am | 1 + cipher/cipher-gcm.c | 282 +++++++++++++++++++++++++++++++++++++++ cipher/cipher-internal.h | 27 ++++ cipher/cipher.c | 30 +++++ src/gcrypt.h.in | 6 +- tests/basic.c | 332 ++++++++++++++++++++++++++++++++++++++++++++++ tests/benchmark.c | 5 +- 7 files changed, 681 insertions(+), 2 deletions(-) create mode 100644 cipher/cipher-gcm.c diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 75ad987..e185975 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -40,6 +40,7 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \ +cipher-gcm.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h \ md.c \ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c new file mode 100644 index 0000000..c216b90 --- /dev/null +++ b/cipher/cipher-gcm.c @@ -0,0 +1,282 @@ +/* cipher-gcm.c - Generic Galois Counter Mode implementation + * Copyright (C) 2013 Dmitry Eremin-Solenikov + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "cipher.h" +#include "ath.h" +#include "bufhelp.h" +#include "./cipher-internal.h" + +static unsigned bshift(unsigned char *b) +{ + unsigned char c; + int i; + c = b[15] & 1; + for (i = 15; i > 0; i--) + { + b[i] = (b[i] >> 1) | (b[i-1] << 7); + } + b[i] >>= 1; + return c; +} + +static void ghash(unsigned char *hsub, unsigned char *result, const unsigned char *buf) +{ + unsigned char V[16]; + int i, j; + + memcpy(V, result, 16); + buf_xor(V, V, buf, 16); + + memset(result, 0, 16); + + for (i = 0; i < 16; i++) + { + for (j = 0x80; j ; j >>= 1) + { + if (hsub[i] & j) + buf_xor(result, result, V, 16); + if (bshift(V)) + V[0] ^= 0xe1; + } + } +} + + +gcry_err_code_t +_gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c, + byte *outbuf, unsigned int outbuflen, + const byte *inbuf, unsigned int inbuflen) +{ + unsigned int n; + int i; + unsigned int blocksize = c->cipher->blocksize; + unsigned char tmp[MAX_BLOCKSIZE]; + + if (blocksize >= 0x20) + return GPG_ERR_CIPHER_ALGO; + if (blocksize != 0x10) + return GPG_ERR_CIPHER_ALGO; + if (outbuflen < inbuflen) + return GPG_ERR_BUFFER_TOO_SHORT; + + if (!c->marks.iv) + { + memset(tmp, 0, 16); + _gcry_cipher_gcm_setiv(c, tmp, 16); + } + + while (inbuflen) + { + for (i = blocksize; i > blocksize - 4; i--) + { + c->u_ctr.ctr[i-1]++; + if (c->u_ctr.ctr[i-1] != 0) + break; + } + + n = blocksize < inbuflen ? blocksize : inbuflen; + + i = blocksize - 1; + c->length[i] += n * 8; + for ( ; c->length[i] == 0 && i > blocksize / 2; i --) + c->length[i - 1]++; + + c->cipher->encrypt (&c->context.c, tmp, c->u_ctr.ctr); + if (n < blocksize) + { + buf_xor_2dst (outbuf, tmp, inbuf, n); + memset(tmp + n, 0, blocksize - n); + ghash (c->u_iv.iv, c->u_tag.tag, tmp); + } else { + buf_xor (outbuf, tmp, inbuf, n); + ghash (c->u_iv.iv, c->u_tag.tag, outbuf); + } + + inbuflen -= n; + outbuf += n; + inbuf += n; + } + + return 0; +} + +gcry_err_code_t +_gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c, + byte *outbuf, unsigned int outbuflen, + const byte *inbuf, unsigned int inbuflen) +{ + unsigned int n; + int i; + unsigned int blocksize = c->cipher->blocksize; + unsigned char tmp[MAX_BLOCKSIZE]; + + if (blocksize >= 0x20) + return GPG_ERR_CIPHER_ALGO; + if (blocksize != 0x10) + return GPG_ERR_CIPHER_ALGO; + if (outbuflen < inbuflen) + return GPG_ERR_BUFFER_TOO_SHORT; + + if (!c->marks.iv) + { + memset(tmp, 0, 16); + _gcry_cipher_gcm_setiv(c, tmp, 16); + } + + while (inbuflen) + { + for (i = blocksize; i > blocksize - 4; i--) + { + c->u_ctr.ctr[i-1]++; + if (c->u_ctr.ctr[i-1] != 0) + break; + } + + n = blocksize < inbuflen ? blocksize : inbuflen; + if (n < blocksize) + { + memcpy (tmp, inbuf, n); + memset(tmp + n, 0, blocksize - n); + ghash (c->u_iv.iv, c->u_tag.tag, tmp); + } else { + ghash (c->u_iv.iv, c->u_tag.tag, inbuf); + } + + i = blocksize - 1; + c->length[i] += n * 8; + for ( ; c->length[i] == 0 && i > blocksize / 2; i --) + c->length[i - 1]++; + + c->cipher->encrypt (&c->context.c, tmp, c->u_ctr.ctr); + + buf_xor (outbuf, inbuf, tmp, n); + + inbuflen -= n; + outbuf += n; + inbuf += n; + } + + return 0; +} + +gcry_err_code_t +_gcry_cipher_gcm_authenticate (gcry_cipher_hd_t c, + const byte *aadbuf, unsigned int aadbuflen) +{ + unsigned int n; + int i; + unsigned int blocksize = c->cipher->blocksize; + unsigned char tmp[MAX_BLOCKSIZE]; + + if (!c->marks.iv) + { + memset(tmp, 0, 16); + _gcry_cipher_gcm_setiv(c, tmp, 16); + } + + n = aadbuflen; + i = blocksize / 2; + c->length[i-1] = (n % 0x20) * 8; + n /= 0x20; + for (; n && i > 0; i--, n >>= 8) + c->length[i-1] = n & 0xff; + + while (aadbuflen >= blocksize) + { + ghash (c->u_iv.iv, c->u_tag.tag, aadbuf); + + aadbuflen -= blocksize; + aadbuf += blocksize; + } + + if (aadbuflen != 0) + { + memcpy(tmp, aadbuf, aadbuflen); + memset(tmp + aadbuflen, 0, blocksize - aadbuflen); + + ghash (c->u_iv.iv, c->u_tag.tag, tmp); + } + + return 0; +} + +void +_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, + const byte *iv, unsigned int ivlen) +{ + memset (c->length, 0, 16); + memset (c->u_tag.tag, 0, 16); + c->cipher->encrypt ( &c->context.c, c->u_iv.iv, c->u_tag.tag ); + + if (ivlen != 16 - 4) + { + unsigned char tmp[MAX_BLOCKSIZE]; + unsigned n; + memset(c->u_ctr.ctr, 0, 16); + for (n = ivlen; n >= 16; n -= 16, iv += 16) + ghash (c->u_iv.iv, c->u_ctr.ctr, iv); + if (n != 0) + { + memcpy(tmp, iv, n); + memset(tmp + n, 0, 16 - n); + ghash (c->u_iv.iv, c->u_ctr.ctr, tmp); + } + memset(tmp, 0, 16); + n = 16; + tmp[n-1] = (ivlen % 0x20) * 8; + ivlen /= 0x20; + n--; + for (; n > 0; n--, ivlen >>= 8) + tmp[n-1] = ivlen & 0xff; + ghash (c->u_iv.iv, c->u_ctr.ctr, tmp); + } else { + memcpy (c->u_ctr.ctr, iv, ivlen); + c->u_ctr.ctr[12] = c->u_ctr.ctr[13] = c->u_ctr.ctr[14] = 0; + c->u_ctr.ctr[15] = 1; + } + + c->cipher->encrypt ( &c->context.c, c->lastiv, c->u_ctr.ctr ); + c->marks.iv = 1; + +} + +gcry_err_code_t +_gcry_cipher_gcm_tag (gcry_cipher_hd_t c, + byte *outbuf, unsigned int outbuflen) +{ + if (outbuflen < 16) + return GPG_ERR_BUFFER_TOO_SHORT; + + if (!c->marks.tag) + { + ghash (c->u_iv.iv, c->u_tag.tag, c->length); + buf_xor (c->u_tag.tag, c->lastiv, c->u_tag.tag, 16); + c->marks.tag = 1; + } + memcpy (outbuf, c->u_tag.tag, 16); + + return 0; +} diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 025bf2e..76d8540 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -98,6 +98,7 @@ struct gcry_cipher_handle struct { unsigned int key:1; /* Set to 1 if a key has been set. */ unsigned int iv:1; /* Set to 1 if a IV has been set. */ + unsigned int tag:1; /* Set to 1 if a tag is finalized. */ } marks; /* The initialization vector. For best performance we make sure @@ -115,9 +116,16 @@ struct gcry_cipher_handle unsigned char ctr[MAX_BLOCKSIZE]; } u_ctr; + /* The interim tag for GCM mode. */ + union { + cipher_context_alignment_t iv_align; + unsigned char tag[MAX_BLOCKSIZE]; + } u_tag; + /* Space to save an IV or CTR for chaining operations. */ unsigned char lastiv[MAX_BLOCKSIZE]; int unused; /* Number of unused bytes in LASTIV. */ + unsigned char length[MAX_BLOCKSIZE]; /* bit counters for GCM */ /* What follows are two contexts of the cipher in use. The first one needs to be aligned well enough for the cipher operation @@ -177,5 +185,24 @@ gcry_err_code_t _gcry_cipher_aeswrap_decrypt const byte *inbuf, unsigned int inbuflen); +/*-- cipher-gcm.c --*/ +gcry_err_code_t _gcry_cipher_gcm_encrypt +/* */ (gcry_cipher_hd_t c, + byte *outbuf, unsigned int outbuflen, + const byte *inbuf, unsigned int inbuflen); +gcry_err_code_t _gcry_cipher_gcm_decrypt +/* */ (gcry_cipher_hd_t c, + byte *outbuf, unsigned int outbuflen, + const byte *inbuf, unsigned int inbuflen); +void _gcry_cipher_gcm_setiv +/* */ (gcry_cipher_hd_t c, + const byte *iv, unsigned int ivlen); +gcry_err_code_t _gcry_cipher_gcm_authenticate +/* */ (gcry_cipher_hd_t c, + const byte *aadbuf, unsigned int aadbuflen); +gcry_err_code_t _gcry_cipher_gcm_tag +/* */ (gcry_cipher_hd_t c, + byte *outbuf, unsigned int outbuflen); + #endif /*G10_CIPHER_INTERNAL_H*/ diff --git a/cipher/cipher.c b/cipher/cipher.c index 99bd3cd..e61f576 100644 --- a/cipher/cipher.c +++ b/cipher/cipher.c @@ -638,6 +638,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle, case GCRY_CIPHER_MODE_OFB: case GCRY_CIPHER_MODE_CTR: case GCRY_CIPHER_MODE_AESWRAP: + case GCRY_CIPHER_MODE_GCM: if ((cipher->encrypt == dummy_encrypt_block) || (cipher->decrypt == dummy_decrypt_block)) err = GPG_ERR_INV_CIPHER_MODE; @@ -851,6 +852,13 @@ cipher_setkey (gcry_cipher_hd_t c, byte *key, unsigned int keylen) static void cipher_setiv (gcry_cipher_hd_t c, const byte *iv, unsigned ivlen) { + /* GCM has its own IV handler */ + if (c->mode == GCRY_CIPHER_MODE_GCM) + { + _gcry_cipher_gcm_setiv (c, iv, ivlen); + return; + } + /* If the cipher has its own IV handler, we use only this one. This is currently used for stream ciphers requiring a nonce. */ if (c->extraspec && c->extraspec->setiv) @@ -891,6 +899,8 @@ cipher_reset (gcry_cipher_hd_t c) memset (c->u_iv.iv, 0, c->cipher->blocksize); memset (c->lastiv, 0, c->cipher->blocksize); memset (c->u_ctr.ctr, 0, c->cipher->blocksize); + memset (c->u_tag.tag, 0, c->cipher->blocksize); + memset (c->length, 0, c->cipher->blocksize); } @@ -982,6 +992,11 @@ cipher_encrypt (gcry_cipher_hd_t c, byte *outbuf, unsigned int outbuflen, inbuf, inbuflen); break; + case GCRY_CIPHER_MODE_GCM: + rc = _gcry_cipher_gcm_encrypt (c, outbuf, outbuflen, + inbuf, inbuflen); + break; + case GCRY_CIPHER_MODE_STREAM: c->cipher->stencrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf, inbuflen); @@ -1075,6 +1090,11 @@ cipher_decrypt (gcry_cipher_hd_t c, byte *outbuf, unsigned int outbuflen, inbuf, inbuflen); break; + case GCRY_CIPHER_MODE_GCM: + rc = _gcry_cipher_gcm_decrypt (c, outbuf, outbuflen, + inbuf, inbuflen); + break; + case GCRY_CIPHER_MODE_STREAM: c->cipher->stdecrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf, inbuflen); @@ -1178,6 +1198,11 @@ gcry_error_t _gcry_cipher_authenticate (gcry_cipher_hd_t hd, const void *aad, size_t aadsize) { + if (hd->mode == GCRY_CIPHER_MODE_GCM) + { + return gpg_error (_gcry_cipher_gcm_authenticate (hd, aad, aadsize)); + } + log_fatal ("gcry_cipher_tag: invalid mode %d\n", hd->mode ); return gpg_error (GPG_ERR_INV_CIPHER_MODE); } @@ -1185,6 +1210,11 @@ _gcry_cipher_authenticate (gcry_cipher_hd_t hd, gcry_error_t _gcry_cipher_tag (gcry_cipher_hd_t hd, void *out, size_t outsize) { + if (hd->mode == GCRY_CIPHER_MODE_GCM) + { + return gpg_error (_gcry_cipher_gcm_tag (hd, out, outsize)); + } + log_fatal ("gcry_cipher_tag: invalid mode %d\n", hd->mode ); return gpg_error (GPG_ERR_INV_CIPHER_MODE); } diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index faedb33..3fa3410 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -841,9 +841,13 @@ enum gcry_cipher_modes GCRY_CIPHER_MODE_STREAM = 4, /* Used with stream ciphers. */ GCRY_CIPHER_MODE_OFB = 5, /* Outer feedback. */ GCRY_CIPHER_MODE_CTR = 6, /* Counter. */ - GCRY_CIPHER_MODE_AESWRAP= 7 /* AES-WRAP algorithm. */ + GCRY_CIPHER_MODE_AESWRAP= 7, /* AES-WRAP algorithm. */ + GCRY_CIPHER_MODE_GCM = 8 /* Galois Counter Mode. */ }; +/* GCM works only with blocks of 128 bits */ +#define GCRY_GCM_BLOCK_LEN (128 / 8) + /* Flags used with the open function. */ enum gcry_cipher_flags { diff --git a/tests/basic.c b/tests/basic.c index 46e213c..c0fc0f9 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -1137,6 +1137,335 @@ check_ofb_cipher (void) fprintf (stderr, " Completed OFB checks.\n"); } +static void +check_gcm_cipher (void) +{ + struct tv + { + int algo; + char key[MAX_DATA_LEN]; + char iv[MAX_DATA_LEN]; + int ivlen; + unsigned char aad[MAX_DATA_LEN]; + int aadlen; + unsigned char plaintext[MAX_DATA_LEN]; + int inlen; + char out[MAX_DATA_LEN]; + char tag[MAX_DATA_LEN]; + } tv[] = + { + /* http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf */ + { GCRY_CIPHER_AES, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 12, + "", 0, + "", + 0, + "", + "\x58\xe2\xfc\xce\xfa\x7e\x30\x61\x36\x7f\x1d\x57\xa4\xe7\x45\x5a" }, + { GCRY_CIPHER_AES, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", 12, + "", 0, + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + 16, + "\x03\x88\xda\xce\x60\xb6\xa3\x92\xf3\x28\xc2\xb9\x71\xb2\xfe\x78", + "\xab\x6e\x47\xd4\x2c\xec\x13\xbd\xf5\x3a\x67\xb2\x12\x57\xbd\xdf" }, + { GCRY_CIPHER_AES, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\xca\xfe\xba\xbe\xfa\xce\xdb\xad\xde\xca\xf8\x88", 12, + "", 0, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39\x1a\xaf\xd2\x55", + 64, + "\x42\x83\x1e\xc2\x21\x77\x74\x24\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" + "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0\x35\xc1\x7e\x23\x29\xac\xa1\x2e" + "\x21\xd5\x14\xb2\x54\x66\x93\x1c\x7d\x8f\x6a\x5a\xac\x84\xaa\x05" + "\x1b\xa3\x0b\x39\x6a\x0a\xac\x97\x3d\x58\xe0\x91\x47\x3f\x59\x85", + "\x4d\x5c\x2a\xf3\x27\xcd\x64\xa6\x2c\xf3\x5a\xbd\x2b\xa6\xfa\xb4" }, + { GCRY_CIPHER_AES, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\xca\xfe\xba\xbe\xfa\xce\xdb\xad\xde\xca\xf8\x88", 12, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\x42\x83\x1e\xc2\x21\x77\x74\x24\x4b\x72\x21\xb7\x84\xd0\xd4\x9c" + "\xe3\xaa\x21\x2f\x2c\x02\xa4\xe0\x35\xc1\x7e\x23\x29\xac\xa1\x2e" + "\x21\xd5\x14\xb2\x54\x66\x93\x1c\x7d\x8f\x6a\x5a\xac\x84\xaa\x05" + "\x1b\xa3\x0b\x39\x6a\x0a\xac\x97\x3d\x58\xe0\x91\x47\x3f\x59\x85", + "\x5b\xc9\x4f\xbc\x32\x21\xa5\xdb\x94\xfa\xe9\x5a\xe7\x12\x1a\x47" }, + { GCRY_CIPHER_AES, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\xca\xfe\xba\xbe\xfa\xce\xdb\xad", 8, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\x61\x35\x3b\x4c\x28\x06\x93\x4a\x77\x7f\xf5\x1f\xa2\x2a\x47\x55" + "\x69\x9b\x2a\x71\x4f\xcd\xc6\xf8\x37\x66\xe5\xf9\x7b\x6c\x74\x23" + "\x73\x80\x69\x00\xe4\x9f\x24\xb2\x2b\x09\x75\x44\xd4\x89\x6b\x42" + "\x49\x89\xb5\xe1\xeb\xac\x0f\x07\xc2\x3f\x45\x98", + "\x36\x12\xd2\xe7\x9e\x3b\x07\x85\x56\x1b\xe1\x4a\xac\xa2\xfc\xcb" }, + { GCRY_CIPHER_AES, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\x93\x13\x22\x5d\xf8\x84\x06\xe5\x55\x90\x9c\x5a\xff\x52\x69\xaa" + "\x6a\x7a\x95\x38\x53\x4f\x7d\xa1\xe4\xc3\x03\xd2\xa3\x18\xa7\x28" + "\xc3\xc0\xc9\x51\x56\x80\x95\x39\xfc\xf0\xe2\x42\x9a\x6b\x52\x54" + "\x16\xae\xdb\xf5\xa0\xde\x6a\x57\xa6\x37\xb3\x9b", 60, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\x8c\xe2\x49\x98\x62\x56\x15\xb6\x03\xa0\x33\xac\xa1\x3f\xb8\x94" + "\xbe\x91\x12\xa5\xc3\xa2\x11\xa8\xba\x26\x2a\x3c\xca\x7e\x2c\xa7" + "\x01\xe4\xa9\xa4\xfb\xa4\x3c\x90\xcc\xdc\xb2\x81\xd4\x8c\x7c\x6f" + "\xd6\x28\x75\xd2\xac\xa4\x17\x03\x4c\x34\xae\xe5", + "\x61\x9c\xc5\xae\xff\xfe\x0b\xfa\x46\x2a\xf4\x3c\x16\x99\xd0\x50" }, + { GCRY_CIPHER_AES192, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c", + "\x93\x13\x22\x5d\xf8\x84\x06\xe5\x55\x90\x9c\x5a\xff\x52\x69\xaa" + "\x6a\x7a\x95\x38\x53\x4f\x7d\xa1\xe4\xc3\x03\xd2\xa3\x18\xa7\x28" + "\xc3\xc0\xc9\x51\x56\x80\x95\x39\xfc\xf0\xe2\x42\x9a\x6b\x52\x54" + "\x16\xae\xdb\xf5\xa0\xde\x6a\x57\xa6\x37\xb3\x9b", 60, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\xd2\x7e\x88\x68\x1c\xe3\x24\x3c\x48\x30\x16\x5a\x8f\xdc\xf9\xff" + "\x1d\xe9\xa1\xd8\xe6\xb4\x47\xef\x6e\xf7\xb7\x98\x28\x66\x6e\x45" + "\x81\xe7\x90\x12\xaf\x34\xdd\xd9\xe2\xf0\x37\x58\x9b\x29\x2d\xb3" + "\xe6\x7c\x03\x67\x45\xfa\x22\xe7\xe9\xb7\x37\x3b", + "\xdc\xf5\x66\xff\x29\x1c\x25\xbb\xb8\x56\x8f\xc3\xd3\x76\xa6\xd9" }, + { GCRY_CIPHER_AES256, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\x93\x13\x22\x5d\xf8\x84\x06\xe5\x55\x90\x9c\x5a\xff\x52\x69\xaa" + "\x6a\x7a\x95\x38\x53\x4f\x7d\xa1\xe4\xc3\x03\xd2\xa3\x18\xa7\x28" + "\xc3\xc0\xc9\x51\x56\x80\x95\x39\xfc\xf0\xe2\x42\x9a\x6b\x52\x54" + "\x16\xae\xdb\xf5\xa0\xde\x6a\x57\xa6\x37\xb3\x9b", 60, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\x5a\x8d\xef\x2f\x0c\x9e\x53\xf1\xf7\x5d\x78\x53\x65\x9e\x2a\x20" + "\xee\xb2\xb2\x2a\xaf\xde\x64\x19\xa0\x58\xab\x4f\x6f\x74\x6b\xf4" + "\x0f\xc0\xc3\xb7\x80\xf2\x44\x45\x2d\xa3\xeb\xf1\xc5\xd8\x2c\xde" + "\xa2\x41\x89\x97\x20\x0e\xf8\x2e\x44\xae\x7e\x3f", + "\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" } + }; + + gcry_cipher_hd_t hde, hdd; + unsigned char out[MAX_DATA_LEN]; + int i, keylen; + gcry_error_t err = 0; + + if (verbose) + fprintf (stderr, " Starting GCM checks.\n"); + + for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++) + { + if (verbose) + fprintf (stderr, " checking GCM mode for %s [%i]\n", + gcry_cipher_algo_name (tv[i].algo), + tv[i].algo); + err = gcry_cipher_open (&hde, tv[i].algo, GCRY_CIPHER_MODE_GCM, 0); + if (!err) + err = gcry_cipher_open (&hdd, tv[i].algo, GCRY_CIPHER_MODE_GCM, 0); + if (err) + { + fail ("aes-gcm, gcry_cipher_open failed: %s\n", gpg_strerror (err)); + return; + } + + keylen = gcry_cipher_get_algo_keylen(tv[i].algo); + if (!keylen) + { + fail ("aes-gcm, gcry_cipher_get_algo_keylen failed\n"); + return; + } + + err = gcry_cipher_setkey (hde, tv[i].key, keylen); + if (!err) + err = gcry_cipher_setkey (hdd, tv[i].key, keylen); + if (err) + { + fail ("aes-gcm, gcry_cipher_setkey failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen); + if (!err) + err = gcry_cipher_setiv (hdd, tv[i].iv, tv[i].ivlen); + if (err) + { + fail ("aes-gcm, gcry_cipher_setiv failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_authenticate(hde, tv[i].aad, tv[i].aadlen); + if (!err) + err = gcry_cipher_authenticate(hdd, tv[i].aad, tv[i].aadlen); + if (err) + { + fail ("aes-gcm, gcry_cipher_authenticate (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN, + tv[i].plaintext, + tv[i].inlen); + if (err) + { + fail ("aes-gcm, gcry_cipher_encrypt (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].out, out, tv[i].inlen)) + fail ("aes-gcm, encrypt mismatch entry %d\n", i); + + err = gcry_cipher_decrypt (hdd, out, tv[i].inlen, NULL, 0); + if (err) + { + fail ("aes-gcm, gcry_cipher_decrypt (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].plaintext, out, tv[i].inlen)) + fail ("aes-gcm, decrypt mismatch entry %d\n", i); + +#define TAGLEN 16 + err = gcry_cipher_tag (hde, out, TAGLEN); /* FIXME */ + if (err) + { + fail ("aes-gcm, gcry_cipher_tag(%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].tag, out, TAGLEN)) + fail ("aes-gcm, encrypt tag mismatch entry %d\n", i); + + + err = gcry_cipher_tag (hdd, out, TAGLEN); /* FIXME */ + if (err) + { + fail ("aes-gcm, gcry_cipher_tag(%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].tag, out, TAGLEN)) + fail ("aes-gcm, decrypt tag mismatch entry %d\n", i); + + err = gcry_cipher_reset(hde); + if (!err) + err = gcry_cipher_reset(hdd); + if (err) + { + fail ("aes-gcm, gcry_cipher_reset (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + +#if 0 + /* gcry_cipher_reset clears the IV */ + err = gcry_cipher_setiv (hde, tv[i].iv, tv[i].ivlen); + if (!err) + err = gcry_cipher_setiv (hdd, tv[i].iv, tv[i].ivlen); + if (err) + { + fail ("aes-gcm, gcry_cipher_setiv failed: %s\n", + gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + /* this time we encrypt and decrypt one byte at a time */ + int byteNum; + for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum) + { + err = gcry_cipher_encrypt (hde, out+byteNum, 1, + (tv[i].plaintext) + byteNum, + 1); + if (err) + { + fail ("aes-gcm, gcry_cipher_encrypt (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].out, out, tv[i].inlen)) + fail ("aes-gcm, encrypt mismatch entry %d\n", i); + + for (byteNum = 0; byteNum < tv[i].inlen; ++byteNum) + { + err = gcry_cipher_decrypt (hdd, out+byteNum, 1, NULL, 0); + if (err) + { + fail ("aes-gcm, gcry_cipher_decrypt (%d) failed: %s\n", + i, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + } + + if (memcmp (tv[i].plaintext, out, tv[i].inlen)) + fail ("aes-gcm, decrypt mismatch entry %d\n", i); +#endif + + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + } + if (verbose) + fprintf (stderr, " Completed GCM checks.\n"); +} + static void check_stream_cipher (void) @@ -2197,6 +2526,8 @@ check_ciphers (void) check_one_cipher (algos[i], GCRY_CIPHER_MODE_CBC, 0); check_one_cipher (algos[i], GCRY_CIPHER_MODE_CBC, GCRY_CIPHER_CBC_CTS); check_one_cipher (algos[i], GCRY_CIPHER_MODE_CTR, 0); + if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_GCM_BLOCK_LEN) + check_one_cipher (algos[i], GCRY_CIPHER_MODE_GCM, 0); } for (i = 0; algos2[i]; i++) @@ -2234,6 +2565,7 @@ check_cipher_modes(void) check_ctr_cipher (); check_cfb_cipher (); check_ofb_cipher (); + check_gcm_cipher (); check_stream_cipher (); check_stream_cipher_large_block (); diff --git a/tests/benchmark.c b/tests/benchmark.c index 79048a3..2f0e20b 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -536,6 +536,7 @@ cipher_bench ( const char *algoname ) { GCRY_CIPHER_MODE_CFB, " CFB", 0 }, { GCRY_CIPHER_MODE_OFB, " OFB", 0 }, { GCRY_CIPHER_MODE_CTR, " CTR", 0 }, + { GCRY_CIPHER_MODE_GCM, " GCM", 0 }, { GCRY_CIPHER_MODE_STREAM, "", 0 }, {0} }; @@ -624,7 +625,9 @@ cipher_bench ( const char *algoname ) for (modeidx=0; modes[modeidx].mode; modeidx++) { if ((blklen > 1 && modes[modeidx].mode == GCRY_CIPHER_MODE_STREAM) - | (blklen == 1 && modes[modeidx].mode != GCRY_CIPHER_MODE_STREAM)) + || (blklen == 1 && modes[modeidx].mode != GCRY_CIPHER_MODE_STREAM) + || (blklen != GCRY_GCM_BLOCK_LEN && modes[modeidx].mode == + GCRY_CIPHER_MODE_GCM)) continue; for (i=0; i < sizeof buf; i++) -- 1.7.10.4 From smueller at chronox.de Fri Aug 2 16:10:54 2013 From: smueller at chronox.de (Stephan Mueller) Date: Fri, 02 Aug 2013 16:10:54 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> Message-ID: <3440978.kUROICP7lO@tauon> Am Freitag, 2. August 2013, 11:14:15 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >+void >+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, >+ const byte *iv, unsigned int ivlen) >+{ The IV handling in GCM is a special beast. SP800-38D section 8.2 defines exactly two ways how IVs are to be constructed. The current implementation seems to leave that issue to the caller. However, a caller may not understand that there is a specific requirement on how to set up the IV. In case this implementation shall get through a successful FIPS 140-2 validation, meeting SP800-38D section 8.2 (meeting either one or both of the outlined construction types is fine) is mandatory. Ciao Stephan -- | Cui bono? | From smueller at chronox.de Fri Aug 2 16:53:29 2013 From: smueller at chronox.de (Stephan Mueller) Date: Fri, 02 Aug 2013 16:53:29 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> Message-ID: <2395307.BzbpSBGYu5@tauon> Am Freitag, 2. August 2013, 11:14:15 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >+void >+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, >+ const byte *iv, unsigned int ivlen) >+{ The IV handling in GCM is a special beast. SP800-38D section 8.2 defines exactly two ways how IVs are to be constructed. The current implementation seems to leave that issue to the caller. However, a caller may not understand that there is a specific requirement on how to set up the IV. In case this implementation shall get through a successful FIPS 140-2 validation, meeting SP800-38D section 8.2 (meeting either one or both of the outlined construction types is fine) is mandatory. Ciao Stephan -- | Cui bono? | From smueller at chronox.de Mon Aug 5 14:31:29 2013 From: smueller at chronox.de (Stephan Mueller) Date: Mon, 05 Aug 2013 14:31:29 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <3440978.kUROICP7lO@tauon> Message-ID: <16024041.dq35RrqeEh@tauon> Am Montag, 5. August 2013, 16:28:09 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >Hi Stephan, > >On Fri, Aug 2, 2013 at 6:10 PM, Stephan Mueller wrote: >> Am Freitag, 2. August 2013, 11:14:15 schrieb Dmitry Eremin-Solenikov: >> >> Hi Dmitry, >> >>>+void >>>+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, >>>+ const byte *iv, unsigned int ivlen) >>>+{ >>> >> The IV handling in GCM is a special beast. SP800-38D section 8.2 >> defines exactly two ways how IVs are to be constructed. The current >> implementation seems to leave that issue to the caller. However, a >> caller may not understand that there is a specific requirement on how >> to set up the IV. > >Thanks for the pointing to the issue. In my opinion, we should not >mandate any special form of IV in setiv interface. IV block could be >already constructed by the caller according to the rules of SP800-38D. >I might be wrong, but judging from quick glance on OpenSSL, Nettle or >NSS, no library implements these IV requirements in basic interface. >If that would be required by FIPS certification, we can probably >extend API. However I don't think >that basic setiv should have any additional complexity. As I am working in that field of FIPS 140-2, I know that NIST has some change of heart in that area in recent times. If you leave it like this, a successful validation is in question in the future. > >I will probably add a note that to be fully compatible with NIST >recommendations, >one have to generate IV according to the specification. > >What do you think? Ciao Stephan -- | Cui bono? | From dbaryshkov at gmail.com Mon Aug 5 15:13:13 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 5 Aug 2013 17:13:13 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <16024041.dq35RrqeEh@tauon> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <3440978.kUROICP7lO@tauon> <16024041.dq35RrqeEh@tauon> Message-ID: On Mon, Aug 5, 2013 at 4:31 PM, Stephan Mueller wrote: > Am Montag, 5. August 2013, 16:28:09 schrieb Dmitry Eremin-Solenikov: > > Hi Dmitry, > >>Hi Stephan, >> >>On Fri, Aug 2, 2013 at 6:10 PM, Stephan Mueller > wrote: >>> Am Freitag, 2. August 2013, 11:14:15 schrieb Dmitry Eremin-Solenikov: >>> >>> Hi Dmitry, >>> >>>>+void >>>>+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, >>>>+ const byte *iv, unsigned int ivlen) >>>>+{ >>>> >>> The IV handling in GCM is a special beast. SP800-38D section 8.2 >>> defines exactly two ways how IVs are to be constructed. The current >>> implementation seems to leave that issue to the caller. However, a >>> caller may not understand that there is a specific requirement on how >>> to set up the IV. >> >>Thanks for the pointing to the issue. In my opinion, we should not >>mandate any special form of IV in setiv interface. IV block could be >>already constructed by the caller according to the rules of SP800-38D. >>I might be wrong, but judging from quick glance on OpenSSL, Nettle or >>NSS, no library implements these IV requirements in basic interface. >>If that would be required by FIPS certification, we can probably >>extend API. However I don't think >>that basic setiv should have any additional complexity. > > As I am working in that field of FIPS 140-2, I know that NIST has some > change of heart in that area in recent times. If you leave it like this, > a successful validation is in question in the future. What would be your proposal? >> >>I will probably add a note that to be fully compatible with NIST >>recommendations, >>one have to generate IV according to the specification. >> >>What do you think? -- With best wishes Dmitry From dbaryshkov at gmail.com Mon Aug 5 14:28:09 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 5 Aug 2013 16:28:09 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <3440978.kUROICP7lO@tauon> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> <3440978.kUROICP7lO@tauon> Message-ID: Hi Stephan, On Fri, Aug 2, 2013 at 6:10 PM, Stephan Mueller wrote: > Am Freitag, 2. August 2013, 11:14:15 schrieb Dmitry Eremin-Solenikov: > > Hi Dmitry, > > >>+void >>+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, >>+ const byte *iv, unsigned int ivlen) >>+{ > > The IV handling in GCM is a special beast. SP800-38D section 8.2 defines > exactly two ways how IVs are to be constructed. The current > implementation seems to leave that issue to the caller. However, a > caller may not understand that there is a specific requirement on how to > set up the IV. Thanks for the pointing to the issue. In my opinion, we should not mandate any special form of IV in setiv interface. IV block could be already constructed by the caller according to the rules of SP800-38D. I might be wrong, but judging from quick glance on OpenSSL, Nettle or NSS, no library implements these IV requirements in basic interface. If that would be required by FIPS certification, we can probably extend API. However I don't think that basic setiv should have any additional complexity. I will probably add a note that to be fully compatible with NIST recommendations, one have to generate IV according to the specification. What do you think? -- With best wishes Dmitry From wk at gnupg.org Mon Aug 5 16:01:30 2013 From: wk at gnupg.org (Werner Koch) Date: Mon, 05 Aug 2013 16:01:30 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: (Dmitry Eremin-Solenikov's message of "Mon, 5 Aug 2013 16:28:09 +0400") References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> <3440978.kUROICP7lO@tauon> Message-ID: <87k3k0me2t.fsf@vigenere.g10code.de> On Mon, 5 Aug 2013 14:28, dbaryshkov at gmail.com said: > I will probably add a note that to be fully compatible with NIST > recommendations, > one have to generate IV according to the specification. If the standard requires a certain way to generate an IV we need to make sure that it is done in this way. If there is an application which can't work with such a high-level approach we may revisit this and decide whether to add a low-level API for this case. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From smueller at chronox.de Mon Aug 5 16:13:15 2013 From: smueller at chronox.de (Stephan Mueller) Date: Mon, 05 Aug 2013 16:13:15 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <16024041.dq35RrqeEh@tauon> Message-ID: <38241683.OIdXLog9rX@tauon> Am Montag, 5. August 2013, 17:13:13 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >On Mon, Aug 5, 2013 at 4:31 PM, Stephan Mueller wrote: >> Am Montag, 5. August 2013, 16:28:09 schrieb Dmitry Eremin-Solenikov: >> >> Hi Dmitry, >> >>>Hi Stephan, >>> >>>On Fri, Aug 2, 2013 at 6:10 PM, Stephan Mueller >>> >> wrote: >>>> Am Freitag, 2. August 2013, 11:14:15 schrieb Dmitry >>>> Eremin-Solenikov: >>>> >>>> Hi Dmitry, >>>> >>>>>+void >>>>>+_gcry_cipher_gcm_setiv (gcry_cipher_hd_t c, >>>>>+ const byte *iv, unsigned int ivlen) >>>>>+{ >>>>> >>>> The IV handling in GCM is a special beast. SP800-38D section 8.2 >>>> defines exactly two ways how IVs are to be constructed. The current >>>> implementation seems to leave that issue to the caller. However, a >>>> caller may not understand that there is a specific requirement on >>>> how >>>> to set up the IV. >>> >>>Thanks for the pointing to the issue. In my opinion, we should not >>>mandate any special form of IV in setiv interface. IV block could be >>>already constructed by the caller according to the rules of >>>SP800-38D. >>>I might be wrong, but judging from quick glance on OpenSSL, Nettle or >>>NSS, no library implements these IV requirements in basic interface. >>>If that would be required by FIPS certification, we can probably >>>extend API. However I don't think >>>that basic setiv should have any additional complexity. >>> >> As I am working in that field of FIPS 140-2, I know that NIST has >> some >> change of heart in that area in recent times. If you leave it like >> this, a successful validation is in question in the future. > >What would be your proposal? Unfortunately, I am not entitled to a proposal as this would taint me although I would have some suggestions. :-( NIST and their rules... > >>>I will probably add a note that to be fully compatible with NIST >>>recommendations, >>>one have to generate IV according to the specification. >>> >>>What do you think? Ciao Stephan -- | Cui bono? | From dbaryshkov at gmail.com Mon Aug 5 17:17:58 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 5 Aug 2013 19:17:58 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <87k3k0me2t.fsf@vigenere.g10code.de> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1375427655-5894-3-git-send-email-dbaryshkov@gmail.com> <3440978.kUROICP7lO@tauon> <87k3k0me2t.fsf@vigenere.g10code.de> Message-ID: On Mon, Aug 5, 2013 at 6:01 PM, Werner Koch wrote: > On Mon, 5 Aug 2013 14:28, dbaryshkov at gmail.com said: > >> I will probably add a note that to be fully compatible with NIST >> recommendations, >> one have to generate IV according to the specification. > > If the standard requires a certain way to generate an IV we need to make > sure that it is done in this way. If there is an application which > can't work with such a high-level approach we may revisit this and > decide whether to add a low-level API for this case. The problem is that IETF standards (RFC 5116/5288) are not _that_strict. As far as I understand text, it only proposes a way to generate nonce (which looks like 8.2.1). And 5288 requires distinct explicit nonce, but then it tells that it may be a TLS sequence number (and thus might be not). To make things worse: libgcrypt should permit using any IV for decryption. Thus we should not impose any additional requirements on setiv() if the user wants only to decrypt a message. What about the following construction: * setiv() permits any iv, but then sets a special flag 'explicit IV' (?) * We add a special API like gcry_cipher_generate_nonce(), which will generate noonce. In case of GCM it will use additional data and a counter (or clock_gettime(?)) to generate IV according to SP800-38D, 8.2.1). * GCM decrypt() works with any IV set or generated. * In FIPS mode GCM encrypt() fails, if 'explicit IV' flag is set (and works if IV was generated). -- With best wishes Dmitry From smueller at chronox.de Mon Aug 5 17:55:14 2013 From: smueller at chronox.de (Stephan Mueller) Date: Mon, 05 Aug 2013 17:55:14 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <87k3k0me2t.fsf@vigenere.g10code.de> Message-ID: <22956096.puAdeCYO86@tauon> Am Montag, 5. August 2013, 19:17:58 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >On Mon, Aug 5, 2013 at 6:01 PM, Werner Koch wrote: >> On Mon, 5 Aug 2013 14:28, dbaryshkov at gmail.com said: >>> I will probably add a note that to be fully compatible with NIST >>> recommendations, >>> one have to generate IV according to the specification. >> >> If the standard requires a certain way to generate an IV we need to >> make sure that it is done in this way. If there is an application >> which can't work with such a high-level approach we may revisit this >> and decide whether to add a low-level API for this case. > >The problem is that IETF standards (RFC 5116/5288) are not >_that_strict. As far as I understand text, it only proposes a way to >generate nonce (which looks like 8.2.1). And 5288 requires distinct >explicit nonce, but then it tells that it may be a TLS sequence number >(and thus might be not). > >To make things worse: >libgcrypt should permit using any IV for decryption. Thus we should not >impose any additional requirements on setiv() if the user wants only >to decrypt a message. It is always a hassle when mechanisms are defined in two different places :-( > >What about the following construction: >* setiv() permits any iv, but then sets a special flag 'explicit IV' >(?) * We add a special API like gcry_cipher_generate_nonce(), which >will generate noonce. In case of GCM it will use additional data and a >counter (or clock_gettime(?)) > to generate IV according to SP800-38D, 8.2.1). >* GCM decrypt() works with any IV set or generated. The enc/dec function are not affected by the issue, because the setiv function handles the IV. And I guess setiv would need the enforcement logic to comply with 800-38D. >* In FIPS mode GCM encrypt() fails, if 'explicit IV' flag is set (and >works if IV was generated). It is definitely ok to limit the solution to FIPS mode, from FIPS perspective. However, even in FIPS mode, I guess you want to accept IVs from the outside (e.g. when you set up an encryption channel where two sides need to agree on IVs). Therefore, there are valid reasons why an externally generated IV may make sense in FIPS mode. Thus, the enforcement of the flag in FIPS mode may cripple the implementation in a way that it may not be useful (just thinking aloud here). As you only refer to 8.2.1, why not considering 8.2.2 as well? Ciao Stephan -- | Cui bono? | From dbaryshkov at gmail.com Mon Aug 5 18:06:39 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 5 Aug 2013 20:06:39 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <22956096.puAdeCYO86@tauon> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <87k3k0me2t.fsf@vigenere.g10code.de> <22956096.puAdeCYO86@tauon> Message-ID: Hi Stephan, On Mon, Aug 5, 2013 at 7:55 PM, Stephan Mueller wrote: > Am Montag, 5. August 2013, 19:17:58 schrieb Dmitry Eremin-Solenikov: >>What about the following construction: >>* setiv() permits any iv, but then sets a special flag 'explicit IV' >>(?) * We add a special API like gcry_cipher_generate_nonce(), which >>will generate noonce. In case of GCM it will use additional data and a >>counter (or clock_gettime(?)) >> to generate IV according to SP800-38D, 8.2.1). >>* GCM decrypt() works with any IV set or generated. > > The enc/dec function are not affected by the issue, because the setiv > function handles the IV. And I guess setiv would need the enforcement > logic to comply with 800-38D. Let me rephrase the issue in a little bit different way, so that you see my point: 'GCM should not use the same IV for encryption more than once with a given key'. 800-38D talks about calling 'authenticated encryption' function with same IV. 800-38D does not have any requirements about 'authenticate decryption' function with respect to IV vectors. Thus we should not allow using 'bad' IV vectors for encryption only. Adding a check to gcm_encrypt() looks like a sufficient protection. > >>* In FIPS mode GCM encrypt() fails, if 'explicit IV' flag is set (and >>works if IV was generated). > > It is definitely ok to limit the solution to FIPS mode, from FIPS > perspective. Good! > However, even in FIPS mode, I guess you want to accept IVs > from the outside (e.g. when you set up an encryption channel where two > sides need to agree on IVs). Therefore, there are valid reasons why an > externally generated IV may make sense in FIPS mode. Thus, the > enforcement of the flag in FIPS mode may cripple the implementation in a > way that it may not be useful (just thinking aloud here). See above. > > As you only refer to 8.2.1, why not considering 8.2.2 as well? Because TLS uses 8.2.1 version of IV generation. -- With best wishes Dmitry From smueller at chronox.de Mon Aug 5 19:12:31 2013 From: smueller at chronox.de (Stephan Mueller) Date: Mon, 05 Aug 2013 19:12:31 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <22956096.puAdeCYO86@tauon> Message-ID: <14210169.HvuJmyEayS@tauon> Am Montag, 5. August 2013, 20:06:39 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >Hi Stephan, > >On Mon, Aug 5, 2013 at 7:55 PM, Stephan Mueller wrote: >> Am Montag, 5. August 2013, 19:17:58 schrieb Dmitry Eremin-Solenikov: >>>What about the following construction: >>>* setiv() permits any iv, but then sets a special flag 'explicit IV' >>>(?) * We add a special API like gcry_cipher_generate_nonce(), which >>>will generate noonce. In case of GCM it will use additional data and >>>a >>>counter (or clock_gettime(?)) >>> >>> to generate IV according to SP800-38D, 8.2.1). >>> >>>* GCM decrypt() works with any IV set or generated. >>> >> The enc/dec function are not affected by the issue, because the setiv >> function handles the IV. And I guess setiv would need the enforcement >> logic to comply with 800-38D. > >Let me rephrase the issue in a little bit different way, so that you >see my point: >'GCM should not use the same IV for encryption more than once with a >given key'. 800-38D talks about calling 'authenticated encryption' >function with same IV. 800-38D does not have any requirements about >'authenticate decryption' function with respect to IV vectors. > >Thus we should not allow using 'bad' IV vectors for encryption only. >Adding a check >to gcm_encrypt() looks like a sufficient protection. Of course, here I am with you. Covering the encrypt operation is sufficient. Yet I fail to understand your proposal to handle 8.2.1 compliance. When you use clock_gettime, there is a chance greater than zero that you get two identical values. Note, CLOCK_REALTIME/CLOCK_MONOTONIC is affected by ntp drift and ntp updates, CLOCK_MONOTONIC_RAW may not be available on every system. In general: how do you propose you want to construct or verify the "fixed field" and the "invocation field"? The invocation field must be modified after each authentication. That modification must still ensure that the invocation field is unique. How shall that be done? Moreover, there is a specific length requirement for the IV. > >>>* In FIPS mode GCM encrypt() fails, if 'explicit IV' flag is set (and >>>works if IV was generated). >>> >> It is definitely ok to limit the solution to FIPS mode, from FIPS >> perspective. > >Good! > >> However, even in FIPS mode, I guess you want to accept IVs >> from the outside (e.g. when you set up an encryption channel where >> two >> sides need to agree on IVs). Therefore, there are valid reasons why >> an >> externally generated IV may make sense in FIPS mode. Thus, the >> enforcement of the flag in FIPS mode may cripple the implementation >> in a way that it may not be useful (just thinking aloud here). > >See above. > >> As you only refer to 8.2.1, why not considering 8.2.2 as well? > >Because TLS uses 8.2.1 version of IV generation. Thanks for the hint. (PS: yes, I know: all are pointing out the issues, without telling that you did a good job in creating the GCM implementation in the first place :-) ) Ciao Stephan -- | Cui bono? | From dbaryshkov at gmail.com Mon Aug 5 19:41:30 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Mon, 5 Aug 2013 21:41:30 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <14210169.HvuJmyEayS@tauon> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <22956096.puAdeCYO86@tauon> <14210169.HvuJmyEayS@tauon> Message-ID: On Mon, Aug 5, 2013 at 9:12 PM, Stephan Mueller wrote: > Am Montag, 5. August 2013, 20:06:39 schrieb Dmitry Eremin-Solenikov: >>On Mon, Aug 5, 2013 at 7:55 PM, Stephan Mueller > wrote: >>> Am Montag, 5. August 2013, 19:17:58 schrieb Dmitry Eremin-Solenikov: >> >>Let me rephrase the issue in a little bit different way, so that you >>see my point: >>'GCM should not use the same IV for encryption more than once with a >>given key'. 800-38D talks about calling 'authenticated encryption' >>function with same IV. 800-38D does not have any requirements about >>'authenticate decryption' function with respect to IV vectors. >> >>Thus we should not allow using 'bad' IV vectors for encryption only. >>Adding a check >>to gcm_encrypt() looks like a sufficient protection. > > Of course, here I am with you. Covering the encrypt operation is > sufficient. > > Yet I fail to understand your proposal to handle 8.2.1 compliance. When > you use clock_gettime, there is a chance greater than zero that you get > two identical values. Note, CLOCK_REALTIME/CLOCK_MONOTONIC is affected > by ntp drift and ntp updates, CLOCK_MONOTONIC_RAW may not be available > on every system. I forgot about clock_monotonic being affected by ntp :( In the end this can be as simple, as static counter, being incremented on each call to generate_iv(). Or (if that would still satisfy 800-38D) have that counter in cipher context (oh, my, another one). Thus one can assume that if a key is set on the cipher context, all further generate_iv() invocations would increment that counter. > In general: how do you propose you want to construct or verify the > "fixed field" and the "invocation field"? "fixed field" can be specified during generate_iv() invocation. "invocation field" is a copy of a counter. > The invocation field must be modified after each authentication. That > modification must still ensure that the invocation field is unique. How > shall that be done? Hmm. I would increment that during generation. You have to reset internal GCM counter to 0 values (so that your tag is correct). Currently it is done either via setiv() (reset() will zero a marks.iv flag, thus next encryption/decryption operation will call gcm_setiv() internally). > Moreover, there is a specific length requirement for the IV. I think there are not. Spec says, that for |IV| < 96 bits, one must use 8.2.1, for |IV| >= 96, it should use one of 8.2.1 or 8.2.2 (but only one of them). [skipped] >>> As you only refer to 8.2.1, why not considering 8.2.2 as well? >> >>Because TLS uses 8.2.1 version of IV generation. > > Thanks for the hint. > > (PS: yes, I know: all are pointing out the issues, without telling that > you did a good job in creating the GCM implementation in the first place > :-) ) That is not a problem, I'm used to community reviews of patches. There is one issue/topic, I'd like to especially ask for review/suggestions: Is my usage/extension of handler fields correct, or I should somehow improve that? -- With best wishes Dmitry From smueller at chronox.de Mon Aug 5 20:05:31 2013 From: smueller at chronox.de (Stephan Mueller) Date: Mon, 05 Aug 2013 20:05:31 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <14210169.HvuJmyEayS@tauon> Message-ID: <1410100.jm8NpcopG3@tauon> Am Montag, 5. August 2013, 21:41:30 schrieb Dmitry Eremin-Solenikov: Hi Dmitry, >On Mon, Aug 5, 2013 at 9:12 PM, Stephan Mueller wrote: >> Am Montag, 5. August 2013, 20:06:39 schrieb Dmitry Eremin-Solenikov: >>>On Mon, Aug 5, 2013 at 7:55 PM, Stephan Mueller >>> >> wrote: >>>> Am Montag, 5. August 2013, 19:17:58 schrieb Dmitry Eremin- Solenikov: >>>Let me rephrase the issue in a little bit different way, so that you >>>see my point: >>>'GCM should not use the same IV for encryption more than once with a >>>given key'. 800-38D talks about calling 'authenticated encryption' >>>function with same IV. 800-38D does not have any requirements about >>>'authenticate decryption' function with respect to IV vectors. >>> >>>Thus we should not allow using 'bad' IV vectors for encryption only. >>>Adding a check >>>to gcm_encrypt() looks like a sufficient protection. >>> >> Of course, here I am with you. Covering the encrypt operation is >> sufficient. >> >> Yet I fail to understand your proposal to handle 8.2.1 compliance. >> When you use clock_gettime, there is a chance greater than zero that >> you get two identical values. Note, CLOCK_REALTIME/CLOCK_MONOTONIC >> is affected by ntp drift and ntp updates, CLOCK_MONOTONIC_RAW may >> not be available on every system. > >I forgot about clock_monotonic being affected by ntp :( >In the end this can be as simple, as static counter, being incremented >on each call to generate_iv(). > >Or (if that would still satisfy 800-38D) have that counter in cipher >context (oh, my, another one). Thus one can assume that if a key is >set on the cipher context, >all further generate_iv() invocations would increment that counter. That solution is explicitly allowed in 8.2.1. "The invocation field typically is either 1) an integer counter or 2) a linear feedback shift register that is driven by a primitive polynomial to ensure a maximal cycle length. In either case, the invocation field increments upon each invocation of the authenticated encryption function." > >> In general: how do you propose you want to construct or verify the >> "fixed field" and the "invocation field"? > >"fixed field" can be specified during generate_iv() invocation. >"invocation field" is a copy of a counter. Well, shouldn't there be a service offered to generate an iv that even generates the field if the caller has no clue (e.g. provides NULL for the fixed field)? I mean, how shall a caller come up with a unique fixed field value? > >> The invocation field must be modified after each authentication. That >> modification must still ensure that the invocation field is unique. >> How shall that be done? > >Hmm. I would increment that during generation. You have to reset right, or use a shift register as specified in 8.2.1. >internal GCM counter to 0 values (so that your tag is correct). >Currently it is done either via setiv() (reset() will zero a marks.iv >flag, thus next encryption/decryption >operation will call gcm_setiv() internally). > >> Moreover, there is a specific length requirement for the IV. > >I think there are not. Spec says, that for |IV| < 96 bits, one must use >8.2.1, for |IV| >= 96, it should use one of 8.2.1 or 8.2.2 (but only >one of them). Right, scratch my remark. [...] Ciao Stephan -- | Cui bono? | From gniibe at fsij.org Tue Aug 6 04:28:12 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Tue, 06 Aug 2013 11:28:12 +0900 Subject: cipher: fix memory leaks Message-ID: <1375756092.3199.2.camel@cfw2.gniibe.org> I found memory leaks, and following is my fix for master. Still, there is a leak for sexp_elements_extract_ecc when ELEMENT_NAMES is "pabgnqd" and WANT_PRIVATE is true, and Q is optional. In this case, elements[6] won't be freed, as elements[5] is NULL (for Q). We need to fix caller. ------------------------------------------ cipher: fix memory leak. * cipher/elgamal.c (elg_generate_ext): Free XVALUE. * cipher/pubkey.c (sexp_elements_extract): Don't use IDX for loop. Call mpi_free. (sexp_elements_extract_ecc): Call mpi_free. diff --git a/cipher/elgamal.c b/cipher/elgamal.c index b40d132..7540e3f 100644 --- a/cipher/elgamal.c +++ b/cipher/elgamal.c @@ -641,7 +641,10 @@ elg_generate_ext (int algo, unsigned int nbits, unsigned long evalue, } if (xvalue) - ec = generate_using_x (&sk, nbits, xvalue, retfactors); + { + ec = generate_using_x (&sk, nbits, xvalue, retfactors); + mpi_free (xvalue); + } else { generate (&sk, nbits, retfactors); diff --git a/cipher/pubkey.c b/cipher/pubkey.c index b540bd5..e867169 100644 --- a/cipher/pubkey.c +++ b/cipher/pubkey.c @@ -1832,8 +1832,8 @@ sexp_elements_extract (gcry_sexp_t key_sexp, const char *element_names, if (!err) { /* Check that all elements are available. */ - for (name = element_names, idx = 0; *name; name++, idx++) - if (!elements[idx]) + for (name = element_names, i = 0; *name; name++, i++) + if (!elements[i]) break; if (*name) { @@ -1857,7 +1857,7 @@ sexp_elements_extract (gcry_sexp_t key_sexp, const char *element_names, { for (i = 0; i < idx; i++) if (elements[i]) - gcry_free (elements[i]); + mpi_free (elements[i]); } return err; } @@ -1963,7 +1963,7 @@ sexp_elements_extract_ecc (gcry_sexp_t key_sexp, const char *element_names, { for (name = element_names, idx = 0; *name; name++, idx++) if (elements[idx]) - gcry_free (elements[idx]); + mpi_free (elements[idx]); } return err; } From gniibe at fsij.org Tue Aug 6 06:03:18 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Tue, 06 Aug 2013 13:03:18 +0900 Subject: tests: fix memory leaks Message-ID: <1375761798.3199.4.camel@cfw2.gniibe.org> I found memory leaks in test code. It's better to fix these as people may use the code as example. -------------------------- tests: fix memory leaks. * tests/pubkey.c (check_keys_crypt): Release L, X0, and X1. (check_keys): Release X. diff --git a/tests/pubkey.c b/tests/pubkey.c index ffaecb3..baf234c 100644 --- a/tests/pubkey.c +++ b/tests/pubkey.c @@ -144,6 +144,7 @@ check_keys_crypt (gcry_sexp_t pkey, gcry_sexp_t skey, /* Extract data from plaintext. */ l = gcry_sexp_find_token (plain0, "value", 0); x0 = gcry_sexp_nth_mpi (l, 1, GCRYMPI_FMT_USG); + gcry_sexp_release (l); /* Encrypt data. */ rc = gcry_pk_encrypt (&cipher, plain0, pkey); @@ -160,7 +161,10 @@ check_keys_crypt (gcry_sexp_t pkey, gcry_sexp_t skey, if (rc) { if (decrypt_fail_code && gpg_err_code (rc) == decrypt_fail_code) - return; /* This is the expected failure code. */ + { + gcry_mpi_release (x0); + return; /* This is the expected failure code. */ + } die ("decryption failed: %s\n", gcry_strerror (rc)); } @@ -189,6 +193,8 @@ check_keys_crypt (gcry_sexp_t pkey, gcry_sexp_t skey, /* Compare. */ if (gcry_mpi_cmp (x0, x1)) die ("data corrupted\n"); + gcry_mpi_release (x0); + gcry_mpi_release (x1); } static void @@ -218,6 +224,7 @@ check_keys (gcry_sexp_t pkey, gcry_sexp_t skey, unsigned int nbits_data, rc = gcry_sexp_build (&plain, NULL, "(data (flags raw no-blinding) (value %m))", x); + gcry_mpi_release (x); if (rc) die ("converting data for encryption failed: %s\n", gcry_strerror (rc)); -- From gniibe at fsij.org Tue Aug 6 07:38:05 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Tue, 06 Aug 2013 14:38:05 +0900 Subject: cipher: fix memory leaks In-Reply-To: <1375756092.3199.2.camel@cfw2.gniibe.org> References: <1375756092.3199.2.camel@cfw2.gniibe.org> Message-ID: <1375767485.3199.6.camel@cfw2.gniibe.org> On 2013-08-06 at 11:28 +0900, NIIBE Yutaka wrote: > I found memory leaks, and following is my fix for master. Another fix. diff --git a/cipher/ecc.c b/cipher/ecc.c index 375eeaf..b694d76 100644 --- a/cipher/ecc.c +++ b/cipher/ecc.c @@ -1319,6 +1319,7 @@ ecc_get_curve (gcry_mpi_t *pkey, int iterator, unsigned int *r_nbits) tmp = scanval (domain_parms[idx].g_y); if (!mpi_cmp (tmp, E.G.y)) { + mpi_free (tmp); result = domain_parms[idx].desc; if (r_nbits) *r_nbits = domain_parms[idx].nbits; -- From v.chandla at samsung.com Tue Aug 6 08:43:21 2013 From: v.chandla at samsung.com (Vishal Chandla) Date: Tue, 06 Aug 2013 06:43:21 +0000 (GMT) Subject: Fix memory leaks. Message-ID: <9A.B0.07473.A0B90025@epcpsbgx2.samsung.com> An HTML attachment was scrubbed... URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: 201308061213652_R2OCEO92.gif Type: image/gif Size: 14036 bytes Desc: not available URL: From wk at gnupg.org Tue Aug 6 09:48:08 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 06 Aug 2013 09:48:08 +0200 Subject: cipher: fix memory leaks In-Reply-To: <1375756092.3199.2.camel@cfw2.gniibe.org> (NIIBE Yutaka's message of "Tue, 06 Aug 2013 11:28:12 +0900") References: <1375756092.3199.2.camel@cfw2.gniibe.org> Message-ID: <87zjsvl0p3.fsf@vigenere.g10code.de> On Tue, 6 Aug 2013 04:28, gniibe at fsij.org said: > I found memory leaks, and following is my fix for master. Please push your fixes; you may want to merge them first, though. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From gniibe at fsij.org Tue Aug 6 10:35:48 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Tue, 06 Aug 2013 17:35:48 +0900 Subject: cipher: fix memory leaks In-Reply-To: <87zjsvl0p3.fsf@vigenere.g10code.de> References: <1375756092.3199.2.camel@cfw2.gniibe.org> <87zjsvl0p3.fsf@vigenere.g10code.de> Message-ID: <1375778148.3199.7.camel@cfw2.gniibe.org> On 2013-08-06 at 09:48 +0200, Werner Koch wrote: > Please push your fixes; you may want to merge them first, though. Done. For ECC ("pabgnqd", with no Q, but D), I try following patch. It works, but doesn't look so good. diff --git a/cipher/pubkey.c b/cipher/pubkey.c index e867169..4bf74d7 100644 --- a/cipher/pubkey.c +++ b/cipher/pubkey.c @@ -304,13 +304,16 @@ _gcry_pk_unregister (gcry_module_t module) } static void -release_mpi_array (gcry_mpi_t *array) +release_mpi_array (gcry_mpi_t *array, size_t n) { - for (; *array; array++) - { - mpi_free(*array); - *array = NULL; - } + int i; + + for (i = 0; i < n; i++) + if (array[i]) + { + mpi_free (array[i]); + array[i] = NULL; + } } /**************** @@ -1854,11 +1857,7 @@ sexp_elements_extract (gcry_sexp_t key_sexp, const char *element_names, if (err) - { - for (i = 0; i < idx; i++) - if (elements[i]) - mpi_free (elements[i]); - } + release_mpi_array (elements, idx); return err; } @@ -1875,6 +1874,7 @@ sexp_elements_extract_ecc (gcry_sexp_t key_sexp, const char *element_names, int idx; const char *name; gcry_sexp_t list; + size_t element_size; /* Clear the array for easier error cleanup. */ for (name = element_names, idx = 0; *name; name++, idx++) @@ -1883,7 +1883,7 @@ sexp_elements_extract_ecc (gcry_sexp_t key_sexp, const char *element_names, (params only) or 6 (full public key). */ if (idx == 5) elements[5] = NULL; /* Extra clear for the params only case. */ - + element_size = idx; /* Init the array with the available curve parameters. */ for (name = element_names, idx = 0; *name && !err; name++, idx++) @@ -1960,11 +1960,7 @@ sexp_elements_extract_ecc (gcry_sexp_t key_sexp, const char *element_names, leave: if (err) - { - for (name = element_names, idx = 0; *name; name++, idx++) - if (elements[idx]) - mpi_free (elements[idx]); - } + release_mpi_array (elements, element_size); return err; } @@ -2009,8 +2005,8 @@ sexp_elements_extract_ecc (gcry_sexp_t key_sexp, const char *element_names, */ static gcry_err_code_t sexp_to_key (gcry_sexp_t sexp, int want_private, int use, - const char *override_elems, - gcry_mpi_t **retarray, gcry_module_t *retalgo, int *r_is_ecc) + const char *override_elems, gcry_mpi_t **retarray, + size_t *retsize, gcry_module_t *retalgo, int *r_is_ecc) { gcry_err_code_t err = 0; gcry_sexp_t list, l2; @@ -2087,7 +2083,8 @@ sexp_to_key (gcry_sexp_t sexp, int want_private, int use, elems = pubkey->elements_skey; else elems = pubkey->elements_pkey; - array = gcry_calloc (strlen (elems) + 1, sizeof (*array)); + *retsize = strlen (elems) + 1; /* We need +1 for ECC. */ + array = gcry_calloc (*retsize, sizeof (*array)); if (!array) err = gpg_err_code_from_syserror (); if (!err) @@ -2122,7 +2119,7 @@ sexp_to_key (gcry_sexp_t sexp, int want_private, int use, static gcry_err_code_t -sexp_to_sig (gcry_sexp_t sexp, gcry_mpi_t **retarray, +sexp_to_sig (gcry_sexp_t sexp, gcry_mpi_t **retarray, size_t *retsize, gcry_module_t *retalgo) { gcry_err_code_t err = 0; @@ -2182,7 +2179,8 @@ sexp_to_sig (gcry_sexp_t sexp, gcry_mpi_t **retarray, pubkey = (gcry_pk_spec_t *) module->spec; elems = pubkey->elements_sig; - array = gcry_calloc (strlen (elems) + 1 , sizeof *array ); + *retsize = strlen (elems); + array = gcry_calloc (*retsize, sizeof *array ); if (!array) err = gpg_err_code_from_syserror (); @@ -2279,8 +2277,9 @@ get_hash_algo (const char *s, size_t n) * case raw encoding is used. */ static gcry_err_code_t -sexp_to_enc (gcry_sexp_t sexp, gcry_mpi_t **retarray, gcry_module_t *retalgo, - int *ret_modern, int *flags, struct pk_encoding_ctx *ctx) +sexp_to_enc (gcry_sexp_t sexp, gcry_mpi_t **retarray, size_t *retsize, + gcry_module_t *retalgo, int *ret_modern, int *flags, + struct pk_encoding_ctx *ctx) { gcry_err_code_t err = 0; gcry_sexp_t list = NULL, l2 = NULL; @@ -2442,7 +2441,8 @@ sexp_to_enc (gcry_sexp_t sexp, gcry_mpi_t **retarray, gcry_module_t *retalgo, pubkey = (gcry_pk_spec_t *) module->spec; elems = pubkey->elements_enc; - array = gcry_calloc (strlen (elems) + 1, sizeof (*array)); + *retsize = strlen (elems); + array = gcry_calloc (*retsize, sizeof (*array)); if (!array) { err = gpg_err_code_from_syserror (); @@ -2929,13 +2929,16 @@ gcry_pk_encrypt (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t s_pkey) gcry_err_code_t rc; gcry_pk_spec_t *pubkey = NULL; gcry_module_t module = NULL; + size_t pkey_size = 0; + size_t ciph_size = 0; *r_ciph = NULL; REGISTER_DEFAULT_PUBKEYS; /* Get the key. */ - rc = sexp_to_key (s_pkey, 0, GCRY_PK_USAGE_ENCR, NULL, &pkey, &module, NULL); + rc = sexp_to_key (s_pkey, 0, GCRY_PK_USAGE_ENCR, NULL, &pkey, &pkey_size, + &module, NULL); if (rc) goto leave; @@ -2960,7 +2963,8 @@ gcry_pk_encrypt (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t s_pkey) goto leave; /* Now we can encrypt DATA to CIPH. */ - ciph = gcry_calloc (strlen (algo_elems) + 1, sizeof (*ciph)); + ciph_size = strlen (algo_elems); + ciph = gcry_calloc (ciph_size, sizeof (*ciph)); if (!ciph) { rc = gpg_err_code_from_syserror (); @@ -3042,13 +3046,13 @@ gcry_pk_encrypt (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t s_pkey) leave: if (pkey) { - release_mpi_array (pkey); + release_mpi_array (pkey, pkey_size); gcry_free (pkey); } if (ciph) { - release_mpi_array (ciph); + release_mpi_array (ciph, ciph_size); gcry_free (ciph); } @@ -3102,6 +3106,8 @@ gcry_pk_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t s_skey) struct pk_encoding_ctx ctx; gcry_err_code_t rc; gcry_module_t module_enc = NULL, module_key = NULL; + size_t skey_size = 0; + size_t data_size = 0; *r_plain = NULL; ctx.label = NULL; @@ -3109,12 +3115,13 @@ gcry_pk_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t s_skey) REGISTER_DEFAULT_PUBKEYS; rc = sexp_to_key (s_skey, 1, GCRY_PK_USAGE_ENCR, NULL, - &skey, &module_key, NULL); + &skey, &skey_size, &module_key, NULL); if (rc) goto leave; init_encoding_ctx (&ctx, PUBKEY_OP_DECRYPT, gcry_pk_get_nbits (s_skey)); - rc = sexp_to_enc (s_data, &data, &module_enc, &modern, &flags, &ctx); + rc = sexp_to_enc (s_data, &data, &data_size, &module_enc, &modern, + &flags, &ctx); if (rc) goto leave; @@ -3165,7 +3172,7 @@ gcry_pk_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t s_skey) if (skey) { - release_mpi_array (skey); + release_mpi_array (skey, skey_size); gcry_free (skey); } @@ -3173,7 +3180,7 @@ gcry_pk_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t s_skey) if (data) { - release_mpi_array (data); + release_mpi_array (data, data_size); gcry_free (data); } @@ -3233,13 +3240,15 @@ gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey) int i; int is_ecc; gcry_err_code_t rc; + size_t skey_size = 0; + size_t result_size = 0; *r_sig = NULL; REGISTER_DEFAULT_PUBKEYS; rc = sexp_to_key (s_skey, 1, GCRY_PK_USAGE_SIGN, NULL, - &skey, &module, &is_ecc); + &skey, &skey_size, &module, &is_ecc); if (rc) goto leave; @@ -3260,7 +3269,8 @@ gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey) if (rc) goto leave; - result = gcry_calloc (strlen (algo_elems) + 1, sizeof (*result)); + result_size = strlen (algo_elems); + result = gcry_calloc (result_size, sizeof (*result)); if (!result) { rc = gpg_err_code_from_syserror (); @@ -3339,7 +3349,7 @@ gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey) leave: if (skey) { - release_mpi_array (skey); + release_mpi_array (skey, skey_size); gcry_free (skey); } @@ -3348,7 +3358,7 @@ gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey) if (result) { - release_mpi_array (result); + release_mpi_array (result, result_size); gcry_free (result); } @@ -3370,15 +3380,17 @@ gcry_pk_verify (gcry_sexp_t s_sig, gcry_sexp_t s_hash, gcry_sexp_t s_pkey) gcry_mpi_t *pkey = NULL, hash = NULL, *sig = NULL; struct pk_encoding_ctx ctx; gcry_err_code_t rc; + size_t pkey_size = 0; + size_t sig_size = 0; REGISTER_DEFAULT_PUBKEYS; rc = sexp_to_key (s_pkey, 0, GCRY_PK_USAGE_SIGN, NULL, - &pkey, &module_key, NULL); + &pkey, &pkey_size, &module_key, NULL); if (rc) goto leave; - rc = sexp_to_sig (s_sig, &sig, &module_sig); + rc = sexp_to_sig (s_sig, &sig, &sig_size, &module_sig); if (rc) goto leave; @@ -3403,12 +3415,12 @@ gcry_pk_verify (gcry_sexp_t s_sig, gcry_sexp_t s_hash, gcry_sexp_t s_pkey) leave: if (pkey) { - release_mpi_array (pkey); + release_mpi_array (pkey, pkey_size); gcry_free (pkey); } if (sig) { - release_mpi_array (sig); + release_mpi_array (sig, sig_size); gcry_free (sig); } if (hash) @@ -3443,15 +3455,16 @@ gcry_pk_testkey (gcry_sexp_t s_key) gcry_module_t module = NULL; gcry_mpi_t *key = NULL; gcry_err_code_t rc; + size_t key_size = 0; REGISTER_DEFAULT_PUBKEYS; /* Note we currently support only secret key checking. */ - rc = sexp_to_key (s_key, 1, 0, NULL, &key, &module, NULL); + rc = sexp_to_key (s_key, 1, 0, NULL, &key, &key_size, &module, NULL); if (! rc) { rc = pubkey_check_secret_key (module->mod_id, key); - release_mpi_array (key); + release_mpi_array (key, key_size); gcry_free (key); } return gcry_error (rc); @@ -3511,6 +3524,8 @@ gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms) gcry_sexp_t extrainfo = NULL; unsigned int nbits = 0; unsigned long use_e = 0; + size_t skey_size = 0; + size_t factors_size = 0; skey[0] = NULL; *r_key = NULL; @@ -3618,7 +3633,7 @@ gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms) /* Key generation succeeded: Build an S-expression. */ { char *string, *p; - size_t nelem=0, nelem_cp = 0, needed=0; + size_t nelem = 0, nelem_cp = 0, needed=0; gcry_mpi_t mpis[30]; int percent_s_idx = -1; @@ -3628,6 +3643,7 @@ gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms) { for (i = 0; factors[i]; i++) nelem++; + factors_size = i; } nelem_cp = nelem; @@ -3674,8 +3690,7 @@ gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms) } p = stpcpy (p, "))"); - /* Hack to make release_mpi_array() work. */ - skey[i] = NULL; + skey_size = i; if (extrainfo && percent_s_idx == -1) { @@ -3736,12 +3751,12 @@ gcry_pk_genkey (gcry_sexp_t *r_key, gcry_sexp_t s_parms) leave: gcry_free (name); gcry_sexp_release (extrainfo); - release_mpi_array (skey); + release_mpi_array (skey, skey_size); /* Don't free SKEY itself, it is an stack allocated array. */ if (factors) { - release_mpi_array ( factors ); + release_mpi_array (factors, factors_size); gcry_free (factors); } @@ -3773,6 +3788,7 @@ gcry_pk_get_nbits (gcry_sexp_t key) gcry_mpi_t *keyarr = NULL; unsigned int nbits = 0; gcry_err_code_t rc; + size_t keyarr_size = 0; REGISTER_DEFAULT_PUBKEYS; @@ -3780,9 +3796,9 @@ gcry_pk_get_nbits (gcry_sexp_t key) ECC we would only need to look at P and stop parsing right away. */ - rc = sexp_to_key (key, 0, 0, NULL, &keyarr, &module, NULL); + rc = sexp_to_key (key, 0, 0, NULL, &keyarr, &keyarr_size, &module, NULL); if (rc == GPG_ERR_INV_OBJ) - rc = sexp_to_key (key, 1, 0, NULL, &keyarr, &module, NULL); + rc = sexp_to_key (key, 1, 0, NULL, &keyarr, &keyarr_size, &module, NULL); if (rc) return 0; /* Error - 0 is a suitable indication for that. */ @@ -3793,7 +3809,7 @@ gcry_pk_get_nbits (gcry_sexp_t key) _gcry_module_release (module); ath_mutex_unlock (&pubkeys_registered_lock); - release_mpi_array (keyarr); + release_mpi_array (keyarr, keyarr_size); gcry_free (keyarr); return nbits; @@ -3922,6 +3938,7 @@ gcry_pk_get_curve (gcry_sexp_t key, int iterator, unsigned int *r_nbits) char *name = NULL; const char *result = NULL; int want_private = 1; + size_t pkey_size = 0; if (r_nbits) *r_nbits = 0; @@ -3953,7 +3970,8 @@ gcry_pk_get_curve (gcry_sexp_t key, int iterator, unsigned int *r_nbits) /* Get the key. We pass the names of the parameters for override_elems; this allows to call this function without the actual public key parameter. */ - if (sexp_to_key (key, want_private, 0, "pabgn", &pkey, &module, NULL)) + if (sexp_to_key (key, want_private, 0, "pabgn", &pkey, &pkey_size, + &module, NULL)) goto leave; } else @@ -3974,7 +3992,7 @@ gcry_pk_get_curve (gcry_sexp_t key, int iterator, unsigned int *r_nbits) leave: if (pkey) { - release_mpi_array (pkey); + release_mpi_array (pkey, pkey_size); gcry_free (pkey); } if (module) -- From wk at gnupg.org Tue Aug 6 14:10:20 2013 From: wk at gnupg.org (Werner Koch) Date: Tue, 06 Aug 2013 14:10:20 +0200 Subject: Fix memory leaks. In-Reply-To: <9A.B0.07473.A0B90025@epcpsbgx2.samsung.com> (Vishal Chandla's message of "Tue, 06 Aug 2013 06:43:21 +0000 (GMT)") References: <9A.B0.07473.A0B90025@epcpsbgx2.samsung.com> Message-ID: <87a9kvkok3.fsf@vigenere.g10code.de> On Tue, 6 Aug 2013 08:43, v.chandla at samsung.com said: > I found some more memory leaks, and following is my fix for master. > > --------------------------------------------------- > > diff --git a/cipher/ac.c b/cipher/ac.c Well, ac.c has actually be removed from master. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From smueller at chronox.de Wed Aug 7 01:04:52 2013 From: smueller at chronox.de (Stephan Mueller) Date: Wed, 07 Aug 2013 01:04:52 +0200 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <1410100.jm8NpcopG3@tauon> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1410100.jm8NpcopG3@tauon> Message-ID: <4517473.et8hzpF21q@tauon> Am Montag, 5. August 2013, 20:05:31 schrieb Stephan Mueller: Hi Stephan, >Am Montag, 5. August 2013, 21:41:30 schrieb Dmitry Eremin-Solenikov: > >Hi Dmitry, > >>On Mon, Aug 5, 2013 at 9:12 PM, Stephan Mueller > >wrote: >>> Am Montag, 5. August 2013, 20:06:39 schrieb Dmitry Eremin-Solenikov: >>>>On Mon, Aug 5, 2013 at 7:55 PM, Stephan Mueller >>>> >>>> >>> wrote: >>>>> Am Montag, 5. August 2013, 19:17:58 schrieb Dmitry Eremin- > >Solenikov: >>>>Let me rephrase the issue in a little bit different way, so that you >>>>see my point: >>>>'GCM should not use the same IV for encryption more than once with a >>>>given key'. 800-38D talks about calling 'authenticated encryption' >>>>function with same IV. 800-38D does not have any requirements about >>>>'authenticate decryption' function with respect to IV vectors. >>>> >>>>Thus we should not allow using 'bad' IV vectors for encryption only. >>>>Adding a check >>>>to gcm_encrypt() looks like a sufficient protection. >>>> >>> Of course, here I am with you. Covering the encrypt operation is >>> sufficient. >>> >>> Yet I fail to understand your proposal to handle 8.2.1 compliance. >>> When you use clock_gettime, there is a chance greater than zero that >>> you get two identical values. Note, CLOCK_REALTIME/CLOCK_MONOTONIC >>> is affected by ntp drift and ntp updates, CLOCK_MONOTONIC_RAW may >>> not be available on every system. >> >>I forgot about clock_monotonic being affected by ntp :( >>In the end this can be as simple, as static counter, being incremented >>on each call to generate_iv(). >> >>Or (if that would still satisfy 800-38D) have that counter in cipher >>context (oh, my, another one). Thus one can assume that if a key is >>set on the cipher context, >>all further generate_iv() invocations would increment that counter. > >That solution is explicitly allowed in 8.2.1. There is one way that has been accepted that you may want to consider: have the setiv function that hardly can check anything wrt section 8.2.1. Maybe it can check minimum lengths. For example, OpenSSL checks for a min length of 96 bits. But in addition, have a geniv function where the implementation generates an iv. As a rule of thumb, the fixed field is a value that shall be under control of the caller. The invocation field shall be under control of the cipher. I.e. when generating an IV, allow the caller to provide the fixed field. Another catch with the IV: SP800-38D requires that one key shall only be used with at most 2**32 different IVs. If you hit that threshold, you must not continue with the crypto operation, but inform the caller to set up a new key. Do you have such a check? > >"The invocation field typically is either 1) an integer counter or 2) a >linear feedback shift register >that is driven by a primitive polynomial to ensure a maximal cycle >length. In either case, the >invocation field increments upon each invocation of the authenticated >encryption function." > >>> In general: how do you propose you want to construct or verify the >>> "fixed field" and the "invocation field"? >> >>"fixed field" can be specified during generate_iv() invocation. >>"invocation field" is a copy of a counter. > >Well, shouldn't there be a service offered to generate an iv that even >generates the field if the caller has no clue (e.g. provides NULL for >the fixed field)? I mean, how shall a caller come up with a unique >fixed field value? > >>> The invocation field must be modified after each authentication. >>> That >>> modification must still ensure that the invocation field is unique. >>> How shall that be done? >> >>Hmm. I would increment that during generation. You have to reset > >right, or use a shift register as specified in 8.2.1. > >>internal GCM counter to 0 values (so that your tag is correct). >>Currently it is done either via setiv() (reset() will zero a marks.iv >>flag, thus next encryption/decryption >>operation will call gcm_setiv() internally). >> >>> Moreover, there is a specific length requirement for the IV. >> >>I think there are not. Spec says, that for |IV| < 96 bits, one must >>use 8.2.1, for |IV| >= 96, it should use one of 8.2.1 or 8.2.2 (but >>only one of them). > >Right, scratch my remark. > > >[...] > >Ciao >Stephan Ciao Stephan -- | Cui bono? | From gniibe at fsij.org Wed Aug 7 02:05:26 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Wed, 07 Aug 2013 09:05:26 +0900 Subject: tests: fix memory leaks In-Reply-To: <1375761798.3199.4.camel@cfw2.gniibe.org> References: <1375761798.3199.4.camel@cfw2.gniibe.org> Message-ID: <1375833926.3237.0.camel@cfw2.gniibe.org> On 2013-08-06 at 13:03 +0900, NIIBE Yutaka wrote: > I found memory leaks in test code. It's better to fix these > as people may use the code as example. Here is another fix for test code. Another reason of these fixes is that when we will find some memory leaks in future running test code, we will be able to distinguish them as bugs. OK to commit? -------------------------------- tests: fix memory leaks. * tests/benchmark.c (dsa_bench): Release SIG. * tests/mpitests.c (test_powm): Release BASE, EXP, MOD, and RES. * tests/prime.c (check_primes): Release PRIME. * tests/tsexp.c (basic): Use intermediate variable M for constant. Release S1, S2 and A. diff --git a/tests/benchmark.c b/tests/benchmark.c index 79048a3..f332003 100644 --- a/tests/benchmark.c +++ b/tests/benchmark.c @@ -864,7 +864,7 @@ dsa_bench (int iterations, int print_header) int p_sizes[3] = { 1024, 2048, 3072 }; int q_sizes[3] = { 160, 224, 256 }; gcry_sexp_t data; - gcry_sexp_t sig; + gcry_sexp_t sig = NULL; int i, j; err = gcry_sexp_sscan (pub_key+0, NULL, sample_public_dsa_key_1024, @@ -916,6 +916,7 @@ dsa_bench (int iterations, int print_header) start_timer (); for (j=0; j < iterations; j++) { + gcry_sexp_release (sig); err = gcry_pk_sign (&sig, data, sec_key[i]); if (err) { @@ -947,6 +948,7 @@ dsa_bench (int iterations, int print_header) gcry_sexp_release (sig); gcry_sexp_release (data); + sig = NULL; } diff --git a/tests/mpitests.c b/tests/mpitests.c index 432f3e8..03c15b9 100644 --- a/tests/mpitests.c +++ b/tests/mpitests.c @@ -362,6 +362,10 @@ test_powm (void) if (gcry_mpi_cmp (res, base)) die ("test_powm failed at %d\n", __LINE__); + gcry_mpi_release (base); + gcry_mpi_release (exp); + gcry_mpi_release (mod); + gcry_mpi_release (res); /* Fixme: We should add the rest of the cases of course. */ diff --git a/tests/prime.c b/tests/prime.c index 6e825ae..89800e8 100644 --- a/tests/prime.c +++ b/tests/prime.c @@ -95,6 +95,7 @@ check_primes (void) gcry_mpi_add_ui (prime, prime, 1); err = gcry_prime_check (prime, 0); assert (err); + gcry_mpi_release (prime); prime = NULL; } } diff --git a/tests/tsexp.c b/tests/tsexp.c index cef3ed1..7c4f7c8 100644 --- a/tests/tsexp.c +++ b/tests/tsexp.c @@ -89,44 +89,52 @@ basic (void) for (pass=0;;pass++) { + gcry_mpi_t m; + switch (pass) { case 0: string = ("(public-key (dsa (p #41424344#) (y this_is_y) " "(q #61626364656667#) (g %m)))"); - if ( gcry_sexp_build (&sexp, NULL, string, - gcry_mpi_set_ui (NULL, 42)) ) + m = gcry_mpi_set_ui (NULL, 42); + if ( gcry_sexp_build (&sexp, NULL, string, m ) ) { + gcry_mpi_release (m); fail (" scanning `%s' failed\n", string); return; } + gcry_mpi_release (m); break; case 1: string = ("(public-key (dsa (p #41424344#) (y this_is_y) " "(q %b) (g %m)))"); + m = gcry_mpi_set_ui (NULL, 42); if ( gcry_sexp_build (&sexp, NULL, string, - 15, "foo\0\x01\0x02789012345", - gcry_mpi_set_ui (NULL, 42)) ) + 15, "foo\0\x01\0x02789012345", m) ) { + gcry_mpi_release (m); fail (" scanning `%s' failed\n", string); return; } + gcry_mpi_release (m); break; case 2: string = ("(public-key (dsa (p #41424344#) (y silly_y_value) " "(q %b) (g %m)))"); + m = gcry_mpi_set_ui (NULL, 17); if ( gcry_sexp_build (&sexp, NULL, string, - secure_buffer_len, secure_buffer, - gcry_mpi_set_ui (NULL, 17)) ) + secure_buffer_len, secure_buffer, m) ) { + gcry_mpi_release (m); fail (" scanning `%s' failed\n", string); return; } + gcry_mpi_release (m); if (!gcry_is_secure (sexp)) fail ("gcry_sexp_build did not switch to secure memory\n"); break; @@ -144,13 +152,15 @@ basic (void) string = ("(public-key (dsa (p #41424344#) (parm %S) " "(y dummy)(q %b) (g %m)))"); + m = gcry_mpi_set_ui (NULL, 17); if ( gcry_sexp_build (&sexp, NULL, string, help_sexp, - secure_buffer_len, secure_buffer, - gcry_mpi_set_ui (NULL, 17)) ) + secure_buffer_len, secure_buffer, m) ) { + gcry_mpi_release (m); fail (" scanning `%s' failed\n", string); return; } + gcry_mpi_release (m); gcry_sexp_release (help_sexp); } break; @@ -181,6 +191,7 @@ basic (void) p = gcry_sexp_nth_data (s1, 0, &n); if (!p) { + gcry_sexp_release (s1); fail ("no car for `%s'\n", token); continue; } @@ -189,13 +200,16 @@ basic (void) s2 = gcry_sexp_cdr (s1); if (!s2) { + gcry_sexp_release (s1); fail ("no cdr for `%s'\n", token); continue; } p = gcry_sexp_nth_data (s2, 0, &n); + gcry_sexp_release (s2); if (p) { + gcry_sexp_release (s1); fail ("data at car of `%s'\n", token); continue; } @@ -203,6 +217,7 @@ basic (void) if (parm) { s2 = gcry_sexp_find_token (s1, parm, strlen (parm)); + gcry_sexp_release (s1); if (!s2) { fail ("didn't found `%s'\n", parm); @@ -211,6 +226,7 @@ basic (void) p = gcry_sexp_nth_data (s2, 0, &n); if (!p) { + gcry_sexp_release (s2); fail("no car for `%s'\n", parm ); continue; } @@ -218,18 +234,23 @@ basic (void) p = gcry_sexp_nth_data (s2, 1, &n); if (!p) { + gcry_sexp_release (s2); fail("no cdr for `%s'\n", parm ); continue; } info ("cdr=`%.*s'\n", (int)n, p); a = gcry_sexp_nth_mpi (s2, 0, GCRYMPI_FMT_USG); + gcry_sexp_release (s2); if (!a) { fail("failed to cdr the mpi for `%s'\n", parm); continue; } + gcry_mpi_release (a); } + else + gcry_sexp_release (s1); } gcry_sexp_release (sexp); -- From dbaryshkov at gmail.com Wed Aug 7 08:28:31 2013 From: dbaryshkov at gmail.com (Dmitry Eremin-Solenikov) Date: Wed, 7 Aug 2013 10:28:31 +0400 Subject: [RFC 2/2] FIXME: initial implementation of GCM In-Reply-To: <4517473.et8hzpF21q@tauon> References: <1375427655-5894-1-git-send-email-dbaryshkov@gmail.com> <1410100.jm8NpcopG3@tauon> <4517473.et8hzpF21q@tauon> Message-ID: Him On Wed, Aug 7, 2013 at 3:04 AM, Stephan Mueller wrote: > Am Montag, 5. August 2013, 20:05:31 schrieb Stephan Mueller: >>Am Montag, 5. August 2013, 21:41:30 schrieb Dmitry Eremin-Solenikov: > Another catch with the IV: SP800-38D requires that one key shall only be > used with at most 2**32 different IVs. If you hit that threshold, you > must not continue with the crypto operation, but inform the caller to > set up a new key. Do you have such a check? No, good catch. -- With best wishes Dmitry From wk at gnupg.org Wed Aug 7 08:45:26 2013 From: wk at gnupg.org (Werner Koch) Date: Wed, 07 Aug 2013 08:45:26 +0200 Subject: tests: fix memory leaks In-Reply-To: <1375833926.3237.0.camel@cfw2.gniibe.org> (NIIBE Yutaka's message of "Wed, 07 Aug 2013 09:05:26 +0900") References: <1375761798.3199.4.camel@cfw2.gniibe.org> <1375833926.3237.0.camel@cfw2.gniibe.org> Message-ID: <87ob9ahud5.fsf@vigenere.g10code.de> On Wed, 7 Aug 2013 02:05, gniibe at fsij.org said: > Here is another fix for test code. Another reason of these fixes is > that when we will find some memory leaks in future running test code, > we will be able to distinguish them as bugs. > > OK to commit? Sure. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From jussi.kivilinna at iki.fi Wed Aug 7 09:59:44 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 07 Aug 2013 10:59:44 +0300 Subject: [RFC PATCH] Prepare random/win32.c fast poll for 64-bit Windows Message-ID: <20130807075944.5233.46101.stgit@localhost6.localdomain6> * random/win32.c (_gcry_rndw32_gather_random_fast) [ADD]: Rename to ADDINT. (_gcry_rndw32_gather_random_fast): Add ADDPTR. (_gcry_rndw32_gather_random_fast): Disable entropy gathering from GetQueueStatus(QS_ALLEVENTS). (_gcry_rndw32_gather_random_fast): Change minimumWorkingSetSize and maximumWorkingSetSize to SIZE_T from DWORD. (_gcry_rndw32_gather_random_fast): Only add lower 32-bits of minimumWorkingSetSize and maximumWorkingSetSize to random poll. (_gcry_rndw32_gather_random_fast) [__WIN64__]: Read TSC directly using intrinsic. -- Introduce entropy gatherer changes related to 64-bit Windows platform as done in cryptlib fast poll: - Change ADD macro to ADDPTR/ADDINT to handle pointer values. ADDPTR discards high 32-bits of 64-bit pointer values. - minimum/maximumWorkingSetSize changed to SIZE_T type to avoid stack corruption on 64-bit; only low 32-bits are used for entropy. - Use __rdtsc() intrinsic on 64-bit (as TSC is always available). Signed-off-by: Jussi Kivilinna -- Would this be enough for 64-bit Windows support? Slow poll differences cryptlib vs libgcrypt appear to be limited to sensor data reading from third party programs and reading of PnP data. --- random/rndw32.c | 83 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 31 deletions(-) diff --git a/random/rndw32.c b/random/rndw32.c index 5c5d6c6..7e78b50 100644 --- a/random/rndw32.c +++ b/random/rndw32.c @@ -826,39 +826,47 @@ _gcry_rndw32_gather_random_fast (void (*add)(const void*, size_t, cursor position for last message, 1 ms time for last message, handle of window with clipboard open, handle of process heap, handle of procs window station, types of events in input queue, - and milliseconds since Windows was started. */ + and milliseconds since Windows was started. On 64-bit platform + some of these return values are pointers and thus 64-bit wide. + We discard the upper 32-bit of those values. */ { byte buffer[20*sizeof(ulong)], *bufptr; bufptr = buffer; -#define ADD(f) do { ulong along = (ulong)(f); \ - memcpy (bufptr, &along, sizeof (along) ); \ - bufptr += sizeof (along); \ - } while (0) - - ADD ( GetActiveWindow ()); - ADD ( GetCapture ()); - ADD ( GetClipboardOwner ()); - ADD ( GetClipboardViewer ()); - ADD ( GetCurrentProcess ()); - ADD ( GetCurrentProcessId ()); - ADD ( GetCurrentThread ()); - ADD ( GetCurrentThreadId ()); - ADD ( GetDesktopWindow ()); - ADD ( GetFocus ()); - ADD ( GetInputState ()); - ADD ( GetMessagePos ()); - ADD ( GetMessageTime ()); - ADD ( GetOpenClipboardWindow ()); - ADD ( GetProcessHeap ()); - ADD ( GetProcessWindowStation ()); - ADD ( GetQueueStatus (QS_ALLEVENTS)); - ADD ( GetTickCount ()); +#define ADDINT(f) do { ulong along = (ulong)(f); \ + memcpy (bufptr, &along, sizeof (along) ); \ + bufptr += sizeof (along); \ + } while (0) +#define ADDPTR(f) do { void *aptr = (f); \ + ADDINT((SIZE_T)aptr); \ + } while (0) + + ADDPTR ( GetActiveWindow ()); + ADDPTR ( GetCapture ()); + ADDPTR ( GetClipboardOwner ()); + ADDPTR ( GetClipboardViewer ()); + ADDPTR ( GetCurrentProcess ()); + ADDINT ( GetCurrentProcessId ()); + ADDPTR ( GetCurrentThread ()); + ADDINT ( GetCurrentThreadId ()); + ADDPTR ( GetDesktopWindow ()); + ADDPTR ( GetFocus ()); + ADDINT ( GetInputState ()); + ADDINT ( GetMessagePos ()); + ADDINT ( GetMessageTime ()); + ADDPTR ( GetOpenClipboardWindow ()); + ADDPTR ( GetProcessHeap ()); + ADDPTR ( GetProcessWindowStation ()); + /* Following function in some cases stops returning events, and cannot + be used as an entropy source. */ + /*ADDINT ( GetQueueStatus (QS_ALLEVENTS));*/ + ADDINT ( GetTickCount ()); gcry_assert ( bufptr-buffer < sizeof (buffer) ); (*add) ( buffer, bufptr-buffer, origin ); -#undef ADD +#undef ADDINT +#undef ADDPTR } /* Get multiword system information: Current caret position, current @@ -888,7 +896,7 @@ _gcry_rndw32_gather_random_fast (void (*add)(const void*, size_t, { HANDLE handle; FILETIME creationTime, exitTime, kernelTime, userTime; - DWORD minimumWorkingSetSize, maximumWorkingSetSize; + SIZE_T minimumWorkingSetSize, maximumWorkingSetSize; handle = GetCurrentThread (); GetThreadTimes (handle, &creationTime, &exitTime, @@ -910,10 +918,9 @@ _gcry_rndw32_gather_random_fast (void (*add)(const void*, size_t, process. */ GetProcessWorkingSetSize (handle, &minimumWorkingSetSize, &maximumWorkingSetSize); - (*add) ( &minimumWorkingSetSize, - sizeof (minimumWorkingSetSize), origin ); - (*add) ( &maximumWorkingSetSize, - sizeof (maximumWorkingSetSize), origin ); + /* On 64-bit system, discard the high 32-bits. */ + (*add) ( &minimumWorkingSetSize, sizeof (int), origin ); + (*add) ( &maximumWorkingSetSize, sizeof (int), origin ); } @@ -961,7 +968,20 @@ _gcry_rndw32_gather_random_fast (void (*add)(const void*, size_t, To make things unambiguous, we detect a CPU new enough to call RDTSC directly by checking for CPUID capabilities, and fall back to QPC if - this isn't present. */ + this isn't present. + + On AMD64, TSC is always available and intrinsic is provided for accessing + it. */ +#ifdef __WIN64__ + { + unsigned __int64 aint64; + + /* Note: cryptlib does not discard upper 32 bits of TSC on WIN64, but does + * on WIN32. Is this correct? */ + aint64 = __rdtsc(); + (*add) (&aint64, sizeof(aint64), origin); + } +#else #ifdef __GNUC__ /* FIXME: We would need to implement the CPU feature tests first. */ /* if (cpu_has_feature_rdtsc) */ @@ -990,6 +1010,7 @@ _gcry_rndw32_gather_random_fast (void (*add)(const void*, size_t, (*add) (&aword, sizeof (aword), origin ); } } +#endif /*__WIN64__*/ } From gniibe at fsij.org Wed Aug 7 11:05:33 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Wed, 07 Aug 2013 18:05:33 +0900 Subject: tests: fix memory leaks In-Reply-To: <87ob9ahud5.fsf@vigenere.g10code.de> References: <1375761798.3199.4.camel@cfw2.gniibe.org> <1375833926.3237.0.camel@cfw2.gniibe.org> <87ob9ahud5.fsf@vigenere.g10code.de> Message-ID: <1375866333.14725.1.camel@cfw2.gniibe.org> On 2013-08-07 at 08:45 +0200, Werner Koch wrote: > On Wed, 7 Aug 2013 02:05, gniibe at fsij.org said: > > > Here is another fix for test code. Another reason of these fixes is > > that when we will find some memory leaks in future running test code, > > we will be able to distinguish them as bugs. > > > > OK to commit? > > Sure. Done. -- From gniibe at fsij.org Thu Aug 8 00:57:59 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Thu, 08 Aug 2013 07:57:59 +0900 Subject: cipher: fix memory leaks In-Reply-To: <1375778148.3199.7.camel@cfw2.gniibe.org> References: <1375756092.3199.2.camel@cfw2.gniibe.org> <87zjsvl0p3.fsf@vigenere.g10code.de> <1375778148.3199.7.camel@cfw2.gniibe.org> Message-ID: <1375916279.3172.2.camel@cfw2.gniibe.org> On 2013-08-06 at 17:35 +0900, NIIBE Yutaka wrote: > For ECC ("pabgnqd", with no Q, but D), I try following patch. It > works, but doesn't look so good. This is too large, as it includes API changes: pubkey.c | 124 ++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 53 deletions(-) Instead, following is a makeshift solution, but small and local, and enough to fix the specific problem. I'll commit this change. I think that this is the last one of my memory leak series. diff --git a/cipher/pubkey.c b/cipher/pubkey.c index e867169..f9c2193 100644 --- a/cipher/pubkey.c +++ b/cipher/pubkey.c @@ -3339,7 +3339,15 @@ gcry_pk_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_hash, gcry_sexp_t s_skey) leave: if (skey) { - release_mpi_array (skey); + if (is_ecc) + for (i = 0; i < 7; i++) + { + if (skey[i]) + mpi_free (skey[i]); + skey[i] = NULL; + } + else + release_mpi_array (skey); gcry_free (skey); } -- From gniibe at fsij.org Thu Aug 8 07:20:00 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Thu, 08 Aug 2013 14:20:00 +0900 Subject: Comments on the change: Mitigate a flush+reload cache attack on RSA secret exponents Message-ID: <1375939200.3172.6.camel@cfw2.gniibe.org> For the commit of 287bf0e543f244d784cf8b58340bf0ab3c6aba97, I add my git notes below. I realized that git notes are basically local stuff, and we need some practice to share notes. If this is useful, I'd like to push my notes using the namespace 'gniibe', that is, refs/notes/gniibe (to avoid conflict). Or should I use refs/notes/commits? -------------------------------------- Here are results in my notebook PC (Pentium M 1.10GHz). We have performance regression. But possible change of SQR->MUL will be comparable to original. Original: Call SQR and then, call MUL only when E's bit is 1. ====================== original ===================== $ ./tests/benchmark rsa Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 340ms 860ms 30ms RSA 2048 bit 870ms 5510ms 110ms RSA 3072 bit 6440ms 16930ms 210ms RSA 4096 bit 17470ms 37270ms 360ms Current fix: Call MUL always, regardless of E's bit. ====================== Always MUL =================== $ ./tests/benchmark rsa Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 210ms 1180ms 30ms RSA 2048 bit 2040ms 7450ms 110ms RSA 3072 bit 21720ms 21960ms 210ms RSA 4096 bit 25290ms 49680ms 360ms Possible change to recover performance regression: For first SQR, use MUL instread. Then, call MUL only when E's bit is 1. ====================== SQR->MUL ===================== $ ./tests/benchmark rsa Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 100ms 870ms 30ms RSA 2048 bit 860ms 5570ms 100ms RSA 3072 bit 12430ms 16600ms 210ms RSA 4096 bit 32000ms 37470ms 360ms -- From wk at gnupg.org Thu Aug 8 09:36:56 2013 From: wk at gnupg.org (Werner Koch) Date: Thu, 08 Aug 2013 09:36:56 +0200 Subject: Comments on the change: Mitigate a flush+reload cache attack on RSA secret exponents In-Reply-To: <1375939200.3172.6.camel@cfw2.gniibe.org> (NIIBE Yutaka's message of "Thu, 08 Aug 2013 14:20:00 +0900") References: <1375939200.3172.6.camel@cfw2.gniibe.org> Message-ID: <871u64iqg7.fsf@vigenere.g10code.de> On Thu, 8 Aug 2013 07:20, gniibe at fsij.org said: > If this is useful, I'd like to push my notes using the namespace > 'gniibe', that is, refs/notes/gniibe (to avoid conflict). Or should I Does "git log" show them if you use this namespace? If so, using the playfair account name seems to be a good idea. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From gniibe at fsij.org Thu Aug 8 10:05:09 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Thu, 08 Aug 2013 17:05:09 +0900 Subject: Comments on the change: Mitigate a flush+reload cache attack on RSA secret exponents In-Reply-To: <871u64iqg7.fsf@vigenere.g10code.de> References: <1375939200.3172.6.camel@cfw2.gniibe.org> <871u64iqg7.fsf@vigenere.g10code.de> Message-ID: <1375949109.3172.12.camel@cfw2.gniibe.org> On 2013-08-08 at 09:36 +0200, Werner Koch wrote: > Does "git log" show them if you use this namespace? If so, using the > playfair account name seems to be a good idea. That's the issue. Normal "git log" doesn't show them. We need to explicitly specify namespace by --show-notes (--show-notes=* for all namespaces). Users could do that, but I don't know if gitweb can be configure that. When we won't use namespace (but use normal "refs/notes/commits"), we will have conflict and merge problem. -- From gniibe at fsij.org Fri Aug 9 01:37:02 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Fri, 09 Aug 2013 08:37:02 +0900 Subject: cipher: fix memory leaks In-Reply-To: <1375916279.3172.2.camel@cfw2.gniibe.org> References: <1375756092.3199.2.camel@cfw2.gniibe.org> <87zjsvl0p3.fsf@vigenere.g10code.de> <1375778148.3199.7.camel@cfw2.gniibe.org> <1375916279.3172.2.camel@cfw2.gniibe.org> Message-ID: <1376005022.3177.1.camel@cfw2.gniibe.org> On 2013-08-08 at 07:57 +0900, NIIBE Yutaka wrote: > I'll commit this change. I think that this is the last one of > my memory leak series. Committed and pushed. -- From gniibe at fsij.org Fri Aug 9 02:30:30 2013 From: gniibe at fsij.org (NIIBE Yutaka) Date: Fri, 09 Aug 2013 09:30:30 +0900 Subject: Comments on the change: Mitigate a flush+reload cache attack on RSA secret exponents In-Reply-To: <1375949109.3172.12.camel@cfw2.gniibe.org> References: <1375939200.3172.6.camel@cfw2.gniibe.org> <871u64iqg7.fsf@vigenere.g10code.de> <1375949109.3172.12.camel@cfw2.gniibe.org> Message-ID: <1376008230.3177.2.camel@cfw2.gniibe.org> On 2013-08-08 at 17:05 +0900, NIIBE Yutaka wrote: > Users could do that, but I don't know if gitweb can be configure > that. I realized that gitweb doesn't have support to show notes, while cgit seems to have. Given the situation, my opinion is that, it's not good idea, for now, to share some useful information with git notes (for libgcrypt, gnupg, etc.). * * * If we still want to share with git notes, we need some configuration at playfair.gnupg.org [0], and ask everyone to follow git notes [1]. [0] About configuration at playfair.gnupg.org Currently, we have some issue to stop sharing refs/notes/*. Here's error log: -------------------- remote: *** Update hook: unknown type of update to ref refs/notes/commits of type commit remote: error: hook declined to update refs/notes/commits To ssh://playfair.gnupg.org/git/libgcrypt.git ! [remote rejected] refs/notes/commits -> refs/notes/commits (hook declined) error: failed to push some refs to 'ssh://playfair.gnupg.org/git/libgcrypt.git' -------------------- [1] Configuration for each person. I will have something like this to follow remote git notes: --------------- [remote "origin"] fetch = +refs/heads/*:refs/remotes/origin/* fetch = +refs/notes/*:refs/notes/* <---- this url = ssh://playfair.gnupg.org/git/libgcrypt.git --------------- -- From simon at josefsson.org Sun Aug 11 23:45:29 2013 From: simon at josefsson.org (Simon Josefsson) Date: Sun, 11 Aug 2013 23:45:29 +0200 Subject: [PATCH] Add support for Salsa20/12 - 12 round version of Salsa20 In-Reply-To: <87bo5lxiqu.fsf@vigenere.g10code.de> (Werner Koch's message of "Mon, 29 Jul 2013 21:34:49 +0200") References: <1374418402-25336-1-git-send-email-dbaryshkov@gmail.com> <8738r25z5f.fsf@vigenere.g10code.de> <87fvv1w2x1.fsf@latte.josefsson.org> <87bo5lxiqu.fsf@vigenere.g10code.de> Message-ID: <87a9kn52bq.fsf@latte.josefsson.org> Werner Koch writes: > On Fri, 26 Jul 2013 21:12, simon at josefsson.org said: > >> eSTREAM picked 12-rounds Salsa, and not the 20-round version, so it > > I see. > >> I would recommend against implementing 12-rounds without also >> implementing 20-rounds -- DJB specified 20-rounds and I would personally > > Well, we implemented 20 rounds and not yet 12 rounds. Ah. > In how far is eSTREAM relevant; why do we need to care about it? People has a tendency to defer to authorities, so I guess there will be a set of projects that will look for a non-cracked stream cipher and ends up chosing Salsa20, and then goes with the variant of Salsa20 that eSTREAM recommends because they don't know better. > Is there any project already using Salsa20r12 or is there still time to > ignore this variant? In other words, would you mind to change your I-D > to ?Standard? 20 rounds Salsa? Yes, this is still an open question and under discussion. The only thing I am strongly against is to only standardize on 12-rounds, so the remaining options would then be 1) both 12 and 20 or, 2) only 20 rounds. I'm not sure how strong the 12-rounds crowd is; if it is big enough to warrant including both or if we can swing them over to 20 rounds. > It is not that I am against adding this variant, but I try to keep the > number of implemented algorithms low. We already had to add a couple of > algorithms simply for political reasons. I would appreciate if we could > avoid that (and thus make IanG happy). Yes, I fully sympathise with this concern. /Simon From n3npq at me.com Mon Aug 12 00:03:50 2013 From: n3npq at me.com (Jeffrey Johnson) Date: Sun, 11 Aug 2013 18:03:50 -0400 Subject: [PATCH] Add support for Salsa20/12 - 12 round version of Salsa20 In-Reply-To: <87a9kn52bq.fsf@latte.josefsson.org> References: <1374418402-25336-1-git-send-email-dbaryshkov@gmail.com> <8738r25z5f.fsf@vigenere.g10code.de> <87fvv1w2x1.fsf@latte.josefsson.org> <87bo5lxiqu.fsf@vigenere.g10code.de> <87a9kn52bq.fsf@latte.josefsson.org> Message-ID: The design/naming issue is that newer hashes refer to a family with parameter(s). Retrofitting a given parameter choice into existing implementations is necessary because there is no easy/obvious means to associate the parameter with signature/hash metadata except by assigning a name to the algorithm+parameter choices. 73 de Jeff Sent from my iPhone On Aug 11, 2013, at 5:45 PM, Simon Josefsson wrote: > Werner Koch writes: > >> On Fri, 26 Jul 2013 21:12, simon at josefsson.org said: >> >>> eSTREAM picked 12-rounds Salsa, and not the 20-round version, so it >> >> I see. >> >>> I would recommend against implementing 12-rounds without also >>> implementing 20-rounds -- DJB specified 20-rounds and I would personally >> >> Well, we implemented 20 rounds and not yet 12 rounds. > > Ah. > >> In how far is eSTREAM relevant; why do we need to care about it? > > People has a tendency to defer to authorities, so I guess there will be > a set of projects that will look for a non-cracked stream cipher and > ends up chosing Salsa20, and then goes with the variant of Salsa20 that > eSTREAM recommends because they don't know better. > >> Is there any project already using Salsa20r12 or is there still time to >> ignore this variant? In other words, would you mind to change your I-D >> to ?Standard? 20 rounds Salsa? > > Yes, this is still an open question and under discussion. The only > thing I am strongly against is to only standardize on 12-rounds, so the > remaining options would then be 1) both 12 and 20 or, 2) only 20 rounds. > I'm not sure how strong the 12-rounds crowd is; if it is big enough to > warrant including both or if we can swing them over to 20 rounds. > >> It is not that I am against adding this variant, but I try to keep the >> number of implemented algorithms low. We already had to add a couple of >> algorithms simply for political reasons. I would appreciate if we could >> avoid that (and thus make IanG happy). > > Yes, I fully sympathise with this concern. > > /Simon > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel From jussi.kivilinna at iki.fi Thu Aug 15 13:44:34 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 15 Aug 2013 14:44:34 +0300 Subject: [PATCH 0/2] Add ARMv6 assembly implementations of AES and CAST5 Message-ID: <20130815114434.27965.44657.stgit@localhost6.localdomain6> These two patches add ARMv6 implementations of AES and CAST5, tuned for Cortex-A8. --- Jussi Kivilinna (2): rinjdael: add ARMv6 assembly implementation cast5: add ARMv6 assembly implementation cipher/Makefile.am | 4 cipher/cast5-armv6.S | 708 +++++++++++++++++++++++++++++++++++++++ cipher/cast5.c | 137 +++++++ cipher/rijndael-armv6.S | 860 +++++++++++++++++++++++++++++++++++++++++++++++ cipher/rijndael.c | 48 ++- configure.ac | 36 ++ 6 files changed, 1781 insertions(+), 12 deletions(-) create mode 100644 cipher/cast5-armv6.S create mode 100644 cipher/rijndael-armv6.S From jussi.kivilinna at iki.fi Thu Aug 15 13:44:44 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 15 Aug 2013 14:44:44 +0300 Subject: [PATCH 2/2] cast5: add ARMv6 assembly implementation In-Reply-To: <20130815114434.27965.44657.stgit@localhost6.localdomain6> References: <20130815114434.27965.44657.stgit@localhost6.localdomain6> Message-ID: <20130815114444.27965.28729.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'cast5-armv6.S'. * cipher/cast5-armv6.S: New file. * cipher/cast5.c (USE_ARMV6_ASM): New macro. (CAST5_context) [USE_ARMV6_ASM]: New members 'Kr_arm_enc' and 'Kr_arm_dec'. [USE_ARMV6_ASM] (_gcry_cast5_armv6_encrypt_block) (_gcry_cast5_armv6_decrypt_block, _gcry_cast5_armv6_ctr_enc) (_gcry_cast5_armv6_cbc_dec, _gcry_cast5_armv6_cfb_dec): New prototypes. [USE_ARMV6_ASM] (do_encrypt_block, do_decrypt_block, encrypt_block) (decrypt_block): New functions. (_gcry_cast5_ctr_enc) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (_gcry_cast5_cbc_dec) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (_gcry_cast5_cfb_dec) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (do_cast_setkey) [USE_ARMV6_ASM]: Initialize 'Kr_arm_enc' and 'Kr_arm_dec'. * configure.ac (cast5) [arm]: Add 'cast5-armv6.lo'. -- Provides non-parallel implementations for small speed-up and 2-way parallel implementations that gets accelerated on multi-issue CPUs (hand-tuned for in-order dual-issue Cortex-A8). Unaligned access handling is done in assembly. For now, only enable this on little-endian systems as big-endian correctness have not been tested yet. Old vs new (Cortex-A8, Debian Wheezy/armhf): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- CAST5 1.15x 1.12x 1.12x 2.07x 1.14x 1.60x 1.12x 1.13x 1.62x 1.63x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/cast5-armv6.S | 708 ++++++++++++++++++++++++++++++++++++++++++++++++++ cipher/cast5.c | 137 ++++++++++ configure.ac | 4 4 files changed, 848 insertions(+), 3 deletions(-) create mode 100644 cipher/cast5-armv6.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 931675c..11bfda5 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -55,7 +55,7 @@ rmd.h EXTRA_libcipher_la_SOURCES = \ arcfour.c \ blowfish.c blowfish-amd64.S \ -cast5.c cast5-amd64.S \ +cast5.c cast5-amd64.S cast5-armv6.S \ crc.c \ des.c \ dsa.c \ diff --git a/cipher/cast5-armv6.S b/cipher/cast5-armv6.S new file mode 100644 index 0000000..df3c5fc --- /dev/null +++ b/cipher/cast5-armv6.S @@ -0,0 +1,708 @@ +/* cast5-armv6.S - ARM assembly implementation of CAST5 cipher + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +.extern _gcry_cast5_s1to4; + +/* structure of crypto context */ +#define Km 0 +#define Kr (Km + (16 * 4)) +#define Kr_arm_enc (Kr + (16)) +#define Kr_arm_dec (Kr_arm_enc + (16)) + +/* register macros */ +#define CTX %r0 +#define Rs1 %r7 +#define Rs2 %r8 +#define Rs3 %r9 +#define Rs4 %r10 +#define RMASK %r11 +#define RKM %r1 +#define RKR %r2 + +#define RL0 %r3 +#define RR0 %r4 + +#define RL1 %r9 +#define RR1 %r10 + +#define RT0 %lr +#define RT1 %ip +#define RT2 %r5 +#define RT3 %r6 + +/* helper macros */ +#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 0)]; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 3)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 0)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 1)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 2)]; \ + strb rtmp0, [rdst, #((offs) + 3)]; + +#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 3)]; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 0)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 3)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 2)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 1)]; \ + strb rtmp0, [rdst, #((offs) + 0)]; + +#ifdef __ARMEL__ + #define ldr_unaligned_host ldr_unaligned_le + #define str_unaligned_host str_unaligned_le + + /* bswap on little-endian */ + #define host_to_be(reg) \ + rev reg, reg; + #define be_to_host(reg) \ + rev reg, reg; +#else + #define ldr_unaligned_host ldr_unaligned_be + #define str_unaligned_host str_unaligned_be + + /* nop on big-endian */ + #define host_to_be(reg) /*_*/ + #define be_to_host(reg) /*_*/ +#endif + +#define host_to_host(x) /*_*/ + +/********************************************************************** + 1-way cast5 + **********************************************************************/ + +#define dummy(n) /*_*/ + +#define load_kr(n) \ + ldr RKR, [CTX, #(Kr_arm_enc + (n))]; /* Kr[n] */ + +#define load_dec_kr(n) \ + ldr RKR, [CTX, #(Kr_arm_dec + (n) - 3)]; /* Kr[n] */ + +#define load_km(n) \ + ldr RKM, [CTX, #(Km + (n) * 4)]; /* Km[n] */ + +#define shift_kr(dummy) \ + mov RKR, RKR, lsr #8; + +#define F(n, rl, rr, op1, op2, op3, op4, dec, loadkm, shiftkr, loadkr) \ + op1 RKM, rr; \ + mov RKM, RKM, ror RKR; \ + \ + and RT0, RMASK, RKM, ror #(24); \ + and RT1, RMASK, RKM, lsr #(16); \ + and RT2, RMASK, RKM, lsr #(8); \ + ldr RT0, [Rs1, RT0]; \ + and RT3, RMASK, RKM; \ + ldr RT1, [Rs2, RT1]; \ + shiftkr(RKR); \ + \ + ldr RT2, [Rs3, RT2]; \ + \ + op2 RT0, RT1; \ + ldr RT3, [Rs4, RT3]; \ + op3 RT0, RT2; \ + loadkm((n) + (1 - ((dec) * 2))); \ + op4 RT0, RT3; \ + loadkr((n) + (1 - ((dec) * 2))); \ + eor rl, RT0; + +#define F1(n, rl, rr, dec, loadkm, shiftkr, loadkr) \ + F(n, rl, rr, add, eor, sub, add, dec, loadkm, shiftkr, loadkr) +#define F2(n, rl, rr, dec, loadkm, shiftkr, loadkr) \ + F(n, rl, rr, eor, sub, add, eor, dec, loadkm, shiftkr, loadkr) +#define F3(n, rl, rr, dec, loadkm, shiftkr, loadkr) \ + F(n, rl, rr, sub, add, eor, sub, dec, loadkm, shiftkr, loadkr) + +#define enc_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx(n, rl, rr, 0, loadkm, shiftkr, loadkr) + +#define dec_round(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx(n, rl, rr, 1, loadkm, shiftkr, loadkr) + +#define read_block_aligned(rin, offs, l0, r0, convert) \ + ldr l0, [rin, #((offs) + 0)]; \ + ldr r0, [rin, #((offs) + 4)]; \ + convert(l0); \ + convert(r0); + +#define write_block_aligned(rout, offs, l0, r0, convert) \ + convert(l0); \ + convert(r0); \ + str l0, [rout, #((offs) + 0)]; \ + str r0, [rout, #((offs) + 4)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_be) + + #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, be_to_host) + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_host) + + #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, host_to_host) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_be); \ + 2:; + + #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, be_to_host); \ + 2:; + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_host); \ + 2:; + + #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, host_to_host); \ + 2:; +#endif + +.align 3 +.globl _gcry_cast5_armv6_encrypt_block +.type _gcry_cast5_armv6_encrypt_block,%function; + +_gcry_cast5_armv6_encrypt_block: + /* input: + * %r0: CTX + * %r1: dst + * %r2: src + */ + push {%r1, %r4-%r11, %ip, %lr}; + + ldr Rs1, =_gcry_cast5_s1to4; + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100*4); + add Rs3, Rs1, #(0x100*4*2); + add Rs4, Rs1, #(0x100*4*3); + + read_block(%r2, 0, RL0, RR0, RT0); + + load_km(0); + load_kr(0); + enc_round(0, F1, RL0, RR0, load_km, shift_kr, dummy); + enc_round(1, F2, RR0, RL0, load_km, shift_kr, dummy); + enc_round(2, F3, RL0, RR0, load_km, shift_kr, dummy); + enc_round(3, F1, RR0, RL0, load_km, dummy, load_kr); + enc_round(4, F2, RL0, RR0, load_km, shift_kr, dummy); + enc_round(5, F3, RR0, RL0, load_km, shift_kr, dummy); + enc_round(6, F1, RL0, RR0, load_km, shift_kr, dummy); + enc_round(7, F2, RR0, RL0, load_km, dummy, load_kr); + enc_round(8, F3, RL0, RR0, load_km, shift_kr, dummy); + enc_round(9, F1, RR0, RL0, load_km, shift_kr, dummy); + enc_round(10, F2, RL0, RR0, load_km, shift_kr, dummy); + enc_round(11, F3, RR0, RL0, load_km, dummy, load_kr); + enc_round(12, F1, RL0, RR0, load_km, shift_kr, dummy); + enc_round(13, F2, RR0, RL0, load_km, shift_kr, dummy); + enc_round(14, F3, RL0, RR0, load_km, shift_kr, dummy); + enc_round(15, F1, RR0, RL0, dummy, dummy, dummy); + + ldr %r1, [%sp], #4; + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_armv6_encrypt_block,.-_gcry_cast5_armv6_encrypt_block; + +.align 3 +.globl _gcry_cast5_armv6_decrypt_block +.type _gcry_cast5_armv6_decrypt_block,%function; + +_gcry_cast5_armv6_decrypt_block: + /* input: + * %r0: CTX + * %r1: dst + * %r2: src + */ + push {%r1, %r4-%r11, %ip, %lr}; + + ldr Rs1, =_gcry_cast5_s1to4; + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100 * 4); + add Rs3, Rs1, #(0x100 * 4 * 2); + add Rs4, Rs1, #(0x100 * 4 * 3); + + read_block(%r2, 0, RL0, RR0, RT0); + + load_km(15); + load_dec_kr(15); + dec_round(15, F1, RL0, RR0, load_km, shift_kr, dummy); + dec_round(14, F3, RR0, RL0, load_km, shift_kr, dummy); + dec_round(13, F2, RL0, RR0, load_km, shift_kr, dummy); + dec_round(12, F1, RR0, RL0, load_km, dummy, load_dec_kr); + dec_round(11, F3, RL0, RR0, load_km, shift_kr, dummy); + dec_round(10, F2, RR0, RL0, load_km, shift_kr, dummy); + dec_round(9, F1, RL0, RR0, load_km, shift_kr, dummy); + dec_round(8, F3, RR0, RL0, load_km, dummy, load_dec_kr); + dec_round(7, F2, RL0, RR0, load_km, shift_kr, dummy); + dec_round(6, F1, RR0, RL0, load_km, shift_kr, dummy); + dec_round(5, F3, RL0, RR0, load_km, shift_kr, dummy); + dec_round(4, F2, RR0, RL0, load_km, dummy, load_dec_kr); + dec_round(3, F1, RL0, RR0, load_km, shift_kr, dummy); + dec_round(2, F3, RR0, RL0, load_km, shift_kr, dummy); + dec_round(1, F2, RL0, RR0, load_km, shift_kr, dummy); + dec_round(0, F1, RR0, RL0, dummy, dummy, dummy); + + ldr %r1, [%sp], #4; + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_armv6_decrypt_block,.-_gcry_cast5_armv6_decrypt_block; + +/********************************************************************** + 2-way cast5 + **********************************************************************/ + +#define F_2w(n, rl0, rr0, rl1, rr1, op1, op2, op3, op4, dec, loadkm, shiftkr, \ + loadkr) \ + op1 RT3, RKM, rr0; \ + op1 RKM, RKM, rr1; \ + mov RT3, RT3, ror RKR; \ + mov RKM, RKM, ror RKR; \ + \ + and RT0, RMASK, RT3, ror #(24); \ + and RT1, RMASK, RT3, lsr #(16); \ + and RT2, RMASK, RT3, lsr #(8); \ + and RT3, RMASK, RT3; \ + \ + ldr RT0, [Rs1, RT0]; \ + add RT2, #(0x100 * 4); \ + ldr RT1, [Rs2, RT1]; \ + add RT3, #(0x100 * 4 * 2); \ + \ + ldr RT2, [Rs2, RT2]; \ + \ + op2 RT0, RT1; \ + ldr RT3, [Rs2, RT3]; \ + and RT1, RMASK, RKM, ror #(24); \ + op3 RT0, RT2; \ + and RT2, RMASK, RKM, lsr #(16); \ + op4 RT0, RT3; \ + and RT3, RMASK, RKM, lsr #(8); \ + eor rl0, RT0; \ + add RT3, #(0x100 * 4); \ + ldr RT1, [Rs1, RT1]; \ + and RT0, RMASK, RKM; \ + ldr RT2, [Rs2, RT2]; \ + add RT0, #(0x100 * 4 * 2); \ + \ + ldr RT3, [Rs2, RT3]; \ + \ + op2 RT1, RT2; \ + ldr RT0, [Rs2, RT0]; \ + op3 RT1, RT3; \ + loadkm((n) + (1 - ((dec) * 2))); \ + op4 RT1, RT0; \ + loadkr((n) + (1 - ((dec) * 2))); \ + shiftkr(RKR); \ + eor rl1, RT1; + +#define F1_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \ + F_2w(n, rl0, rr0, rl1, rr1, add, eor, sub, add, dec, \ + loadkm, shiftkr, loadkr) +#define F2_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \ + F_2w(n, rl0, rr0, rl1, rr1, eor, sub, add, eor, dec, \ + loadkm, shiftkr, loadkr) +#define F3_2w(n, rl0, rr0, rl1, rr1, dec, loadkm, shiftkr, loadkr) \ + F_2w(n, rl0, rr0, rl1, rr1, sub, add, eor, sub, dec, \ + loadkm, shiftkr, loadkr) + +#define enc_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 0, loadkm, shiftkr, loadkr) + +#define dec_round2(n, Fx, rl, rr, loadkm, shiftkr, loadkr) \ + Fx##_2w(n, rl##0, rr##0, rl##1, rr##1, 1, loadkm, shiftkr, loadkr) + +#define read_block2_aligned(rin, l0, r0, l1, r1, convert) \ + ldr l0, [rin, #(0)]; \ + ldr r0, [rin, #(4)]; \ + convert(l0); \ + ldr l1, [rin, #(8)]; \ + convert(r0); \ + ldr r1, [rin, #(12)]; \ + convert(l1); \ + convert(r1); + +#define write_block2_aligned(rout, l0, r0, l1, r1, convert) \ + convert(l0); \ + convert(r0); \ + convert(l1); \ + str l0, [rout, #(0)]; \ + convert(r1); \ + str r0, [rout, #(4)]; \ + str l1, [rout, #(8)]; \ + str r1, [rout, #(12)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be) + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host) + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host) + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, 0, rtmp0); \ + ldr_unaligned_be(r0, rin, 4, rtmp0); \ + ldr_unaligned_be(l1, rin, 8, rtmp0); \ + ldr_unaligned_be(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be); \ + 2:; + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host); \ + 2:; + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, 0, rtmp0); \ + ldr_unaligned_host(r0, rin, 4, rtmp0); \ + ldr_unaligned_host(l1, rin, 8, rtmp0); \ + ldr_unaligned_host(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host); \ + 2:; + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host); \ + 2:; +#endif + +.align 3 +.type _gcry_cast5_armv6_enc_blk2,%function; + +_gcry_cast5_armv6_enc_blk2: + /* input: + * %r0: CTX + * %r1: dst + * %r2: src + */ + push {%lr}; + + ldr Rs1, =_gcry_cast5_s1to4; + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100 * 4); + + load_km(0); + load_kr(0); + enc_round2(0, F1, RL, RR, load_km, shift_kr, dummy); + enc_round2(1, F2, RR, RL, load_km, shift_kr, dummy); + enc_round2(2, F3, RL, RR, load_km, shift_kr, dummy); + enc_round2(3, F1, RR, RL, load_km, dummy, load_kr); + enc_round2(4, F2, RL, RR, load_km, shift_kr, dummy); + enc_round2(5, F3, RR, RL, load_km, shift_kr, dummy); + enc_round2(6, F1, RL, RR, load_km, shift_kr, dummy); + enc_round2(7, F2, RR, RL, load_km, dummy, load_kr); + enc_round2(8, F3, RL, RR, load_km, shift_kr, dummy); + enc_round2(9, F1, RR, RL, load_km, shift_kr, dummy); + enc_round2(10, F2, RL, RR, load_km, shift_kr, dummy); + enc_round2(11, F3, RR, RL, load_km, dummy, load_kr); + enc_round2(12, F1, RL, RR, load_km, shift_kr, dummy); + enc_round2(13, F2, RR, RL, load_km, shift_kr, dummy); + enc_round2(14, F3, RL, RR, load_km, shift_kr, dummy); + enc_round2(15, F1, RR, RL, dummy, dummy, dummy); + + host_to_be(RR0); + host_to_be(RL0); + host_to_be(RR1); + host_to_be(RL1); + + pop {%pc}; +.ltorg +.size _gcry_cast5_armv6_enc_blk2,.-_gcry_cast5_armv6_enc_blk2; + +.align 3 +.globl _gcry_cast5_armv6_cfb_dec; +.type _gcry_cast5_armv6_cfb_dec,%function; + +_gcry_cast5_armv6_cfb_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r1, %r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load input (iv/%r3 is aligned, src/%r2 might not be) */ + ldm %r3, {RL0, RR0}; + host_to_be(RL0); + host_to_be(RR0); + read_block(%r2, 0, RL1, RR1, %ip); + + /* Update IV, load src[1] and save to iv[0] */ + read_block_host(%r2, 8, %r5, %r6, %r7); + stm %lr, {%r5, %r6}; + + bl _gcry_cast5_armv6_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: dst, %r1: %src */ + pop {%r0, %r1}; + + /* dst = src ^ result */ + read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(r0, %r5, %r6, %r7, %r8, %r1, %r2); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_armv6_cfb_dec,.-_gcry_cast5_armv6_cfb_dec; + +.align 3 +.globl _gcry_cast5_armv6_ctr_enc; +.type _gcry_cast5_armv6_ctr_enc,%function; + +_gcry_cast5_armv6_ctr_enc: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit, big-endian) + */ + push {%r1, %r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load IV (big => host endian) */ + read_block_aligned(%lr, 0, RL0, RR0, be_to_host); + + /* Construct IVs */ + adds RR1, RR0, #1; /* +1 */ + adc RL1, RL0, #0; + adds %r6, RR1, #1; /* +2 */ + adc %r5, RL1, #0; + + /* Store new IV (host => big-endian) */ + write_block_aligned(%lr, 0, %r5, %r6, host_to_be); + + bl _gcry_cast5_armv6_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: dst, %r1: %src */ + pop {%r0, %r1}; + + /* XOR key-stream with plaintext */ + read_block2_host(%r1, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(r0, %r5, %r6, %r7, %r8, %r1, %r2); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_armv6_ctr_enc,.-_gcry_cast5_armv6_ctr_enc; + +.align 3 +.type _gcry_cast5_armv6_dec_blk2,%function; + +_gcry_cast5_armv6_dec_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + + ldr Rs1, =_gcry_cast5_s1to4; + mov RMASK, #(0xff << 2); + add Rs2, Rs1, #(0x100 * 4); + + load_km(15); + load_dec_kr(15); + dec_round2(15, F1, RL, RR, load_km, shift_kr, dummy); + dec_round2(14, F3, RR, RL, load_km, shift_kr, dummy); + dec_round2(13, F2, RL, RR, load_km, shift_kr, dummy); + dec_round2(12, F1, RR, RL, load_km, dummy, load_dec_kr); + dec_round2(11, F3, RL, RR, load_km, shift_kr, dummy); + dec_round2(10, F2, RR, RL, load_km, shift_kr, dummy); + dec_round2(9, F1, RL, RR, load_km, shift_kr, dummy); + dec_round2(8, F3, RR, RL, load_km, dummy, load_dec_kr); + dec_round2(7, F2, RL, RR, load_km, shift_kr, dummy); + dec_round2(6, F1, RR, RL, load_km, shift_kr, dummy); + dec_round2(5, F3, RL, RR, load_km, shift_kr, dummy); + dec_round2(4, F2, RR, RL, load_km, dummy, load_dec_kr); + dec_round2(3, F1, RL, RR, load_km, shift_kr, dummy); + dec_round2(2, F3, RR, RL, load_km, shift_kr, dummy); + dec_round2(1, F2, RL, RR, load_km, shift_kr, dummy); + dec_round2(0, F1, RR, RL, dummy, dummy, dummy); + + host_to_be(RR0); + host_to_be(RL0); + host_to_be(RR1); + host_to_be(RL1); + + b .Ldec_cbc_tail; +.ltorg +.size _gcry_cast5_armv6_dec_blk2,.-_gcry_cast5_armv6_dec_blk2; + +.align 3 +.globl _gcry_cast5_armv6_cbc_dec; +.type _gcry_cast5_armv6_cbc_dec,%function; + +_gcry_cast5_armv6_cbc_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r1-%r11, %ip, %lr}; + + read_block2(%r2, RL0, RR0, RL1, RR1, RT0); + + /* dec_blk2 is only used by cbc_dec, jump directly in/out instead + * of function call. */ + b _gcry_cast5_armv6_dec_blk2; +.Ldec_cbc_tail: + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: dst, %r1: %src, %r2: iv */ + pop {%r0-%r2}; + + /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r1, 0, %r7, %r8, %r5); + /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */ + ldm %r2, {%r5, %r6}; + + /* out[1] ^= IV+1 */ + eor %r10, %r7; + eor %r9, %r8; + /* out[0] ^= IV */ + eor %r4, %r5; + eor %r3, %r6; + + /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r1, 8, %r7, %r8, %r5); + /* store IV+2 to iv[0] (aligned). */ + stm r2, {%r7, %r8}; + + /* store result to dst[0-3]. Might be unaligned. */ + write_block2_host(%r0, %r4, %r3, %r10, %r9, %r5, %r6); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_cast5_armv6_cbc_dec,.-_gcry_cast5_armv6_cbc_dec; + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ diff --git a/cipher/cast5.c b/cipher/cast5.c index 6017bf0..9e7b50f 100644 --- a/cipher/cast5.c +++ b/cipher/cast5.c @@ -51,11 +51,30 @@ # define USE_AMD64_ASM 1 #endif +/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ +#undef USE_ARMV6_ASM +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS +# define USE_ARMV6_ASM 1 +# endif +#endif + #define CAST5_BLOCKSIZE 8 typedef struct { u32 Km[16]; byte Kr[16]; +#ifdef USE_ARMV6_ASM + u32 Kr_arm_enc[16 / sizeof(u32)]; + u32 Kr_arm_dec[16 / sizeof(u32)]; +#endif } CAST5_context; static gcry_err_code_t cast_setkey (void *c, const byte *key, unsigned keylen); @@ -385,7 +404,52 @@ static void decrypt_block (void *context, byte *outbuf, const byte *inbuf) _gcry_burn_stack (2*8); } -#else /*USE_AMD64_ASM*/ +#elif defined(USE_ARMV6_ASM) + +/* ARMv6 assembly implementations of CAST5. */ +extern void _gcry_cast5_armv6_encrypt_block(CAST5_context *c, byte *outbuf, + const byte *inbuf); + +extern void _gcry_cast5_armv6_decrypt_block(CAST5_context *c, byte *outbuf, + const byte *inbuf); + +/* These assembly implementations process two blocks in parallel. */ +extern void _gcry_cast5_armv6_ctr_enc(CAST5_context *ctx, byte *out, + const byte *in, byte *ctr); + +extern void _gcry_cast5_armv6_cbc_dec(CAST5_context *ctx, byte *out, + const byte *in, byte *iv); + +extern void _gcry_cast5_armv6_cfb_dec(CAST5_context *ctx, byte *out, + const byte *in, byte *iv); + +static void +do_encrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf) +{ + _gcry_cast5_armv6_encrypt_block (context, outbuf, inbuf); +} + +static void +do_decrypt_block (CAST5_context *context, byte *outbuf, const byte *inbuf) +{ + _gcry_cast5_armv6_decrypt_block (context, outbuf, inbuf); +} + +static void encrypt_block (void *context , byte *outbuf, const byte *inbuf) +{ + CAST5_context *c = (CAST5_context *) context; + do_encrypt_block (c, outbuf, inbuf); + _gcry_burn_stack (10*4); +} + +static void decrypt_block (void *context, byte *outbuf, const byte *inbuf) +{ + CAST5_context *c = (CAST5_context *) context; + do_decrypt_block (c, outbuf, inbuf); + _gcry_burn_stack (10*4); +} + +#else /*USE_ARMV6_ASM*/ #if defined(__GNUC__) && defined(__i386__) static inline u32 @@ -520,7 +584,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf) _gcry_burn_stack (20+4*sizeof(void*)); } -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_ARMV6_ASM*/ /* Bulk encryption of complete blocks in CTR mode. This function is only @@ -556,6 +620,21 @@ _gcry_cast5_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, /* Use generic code to handle smaller chunks... */ /* TODO: use caching instead? */ } +#elif defined(USE_ARMV6_ASM) + { + /* Process data in 2 block chunks. */ + while (nblocks >= 2) + { + _gcry_cast5_armv6_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 2; + outbuf += 2 * CAST5_BLOCKSIZE; + inbuf += 2 * CAST5_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } #endif for ( ;nblocks; nblocks-- ) @@ -609,6 +688,20 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, /* Use generic code to handle smaller chunks... */ } +#elif defined(USE_ARMV6_ASM) + { + /* Process data in 2 block chunks. */ + while (nblocks >= 2) + { + _gcry_cast5_armv6_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 2; + outbuf += 2 * CAST5_BLOCKSIZE; + inbuf += 2 * CAST5_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } #endif for ( ;nblocks; nblocks-- ) @@ -657,6 +750,20 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, /* Use generic code to handle smaller chunks... */ } +#elif defined(USE_ARMV6_ASM) + { + /* Process data in 2 block chunks. */ + while (nblocks >= 2) + { + _gcry_cast5_armv6_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 2; + outbuf += 2 * CAST5_BLOCKSIZE; + inbuf += 2 * CAST5_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } #endif for ( ;nblocks; nblocks-- ) @@ -857,6 +964,32 @@ do_cast_setkey( CAST5_context *c, const byte *key, unsigned keylen ) for(i=0; i < 16; i++ ) c->Kr[i] = k[i] & 0x1f; +#ifdef USE_ARMV6_ASM + for (i = 0; i < 4; i++) + { + byte Kr_arm[4]; + + /* Convert rotate left to rotate right and add shift left + * by 2. */ + Kr_arm[0] = ((32 - c->Kr[4 * i + 0]) - 2) & 0x1f; + Kr_arm[1] = ((32 - c->Kr[4 * i + 1]) - 2) & 0x1f; + Kr_arm[2] = ((32 - c->Kr[4 * i + 2]) - 2) & 0x1f; + Kr_arm[3] = ((32 - c->Kr[4 * i + 3]) - 2) & 0x1f; + + /* Endian friendly store. */ + c->Kr_arm_enc[i] = Kr_arm[0] | + (Kr_arm[1] << 8) | + (Kr_arm[2] << 16) | + (Kr_arm[3] << 24); + c->Kr_arm_dec[i] = Kr_arm[3] | + (Kr_arm[2] << 8) | + (Kr_arm[1] << 16) | + (Kr_arm[0] << 24); + + wipememory(Kr_arm, sizeof(Kr_arm)); + } +#endif + memset(&x,0, sizeof x); memset(&z,0, sizeof z); memset(&k,0, sizeof k); diff --git a/configure.ac b/configure.ac index 04dee7e..f05f919 100644 --- a/configure.ac +++ b/configure.ac @@ -1293,6 +1293,10 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-amd64.lo" ;; + arm*-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS cast5-armv6.lo" + ;; esac fi From jussi.kivilinna at iki.fi Thu Aug 15 13:44:39 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 15 Aug 2013 14:44:39 +0300 Subject: [PATCH 1/2] rinjdael: add ARMv6 assembly implementation In-Reply-To: <20130815114434.27965.44657.stgit@localhost6.localdomain6> References: <20130815114434.27965.44657.stgit@localhost6.localdomain6> Message-ID: <20130815114439.27965.27385.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'rijndael-armv6.S'. * cipher/rijndael-armv6.S: New file. * cipher/rijndael.c (USE_ARMV6_ASM): New macro. [USE_ARMV6_ASM] (_gcry_aes_armv6_encrypt_block) (_gcry_aes_armv6_decrypt_block): New prototypes. (do_encrypt_aligned) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (do_encrypt): Disable input/output alignment when USE_ARMV6_ASM. (do_decrypt_aligned) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (do_decrypt): Disable input/output alignment when USE_ARMV6_ASM. * configure.ac (HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS): New check for gcc/as compatibility with ARM assembly implementations. (aes) [arm]: Add 'rijndael-armv6.lo'. -- Add optimized ARMv6 assembly implementation for AES. Implementation is tuned for Cortex-A8. Unaligned access handling is done in assembly part. For now, only enable this on little-endian systems as big-endian correctness have not been tested yet. Old vs new. Cortex-A8 (on Debian Wheezy/armhf): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- AES 2.61x 3.12x 2.16x 2.59x 2.26x 2.25x 2.08x 2.08x 2.23x 2.23x AES192 2.60x 3.06x 2.18x 2.65x 2.29x 2.29x 2.12x 2.12x 2.25x 2.27x AES256 2.62x 3.09x 2.24x 2.72x 2.30x 2.34x 2.17x 2.19x 2.32x 2.32x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/rijndael-armv6.S | 860 +++++++++++++++++++++++++++++++++++++++++++++++ cipher/rijndael.c | 48 ++- configure.ac | 32 ++ 4 files changed, 933 insertions(+), 9 deletions(-) create mode 100644 cipher/rijndael-armv6.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 75ad987..931675c 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -64,7 +64,7 @@ ecc.c \ idea.c \ md4.c \ md5.c \ -rijndael.c rijndael-tables.h rijndael-amd64.S \ +rijndael.c rijndael-tables.h rijndael-amd64.S rijndael-armv6.S \ rmd160.c \ rsa.c \ salsa20.c \ diff --git a/cipher/rijndael-armv6.S b/cipher/rijndael-armv6.S new file mode 100644 index 0000000..be39df1 --- /dev/null +++ b/cipher/rijndael-armv6.S @@ -0,0 +1,860 @@ +/* rijndael-armv6.S - ARM assembly implementation of AES cipher + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +/* register macros */ +#define CTX %r0 +#define RTAB %lr +#define RMASK %ip + +#define RA %r4 +#define RB %r5 +#define RC %r6 +#define RD %r7 + +#define RNA %r8 +#define RNB %r9 +#define RNC %r10 +#define RND %r11 + +#define RT0 %r1 +#define RT1 %r2 +#define RT2 %r3 + +/* helper macros */ +#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 0)]; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 3)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 0)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 1)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 2)]; \ + strb rtmp0, [rdst, #((offs) + 3)]; + +/*********************************************************************** + * ARM assembly implementation of the AES cipher + ***********************************************************************/ +#define preload_first_key(round, ra) \ + ldr ra, [CTX, #(((round) * 16) + 0 * 4)]; + +#define dummy(round, ra) /* nothing */ + +#define addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \ + ldm CTX, {rna, rnb, rnc, rnd}; \ + eor ra, rna; \ + eor rb, rnb; \ + eor rc, rnc; \ + preload_key(1, rna); \ + eor rd, rnd; + +#define do_encround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \ + ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \ + \ + and RT0, RMASK, ra, lsl#3; \ + ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \ + and RT1, RMASK, ra, lsr#(8 - 3); \ + ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \ + and RT2, RMASK, ra, lsr#(16 - 3); \ + ldr RT0, [RTAB, RT0]; \ + and ra, RMASK, ra, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rna, rna, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rd, lsl#3; \ + ldr ra, [RTAB, ra]; \ + \ + eor rnd, rnd, RT1, ror #24; \ + and RT1, RMASK, rd, lsr#(8 - 3); \ + eor rnc, rnc, RT2, ror #16; \ + and RT2, RMASK, rd, lsr#(16 - 3); \ + eor rnb, rnb, ra, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rd, RMASK, rd, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rnd, rnd, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rc, lsl#3; \ + ldr rd, [RTAB, rd]; \ + \ + eor rnc, rnc, RT1, ror #24; \ + and RT1, RMASK, rc, lsr#(8 - 3); \ + eor rnb, rnb, RT2, ror #16; \ + and RT2, RMASK, rc, lsr#(16 - 3); \ + eor rna, rna, rd, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rc, RMASK, rc, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rnc, rnc, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rb, lsl#3; \ + ldr rc, [RTAB, rc]; \ + \ + eor rnb, rnb, RT1, ror #24; \ + and RT1, RMASK, rb, lsr#(8 - 3); \ + eor rna, rna, RT2, ror #16; \ + and RT2, RMASK, rb, lsr#(16 - 3); \ + eor rnd, rnd, rc, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rb, RMASK, rb, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rnb, rnb, RT0; \ + ldr RT2, [RTAB, RT2]; \ + eor rna, rna, RT1, ror #24; \ + ldr rb, [RTAB, rb]; \ + \ + eor rnd, rnd, RT2, ror #16; \ + preload_key((next_r) + 1, ra); \ + eor rnc, rnc, rb, ror #8; + +#define do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + and RT0, RMASK, ra, lsl#3; \ + and RT1, RMASK, ra, lsr#(8 - 3); \ + and RT2, RMASK, ra, lsr#(16 - 3); \ + ldr rna, [RTAB, RT0]; \ + and ra, RMASK, ra, lsr#(24 - 3); \ + ldr rnd, [RTAB, RT1]; \ + and RT0, RMASK, rd, lsl#3; \ + ldr rnc, [RTAB, RT2]; \ + mov rnd, rnd, ror #24; \ + ldr rnb, [RTAB, ra]; \ + and RT1, RMASK, rd, lsr#(8 - 3); \ + mov rnc, rnc, ror #16; \ + and RT2, RMASK, rd, lsr#(16 - 3); \ + mov rnb, rnb, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rd, RMASK, rd, lsr#(24 - 3); \ + ldr RT1, [RTAB, RT1]; \ + \ + orr rnd, rnd, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rc, lsl#3; \ + ldr rd, [RTAB, rd]; \ + orr rnc, rnc, RT1, ror #24; \ + and RT1, RMASK, rc, lsr#(8 - 3); \ + orr rnb, rnb, RT2, ror #16; \ + and RT2, RMASK, rc, lsr#(16 - 3); \ + orr rna, rna, rd, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rc, RMASK, rc, lsr#(24 - 3); \ + ldr RT1, [RTAB, RT1]; \ + \ + orr rnc, rnc, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rb, lsl#3; \ + ldr rc, [RTAB, rc]; \ + orr rnb, rnb, RT1, ror #24; \ + and RT1, RMASK, rb, lsr#(8 - 3); \ + orr rna, rna, RT2, ror #16; \ + ldr RT0, [RTAB, RT0]; \ + and RT2, RMASK, rb, lsr#(16 - 3); \ + ldr RT1, [RTAB, RT1]; \ + orr rnd, rnd, rc, ror #8; \ + ldr RT2, [RTAB, RT2]; \ + and rb, RMASK, rb, lsr#(24 - 3); \ + ldr rb, [RTAB, rb]; \ + \ + orr rnb, rnb, RT0; \ + orr rna, rna, RT1, ror #24; \ + orr rnd, rnd, RT2, ror #16; \ + orr rnc, rnc, rb, ror #8; + +#define firstencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + addroundkey(ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); \ + do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); + +#define encround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \ + do_encround((round) + 1, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key); + +#define lastencround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + add CTX, #(((round) + 1) * 16); \ + add RTAB, #4; \ + do_lastencround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \ + addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy); + +.align 3 +.global _gcry_aes_armv6_encrypt_block +.type _gcry_aes_armv6_encrypt_block,%function; + +_gcry_aes_armv6_encrypt_block: + /* input: + * %r0: keysched, CTX + * %r1: dst + * %r2: src + * %r3: number of rounds.. 10, 12 or 14 + */ + push {%r4-%r11, %ip, %lr}; + + /* read input block */ +#ifndef __ARM_FEATURE_UNALIGNED + /* test if src is unaligned */ + tst %r2, #3; + beq 1f; + + /* unaligned load */ + ldr_unaligned_le(RA, %r2, 0, RNA); + ldr_unaligned_le(RB, %r2, 4, RNB); + ldr_unaligned_le(RC, %r2, 8, RNA); + ldr_unaligned_le(RD, %r2, 12, RNB); + b 2f; +.ltorg +1: +#endif + /* aligned load */ + ldm %r2, {RA, RB, RC, RD}; +#ifndef __ARMEL__ + rev RA, RA; + rev RB, RB; + rev RC, RC; + rev RD, RD; +#endif +2: + sub %sp, #16; + + ldr RTAB, =.LtableE0; + + str %r1, [%sp, #4]; /* dst */ + mov RMASK, #0xff; + str %r3, [%sp, #8]; /* nrounds */ + mov RMASK, RMASK, lsl#3; /* byte mask */ + + firstencround(0, RA, RB, RC, RD, RNA, RNB, RNC, RND); + encround(1, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + encround(2, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + encround(3, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + encround(4, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + encround(5, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + encround(6, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + encround(7, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + + ldr RT0, [%sp, #8]; /* nrounds */ + cmp RT0, #12; + bge .Lenc_not_128; + + encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy); + lastencround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD); + +.Lenc_done: + ldr RT0, [%sp, #4]; /* dst */ + add %sp, #16; + + /* store output block */ +#ifndef __ARM_FEATURE_UNALIGNED + /* test if dst is unaligned */ + tst RT0, #3; + beq 1f; + + /* unaligned load */ + str_unaligned_le(RA, RT0, 0, RNA, RNB); + str_unaligned_le(RB, RT0, 4, RNA, RNB); + str_unaligned_le(RC, RT0, 8, RNA, RNB); + str_unaligned_le(RD, RT0, 12, RNA, RNB); + b 2f; +.ltorg +1: +#endif + /* aligned load */ +#ifndef __ARMEL__ + rev RA, RA; + rev RB, RB; + rev RC, RC; + rev RD, RD; +#endif + /* write output block */ + stm RT0, {RA, RB, RC, RD}; +2: + pop {%r4-%r11, %ip, %pc}; + +.ltorg +.Lenc_not_128: + beq .Lenc_192 + + encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + encround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + encround(12, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy); + lastencround(13, RNA, RNB, RNC, RND, RA, RB, RC, RD); + + b .Lenc_done; + +.ltorg +.Lenc_192: + encround(8, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + encround(9, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + encround(10, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy); + lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD); + + b .Lenc_done; +.size _gcry_aes_armv6_encrypt_block,.-_gcry_aes_armv6_encrypt_block; + +#define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + ldr rna, [CTX, #(((round) * 16) + 0 * 4)]; \ + ldr rnb, [CTX, #(((round) * 16) + 1 * 4)]; \ + eor ra, rna; \ + ldr rnc, [CTX, #(((round) * 16) + 2 * 4)]; \ + eor rb, rnb; \ + ldr rnd, [CTX, #(((round) * 16) + 3 * 4)]; \ + eor rc, rnc; \ + preload_first_key((round) - 1, rna); \ + eor rd, rnd; + +#define do_decround(next_r, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \ + ldr rnb, [CTX, #(((next_r) * 16) + 1 * 4)]; \ + \ + and RT0, RMASK, ra, lsl#3; \ + ldr rnc, [CTX, #(((next_r) * 16) + 2 * 4)]; \ + and RT1, RMASK, ra, lsr#(8 - 3); \ + ldr rnd, [CTX, #(((next_r) * 16) + 3 * 4)]; \ + and RT2, RMASK, ra, lsr#(16 - 3); \ + ldr RT0, [RTAB, RT0]; \ + and ra, RMASK, ra, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rna, rna, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rb, lsl#3; \ + ldr ra, [RTAB, ra]; \ + \ + eor rnb, rnb, RT1, ror #24; \ + and RT1, RMASK, rb, lsr#(8 - 3); \ + eor rnc, rnc, RT2, ror #16; \ + and RT2, RMASK, rb, lsr#(16 - 3); \ + eor rnd, rnd, ra, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rb, RMASK, rb, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rnb, rnb, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rc, lsl#3; \ + ldr rb, [RTAB, rb]; \ + \ + eor rnc, rnc, RT1, ror #24; \ + and RT1, RMASK, rc, lsr#(8 - 3); \ + eor rnd, rnd, RT2, ror #16; \ + and RT2, RMASK, rc, lsr#(16 - 3); \ + eor rna, rna, rb, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rc, RMASK, rc, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rnc, rnc, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rd, lsl#3; \ + ldr rc, [RTAB, rc]; \ + \ + eor rnd, rnd, RT1, ror #24; \ + and RT1, RMASK, rd, lsr#(8 - 3); \ + eor rna, rna, RT2, ror #16; \ + and RT2, RMASK, rd, lsr#(16 - 3); \ + eor rnb, rnb, rc, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rd, RMASK, rd, lsr#(24 - 3); \ + \ + ldr RT1, [RTAB, RT1]; \ + eor rnd, rnd, RT0; \ + ldr RT2, [RTAB, RT2]; \ + eor rna, rna, RT1, ror #24; \ + ldr rd, [RTAB, rd]; \ + \ + eor rnb, rnb, RT2, ror #16; \ + preload_key((next_r) - 1, ra); \ + eor rnc, rnc, rd, ror #8; + +#define do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + and RT0, RMASK, ra, lsl#3; \ + and RT1, RMASK, ra, lsr#(8 - 3); \ + and RT2, RMASK, ra, lsr#(16 - 3); \ + ldr rna, [RTAB, RT0]; \ + and ra, RMASK, ra, lsr#(24 - 3); \ + ldr rnb, [RTAB, RT1]; \ + and RT0, RMASK, rb, lsl#3; \ + ldr rnc, [RTAB, RT2]; \ + mov rnb, rnb, ror #24; \ + ldr rnd, [RTAB, ra]; \ + and RT1, RMASK, rb, lsr#(8 - 3); \ + mov rnc, rnc, ror #16; \ + and RT2, RMASK, rb, lsr#(16 - 3); \ + mov rnd, rnd, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rb, RMASK, rb, lsr#(24 - 3); \ + ldr RT1, [RTAB, RT1]; \ + \ + orr rnb, rnb, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rc, lsl#3; \ + ldr rb, [RTAB, rb]; \ + orr rnc, rnc, RT1, ror #24; \ + and RT1, RMASK, rc, lsr#(8 - 3); \ + orr rnd, rnd, RT2, ror #16; \ + and RT2, RMASK, rc, lsr#(16 - 3); \ + orr rna, rna, rb, ror #8; \ + ldr RT0, [RTAB, RT0]; \ + and rc, RMASK, rc, lsr#(24 - 3); \ + ldr RT1, [RTAB, RT1]; \ + \ + orr rnc, rnc, RT0; \ + ldr RT2, [RTAB, RT2]; \ + and RT0, RMASK, rd, lsl#3; \ + ldr rc, [RTAB, rc]; \ + orr rnd, rnd, RT1, ror #24; \ + and RT1, RMASK, rd, lsr#(8 - 3); \ + orr rna, rna, RT2, ror #16; \ + ldr RT0, [RTAB, RT0]; \ + and RT2, RMASK, rd, lsr#(16 - 3); \ + ldr RT1, [RTAB, RT1]; \ + orr rnb, rnb, rc, ror #8; \ + ldr RT2, [RTAB, RT2]; \ + and rd, RMASK, rd, lsr#(24 - 3); \ + ldr rd, [RTAB, rd]; \ + \ + orr rnd, rnd, RT0; \ + orr rna, rna, RT1, ror #24; \ + orr rnb, rnb, RT2, ror #16; \ + orr rnc, rnc, rd, ror #8; + +#define firstdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + addroundkey_dec(((round) + 1), ra, rb, rc, rd, rna, rnb, rnc, rnd); \ + do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_first_key); + +#define decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key) \ + do_decround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd, preload_key); + +#define lastdecround(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \ + add RTAB, #4; \ + do_lastdecround(ra, rb, rc, rd, rna, rnb, rnc, rnd); \ + addroundkey(rna, rnb, rnc, rnd, ra, rb, rc, rd, dummy); + +.align 3 +.global _gcry_aes_armv6_decrypt_block +.type _gcry_aes_armv6_decrypt_block,%function; + +_gcry_aes_armv6_decrypt_block: + /* input: + * %r0: keysched, CTX + * %r1: dst + * %r2: src + * %r3: number of rounds.. 10, 12 or 14 + */ + push {%r4-%r11, %ip, %lr}; + + /* read input block */ +#ifndef __ARM_FEATURE_UNALIGNED + /* test if src is unaligned */ + tst %r2, #3; + beq 1f; + + /* unaligned load */ + ldr_unaligned_le(RA, %r2, 0, RNA); + ldr_unaligned_le(RB, %r2, 4, RNB); + ldr_unaligned_le(RC, %r2, 8, RNA); + ldr_unaligned_le(RD, %r2, 12, RNB); + b 2f; +.ltorg +1: +#endif + /* aligned load */ + ldm %r2, {RA, RB, RC, RD}; +#ifndef __ARMEL__ + rev RA, RA; + rev RB, RB; + rev RC, RC; + rev RD, RD; +#endif +2: + sub %sp, #16; + + ldr RTAB, =.LtableD0; + + mov RMASK, #0xff; + str %r1, [%sp, #4]; /* dst */ + mov RMASK, RMASK, lsl#3; /* byte mask */ + + cmp %r3, #12; + bge .Ldec_256; + + firstdecround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND); +.Ldec_tail: + decround(8, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(7, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + decround(6, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(5, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + decround(4, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(3, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + decround(2, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(1, RA, RB, RC, RD, RNA, RNB, RNC, RND, dummy); + lastdecround(0, RNA, RNB, RNC, RND, RA, RB, RC, RD); + + ldr RT0, [%sp, #4]; /* dst */ + add %sp, #16; + + /* store output block */ +#ifndef __ARM_FEATURE_UNALIGNED + /* test if dst is unaligned */ + tst RT0, #3; + beq 1f; + + /* unaligned load */ + str_unaligned_le(RA, RT0, 0, RNA, RNB); + str_unaligned_le(RB, RT0, 4, RNA, RNB); + str_unaligned_le(RC, RT0, 8, RNA, RNB); + str_unaligned_le(RD, RT0, 12, RNA, RNB); + b 2f; +.ltorg +1: +#endif + /* aligned load */ +#ifndef __ARMEL__ + rev RA, RA; + rev RB, RB; + rev RC, RC; + rev RD, RD; +#endif + /* write output block */ + stm RT0, {RA, RB, RC, RD}; +2: + pop {%r4-%r11, %ip, %pc}; + +.ltorg +.Ldec_256: + beq .Ldec_192; + + firstdecround(13, RA, RB, RC, RD, RNA, RNB, RNC, RND); + decround(12, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + + b .Ldec_tail; + +.ltorg +.Ldec_192: + firstdecround(11, RA, RB, RC, RD, RNA, RNB, RNC, RND); + decround(10, RNA, RNB, RNC, RND, RA, RB, RC, RD, preload_first_key); + decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); + + b .Ldec_tail; +.size _gcry_aes_armv6_encrypt_block,.-_gcry_aes_armv6_encrypt_block; + +.data + +/* Encryption tables */ +.align 5 +.type .LtableE0, %object +.type .LtableEs0, %object +.LtableE0: +.long 0xa56363c6 +.LtableEs0: +.long 0x00000063, 0x847c7cf8, 0x0000007c +.long 0x997777ee, 0x00000077, 0x8d7b7bf6, 0x0000007b +.long 0x0df2f2ff, 0x000000f2, 0xbd6b6bd6, 0x0000006b +.long 0xb16f6fde, 0x0000006f, 0x54c5c591, 0x000000c5 +.long 0x50303060, 0x00000030, 0x03010102, 0x00000001 +.long 0xa96767ce, 0x00000067, 0x7d2b2b56, 0x0000002b +.long 0x19fefee7, 0x000000fe, 0x62d7d7b5, 0x000000d7 +.long 0xe6abab4d, 0x000000ab, 0x9a7676ec, 0x00000076 +.long 0x45caca8f, 0x000000ca, 0x9d82821f, 0x00000082 +.long 0x40c9c989, 0x000000c9, 0x877d7dfa, 0x0000007d +.long 0x15fafaef, 0x000000fa, 0xeb5959b2, 0x00000059 +.long 0xc947478e, 0x00000047, 0x0bf0f0fb, 0x000000f0 +.long 0xecadad41, 0x000000ad, 0x67d4d4b3, 0x000000d4 +.long 0xfda2a25f, 0x000000a2, 0xeaafaf45, 0x000000af +.long 0xbf9c9c23, 0x0000009c, 0xf7a4a453, 0x000000a4 +.long 0x967272e4, 0x00000072, 0x5bc0c09b, 0x000000c0 +.long 0xc2b7b775, 0x000000b7, 0x1cfdfde1, 0x000000fd +.long 0xae93933d, 0x00000093, 0x6a26264c, 0x00000026 +.long 0x5a36366c, 0x00000036, 0x413f3f7e, 0x0000003f +.long 0x02f7f7f5, 0x000000f7, 0x4fcccc83, 0x000000cc +.long 0x5c343468, 0x00000034, 0xf4a5a551, 0x000000a5 +.long 0x34e5e5d1, 0x000000e5, 0x08f1f1f9, 0x000000f1 +.long 0x937171e2, 0x00000071, 0x73d8d8ab, 0x000000d8 +.long 0x53313162, 0x00000031, 0x3f15152a, 0x00000015 +.long 0x0c040408, 0x00000004, 0x52c7c795, 0x000000c7 +.long 0x65232346, 0x00000023, 0x5ec3c39d, 0x000000c3 +.long 0x28181830, 0x00000018, 0xa1969637, 0x00000096 +.long 0x0f05050a, 0x00000005, 0xb59a9a2f, 0x0000009a +.long 0x0907070e, 0x00000007, 0x36121224, 0x00000012 +.long 0x9b80801b, 0x00000080, 0x3de2e2df, 0x000000e2 +.long 0x26ebebcd, 0x000000eb, 0x6927274e, 0x00000027 +.long 0xcdb2b27f, 0x000000b2, 0x9f7575ea, 0x00000075 +.long 0x1b090912, 0x00000009, 0x9e83831d, 0x00000083 +.long 0x742c2c58, 0x0000002c, 0x2e1a1a34, 0x0000001a +.long 0x2d1b1b36, 0x0000001b, 0xb26e6edc, 0x0000006e +.long 0xee5a5ab4, 0x0000005a, 0xfba0a05b, 0x000000a0 +.long 0xf65252a4, 0x00000052, 0x4d3b3b76, 0x0000003b +.long 0x61d6d6b7, 0x000000d6, 0xceb3b37d, 0x000000b3 +.long 0x7b292952, 0x00000029, 0x3ee3e3dd, 0x000000e3 +.long 0x712f2f5e, 0x0000002f, 0x97848413, 0x00000084 +.long 0xf55353a6, 0x00000053, 0x68d1d1b9, 0x000000d1 +.long 0x00000000, 0x00000000, 0x2cededc1, 0x000000ed +.long 0x60202040, 0x00000020, 0x1ffcfce3, 0x000000fc +.long 0xc8b1b179, 0x000000b1, 0xed5b5bb6, 0x0000005b +.long 0xbe6a6ad4, 0x0000006a, 0x46cbcb8d, 0x000000cb +.long 0xd9bebe67, 0x000000be, 0x4b393972, 0x00000039 +.long 0xde4a4a94, 0x0000004a, 0xd44c4c98, 0x0000004c +.long 0xe85858b0, 0x00000058, 0x4acfcf85, 0x000000cf +.long 0x6bd0d0bb, 0x000000d0, 0x2aefefc5, 0x000000ef +.long 0xe5aaaa4f, 0x000000aa, 0x16fbfbed, 0x000000fb +.long 0xc5434386, 0x00000043, 0xd74d4d9a, 0x0000004d +.long 0x55333366, 0x00000033, 0x94858511, 0x00000085 +.long 0xcf45458a, 0x00000045, 0x10f9f9e9, 0x000000f9 +.long 0x06020204, 0x00000002, 0x817f7ffe, 0x0000007f +.long 0xf05050a0, 0x00000050, 0x443c3c78, 0x0000003c +.long 0xba9f9f25, 0x0000009f, 0xe3a8a84b, 0x000000a8 +.long 0xf35151a2, 0x00000051, 0xfea3a35d, 0x000000a3 +.long 0xc0404080, 0x00000040, 0x8a8f8f05, 0x0000008f +.long 0xad92923f, 0x00000092, 0xbc9d9d21, 0x0000009d +.long 0x48383870, 0x00000038, 0x04f5f5f1, 0x000000f5 +.long 0xdfbcbc63, 0x000000bc, 0xc1b6b677, 0x000000b6 +.long 0x75dadaaf, 0x000000da, 0x63212142, 0x00000021 +.long 0x30101020, 0x00000010, 0x1affffe5, 0x000000ff +.long 0x0ef3f3fd, 0x000000f3, 0x6dd2d2bf, 0x000000d2 +.long 0x4ccdcd81, 0x000000cd, 0x140c0c18, 0x0000000c +.long 0x35131326, 0x00000013, 0x2fececc3, 0x000000ec +.long 0xe15f5fbe, 0x0000005f, 0xa2979735, 0x00000097 +.long 0xcc444488, 0x00000044, 0x3917172e, 0x00000017 +.long 0x57c4c493, 0x000000c4, 0xf2a7a755, 0x000000a7 +.long 0x827e7efc, 0x0000007e, 0x473d3d7a, 0x0000003d +.long 0xac6464c8, 0x00000064, 0xe75d5dba, 0x0000005d +.long 0x2b191932, 0x00000019, 0x957373e6, 0x00000073 +.long 0xa06060c0, 0x00000060, 0x98818119, 0x00000081 +.long 0xd14f4f9e, 0x0000004f, 0x7fdcdca3, 0x000000dc +.long 0x66222244, 0x00000022, 0x7e2a2a54, 0x0000002a +.long 0xab90903b, 0x00000090, 0x8388880b, 0x00000088 +.long 0xca46468c, 0x00000046, 0x29eeeec7, 0x000000ee +.long 0xd3b8b86b, 0x000000b8, 0x3c141428, 0x00000014 +.long 0x79dedea7, 0x000000de, 0xe25e5ebc, 0x0000005e +.long 0x1d0b0b16, 0x0000000b, 0x76dbdbad, 0x000000db +.long 0x3be0e0db, 0x000000e0, 0x56323264, 0x00000032 +.long 0x4e3a3a74, 0x0000003a, 0x1e0a0a14, 0x0000000a +.long 0xdb494992, 0x00000049, 0x0a06060c, 0x00000006 +.long 0x6c242448, 0x00000024, 0xe45c5cb8, 0x0000005c +.long 0x5dc2c29f, 0x000000c2, 0x6ed3d3bd, 0x000000d3 +.long 0xefacac43, 0x000000ac, 0xa66262c4, 0x00000062 +.long 0xa8919139, 0x00000091, 0xa4959531, 0x00000095 +.long 0x37e4e4d3, 0x000000e4, 0x8b7979f2, 0x00000079 +.long 0x32e7e7d5, 0x000000e7, 0x43c8c88b, 0x000000c8 +.long 0x5937376e, 0x00000037, 0xb76d6dda, 0x0000006d +.long 0x8c8d8d01, 0x0000008d, 0x64d5d5b1, 0x000000d5 +.long 0xd24e4e9c, 0x0000004e, 0xe0a9a949, 0x000000a9 +.long 0xb46c6cd8, 0x0000006c, 0xfa5656ac, 0x00000056 +.long 0x07f4f4f3, 0x000000f4, 0x25eaeacf, 0x000000ea +.long 0xaf6565ca, 0x00000065, 0x8e7a7af4, 0x0000007a +.long 0xe9aeae47, 0x000000ae, 0x18080810, 0x00000008 +.long 0xd5baba6f, 0x000000ba, 0x887878f0, 0x00000078 +.long 0x6f25254a, 0x00000025, 0x722e2e5c, 0x0000002e +.long 0x241c1c38, 0x0000001c, 0xf1a6a657, 0x000000a6 +.long 0xc7b4b473, 0x000000b4, 0x51c6c697, 0x000000c6 +.long 0x23e8e8cb, 0x000000e8, 0x7cdddda1, 0x000000dd +.long 0x9c7474e8, 0x00000074, 0x211f1f3e, 0x0000001f +.long 0xdd4b4b96, 0x0000004b, 0xdcbdbd61, 0x000000bd +.long 0x868b8b0d, 0x0000008b, 0x858a8a0f, 0x0000008a +.long 0x907070e0, 0x00000070, 0x423e3e7c, 0x0000003e +.long 0xc4b5b571, 0x000000b5, 0xaa6666cc, 0x00000066 +.long 0xd8484890, 0x00000048, 0x05030306, 0x00000003 +.long 0x01f6f6f7, 0x000000f6, 0x120e0e1c, 0x0000000e +.long 0xa36161c2, 0x00000061, 0x5f35356a, 0x00000035 +.long 0xf95757ae, 0x00000057, 0xd0b9b969, 0x000000b9 +.long 0x91868617, 0x00000086, 0x58c1c199, 0x000000c1 +.long 0x271d1d3a, 0x0000001d, 0xb99e9e27, 0x0000009e +.long 0x38e1e1d9, 0x000000e1, 0x13f8f8eb, 0x000000f8 +.long 0xb398982b, 0x00000098, 0x33111122, 0x00000011 +.long 0xbb6969d2, 0x00000069, 0x70d9d9a9, 0x000000d9 +.long 0x898e8e07, 0x0000008e, 0xa7949433, 0x00000094 +.long 0xb69b9b2d, 0x0000009b, 0x221e1e3c, 0x0000001e +.long 0x92878715, 0x00000087, 0x20e9e9c9, 0x000000e9 +.long 0x49cece87, 0x000000ce, 0xff5555aa, 0x00000055 +.long 0x78282850, 0x00000028, 0x7adfdfa5, 0x000000df +.long 0x8f8c8c03, 0x0000008c, 0xf8a1a159, 0x000000a1 +.long 0x80898909, 0x00000089, 0x170d0d1a, 0x0000000d +.long 0xdabfbf65, 0x000000bf, 0x31e6e6d7, 0x000000e6 +.long 0xc6424284, 0x00000042, 0xb86868d0, 0x00000068 +.long 0xc3414182, 0x00000041, 0xb0999929, 0x00000099 +.long 0x772d2d5a, 0x0000002d, 0x110f0f1e, 0x0000000f +.long 0xcbb0b07b, 0x000000b0, 0xfc5454a8, 0x00000054 +.long 0xd6bbbb6d, 0x000000bb, 0x3a16162c, 0x00000016 + +/* Decryption tables */ +.align 5 +.type .LtableD0, %object +.type .LtableDs0, %object +.LtableD0: +.long 0x50a7f451 +.LtableDs0: +.long 0x00000052, 0x5365417e, 0x00000009 +.long 0xc3a4171a, 0x0000006a, 0x965e273a, 0x000000d5 +.long 0xcb6bab3b, 0x00000030, 0xf1459d1f, 0x00000036 +.long 0xab58faac, 0x000000a5, 0x9303e34b, 0x00000038 +.long 0x55fa3020, 0x000000bf, 0xf66d76ad, 0x00000040 +.long 0x9176cc88, 0x000000a3, 0x254c02f5, 0x0000009e +.long 0xfcd7e54f, 0x00000081, 0xd7cb2ac5, 0x000000f3 +.long 0x80443526, 0x000000d7, 0x8fa362b5, 0x000000fb +.long 0x495ab1de, 0x0000007c, 0x671bba25, 0x000000e3 +.long 0x980eea45, 0x00000039, 0xe1c0fe5d, 0x00000082 +.long 0x02752fc3, 0x0000009b, 0x12f04c81, 0x0000002f +.long 0xa397468d, 0x000000ff, 0xc6f9d36b, 0x00000087 +.long 0xe75f8f03, 0x00000034, 0x959c9215, 0x0000008e +.long 0xeb7a6dbf, 0x00000043, 0xda595295, 0x00000044 +.long 0x2d83bed4, 0x000000c4, 0xd3217458, 0x000000de +.long 0x2969e049, 0x000000e9, 0x44c8c98e, 0x000000cb +.long 0x6a89c275, 0x00000054, 0x78798ef4, 0x0000007b +.long 0x6b3e5899, 0x00000094, 0xdd71b927, 0x00000032 +.long 0xb64fe1be, 0x000000a6, 0x17ad88f0, 0x000000c2 +.long 0x66ac20c9, 0x00000023, 0xb43ace7d, 0x0000003d +.long 0x184adf63, 0x000000ee, 0x82311ae5, 0x0000004c +.long 0x60335197, 0x00000095, 0x457f5362, 0x0000000b +.long 0xe07764b1, 0x00000042, 0x84ae6bbb, 0x000000fa +.long 0x1ca081fe, 0x000000c3, 0x942b08f9, 0x0000004e +.long 0x58684870, 0x00000008, 0x19fd458f, 0x0000002e +.long 0x876cde94, 0x000000a1, 0xb7f87b52, 0x00000066 +.long 0x23d373ab, 0x00000028, 0xe2024b72, 0x000000d9 +.long 0x578f1fe3, 0x00000024, 0x2aab5566, 0x000000b2 +.long 0x0728ebb2, 0x00000076, 0x03c2b52f, 0x0000005b +.long 0x9a7bc586, 0x000000a2, 0xa50837d3, 0x00000049 +.long 0xf2872830, 0x0000006d, 0xb2a5bf23, 0x0000008b +.long 0xba6a0302, 0x000000d1, 0x5c8216ed, 0x00000025 +.long 0x2b1ccf8a, 0x00000072, 0x92b479a7, 0x000000f8 +.long 0xf0f207f3, 0x000000f6, 0xa1e2694e, 0x00000064 +.long 0xcdf4da65, 0x00000086, 0xd5be0506, 0x00000068 +.long 0x1f6234d1, 0x00000098, 0x8afea6c4, 0x00000016 +.long 0x9d532e34, 0x000000d4, 0xa055f3a2, 0x000000a4 +.long 0x32e18a05, 0x0000005c, 0x75ebf6a4, 0x000000cc +.long 0x39ec830b, 0x0000005d, 0xaaef6040, 0x00000065 +.long 0x069f715e, 0x000000b6, 0x51106ebd, 0x00000092 +.long 0xf98a213e, 0x0000006c, 0x3d06dd96, 0x00000070 +.long 0xae053edd, 0x00000048, 0x46bde64d, 0x00000050 +.long 0xb58d5491, 0x000000fd, 0x055dc471, 0x000000ed +.long 0x6fd40604, 0x000000b9, 0xff155060, 0x000000da +.long 0x24fb9819, 0x0000005e, 0x97e9bdd6, 0x00000015 +.long 0xcc434089, 0x00000046, 0x779ed967, 0x00000057 +.long 0xbd42e8b0, 0x000000a7, 0x888b8907, 0x0000008d +.long 0x385b19e7, 0x0000009d, 0xdbeec879, 0x00000084 +.long 0x470a7ca1, 0x00000090, 0xe90f427c, 0x000000d8 +.long 0xc91e84f8, 0x000000ab, 0x00000000, 0x00000000 +.long 0x83868009, 0x0000008c, 0x48ed2b32, 0x000000bc +.long 0xac70111e, 0x000000d3, 0x4e725a6c, 0x0000000a +.long 0xfbff0efd, 0x000000f7, 0x5638850f, 0x000000e4 +.long 0x1ed5ae3d, 0x00000058, 0x27392d36, 0x00000005 +.long 0x64d90f0a, 0x000000b8, 0x21a65c68, 0x000000b3 +.long 0xd1545b9b, 0x00000045, 0x3a2e3624, 0x00000006 +.long 0xb1670a0c, 0x000000d0, 0x0fe75793, 0x0000002c +.long 0xd296eeb4, 0x0000001e, 0x9e919b1b, 0x0000008f +.long 0x4fc5c080, 0x000000ca, 0xa220dc61, 0x0000003f +.long 0x694b775a, 0x0000000f, 0x161a121c, 0x00000002 +.long 0x0aba93e2, 0x000000c1, 0xe52aa0c0, 0x000000af +.long 0x43e0223c, 0x000000bd, 0x1d171b12, 0x00000003 +.long 0x0b0d090e, 0x00000001, 0xadc78bf2, 0x00000013 +.long 0xb9a8b62d, 0x0000008a, 0xc8a91e14, 0x0000006b +.long 0x8519f157, 0x0000003a, 0x4c0775af, 0x00000091 +.long 0xbbdd99ee, 0x00000011, 0xfd607fa3, 0x00000041 +.long 0x9f2601f7, 0x0000004f, 0xbcf5725c, 0x00000067 +.long 0xc53b6644, 0x000000dc, 0x347efb5b, 0x000000ea +.long 0x7629438b, 0x00000097, 0xdcc623cb, 0x000000f2 +.long 0x68fcedb6, 0x000000cf, 0x63f1e4b8, 0x000000ce +.long 0xcadc31d7, 0x000000f0, 0x10856342, 0x000000b4 +.long 0x40229713, 0x000000e6, 0x2011c684, 0x00000073 +.long 0x7d244a85, 0x00000096, 0xf83dbbd2, 0x000000ac +.long 0x1132f9ae, 0x00000074, 0x6da129c7, 0x00000022 +.long 0x4b2f9e1d, 0x000000e7, 0xf330b2dc, 0x000000ad +.long 0xec52860d, 0x00000035, 0xd0e3c177, 0x00000085 +.long 0x6c16b32b, 0x000000e2, 0x99b970a9, 0x000000f9 +.long 0xfa489411, 0x00000037, 0x2264e947, 0x000000e8 +.long 0xc48cfca8, 0x0000001c, 0x1a3ff0a0, 0x00000075 +.long 0xd82c7d56, 0x000000df, 0xef903322, 0x0000006e +.long 0xc74e4987, 0x00000047, 0xc1d138d9, 0x000000f1 +.long 0xfea2ca8c, 0x0000001a, 0x360bd498, 0x00000071 +.long 0xcf81f5a6, 0x0000001d, 0x28de7aa5, 0x00000029 +.long 0x268eb7da, 0x000000c5, 0xa4bfad3f, 0x00000089 +.long 0xe49d3a2c, 0x0000006f, 0x0d927850, 0x000000b7 +.long 0x9bcc5f6a, 0x00000062, 0x62467e54, 0x0000000e +.long 0xc2138df6, 0x000000aa, 0xe8b8d890, 0x00000018 +.long 0x5ef7392e, 0x000000be, 0xf5afc382, 0x0000001b +.long 0xbe805d9f, 0x000000fc, 0x7c93d069, 0x00000056 +.long 0xa92dd56f, 0x0000003e, 0xb31225cf, 0x0000004b +.long 0x3b99acc8, 0x000000c6, 0xa77d1810, 0x000000d2 +.long 0x6e639ce8, 0x00000079, 0x7bbb3bdb, 0x00000020 +.long 0x097826cd, 0x0000009a, 0xf418596e, 0x000000db +.long 0x01b79aec, 0x000000c0, 0xa89a4f83, 0x000000fe +.long 0x656e95e6, 0x00000078, 0x7ee6ffaa, 0x000000cd +.long 0x08cfbc21, 0x0000005a, 0xe6e815ef, 0x000000f4 +.long 0xd99be7ba, 0x0000001f, 0xce366f4a, 0x000000dd +.long 0xd4099fea, 0x000000a8, 0xd67cb029, 0x00000033 +.long 0xafb2a431, 0x00000088, 0x31233f2a, 0x00000007 +.long 0x3094a5c6, 0x000000c7, 0xc066a235, 0x00000031 +.long 0x37bc4e74, 0x000000b1, 0xa6ca82fc, 0x00000012 +.long 0xb0d090e0, 0x00000010, 0x15d8a733, 0x00000059 +.long 0x4a9804f1, 0x00000027, 0xf7daec41, 0x00000080 +.long 0x0e50cd7f, 0x000000ec, 0x2ff69117, 0x0000005f +.long 0x8dd64d76, 0x00000060, 0x4db0ef43, 0x00000051 +.long 0x544daacc, 0x0000007f, 0xdf0496e4, 0x000000a9 +.long 0xe3b5d19e, 0x00000019, 0x1b886a4c, 0x000000b5 +.long 0xb81f2cc1, 0x0000004a, 0x7f516546, 0x0000000d +.long 0x04ea5e9d, 0x0000002d, 0x5d358c01, 0x000000e5 +.long 0x737487fa, 0x0000007a, 0x2e410bfb, 0x0000009f +.long 0x5a1d67b3, 0x00000093, 0x52d2db92, 0x000000c9 +.long 0x335610e9, 0x0000009c, 0x1347d66d, 0x000000ef +.long 0x8c61d79a, 0x000000a0, 0x7a0ca137, 0x000000e0 +.long 0x8e14f859, 0x0000003b, 0x893c13eb, 0x0000004d +.long 0xee27a9ce, 0x000000ae, 0x35c961b7, 0x0000002a +.long 0xede51ce1, 0x000000f5, 0x3cb1477a, 0x000000b0 +.long 0x59dfd29c, 0x000000c8, 0x3f73f255, 0x000000eb +.long 0x79ce1418, 0x000000bb, 0xbf37c773, 0x0000003c +.long 0xeacdf753, 0x00000083, 0x5baafd5f, 0x00000053 +.long 0x146f3ddf, 0x00000099, 0x86db4478, 0x00000061 +.long 0x81f3afca, 0x00000017, 0x3ec468b9, 0x0000002b +.long 0x2c342438, 0x00000004, 0x5f40a3c2, 0x0000007e +.long 0x72c31d16, 0x000000ba, 0x0c25e2bc, 0x00000077 +.long 0x8b493c28, 0x000000d6, 0x41950dff, 0x00000026 +.long 0x7101a839, 0x000000e1, 0xdeb30c08, 0x00000069 +.long 0x9ce4b4d8, 0x00000014, 0x90c15664, 0x00000063 +.long 0x6184cb7b, 0x00000055, 0x70b632d5, 0x00000021 +.long 0x745c6c48, 0x0000000c, 0x4257b8d0, 0x0000007d + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 35e599a..8855e0c 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -67,6 +67,21 @@ # define USE_AMD64_ASM 1 #endif +/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ +#undef USE_ARMV6_ASM +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS +# define USE_ARMV6_ASM 1 +# endif +#endif + /* USE_PADLOCK indicates whether to compile the padlock specific code. */ #undef USE_PADLOCK @@ -115,6 +130,19 @@ extern void _gcry_aes_amd64_decrypt_block(const void *keysched_dec, int rounds); #endif /*USE_AMD64_ASM*/ +#ifdef USE_ARMV6_ASM +/* ARMv6 assembly implementations of AES */ +extern void _gcry_aes_armv6_encrypt_block(const void *keysched_enc, + unsigned char *out, + const unsigned char *in, + int rounds); + +extern void _gcry_aes_armv6_decrypt_block(const void *keysched_dec, + unsigned char *out, + const unsigned char *in, + int rounds); +#endif /*USE_ARMV6_ASM*/ + /* Our context object. */ @@ -546,7 +574,9 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx, { #ifdef USE_AMD64_ASM _gcry_aes_amd64_encrypt_block(ctx->keyschenc, b, a, ctx->rounds); -#else /*!USE_AMD64_ASM*/ +#elif defined(USE_ARMV6_ASM) + _gcry_aes_armv6_encrypt_block(ctx->keyschenc, b, a, ctx->rounds); +#else #define rk (ctx->keyschenc) int rounds = ctx->rounds; int r; @@ -628,7 +658,7 @@ do_encrypt_aligned (const RIJNDAEL_context *ctx, *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[rounds][2]); *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[rounds][3]); #undef rk -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ } @@ -636,7 +666,7 @@ static void do_encrypt (const RIJNDAEL_context *ctx, unsigned char *bx, const unsigned char *ax) { -#ifndef USE_AMD64_ASM +#if !defined(USE_AMD64_ASM) && !defined(USE_ARMV6_ASM) /* BX and AX are not necessary correctly aligned. Thus we might need to copy them here. We try to align to a 16 bytes. */ if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f)) @@ -657,7 +687,7 @@ do_encrypt (const RIJNDAEL_context *ctx, memcpy (bx, b.b, 16); } else -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ { do_encrypt_aligned (ctx, bx, ax); } @@ -1667,7 +1697,9 @@ do_decrypt_aligned (RIJNDAEL_context *ctx, { #ifdef USE_AMD64_ASM _gcry_aes_amd64_decrypt_block(ctx->keyschdec, b, a, ctx->rounds); -#else /*!USE_AMD64_ASM*/ +#elif defined(USE_ARMV6_ASM) + _gcry_aes_armv6_decrypt_block(ctx->keyschdec, b, a, ctx->rounds); +#else #define rk (ctx->keyschdec) int rounds = ctx->rounds; int r; @@ -1750,7 +1782,7 @@ do_decrypt_aligned (RIJNDAEL_context *ctx, *((u32_a_t*)(b+ 8)) ^= *((u32_a_t*)rk[0][2]); *((u32_a_t*)(b+12)) ^= *((u32_a_t*)rk[0][3]); #undef rk -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ } @@ -1765,7 +1797,7 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax) ctx->decryption_prepared = 1; } -#ifndef USE_AMD64_ASM +#if !defined(USE_AMD64_ASM) && !defined(USE_ARMV6_ASM) /* BX and AX are not necessary correctly aligned. Thus we might need to copy them here. We try to align to a 16 bytes. */ if (((size_t)ax & 0x0f) || ((size_t)bx & 0x0f)) @@ -1786,7 +1818,7 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax) memcpy (bx, b.b, 16); } else -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM && !USE_ARMV6_ASM*/ { do_decrypt_aligned (ctx, bx, ax); } diff --git a/configure.ac b/configure.ac index 06c0b79..04dee7e 100644 --- a/configure.ac +++ b/configure.ac @@ -964,6 +964,34 @@ if test "$gcry_cv_gcc_amd64_platform_as_ok" = "yes" ; then fi +# +# Check whether GCC assembler supports features needed for our ARM +# implementations +# +AC_CACHE_CHECK([whether GCC assembler is compatible for ARM assembly implementations], + [gcry_cv_gcc_arm_platform_as_ok], + [gcry_cv_gcc_arm_platform_as_ok=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[__asm__( + /* Test if assembler supports UAL syntax. */ + ".syntax unified\n\t" + ".thumb\n\t" /* thumb-2 in UAL, thumb-1 otherwise. */ + ".code 16\n\t" + /* Following causes error if assembler ignored '.syntax unified'. */ + "asmfunc:\n\t" + "add.w %r0, %r4, %r8, ror #12;\n\t" + + /* Test if '.type' and '.size' are supported. */ + ".size asmfunc,.-asmfunc;\n\t" + ".type asmfunc,%function;\n\t" + );]])], + [gcry_cv_gcc_arm_platform_as_ok=yes])]) +if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then + AC_DEFINE(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS,1, + [Defined if underlying assembler is compatible with ARM assembly implementations]) +fi + + ####################################### #### Checks for library functions. #### ####################################### @@ -1284,6 +1312,10 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-amd64.lo" ;; + arm*-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-armv6.lo" + ;; esac fi From jussi.kivilinna at iki.fi Fri Aug 16 15:05:42 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 16 Aug 2013 16:05:42 +0300 Subject: [PATCH 1/2] blowfish: add ARMv6 assembly implementation Message-ID: <20130816130542.23638.85628.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'blowfish-armv6.S'. * cipher/blowfish-armv6.S: New file. * cipher/blowfish.c (USE_ARMV6_ASM): New macro. [USE_ARMV6_ASM] (_gcry_blowfish_armv6_do_encrypt) (_gcry_blowfish_armv6_encrypt_block) (_gcry_blowfish_armv6_decrypt_block, _gcry_blowfish_armv6_ctr_enc) (_gcry_blowfish_armv6_cbc_dec, _gcry_blowfish_armv6_cfb_dec): New prototypes. [USE_ARMV6_ASM] (do_encrypt, do_encrypt_block, do_decrypt_block) (encrypt_block, decrypt_block): New functions. (_gcry_blowfish_ctr_enc) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (_gcry_blowfish_cbc_dec) [USE_ARMV6_ASM]: Use ARMv6 assembly function. (_gcry_blowfish_cfb_dec) [USE_ARMV6_ASM]: Use ARMv6 assembly function. * configure.ac (blowfish) [arm]: Add 'blowfish-armv6.lo'. -- Patch provides non-parallel implementations for small speed-up and 2-way parallel implementations that gets accelerated on multi-issue CPUs (hand-tuned for in-order dual-issue Cortex-A8). Unaligned access handling is done in assembly. For now, only enable this on little-endian systems as big-endian correctness have not been tested yet. Old vs new (Cortex-A8, Debian Wheezy/armhf): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- BLOWFISH 1.28x 1.16x 1.21x 2.16x 1.26x 1.86x 1.21x 1.25x 1.89x 1.96x Signed-off-by: Jussi Kivilinna --- cipher/blowfish-armv6.S | 737 +++++++++++++++++++++++++++++++++++++++++++++++ cipher/blowfish.c | 115 +++++++ configure.ac | 4 3 files changed, 854 insertions(+), 2 deletions(-) create mode 100644 cipher/blowfish-armv6.S diff --git a/cipher/blowfish-armv6.S b/cipher/blowfish-armv6.S new file mode 100644 index 0000000..b11d27f --- /dev/null +++ b/cipher/blowfish-armv6.S @@ -0,0 +1,737 @@ +/* blowfish-armv6.S - ARM assembly implementation of Blowfish cipher + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +/* structure of crypto context */ +#define s0 0 +#define s1 (s0 + (1 * 256) * 4) +#define s2 (s0 + (2 * 256) * 4) +#define s3 (s0 + (3 * 256) * 4) +#define p (s3 + (1 * 256) * 4) + +/* register macros */ +#define CTXs0 %r0 +#define CTXs1 %r9 +#define CTXs2 %r8 +#define CTXs3 %r10 +#define RMASK %lr +#define RKEYL %r2 +#define RKEYR %ip + +#define RL0 %r3 +#define RR0 %r4 + +#define RL1 %r9 +#define RR1 %r10 + +#define RT0 %r11 +#define RT1 %r7 +#define RT2 %r5 +#define RT3 %r6 + +/* helper macros */ +#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 0)]; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 3)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 0)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 1)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 2)]; \ + strb rtmp0, [rdst, #((offs) + 3)]; + +#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 3)]; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 0)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 3)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 2)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 1)]; \ + strb rtmp0, [rdst, #((offs) + 0)]; + +#ifdef __ARMEL__ + #define ldr_unaligned_host ldr_unaligned_le + #define str_unaligned_host str_unaligned_le + + /* bswap on little-endian */ + #define host_to_be(reg) \ + rev reg, reg; + #define be_to_host(reg) \ + rev reg, reg; +#else + #define ldr_unaligned_host ldr_unaligned_be + #define str_unaligned_host str_unaligned_be + + /* nop on big-endian */ + #define host_to_be(reg) /*_*/ + #define be_to_host(reg) /*_*/ +#endif + +#define host_to_host(x) /*_*/ + +/*********************************************************************** + * 1-way blowfish + ***********************************************************************/ +#define F(l, r) \ + and RT0, RMASK, l, lsr#(24 - 2); \ + and RT1, RMASK, l, lsr#(16 - 2); \ + ldr RT0, [CTXs0, RT0]; \ + and RT2, RMASK, l, lsr#(8 - 2); \ + ldr RT1, [CTXs1, RT1]; \ + and RT3, RMASK, l, lsl#2; \ + ldr RT2, [CTXs2, RT2]; \ + add RT0, RT1; \ + ldr RT3, [CTXs3, RT3]; \ + eor RT0, RT2; \ + add RT0, RT3; \ + eor r, RT0; + +#define load_roundkey_enc(n) \ + ldr RKEYL, [CTXs2, #((p - s2) + (4 * (n) + 0))]; \ + ldr RKEYR, [CTXs2, #((p - s2) + (4 * (n) + 4))]; + +#define add_roundkey_enc() \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; + +#define round_enc(n) \ + add_roundkey_enc(); \ + load_roundkey_enc(n); \ + \ + F(RL0, RR0); \ + F(RR0, RL0); + +#define load_roundkey_dec(n) \ + ldr RKEYL, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 4))]; \ + ldr RKEYR, [CTXs2, #((p - s2) + (4 * ((n) - 1) + 0))]; + +#define add_roundkey_dec() \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; + +#define round_dec(n) \ + add_roundkey_dec(); \ + load_roundkey_dec(n); \ + \ + F(RL0, RR0); \ + F(RR0, RL0); + +#define read_block_aligned(rin, offs, l0, r0, convert) \ + ldr l0, [rin, #((offs) + 0)]; \ + ldr r0, [rin, #((offs) + 4)]; \ + convert(l0); \ + convert(r0); + +#define write_block_aligned(rout, offs, l0, r0, convert) \ + convert(l0); \ + convert(r0); \ + str l0, [rout, #((offs) + 0)]; \ + str r0, [rout, #((offs) + 4)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_be) + + #define write_block(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, be_to_host) + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + read_block_aligned(rin, offs, l0, r0, host_to_host) + + #define write_block_host(rout, offs, r0, l0, rtmp0, rtmp1) \ + write_block_aligned(rout, offs, r0, l0, host_to_host) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_be(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_be); \ + 2:; + + #define write_block(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, be_to_host); \ + 2:; + + #define read_block_host(rin, offs, l0, r0, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, (offs) + 0, rtmp0); \ + ldr_unaligned_host(r0, rin, (offs) + 4, rtmp0); \ + b 2f; \ + 1:;\ + read_block_aligned(rin, offs, l0, r0, host_to_host); \ + 2:; + + #define write_block_host(rout, offs, l0, r0, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, (offs) + 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, (offs) + 4, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block_aligned(rout, offs, l0, r0, host_to_host); \ + 2:; +#endif + +.align 3 +.type __blowfish_enc_blk1,%function; + +__blowfish_enc_blk1: + /* input: + * preloaded: CTX + * [RL0, RR0]: src + * output: + * [RR0, RL0]: dst + */ + push {%lr}; + + add CTXs1, CTXs0, #(s1 - s0); + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + add CTXs3, CTXs1, #(s3 - s1); + + load_roundkey_enc(0); + round_enc(2); + round_enc(4); + round_enc(6); + round_enc(8); + round_enc(10); + round_enc(12); + round_enc(14); + round_enc(16); + add_roundkey_enc(); + + pop {%pc}; +.size __blowfish_enc_blk1,.-__blowfish_enc_blk1; + +.align 8 +.globl _gcry_blowfish_armv6_do_encrypt +.type _gcry_blowfish_armv6_do_encrypt,%function; + +_gcry_blowfish_armv6_do_encrypt: + /* input: + * %r0: ctx, CTX + * %r1: u32 *ret_xl + * %r2: u32 *ret_xr + */ + push {%r2, %r4-%r11, %ip, %lr}; + + ldr RL0, [%r1]; + ldr RR0, [%r2]; + + bl __blowfish_enc_blk1; + + pop {%r2}; + str RR0, [%r1]; + str RL0, [%r2]; + + pop {%r4-%r11, %ip, %pc}; +.size _gcry_blowfish_armv6_do_encrypt,.-_gcry_blowfish_armv6_do_encrypt; + +.align 3 +.global _gcry_blowfish_armv6_encrypt_block +.type _gcry_blowfish_armv6_encrypt_block,%function; + +_gcry_blowfish_armv6_encrypt_block: + /* input: + * %r0: ctx, CTX + * %r1: dst + * %r2: src + */ + push {%r4-%r11, %ip, %lr}; + + read_block(%r2, 0, RL0, RR0, RT0); + + bl __blowfish_enc_blk1; + + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.size _gcry_blowfish_armv6_encrypt_block,.-_gcry_blowfish_armv6_encrypt_block; + +.align 3 +.global _gcry_blowfish_armv6_decrypt_block +.type _gcry_blowfish_armv6_decrypt_block,%function; + +_gcry_blowfish_armv6_decrypt_block: + /* input: + * %r0: ctx, CTX + * %r1: dst + * %r2: src + */ + push {%r4-%r11, %ip, %lr}; + + add CTXs1, CTXs0, #(s1 - s0); + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + add CTXs3, CTXs1, #(s3 - s1); + + read_block(%r2, 0, RL0, RR0, RT0); + + load_roundkey_dec(17); + round_dec(15); + round_dec(13); + round_dec(11); + round_dec(9); + round_dec(7); + round_dec(5); + round_dec(3); + round_dec(1); + add_roundkey_dec(); + + write_block(%r1, 0, RR0, RL0, RT0, RT1); + + pop {%r4-%r11, %ip, %pc}; +.size _gcry_blowfish_armv6_decrypt_block,.-_gcry_blowfish_armv6_decrypt_block; + +/*********************************************************************** + * 2-way blowfish + ***********************************************************************/ +#define F2(n, l0, r0, l1, r1, set_nextk, dec) \ + \ + and RT0, RMASK, l0, lsr#(24 - 2); \ + and RT1, RMASK, l0, lsr#(16 - 2); \ + and RT2, RMASK, l0, lsr#(8 - 2); \ + add RT1, #(s1 - s0); \ + \ + ldr RT0, [CTXs0, RT0]; \ + and RT3, RMASK, l0, lsl#2; \ + ldr RT1, [CTXs0, RT1]; \ + add RT3, #(s3 - s2); \ + ldr RT2, [CTXs2, RT2]; \ + add RT0, RT1; \ + ldr RT3, [CTXs2, RT3]; \ + \ + and RT1, RMASK, l1, lsr#(24 - 2); \ + eor RT0, RT2; \ + and RT2, RMASK, l1, lsr#(16 - 2); \ + add RT0, RT3; \ + add RT2, #(s1 - s0); \ + and RT3, RMASK, l1, lsr#(8 - 2); \ + eor r0, RT0; \ + \ + ldr RT1, [CTXs0, RT1]; \ + and RT0, RMASK, l1, lsl#2; \ + ldr RT2, [CTXs0, RT2]; \ + add RT0, #(s3 - s2); \ + ldr RT3, [CTXs2, RT3]; \ + add RT1, RT2; \ + ldr RT0, [CTXs2, RT0]; \ + \ + and RT2, RMASK, r0, lsr#(24 - 2); \ + eor RT1, RT3; \ + and RT3, RMASK, r0, lsr#(16 - 2); \ + add RT1, RT0; \ + add RT3, #(s1 - s0); \ + and RT0, RMASK, r0, lsr#(8 - 2); \ + eor r1, RT1; \ + \ + ldr RT2, [CTXs0, RT2]; \ + and RT1, RMASK, r0, lsl#2; \ + ldr RT3, [CTXs0, RT3]; \ + add RT1, #(s3 - s2); \ + ldr RT0, [CTXs2, RT0]; \ + add RT2, RT3; \ + ldr RT1, [CTXs2, RT1]; \ + \ + and RT3, RMASK, r1, lsr#(24 - 2); \ + eor RT2, RT0; \ + and RT0, RMASK, r1, lsr#(16 - 2); \ + add RT2, RT1; \ + add RT0, #(s1 - s0); \ + and RT1, RMASK, r1, lsr#(8 - 2); \ + eor l0, RT2; \ + \ + ldr RT3, [CTXs0, RT3]; \ + and RT2, RMASK, r1, lsl#2; \ + ldr RT0, [CTXs0, RT0]; \ + add RT2, #(s3 - s2); \ + ldr RT1, [CTXs2, RT1]; \ + eor l1, RKEYL; \ + ldr RT2, [CTXs2, RT2]; \ + \ + eor r0, RKEYR; \ + add RT3, RT0; \ + eor r1, RKEYR; \ + eor RT3, RT1; \ + eor l0, RKEYL; \ + add RT3, RT2; \ + set_nextk(RKEYL, (p - s2) + (4 * (n) + ((dec) * 4))); \ + eor l1, RT3; \ + set_nextk(RKEYR, (p - s2) + (4 * (n) + (!(dec) * 4))); + +#define load_n_add_roundkey_enc2(n) \ + load_roundkey_enc(n); \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; \ + eor RL1, RKEYL; \ + eor RR1, RKEYR; \ + load_roundkey_enc((n) + 2); + +#define next_key(reg, offs) \ + ldr reg, [CTXs2, #(offs)]; + +#define dummy(x, y) /* do nothing */ + +#define round_enc2(n, load_next_key) \ + F2((n) + 2, RL0, RR0, RL1, RR1, load_next_key, 0); + +#define load_n_add_roundkey_dec2(n) \ + load_roundkey_dec(n); \ + eor RL0, RKEYL; \ + eor RR0, RKEYR; \ + eor RL1, RKEYL; \ + eor RR1, RKEYR; \ + load_roundkey_dec((n) - 2); + +#define round_dec2(n, load_next_key) \ + F2((n) - 3, RL0, RR0, RL1, RR1, load_next_key, 1); + +#define read_block2_aligned(rin, l0, r0, l1, r1, convert) \ + ldr l0, [rin, #(0)]; \ + ldr r0, [rin, #(4)]; \ + convert(l0); \ + ldr l1, [rin, #(8)]; \ + convert(r0); \ + ldr r1, [rin, #(12)]; \ + convert(l1); \ + convert(r1); + +#define write_block2_aligned(rout, l0, r0, l1, r1, convert) \ + convert(l0); \ + convert(r0); \ + convert(l1); \ + str l0, [rout, #(0)]; \ + convert(r1); \ + str r0, [rout, #(4)]; \ + str l1, [rout, #(8)]; \ + str r1, [rout, #(12)]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads allowed */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be) + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host) + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host) + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host) +#else + /* need to handle unaligned reads by byte reads */ + #define read_block2(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(l0, rin, 0, rtmp0); \ + ldr_unaligned_be(r0, rin, 4, rtmp0); \ + ldr_unaligned_be(l1, rin, 8, rtmp0); \ + ldr_unaligned_be(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_be); \ + 2:; + + #define write_block2(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_be(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_be(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_be(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, be_to_host); \ + 2:; + + #define read_block2_host(rin, l0, r0, l1, r1, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_host(l0, rin, 0, rtmp0); \ + ldr_unaligned_host(r0, rin, 4, rtmp0); \ + ldr_unaligned_host(l1, rin, 8, rtmp0); \ + ldr_unaligned_host(r1, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + read_block2_aligned(rin, l0, r0, l1, r1, host_to_host); \ + 2:; + + #define write_block2_host(rout, l0, r0, l1, r1, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_host(l0, rout, 0, rtmp0, rtmp1); \ + str_unaligned_host(r0, rout, 4, rtmp0, rtmp1); \ + str_unaligned_host(l1, rout, 8, rtmp0, rtmp1); \ + str_unaligned_host(r1, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + write_block2_aligned(rout, l0, r0, l1, r1, host_to_host); \ + 2:; +#endif + +.align 3 +.type _gcry_blowfish_armv6_enc_blk2,%function; + +_gcry_blowfish_armv6_enc_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + push {%lr}; + + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + + load_n_add_roundkey_enc2(0); + round_enc2(2, next_key); + round_enc2(4, next_key); + round_enc2(6, next_key); + round_enc2(8, next_key); + round_enc2(10, next_key); + round_enc2(12, next_key); + round_enc2(14, next_key); + round_enc2(16, dummy); + + host_to_be(RR0); + host_to_be(RL0); + host_to_be(RR1); + host_to_be(RL1); + + pop {%pc}; +.size _gcry_blowfish_armv6_enc_blk2,.-_gcry_blowfish_armv6_enc_blk2; + +.align 3 +.globl _gcry_blowfish_armv6_cfb_dec; +.type _gcry_blowfish_armv6_cfb_dec,%function; + +_gcry_blowfish_armv6_cfb_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load input (iv/%r3 is aligned, src/%r2 might not be) */ + ldm %r3, {RL0, RR0}; + host_to_be(RL0); + host_to_be(RR0); + read_block(%r2, 0, RL1, RR1, RT0); + + /* Update IV, load src[1] and save to iv[0] */ + read_block_host(%r2, 8, %r5, %r6, RT0); + stm %lr, {%r5, %r6}; + + bl _gcry_blowfish_armv6_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r1: dst, %r0: %src */ + pop {%r0}; + + /* dst = src ^ result */ + read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_blowfish_armv6_cfb_dec,.-_gcry_blowfish_armv6_cfb_dec; + +.align 3 +.globl _gcry_blowfish_armv6_ctr_enc; +.type _gcry_blowfish_armv6_ctr_enc,%function; + +_gcry_blowfish_armv6_ctr_enc: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit, big-endian) + */ + push {%r2, %r4-%r11, %ip, %lr}; + + mov %lr, %r3; + + /* Load IV (big => host endian) */ + read_block_aligned(%lr, 0, RL0, RR0, be_to_host); + + /* Construct IVs */ + adds RR1, RR0, #1; /* +1 */ + adc RL1, RL0, #0; + adds %r6, RR1, #1; /* +2 */ + adc %r5, RL1, #0; + + /* Store new IV (host => big-endian) */ + write_block_aligned(%lr, 0, %r5, %r6, host_to_be); + + bl _gcry_blowfish_armv6_enc_blk2; + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r1: dst, %r0: %src */ + pop {%r0}; + + /* XOR key-stream with plaintext */ + read_block2_host(%r0, %r5, %r6, %r7, %r8, %lr); + eor %r5, %r4; + eor %r6, %r3; + eor %r7, %r10; + eor %r8, %r9; + write_block2_host(%r1, %r5, %r6, %r7, %r8, %r9, %r10); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_blowfish_armv6_ctr_enc,.-_gcry_blowfish_armv6_ctr_enc; + +.align 3 +.type _gcry_blowfish_armv6_dec_blk2,%function; + +_gcry_blowfish_armv6_dec_blk2: + /* input: + * preloaded: CTX + * [RL0, RR0], [RL1, RR1]: src + * output: + * [RR0, RL0], [RR1, RL1]: dst + */ + add CTXs2, CTXs0, #(s2 - s0); + mov RMASK, #(0xff << 2); /* byte mask */ + + load_n_add_roundkey_dec2(17); + round_dec2(15, next_key); + round_dec2(13, next_key); + round_dec2(11, next_key); + round_dec2(9, next_key); + round_dec2(7, next_key); + round_dec2(5, next_key); + round_dec2(3, next_key); + round_dec2(1, dummy); + + host_to_be(RR0); + host_to_be(RL0); + host_to_be(RR1); + host_to_be(RL1); + + b .Ldec_cbc_tail; +.ltorg +.size _gcry_blowfish_armv6_dec_blk2,.-_gcry_blowfish_armv6_dec_blk2; + +.align 3 +.globl _gcry_blowfish_armv6_cbc_dec; +.type _gcry_blowfish_armv6_cbc_dec,%function; + +_gcry_blowfish_armv6_cbc_dec: + /* input: + * %r0: CTX + * %r1: dst (2 blocks) + * %r2: src (2 blocks) + * %r3: iv (64bit) + */ + push {%r2-%r11, %ip, %lr}; + + read_block2(%r2, RL0, RR0, RL1, RR1, RT0); + + /* dec_blk2 is only used by cbc_dec, jump directly in/out instead + * of function call. */ + b _gcry_blowfish_armv6_dec_blk2; +.Ldec_cbc_tail: + /* result in RR0:RL0, RR1:RL1 = %r4:%r3, %r10:%r9 */ + + /* %r0: %src, %r1: dst, %r2: iv */ + pop {%r0, %r2}; + + /* load IV+1 (src[0]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r0, 0, %r7, %r8, %r5); + /* load IV (iv[0]) to %r5:%r6. 'iv' is aligned. */ + ldm %r2, {%r5, %r6}; + + /* out[1] ^= IV+1 */ + eor %r10, %r7; + eor %r9, %r8; + /* out[0] ^= IV */ + eor %r4, %r5; + eor %r3, %r6; + + /* load IV+2 (src[1]) to %r7:%r8. Might be unaligned. */ + read_block_host(%r0, 8, %r7, %r8, %r5); + /* store IV+2 to iv[0] (aligned). */ + stm %r2, {%r7, %r8}; + + /* store result to dst[0-3]. Might be unaligned. */ + write_block2_host(%r1, %r4, %r3, %r10, %r9, %r5, %r6); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_blowfish_armv6_cbc_dec,.-_gcry_blowfish_armv6_cbc_dec; + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ diff --git a/cipher/blowfish.c b/cipher/blowfish.c index 69baebe..fe4e280 100644 --- a/cipher/blowfish.c +++ b/cipher/blowfish.c @@ -50,6 +50,20 @@ # define USE_AMD64_ASM 1 #endif +/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ +#undef USE_ARMV6_ASM +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +# if (BLOWFISH_ROUNDS == 16) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) +# define USE_ARMV6_ASM 1 +# endif +#endif typedef struct { u32 s0[256]; @@ -305,7 +319,61 @@ static void decrypt_block (void *context, byte *outbuf, const byte *inbuf) _gcry_burn_stack (2*8); } -#else /*USE_AMD64_ASM*/ +#elif defined(USE_ARMV6_ASM) + +/* Assembly implementations of Blowfish. */ +extern void _gcry_blowfish_armv6_do_encrypt(BLOWFISH_context *c, u32 *ret_xl, + u32 *ret_xr); + +extern void _gcry_blowfish_armv6_encrypt_block(BLOWFISH_context *c, byte *out, + const byte *in); + +extern void _gcry_blowfish_armv6_decrypt_block(BLOWFISH_context *c, byte *out, + const byte *in); + +/* These assembly implementations process two blocks in parallel. */ +extern void _gcry_blowfish_armv6_ctr_enc(BLOWFISH_context *ctx, byte *out, + const byte *in, byte *ctr); + +extern void _gcry_blowfish_armv6_cbc_dec(BLOWFISH_context *ctx, byte *out, + const byte *in, byte *iv); + +extern void _gcry_blowfish_armv6_cfb_dec(BLOWFISH_context *ctx, byte *out, + const byte *in, byte *iv); + +static void +do_encrypt ( BLOWFISH_context *bc, u32 *ret_xl, u32 *ret_xr ) +{ + _gcry_blowfish_armv6_do_encrypt (bc, ret_xl, ret_xr); +} + +static void +do_encrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf) +{ + _gcry_blowfish_armv6_encrypt_block (context, outbuf, inbuf); +} + +static void +do_decrypt_block (BLOWFISH_context *context, byte *outbuf, const byte *inbuf) +{ + _gcry_blowfish_armv6_decrypt_block (context, outbuf, inbuf); +} + +static void encrypt_block (void *context , byte *outbuf, const byte *inbuf) +{ + BLOWFISH_context *c = (BLOWFISH_context *) context; + do_encrypt_block (c, outbuf, inbuf); + _gcry_burn_stack (10*4); +} + +static void decrypt_block (void *context, byte *outbuf, const byte *inbuf) +{ + BLOWFISH_context *c = (BLOWFISH_context *) context; + do_decrypt_block (c, outbuf, inbuf); + _gcry_burn_stack (10*4); +} + +#else /*USE_ARMV6_ASM*/ #if BLOWFISH_ROUNDS != 16 static inline u32 @@ -527,7 +595,7 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf) _gcry_burn_stack (64); } -#endif /*!USE_AMD64_ASM*/ +#endif /*!USE_AMD64_ASM&&!USE_ARMV6_ASM*/ /* Bulk encryption of complete blocks in CTR mode. This function is only @@ -562,6 +630,21 @@ _gcry_blowfish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, /* Use generic code to handle smaller chunks... */ /* TODO: use caching instead? */ } +#elif defined(USE_ARMV6_ASM) + { + /* Process data in 2 block chunks. */ + while (nblocks >= 2) + { + _gcry_blowfish_armv6_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 2; + outbuf += 2 * BLOWFISH_BLOCKSIZE; + inbuf += 2 * BLOWFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + /* TODO: use caching instead? */ + } #endif for ( ;nblocks; nblocks-- ) @@ -615,6 +698,20 @@ _gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, /* Use generic code to handle smaller chunks... */ } +#elif defined(USE_ARMV6_ASM) + { + /* Process data in 2 block chunks. */ + while (nblocks >= 2) + { + _gcry_blowfish_armv6_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 2; + outbuf += 2 * BLOWFISH_BLOCKSIZE; + inbuf += 2 * BLOWFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } #endif for ( ;nblocks; nblocks-- ) @@ -664,6 +761,20 @@ _gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, /* Use generic code to handle smaller chunks... */ } +#elif defined(USE_ARMV6_ASM) + { + /* Process data in 2 block chunks. */ + while (nblocks >= 2) + { + _gcry_blowfish_armv6_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 2; + outbuf += 2 * BLOWFISH_BLOCKSIZE; + inbuf += 2 * BLOWFISH_BLOCKSIZE; + } + + /* Use generic code to handle smaller chunks... */ + } #endif for ( ;nblocks; nblocks-- ) diff --git a/configure.ac b/configure.ac index f05f919..22dab9d 100644 --- a/configure.ac +++ b/configure.ac @@ -1280,6 +1280,10 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-amd64.lo" ;; + arm*-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS blowfish-armv6.lo" + ;; esac fi From jussi.kivilinna at iki.fi Fri Aug 16 15:05:47 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 16 Aug 2013 16:05:47 +0300 Subject: [PATCH 2/2] camellia: add ARMv6 assembly implementation In-Reply-To: <20130816130542.23638.85628.stgit@localhost6.localdomain6> References: <20130816130542.23638.85628.stgit@localhost6.localdomain6> Message-ID: <20130816130547.23638.73530.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'camellia-armv6.S'. * cipher/camellia-armv6.S: New file. * cipher/camellia-glue.c [USE_ARMV6_ASM] (_gcry_camellia_armv6_encrypt_block) (_gcry_camellia_armv6_decrypt_block): New prototypes. [USE_ARMV6_ASM] (Camellia_EncryptBlock, Camellia_DecryptBlock) (camellia_encrypt, camellia_decrypt): New functions. * cipher/camellia.c [!USE_ARMV6_ASM]: Compile encryption and decryption routines if USE_ARMV6_ASM macro is _not_ defined. * cipher/camellia.h (USE_ARMV6_ASM): New macro. [!USE_ARMV6_ASM] (Camellia_EncryptBlock, Camellia_DecryptBlock): If USE_ARMV6_ASM is defined, disable these function prototypes. (camellia) [arm]: Add 'camellia-armv6.lo'. -- Add optimized ARMv6 assembly implementation for Camellia. Implementation is tuned for Cortex-A8. Unaligned access handling is done in assembly part. For now. only enable this on little-endian systems as big-endian correctness have not been tested yet. Old vs new. Cortex-A8 (on Debian Wheezy/armhf): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- CAMELLIA128 1.44x 1.47x 1.35x 1.34x 1.43x 1.39x 1.38x 1.36x 1.38x 1.39x CAMELLIA192 1.60x 1.62x 1.52x 1.47x 1.56x 1.54x 1.52x 1.53x 1.52x 1.53x CAMELLIA256 1.59x 1.60x 1.49x 1.47x 1.53x 1.54x 1.51x 1.50x 1.52x 1.53x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/camellia-armv6.S | 611 +++++++++++++++++++++++++++++++++++++++++++++++ cipher/camellia-glue.c | 51 ++++ cipher/camellia.c | 5 cipher/camellia.h | 16 + configure.ac | 7 + 6 files changed, 691 insertions(+), 1 deletion(-) create mode 100644 cipher/camellia-armv6.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 11bfda5..c0f21fa 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -79,7 +79,7 @@ whirlpool.c \ twofish.c twofish-amd64.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ - camellia-aesni-avx2-amd64.S + camellia-aesni-avx2-amd64.S camellia-armv6.S if ENABLE_O_FLAG_MUNGING o_flag_munging = sed -e 's/-O$[2-9s][2-9s]*$/-O1/' -e 's/-Ofast/-O1/g' diff --git a/cipher/camellia-armv6.S b/cipher/camellia-armv6.S new file mode 100644 index 0000000..769db02 --- /dev/null +++ b/cipher/camellia-armv6.S @@ -0,0 +1,611 @@ +/* camellia-armv6.S - ARM assembly implementation of Camellia cipher + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS + +.text + +.syntax unified +.arm + +#define CAMELLIA_TABLE_BYTE_LEN 272 + +/* struct camellia_ctx: */ +#define key_table 0 +#define key_length CAMELLIA_TABLE_BYTE_LEN + +/* register macros */ +#define CTX %r0 +#define RTAB1 %ip +#define RTAB3 %r1 +#define RMASK %lr + +#define IL %r2 +#define IR %r3 + +#define XL %r4 +#define XR %r5 +#define YL %r6 +#define YR %r7 + +#define RT0 %r8 +#define RT1 %r9 +#define RT2 %r10 +#define RT3 %r11 + +/* helper macros */ +#define ldr_unaligned_be(rout, rsrc, offs, rtmp) \ + ldrb rout, [rsrc, #((offs) + 3)]; \ + ldrb rtmp, [rsrc, #((offs) + 2)]; \ + orr rout, rout, rtmp, lsl #8; \ + ldrb rtmp, [rsrc, #((offs) + 1)]; \ + orr rout, rout, rtmp, lsl #16; \ + ldrb rtmp, [rsrc, #((offs) + 0)]; \ + orr rout, rout, rtmp, lsl #24; + +#define str_unaligned_be(rin, rdst, offs, rtmp0, rtmp1) \ + mov rtmp0, rin, lsr #8; \ + strb rin, [rdst, #((offs) + 3)]; \ + mov rtmp1, rin, lsr #16; \ + strb rtmp0, [rdst, #((offs) + 2)]; \ + mov rtmp0, rin, lsr #24; \ + strb rtmp1, [rdst, #((offs) + 1)]; \ + strb rtmp0, [rdst, #((offs) + 0)]; + +#ifdef __ARMEL__ + /* bswap on little-endian */ + #define host_to_be(reg) \ + rev reg, reg; + #define be_to_host(reg) \ + rev reg, reg; +#else + /* nop on big-endian */ + #define host_to_be(reg) /*_*/ + #define be_to_host(reg) /*_*/ +#endif + +#define ldr_input_aligned_be(rin, a, b, c, d) \ + ldr a, [rin, #0]; \ + ldr b, [rin, #4]; \ + be_to_host(a); \ + ldr c, [rin, #8]; \ + be_to_host(b); \ + ldr d, [rin, #12]; \ + be_to_host(c); \ + be_to_host(d); + +#define str_output_aligned_be(rout, a, b, c, d) \ + be_to_host(a); \ + be_to_host(b); \ + str a, [rout, #0]; \ + be_to_host(c); \ + str b, [rout, #4]; \ + be_to_host(d); \ + str c, [rout, #8]; \ + str d, [rout, #12]; + +#ifdef __ARM_FEATURE_UNALIGNED + /* unaligned word reads/writes allowed */ + #define ldr_input_be(rin, ra, rb, rc, rd, rtmp) \ + ldr_input_aligned_be(rin, ra, rb, rc, rd) + + #define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \ + str_output_aligned_be(rout, ra, rb, rc, rd) +#else + /* need to handle unaligned reads/writes by byte reads */ + #define ldr_input_be(rin, ra, rb, rc, rd, rtmp0) \ + tst rin, #3; \ + beq 1f; \ + ldr_unaligned_be(ra, rin, 0, rtmp0); \ + ldr_unaligned_be(rb, rin, 4, rtmp0); \ + ldr_unaligned_be(rc, rin, 8, rtmp0); \ + ldr_unaligned_be(rd, rin, 12, rtmp0); \ + b 2f; \ + 1:;\ + ldr_input_aligned_be(rin, ra, rb, rc, rd); \ + 2:; + + #define str_output_be(rout, ra, rb, rc, rd, rtmp0, rtmp1) \ + tst rout, #3; \ + beq 1f; \ + str_unaligned_be(ra, rout, 0, rtmp0, rtmp1); \ + str_unaligned_be(rb, rout, 4, rtmp0, rtmp1); \ + str_unaligned_be(rc, rout, 8, rtmp0, rtmp1); \ + str_unaligned_be(rd, rout, 12, rtmp0, rtmp1); \ + b 2f; \ + 1:;\ + str_output_aligned_be(rout, ra, rb, rc, rd); \ + 2:; +#endif + +/********************************************************************** + 1-way camellia + **********************************************************************/ +#define roundsm(xl, xr, kl, kr, yl, yr) \ + ldr RT2, [CTX, #(key_table + ((kl) * 4))]; \ + and IR, RMASK, xr, lsl#(4); /*sp1110*/ \ + ldr RT3, [CTX, #(key_table + ((kr) * 4))]; \ + and IL, RMASK, xl, lsr#(24 - 4); /*sp1110*/ \ + and RT0, RMASK, xr, lsr#(16 - 4); /*sp3033*/ \ + ldr IR, [RTAB1, IR]; \ + and RT1, RMASK, xl, lsr#(8 - 4); /*sp3033*/ \ + eor yl, RT2; \ + ldr IL, [RTAB1, IL]; \ + eor yr, RT3; \ + \ + ldr RT0, [RTAB3, RT0]; \ + add RTAB1, #4; \ + ldr RT1, [RTAB3, RT1]; \ + add RTAB3, #4; \ + \ + and RT2, RMASK, xr, lsr#(24 - 4); /*sp0222*/ \ + and RT3, RMASK, xl, lsr#(16 - 4); /*sp0222*/ \ + \ + eor IR, RT0; \ + eor IL, RT1; \ + \ + ldr RT2, [RTAB1, RT2]; \ + and RT0, RMASK, xr, lsr#(8 - 4); /*sp4404*/ \ + ldr RT3, [RTAB1, RT3]; \ + and RT1, RMASK, xl, lsl#(4); /*sp4404*/ \ + \ + ldr RT0, [RTAB3, RT0]; \ + sub RTAB1, #4; \ + ldr RT1, [RTAB3, RT1]; \ + sub RTAB3, #4; \ + \ + eor IR, RT2; \ + eor IL, RT3; \ + eor IR, RT0; \ + eor IL, RT1; \ + \ + eor IR, IL; \ + eor yr, yr, IL, ror#8; \ + eor yl, IR; \ + eor yr, IR; + +#define enc_rounds(n) \ + roundsm(XL, XR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, YL, YR); \ + roundsm(YL, YR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, XL, XR); \ + roundsm(XL, XR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, YL, YR); \ + roundsm(YL, YR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, XL, XR); \ + roundsm(XL, XR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, YL, YR); \ + roundsm(YL, YR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, XL, XR); + +#define dec_rounds(n) \ + roundsm(XL, XR, ((n) + 7) * 2 + 0, ((n) + 7) * 2 + 1, YL, YR); \ + roundsm(YL, YR, ((n) + 6) * 2 + 0, ((n) + 6) * 2 + 1, XL, XR); \ + roundsm(XL, XR, ((n) + 5) * 2 + 0, ((n) + 5) * 2 + 1, YL, YR); \ + roundsm(YL, YR, ((n) + 4) * 2 + 0, ((n) + 4) * 2 + 1, XL, XR); \ + roundsm(XL, XR, ((n) + 3) * 2 + 0, ((n) + 3) * 2 + 1, YL, YR); \ + roundsm(YL, YR, ((n) + 2) * 2 + 0, ((n) + 2) * 2 + 1, XL, XR); + +/* perform FL and FL?? */ +#define fls(ll, lr, rl, rr, kll, klr, krl, krr) \ + ldr RT0, [CTX, #(key_table + ((kll) * 4))]; \ + ldr RT2, [CTX, #(key_table + ((krr) * 4))]; \ + and RT0, ll; \ + ldr RT3, [CTX, #(key_table + ((krl) * 4))]; \ + orr RT2, rr; \ + ldr RT1, [CTX, #(key_table + ((klr) * 4))]; \ + eor rl, RT2; \ + eor lr, lr, RT0, ror#31; \ + and RT3, rl; \ + orr RT1, lr; \ + eor ll, RT1; \ + eor rr, rr, RT3, ror#31; + +#define enc_fls(n) \ + fls(XL, XR, YL, YR, \ + (n) * 2 + 0, (n) * 2 + 1, \ + (n) * 2 + 2, (n) * 2 + 3); + +#define dec_fls(n) \ + fls(XL, XR, YL, YR, \ + (n) * 2 + 2, (n) * 2 + 3, \ + (n) * 2 + 0, (n) * 2 + 1); + +#define inpack(n) \ + ldr_input_be(%r2, XL, XR, YL, YR, RT0); \ + ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \ + ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \ + eor XL, RT0; \ + eor XR, RT1; + +#define outunpack(n) \ + ldr RT0, [CTX, #(key_table + ((n) * 8) + 0)]; \ + ldr RT1, [CTX, #(key_table + ((n) * 8) + 4)]; \ + eor YL, RT0; \ + eor YR, RT1; \ + str_output_be(%r1, YL, YR, XL, XR, RT0, RT1); + +.align 3 +.global _gcry_camellia_armv6_encrypt_block +.type _gcry_camellia_armv6_encrypt_block,%function; + +_gcry_camellia_armv6_encrypt_block: + /* input: + * %r0: keytable + * %r1: dst + * %r2: src + * %r3: keybitlen + */ + push {%r1, %r4-%r11, %ip, %lr}; + + ldr RTAB1, =.Lcamellia_sp1110; + mov RMASK, #0xff; + add RTAB3, RTAB1, #(2 * 4); + push {%r3}; + mov RMASK, RMASK, lsl#4 /* byte mask */ + + inpack(0); + + enc_rounds(0); + enc_fls(8); + enc_rounds(8); + enc_fls(16); + enc_rounds(16); + + pop {RT0}; + cmp RT0, #(16 * 8); + bne .Lenc_256; + + pop {%r1}; + outunpack(24); + + pop {%r4-%r11, %ip, %pc}; +.ltorg + +.Lenc_256: + enc_fls(24); + enc_rounds(24); + + pop {%r1}; + outunpack(32); + + pop {%r4-%r11, %ip, %pc}; +.ltorg +.size _gcry_camellia_armv6_encrypt_block,.-_gcry_camellia_armv6_encrypt_block; + +.align 3 +.global _gcry_camellia_armv6_decrypt_block +.type _gcry_camellia_armv6_decrypt_block,%function; + +_gcry_camellia_armv6_decrypt_block: + /* input: + * %r0: keytable + * %r1: dst + * %r2: src + * %r3: keybitlen + */ + push {%r1, %r4-%r11, %ip, %lr}; + + ldr RTAB1, =.Lcamellia_sp1110; + mov RMASK, #0xff; + add RTAB3, RTAB1, #(2 * 4); + mov RMASK, RMASK, lsl#4 /* byte mask */ + + cmp %r3, #(16 * 8); + bne .Ldec_256; + + inpack(24); + +.Ldec_128: + dec_rounds(16); + dec_fls(16); + dec_rounds(8); + dec_fls(8); + dec_rounds(0); + + pop {%r1}; + outunpack(0); + + pop {%r4-%r11, %ip, %pc}; +.ltorg + +.Ldec_256: + inpack(32); + dec_rounds(24); + dec_fls(24); + + b .Ldec_128; +.ltorg +.size _gcry_camellia_armv6_decrypt_block,.-_gcry_camellia_armv6_decrypt_block; + +.data + +/* Encryption/Decryption tables */ +.align 5 +.Lcamellia_sp1110: +.long 0x70707000 +.Lcamellia_sp0222: + .long 0x00e0e0e0 +.Lcamellia_sp3033: + .long 0x38003838 +.Lcamellia_sp4404: + .long 0x70700070 +.long 0x82828200, 0x00050505, 0x41004141, 0x2c2c002c +.long 0x2c2c2c00, 0x00585858, 0x16001616, 0xb3b300b3 +.long 0xececec00, 0x00d9d9d9, 0x76007676, 0xc0c000c0 +.long 0xb3b3b300, 0x00676767, 0xd900d9d9, 0xe4e400e4 +.long 0x27272700, 0x004e4e4e, 0x93009393, 0x57570057 +.long 0xc0c0c000, 0x00818181, 0x60006060, 0xeaea00ea +.long 0xe5e5e500, 0x00cbcbcb, 0xf200f2f2, 0xaeae00ae +.long 0xe4e4e400, 0x00c9c9c9, 0x72007272, 0x23230023 +.long 0x85858500, 0x000b0b0b, 0xc200c2c2, 0x6b6b006b +.long 0x57575700, 0x00aeaeae, 0xab00abab, 0x45450045 +.long 0x35353500, 0x006a6a6a, 0x9a009a9a, 0xa5a500a5 +.long 0xeaeaea00, 0x00d5d5d5, 0x75007575, 0xeded00ed +.long 0x0c0c0c00, 0x00181818, 0x06000606, 0x4f4f004f +.long 0xaeaeae00, 0x005d5d5d, 0x57005757, 0x1d1d001d +.long 0x41414100, 0x00828282, 0xa000a0a0, 0x92920092 +.long 0x23232300, 0x00464646, 0x91009191, 0x86860086 +.long 0xefefef00, 0x00dfdfdf, 0xf700f7f7, 0xafaf00af +.long 0x6b6b6b00, 0x00d6d6d6, 0xb500b5b5, 0x7c7c007c +.long 0x93939300, 0x00272727, 0xc900c9c9, 0x1f1f001f +.long 0x45454500, 0x008a8a8a, 0xa200a2a2, 0x3e3e003e +.long 0x19191900, 0x00323232, 0x8c008c8c, 0xdcdc00dc +.long 0xa5a5a500, 0x004b4b4b, 0xd200d2d2, 0x5e5e005e +.long 0x21212100, 0x00424242, 0x90009090, 0x0b0b000b +.long 0xededed00, 0x00dbdbdb, 0xf600f6f6, 0xa6a600a6 +.long 0x0e0e0e00, 0x001c1c1c, 0x07000707, 0x39390039 +.long 0x4f4f4f00, 0x009e9e9e, 0xa700a7a7, 0xd5d500d5 +.long 0x4e4e4e00, 0x009c9c9c, 0x27002727, 0x5d5d005d +.long 0x1d1d1d00, 0x003a3a3a, 0x8e008e8e, 0xd9d900d9 +.long 0x65656500, 0x00cacaca, 0xb200b2b2, 0x5a5a005a +.long 0x92929200, 0x00252525, 0x49004949, 0x51510051 +.long 0xbdbdbd00, 0x007b7b7b, 0xde00dede, 0x6c6c006c +.long 0x86868600, 0x000d0d0d, 0x43004343, 0x8b8b008b +.long 0xb8b8b800, 0x00717171, 0x5c005c5c, 0x9a9a009a +.long 0xafafaf00, 0x005f5f5f, 0xd700d7d7, 0xfbfb00fb +.long 0x8f8f8f00, 0x001f1f1f, 0xc700c7c7, 0xb0b000b0 +.long 0x7c7c7c00, 0x00f8f8f8, 0x3e003e3e, 0x74740074 +.long 0xebebeb00, 0x00d7d7d7, 0xf500f5f5, 0x2b2b002b +.long 0x1f1f1f00, 0x003e3e3e, 0x8f008f8f, 0xf0f000f0 +.long 0xcecece00, 0x009d9d9d, 0x67006767, 0x84840084 +.long 0x3e3e3e00, 0x007c7c7c, 0x1f001f1f, 0xdfdf00df +.long 0x30303000, 0x00606060, 0x18001818, 0xcbcb00cb +.long 0xdcdcdc00, 0x00b9b9b9, 0x6e006e6e, 0x34340034 +.long 0x5f5f5f00, 0x00bebebe, 0xaf00afaf, 0x76760076 +.long 0x5e5e5e00, 0x00bcbcbc, 0x2f002f2f, 0x6d6d006d +.long 0xc5c5c500, 0x008b8b8b, 0xe200e2e2, 0xa9a900a9 +.long 0x0b0b0b00, 0x00161616, 0x85008585, 0xd1d100d1 +.long 0x1a1a1a00, 0x00343434, 0x0d000d0d, 0x04040004 +.long 0xa6a6a600, 0x004d4d4d, 0x53005353, 0x14140014 +.long 0xe1e1e100, 0x00c3c3c3, 0xf000f0f0, 0x3a3a003a +.long 0x39393900, 0x00727272, 0x9c009c9c, 0xdede00de +.long 0xcacaca00, 0x00959595, 0x65006565, 0x11110011 +.long 0xd5d5d500, 0x00ababab, 0xea00eaea, 0x32320032 +.long 0x47474700, 0x008e8e8e, 0xa300a3a3, 0x9c9c009c +.long 0x5d5d5d00, 0x00bababa, 0xae00aeae, 0x53530053 +.long 0x3d3d3d00, 0x007a7a7a, 0x9e009e9e, 0xf2f200f2 +.long 0xd9d9d900, 0x00b3b3b3, 0xec00ecec, 0xfefe00fe +.long 0x01010100, 0x00020202, 0x80008080, 0xcfcf00cf +.long 0x5a5a5a00, 0x00b4b4b4, 0x2d002d2d, 0xc3c300c3 +.long 0xd6d6d600, 0x00adadad, 0x6b006b6b, 0x7a7a007a +.long 0x51515100, 0x00a2a2a2, 0xa800a8a8, 0x24240024 +.long 0x56565600, 0x00acacac, 0x2b002b2b, 0xe8e800e8 +.long 0x6c6c6c00, 0x00d8d8d8, 0x36003636, 0x60600060 +.long 0x4d4d4d00, 0x009a9a9a, 0xa600a6a6, 0x69690069 +.long 0x8b8b8b00, 0x00171717, 0xc500c5c5, 0xaaaa00aa +.long 0x0d0d0d00, 0x001a1a1a, 0x86008686, 0xa0a000a0 +.long 0x9a9a9a00, 0x00353535, 0x4d004d4d, 0xa1a100a1 +.long 0x66666600, 0x00cccccc, 0x33003333, 0x62620062 +.long 0xfbfbfb00, 0x00f7f7f7, 0xfd00fdfd, 0x54540054 +.long 0xcccccc00, 0x00999999, 0x66006666, 0x1e1e001e +.long 0xb0b0b000, 0x00616161, 0x58005858, 0xe0e000e0 +.long 0x2d2d2d00, 0x005a5a5a, 0x96009696, 0x64640064 +.long 0x74747400, 0x00e8e8e8, 0x3a003a3a, 0x10100010 +.long 0x12121200, 0x00242424, 0x09000909, 0x00000000 +.long 0x2b2b2b00, 0x00565656, 0x95009595, 0xa3a300a3 +.long 0x20202000, 0x00404040, 0x10001010, 0x75750075 +.long 0xf0f0f000, 0x00e1e1e1, 0x78007878, 0x8a8a008a +.long 0xb1b1b100, 0x00636363, 0xd800d8d8, 0xe6e600e6 +.long 0x84848400, 0x00090909, 0x42004242, 0x09090009 +.long 0x99999900, 0x00333333, 0xcc00cccc, 0xdddd00dd +.long 0xdfdfdf00, 0x00bfbfbf, 0xef00efef, 0x87870087 +.long 0x4c4c4c00, 0x00989898, 0x26002626, 0x83830083 +.long 0xcbcbcb00, 0x00979797, 0xe500e5e5, 0xcdcd00cd +.long 0xc2c2c200, 0x00858585, 0x61006161, 0x90900090 +.long 0x34343400, 0x00686868, 0x1a001a1a, 0x73730073 +.long 0x7e7e7e00, 0x00fcfcfc, 0x3f003f3f, 0xf6f600f6 +.long 0x76767600, 0x00ececec, 0x3b003b3b, 0x9d9d009d +.long 0x05050500, 0x000a0a0a, 0x82008282, 0xbfbf00bf +.long 0x6d6d6d00, 0x00dadada, 0xb600b6b6, 0x52520052 +.long 0xb7b7b700, 0x006f6f6f, 0xdb00dbdb, 0xd8d800d8 +.long 0xa9a9a900, 0x00535353, 0xd400d4d4, 0xc8c800c8 +.long 0x31313100, 0x00626262, 0x98009898, 0xc6c600c6 +.long 0xd1d1d100, 0x00a3a3a3, 0xe800e8e8, 0x81810081 +.long 0x17171700, 0x002e2e2e, 0x8b008b8b, 0x6f6f006f +.long 0x04040400, 0x00080808, 0x02000202, 0x13130013 +.long 0xd7d7d700, 0x00afafaf, 0xeb00ebeb, 0x63630063 +.long 0x14141400, 0x00282828, 0x0a000a0a, 0xe9e900e9 +.long 0x58585800, 0x00b0b0b0, 0x2c002c2c, 0xa7a700a7 +.long 0x3a3a3a00, 0x00747474, 0x1d001d1d, 0x9f9f009f +.long 0x61616100, 0x00c2c2c2, 0xb000b0b0, 0xbcbc00bc +.long 0xdedede00, 0x00bdbdbd, 0x6f006f6f, 0x29290029 +.long 0x1b1b1b00, 0x00363636, 0x8d008d8d, 0xf9f900f9 +.long 0x11111100, 0x00222222, 0x88008888, 0x2f2f002f +.long 0x1c1c1c00, 0x00383838, 0x0e000e0e, 0xb4b400b4 +.long 0x32323200, 0x00646464, 0x19001919, 0x78780078 +.long 0x0f0f0f00, 0x001e1e1e, 0x87008787, 0x06060006 +.long 0x9c9c9c00, 0x00393939, 0x4e004e4e, 0xe7e700e7 +.long 0x16161600, 0x002c2c2c, 0x0b000b0b, 0x71710071 +.long 0x53535300, 0x00a6a6a6, 0xa900a9a9, 0xd4d400d4 +.long 0x18181800, 0x00303030, 0x0c000c0c, 0xabab00ab +.long 0xf2f2f200, 0x00e5e5e5, 0x79007979, 0x88880088 +.long 0x22222200, 0x00444444, 0x11001111, 0x8d8d008d +.long 0xfefefe00, 0x00fdfdfd, 0x7f007f7f, 0x72720072 +.long 0x44444400, 0x00888888, 0x22002222, 0xb9b900b9 +.long 0xcfcfcf00, 0x009f9f9f, 0xe700e7e7, 0xf8f800f8 +.long 0xb2b2b200, 0x00656565, 0x59005959, 0xacac00ac +.long 0xc3c3c300, 0x00878787, 0xe100e1e1, 0x36360036 +.long 0xb5b5b500, 0x006b6b6b, 0xda00dada, 0x2a2a002a +.long 0x7a7a7a00, 0x00f4f4f4, 0x3d003d3d, 0x3c3c003c +.long 0x91919100, 0x00232323, 0xc800c8c8, 0xf1f100f1 +.long 0x24242400, 0x00484848, 0x12001212, 0x40400040 +.long 0x08080800, 0x00101010, 0x04000404, 0xd3d300d3 +.long 0xe8e8e800, 0x00d1d1d1, 0x74007474, 0xbbbb00bb +.long 0xa8a8a800, 0x00515151, 0x54005454, 0x43430043 +.long 0x60606000, 0x00c0c0c0, 0x30003030, 0x15150015 +.long 0xfcfcfc00, 0x00f9f9f9, 0x7e007e7e, 0xadad00ad +.long 0x69696900, 0x00d2d2d2, 0xb400b4b4, 0x77770077 +.long 0x50505000, 0x00a0a0a0, 0x28002828, 0x80800080 +.long 0xaaaaaa00, 0x00555555, 0x55005555, 0x82820082 +.long 0xd0d0d000, 0x00a1a1a1, 0x68006868, 0xecec00ec +.long 0xa0a0a000, 0x00414141, 0x50005050, 0x27270027 +.long 0x7d7d7d00, 0x00fafafa, 0xbe00bebe, 0xe5e500e5 +.long 0xa1a1a100, 0x00434343, 0xd000d0d0, 0x85850085 +.long 0x89898900, 0x00131313, 0xc400c4c4, 0x35350035 +.long 0x62626200, 0x00c4c4c4, 0x31003131, 0x0c0c000c +.long 0x97979700, 0x002f2f2f, 0xcb00cbcb, 0x41410041 +.long 0x54545400, 0x00a8a8a8, 0x2a002a2a, 0xefef00ef +.long 0x5b5b5b00, 0x00b6b6b6, 0xad00adad, 0x93930093 +.long 0x1e1e1e00, 0x003c3c3c, 0x0f000f0f, 0x19190019 +.long 0x95959500, 0x002b2b2b, 0xca00caca, 0x21210021 +.long 0xe0e0e000, 0x00c1c1c1, 0x70007070, 0x0e0e000e +.long 0xffffff00, 0x00ffffff, 0xff00ffff, 0x4e4e004e +.long 0x64646400, 0x00c8c8c8, 0x32003232, 0x65650065 +.long 0xd2d2d200, 0x00a5a5a5, 0x69006969, 0xbdbd00bd +.long 0x10101000, 0x00202020, 0x08000808, 0xb8b800b8 +.long 0xc4c4c400, 0x00898989, 0x62006262, 0x8f8f008f +.long 0x00000000, 0x00000000, 0x00000000, 0xebeb00eb +.long 0x48484800, 0x00909090, 0x24002424, 0xcece00ce +.long 0xa3a3a300, 0x00474747, 0xd100d1d1, 0x30300030 +.long 0xf7f7f700, 0x00efefef, 0xfb00fbfb, 0x5f5f005f +.long 0x75757500, 0x00eaeaea, 0xba00baba, 0xc5c500c5 +.long 0xdbdbdb00, 0x00b7b7b7, 0xed00eded, 0x1a1a001a +.long 0x8a8a8a00, 0x00151515, 0x45004545, 0xe1e100e1 +.long 0x03030300, 0x00060606, 0x81008181, 0xcaca00ca +.long 0xe6e6e600, 0x00cdcdcd, 0x73007373, 0x47470047 +.long 0xdadada00, 0x00b5b5b5, 0x6d006d6d, 0x3d3d003d +.long 0x09090900, 0x00121212, 0x84008484, 0x01010001 +.long 0x3f3f3f00, 0x007e7e7e, 0x9f009f9f, 0xd6d600d6 +.long 0xdddddd00, 0x00bbbbbb, 0xee00eeee, 0x56560056 +.long 0x94949400, 0x00292929, 0x4a004a4a, 0x4d4d004d +.long 0x87878700, 0x000f0f0f, 0xc300c3c3, 0x0d0d000d +.long 0x5c5c5c00, 0x00b8b8b8, 0x2e002e2e, 0x66660066 +.long 0x83838300, 0x00070707, 0xc100c1c1, 0xcccc00cc +.long 0x02020200, 0x00040404, 0x01000101, 0x2d2d002d +.long 0xcdcdcd00, 0x009b9b9b, 0xe600e6e6, 0x12120012 +.long 0x4a4a4a00, 0x00949494, 0x25002525, 0x20200020 +.long 0x90909000, 0x00212121, 0x48004848, 0xb1b100b1 +.long 0x33333300, 0x00666666, 0x99009999, 0x99990099 +.long 0x73737300, 0x00e6e6e6, 0xb900b9b9, 0x4c4c004c +.long 0x67676700, 0x00cecece, 0xb300b3b3, 0xc2c200c2 +.long 0xf6f6f600, 0x00ededed, 0x7b007b7b, 0x7e7e007e +.long 0xf3f3f300, 0x00e7e7e7, 0xf900f9f9, 0x05050005 +.long 0x9d9d9d00, 0x003b3b3b, 0xce00cece, 0xb7b700b7 +.long 0x7f7f7f00, 0x00fefefe, 0xbf00bfbf, 0x31310031 +.long 0xbfbfbf00, 0x007f7f7f, 0xdf00dfdf, 0x17170017 +.long 0xe2e2e200, 0x00c5c5c5, 0x71007171, 0xd7d700d7 +.long 0x52525200, 0x00a4a4a4, 0x29002929, 0x58580058 +.long 0x9b9b9b00, 0x00373737, 0xcd00cdcd, 0x61610061 +.long 0xd8d8d800, 0x00b1b1b1, 0x6c006c6c, 0x1b1b001b +.long 0x26262600, 0x004c4c4c, 0x13001313, 0x1c1c001c +.long 0xc8c8c800, 0x00919191, 0x64006464, 0x0f0f000f +.long 0x37373700, 0x006e6e6e, 0x9b009b9b, 0x16160016 +.long 0xc6c6c600, 0x008d8d8d, 0x63006363, 0x18180018 +.long 0x3b3b3b00, 0x00767676, 0x9d009d9d, 0x22220022 +.long 0x81818100, 0x00030303, 0xc000c0c0, 0x44440044 +.long 0x96969600, 0x002d2d2d, 0x4b004b4b, 0xb2b200b2 +.long 0x6f6f6f00, 0x00dedede, 0xb700b7b7, 0xb5b500b5 +.long 0x4b4b4b00, 0x00969696, 0xa500a5a5, 0x91910091 +.long 0x13131300, 0x00262626, 0x89008989, 0x08080008 +.long 0xbebebe00, 0x007d7d7d, 0x5f005f5f, 0xa8a800a8 +.long 0x63636300, 0x00c6c6c6, 0xb100b1b1, 0xfcfc00fc +.long 0x2e2e2e00, 0x005c5c5c, 0x17001717, 0x50500050 +.long 0xe9e9e900, 0x00d3d3d3, 0xf400f4f4, 0xd0d000d0 +.long 0x79797900, 0x00f2f2f2, 0xbc00bcbc, 0x7d7d007d +.long 0xa7a7a700, 0x004f4f4f, 0xd300d3d3, 0x89890089 +.long 0x8c8c8c00, 0x00191919, 0x46004646, 0x97970097 +.long 0x9f9f9f00, 0x003f3f3f, 0xcf00cfcf, 0x5b5b005b +.long 0x6e6e6e00, 0x00dcdcdc, 0x37003737, 0x95950095 +.long 0xbcbcbc00, 0x00797979, 0x5e005e5e, 0xffff00ff +.long 0x8e8e8e00, 0x001d1d1d, 0x47004747, 0xd2d200d2 +.long 0x29292900, 0x00525252, 0x94009494, 0xc4c400c4 +.long 0xf5f5f500, 0x00ebebeb, 0xfa00fafa, 0x48480048 +.long 0xf9f9f900, 0x00f3f3f3, 0xfc00fcfc, 0xf7f700f7 +.long 0xb6b6b600, 0x006d6d6d, 0x5b005b5b, 0xdbdb00db +.long 0x2f2f2f00, 0x005e5e5e, 0x97009797, 0x03030003 +.long 0xfdfdfd00, 0x00fbfbfb, 0xfe00fefe, 0xdada00da +.long 0xb4b4b400, 0x00696969, 0x5a005a5a, 0x3f3f003f +.long 0x59595900, 0x00b2b2b2, 0xac00acac, 0x94940094 +.long 0x78787800, 0x00f0f0f0, 0x3c003c3c, 0x5c5c005c +.long 0x98989800, 0x00313131, 0x4c004c4c, 0x02020002 +.long 0x06060600, 0x000c0c0c, 0x03000303, 0x4a4a004a +.long 0x6a6a6a00, 0x00d4d4d4, 0x35003535, 0x33330033 +.long 0xe7e7e700, 0x00cfcfcf, 0xf300f3f3, 0x67670067 +.long 0x46464600, 0x008c8c8c, 0x23002323, 0xf3f300f3 +.long 0x71717100, 0x00e2e2e2, 0xb800b8b8, 0x7f7f007f +.long 0xbababa00, 0x00757575, 0x5d005d5d, 0xe2e200e2 +.long 0xd4d4d400, 0x00a9a9a9, 0x6a006a6a, 0x9b9b009b +.long 0x25252500, 0x004a4a4a, 0x92009292, 0x26260026 +.long 0xababab00, 0x00575757, 0xd500d5d5, 0x37370037 +.long 0x42424200, 0x00848484, 0x21002121, 0x3b3b003b +.long 0x88888800, 0x00111111, 0x44004444, 0x96960096 +.long 0xa2a2a200, 0x00454545, 0x51005151, 0x4b4b004b +.long 0x8d8d8d00, 0x001b1b1b, 0xc600c6c6, 0xbebe00be +.long 0xfafafa00, 0x00f5f5f5, 0x7d007d7d, 0x2e2e002e +.long 0x72727200, 0x00e4e4e4, 0x39003939, 0x79790079 +.long 0x07070700, 0x000e0e0e, 0x83008383, 0x8c8c008c +.long 0xb9b9b900, 0x00737373, 0xdc00dcdc, 0x6e6e006e +.long 0x55555500, 0x00aaaaaa, 0xaa00aaaa, 0x8e8e008e +.long 0xf8f8f800, 0x00f1f1f1, 0x7c007c7c, 0xf5f500f5 +.long 0xeeeeee00, 0x00dddddd, 0x77007777, 0xb6b600b6 +.long 0xacacac00, 0x00595959, 0x56005656, 0xfdfd00fd +.long 0x0a0a0a00, 0x00141414, 0x05000505, 0x59590059 +.long 0x36363600, 0x006c6c6c, 0x1b001b1b, 0x98980098 +.long 0x49494900, 0x00929292, 0xa400a4a4, 0x6a6a006a +.long 0x2a2a2a00, 0x00545454, 0x15001515, 0x46460046 +.long 0x68686800, 0x00d0d0d0, 0x34003434, 0xbaba00ba +.long 0x3c3c3c00, 0x00787878, 0x1e001e1e, 0x25250025 +.long 0x38383800, 0x00707070, 0x1c001c1c, 0x42420042 +.long 0xf1f1f100, 0x00e3e3e3, 0xf800f8f8, 0xa2a200a2 +.long 0xa4a4a400, 0x00494949, 0x52005252, 0xfafa00fa +.long 0x40404000, 0x00808080, 0x20002020, 0x07070007 +.long 0x28282800, 0x00505050, 0x14001414, 0x55550055 +.long 0xd3d3d300, 0x00a7a7a7, 0xe900e9e9, 0xeeee00ee +.long 0x7b7b7b00, 0x00f6f6f6, 0xbd00bdbd, 0x0a0a000a +.long 0xbbbbbb00, 0x00777777, 0xdd00dddd, 0x49490049 +.long 0xc9c9c900, 0x00939393, 0xe400e4e4, 0x68680068 +.long 0x43434300, 0x00868686, 0xa100a1a1, 0x38380038 +.long 0xc1c1c100, 0x00838383, 0xe000e0e0, 0xa4a400a4 +.long 0x15151500, 0x002a2a2a, 0x8a008a8a, 0x28280028 +.long 0xe3e3e300, 0x00c7c7c7, 0xf100f1f1, 0x7b7b007b +.long 0xadadad00, 0x005b5b5b, 0xd600d6d6, 0xc9c900c9 +.long 0xf4f4f400, 0x00e9e9e9, 0x7a007a7a, 0xc1c100c1 +.long 0x77777700, 0x00eeeeee, 0xbb00bbbb, 0xe3e300e3 +.long 0xc7c7c700, 0x008f8f8f, 0xe300e3e3, 0xf4f400f4 +.long 0x80808000, 0x00010101, 0x40004040, 0xc7c700c7 +.long 0x9e9e9e00, 0x003d3d3d, 0x4f004f4f, 0x9e9e009e + +#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/ +#endif /*__ARM_ARCH >= 6*/ diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c index b44bd7b..6e2319d 100644 --- a/cipher/camellia-glue.c +++ b/cipher/camellia-glue.c @@ -193,6 +193,55 @@ camellia_setkey(void *c, const byte *key, unsigned keylen) return 0; } +#ifdef USE_ARMV6_ASM + +/* Assembly implementations of CAST5. */ +extern void _gcry_camellia_armv6_encrypt_block(const KEY_TABLE_TYPE keyTable, + byte *outbuf, const byte *inbuf, + const int keybits); + +extern void _gcry_camellia_armv6_decrypt_block(const KEY_TABLE_TYPE keyTable, + byte *outbuf, const byte *inbuf, + const int keybits); + +static void Camellia_EncryptBlock(const int keyBitLength, + const unsigned char *plaintext, + const KEY_TABLE_TYPE keyTable, + unsigned char *cipherText) +{ + _gcry_camellia_armv6_encrypt_block(keyTable, cipherText, plaintext, + keyBitLength); +} + +static void Camellia_DecryptBlock(const int keyBitLength, + const unsigned char *cipherText, + const KEY_TABLE_TYPE keyTable, + unsigned char *plaintext) +{ + _gcry_camellia_armv6_decrypt_block(keyTable, plaintext, cipherText, + keyBitLength); +} + +static void +camellia_encrypt(void *c, byte *outbuf, const byte *inbuf) +{ + CAMELLIA_context *ctx = c; + Camellia_EncryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf); +#define CAMELLIA_encrypt_stack_burn_size (15*4) + _gcry_burn_stack(CAMELLIA_encrypt_stack_burn_size); +} + +static void +camellia_decrypt(void *c, byte *outbuf, const byte *inbuf) +{ + CAMELLIA_context *ctx=c; + Camellia_DecryptBlock(ctx->keybitlength,inbuf,ctx->keytable,outbuf); +#define CAMELLIA_decrypt_stack_burn_size (15*4) + _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size); +} + +#else /*USE_ARMV6_ASM*/ + static void camellia_encrypt(void *c, byte *outbuf, const byte *inbuf) { @@ -227,6 +276,8 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf) _gcry_burn_stack(CAMELLIA_decrypt_stack_burn_size); } +#endif /*!USE_ARMV6_ASM*/ + /* Bulk encryption of complete blocks in CTR mode. This function is only intended for the bulk encryption feature of cipher.c. CTR is expected to be of size CAMELLIA_BLOCK_SIZE. */ diff --git a/cipher/camellia.c b/cipher/camellia.c index cd46885..038d911 100644 --- a/cipher/camellia.c +++ b/cipher/camellia.c @@ -869,6 +869,7 @@ void camellia_setup192(const unsigned char *key, u32 *subkey) } +#ifndef USE_ARMV6_ASM /** * Stuff related to camellia encryption/decryption * @@ -1328,6 +1329,8 @@ void camellia_decrypt256(const u32 *subkey, u32 *blocks) return; } +#endif /*!USE_ARMV6_ASM*/ + /*** * @@ -1354,6 +1357,7 @@ void Camellia_Ekeygen(const int keyBitLength, } +#ifndef USE_ARMV6_ASM void Camellia_EncryptBlock(const int keyBitLength, const unsigned char *plaintext, const KEY_TABLE_TYPE keyTable, @@ -1414,3 +1418,4 @@ void Camellia_DecryptBlock(const int keyBitLength, PUTU32(plaintext + 8, tmp[2]); PUTU32(plaintext + 12, tmp[3]); } +#endif /*!USE_ARMV6_ASM*/ diff --git a/cipher/camellia.h b/cipher/camellia.h index cccf786..48f9160 100644 --- a/cipher/camellia.h +++ b/cipher/camellia.h @@ -30,6 +30,20 @@ */ #ifdef HAVE_CONFIG_H #include +/* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ +# undef USE_ARMV6_ASM +# if defined(__arm__) && defined(__ARMEL__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) +# ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS +# define USE_ARMV6_ASM 1 +# endif +# endif #endif #ifdef CAMELLIA_EXT_SYM_PREFIX #define CAMELLIA_PREFIX1(x,y) x ## y @@ -63,6 +77,7 @@ void Camellia_Ekeygen(const int keyBitLength, const unsigned char *rawKey, KEY_TABLE_TYPE keyTable); +#ifndef USE_ARMV6_ASM void Camellia_EncryptBlock(const int keyBitLength, const unsigned char *plaintext, const KEY_TABLE_TYPE keyTable, @@ -72,6 +87,7 @@ void Camellia_DecryptBlock(const int keyBitLength, const unsigned char *cipherText, const KEY_TABLE_TYPE keyTable, unsigned char *plaintext); +#endif /*!USE_ARMV6_ASM*/ #ifdef __cplusplus diff --git a/configure.ac b/configure.ac index 22dab9d..c33f36b 100644 --- a/configure.ac +++ b/configure.ac @@ -1375,6 +1375,13 @@ if test "$found" = "1" ; then GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia.lo camellia-glue.lo" AC_DEFINE(USE_CAMELLIA, 1, [Defined if this module should be included]) + case "${host}" in + arm*-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS camellia-armv6.lo" + ;; + esac + if test x"$avxsupport" = xyes ; then if test x"$aesnisupport" = xyes ; then # Build with the AES-NI/AVX implementation From jussi.kivilinna at iki.fi Fri Aug 16 15:07:46 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 16 Aug 2013 16:07:46 +0300 Subject: [PATCH 0/2] Add ARMv6 assembly implementations of AES and CAST5 In-Reply-To: <20130815114434.27965.44657.stgit@localhost6.localdomain6> References: <20130815114434.27965.44657.stgit@localhost6.localdomain6> Message-ID: <520E2422.1020709@iki.fi> Does these (and blowfish & camellia) look ok to push? -Jussi On 15.08.2013 14:44, Jussi Kivilinna wrote: > These two patches add ARMv6 implementations of AES and CAST5, tuned for > Cortex-A8. > > --- > > Jussi Kivilinna (2): > rinjdael: add ARMv6 assembly implementation > cast5: add ARMv6 assembly implementation > > > cipher/Makefile.am | 4 > cipher/cast5-armv6.S | 708 +++++++++++++++++++++++++++++++++++++++ > cipher/cast5.c | 137 +++++++ > cipher/rijndael-armv6.S | 860 +++++++++++++++++++++++++++++++++++++++++++++++ > cipher/rijndael.c | 48 ++- > configure.ac | 36 ++ > 6 files changed, 1781 insertions(+), 12 deletions(-) > create mode 100644 cipher/cast5-armv6.S > create mode 100644 cipher/rijndael-armv6.S > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > From wk at gnupg.org Fri Aug 16 16:48:49 2013 From: wk at gnupg.org (Werner Koch) Date: Fri, 16 Aug 2013 16:48:49 +0200 Subject: [PATCH 0/2] Add ARMv6 assembly implementations of AES and CAST5 In-Reply-To: <520E2422.1020709@iki.fi> (Jussi Kivilinna's message of "Fri, 16 Aug 2013 16:07:46 +0300") References: <20130815114434.27965.44657.stgit@localhost6.localdomain6> <520E2422.1020709@iki.fi> Message-ID: <8761v57ktq.fsf@vigenere.g10code.de> On Fri, 16 Aug 2013 15:07, jussi.kivilinna at iki.fi said: > Does these (and blowfish & camellia) look ok to push? Please push them. It is always good to make it easier for people to test them. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. From jussi.kivilinna at iki.fi Mon Aug 19 11:15:56 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 19 Aug 2013 12:15:56 +0300 Subject: [PATCH 1/5] Remove burn_stack optimization Message-ID: <20130819091556.5607.2880.stgit@localhost6.localdomain6> * src/misc.c (_gcry_burn_stack): Remove SIZEOF_UNSIGNED_LONG == 4 or 8 optimization. -- At least GCC 4.6 on Debian Wheezy (armhf) generates wrong code for burn_stack, causing recursive structure to be transformed in to iterative without updating stack pointer between iterations. Therefore only first 64 bytes of stack get zeroed. This appears to be fixed in GCC 4.7, but lets play this safe and remove this optimization. Better approach would probably be to add architecture specific assembly routine(s) that replace this generic function. Signed-off-by: Jussi Kivilinna --- src/misc.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/src/misc.c b/src/misc.c index 67c2e80..2d9c73a 100644 --- a/src/misc.c +++ b/src/misc.c @@ -290,35 +290,10 @@ _gcry_log_printhex (const char *text, const void *buffer, size_t length) void _gcry_burn_stack (int bytes) { -#if SIZEOF_UNSIGNED_LONG == 4 || SIZEOF_UNSIGNED_LONG == 8 - /* Optimized burn_stack for 32-bit and 64-bit architectures. In addition - to loop unrolling, compiler sees that writes are within 'buf' and - generation of stack-protection code is avoided. */ - volatile unsigned long buf[64 / SIZEOF_UNSIGNED_LONG]; - - buf[0] = 0; - buf[1] = 0; - buf[2] = 0; - buf[3] = 0; - buf[4] = 0; - buf[5] = 0; - buf[6] = 0; - buf[7] = 0; -# if SIZEOF_UNSIGNED_LONG == 4 - buf[8] = 0; - buf[9] = 0; - buf[10] = 0; - buf[11] = 0; - buf[12] = 0; - buf[13] = 0; - buf[14] = 0; - buf[15] = 0; -# endif -#else char buf[64]; wipememory (buf, sizeof buf); -#endif + bytes -= sizeof buf; if (bytes > 0) _gcry_burn_stack (bytes); From jussi.kivilinna at iki.fi Mon Aug 19 11:16:06 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 19 Aug 2013 12:16:06 +0300 Subject: [PATCH 3/5] Add optimized wipememory for ARM In-Reply-To: <20130819091556.5607.2880.stgit@localhost6.localdomain6> References: <20130819091556.5607.2880.stgit@localhost6.localdomain6> Message-ID: <20130819091606.5607.3970.stgit@localhost6.localdomain6> src/g10lib.h [__arm__] (fast_wipememory2_unaligned_head) (fast_wipememory2): New functions. -- Previous patch that removed _gcry_burn_stack optimization causes burn_stack take over 30% CPU usage when looping 'benchmark cipher blowfish' on ARM/Cortex-A8. Optimizing wipememory2 for ARM helps situation a lot. Old vs new (Cortex-A8): ECB/Stream CBC CFB OFB CTR --------------- --------------- --------------- --------------- --------------- IDEA 1.20x 1.18x 1.16x 1.15x 1.16x 1.18x 1.18x 1.16x 1.16x 1.17x 3DES 1.14x 1.14x 1.12x 1.13x 1.12x 1.13x 1.12x 1.13x 1.13x 1.15x CAST5 1.66x 1.67x 1.43x 1.00x 1.48x 1.00x 1.44x 1.44x 1.04x 0.96x BLOWFISH 1.56x 1.66x 1.47x 1.00x 1.54x 1.05x 1.44x 1.47x 1.00x 1.00x AES 1.52x 1.42x 1.04x 1.00x 1.00x 1.00x 1.38x 1.37x 1.00x 1.00x AES192 1.36x 1.36x 1.00x 1.00x 1.00x 1.04x 1.26x 1.22x 1.00x 1.04x AES256 1.32x 1.31x 1.03x 1.00x 1.00x 1.00x 1.24x 1.30x 1.03x 0.97x TWOFISH 1.31x 1.26x 1.23x 1.00x 1.25x 1.00x 1.24x 1.23x 1.00x 1.03x ARCFOUR 1.05x 0.96x DES 1.31x 1.33x 1.26x 1.29x 1.28x 1.29x 1.26x 1.29x 1.27x 1.29x TWOFISH128 1.27x 1.24x 1.23x 1.00x 1.28x 1.00x 1.21x 1.26x 0.97x 1.06x SERPENT128 1.19x 1.19x 1.15x 1.00x 1.14x 1.00x 1.17x 1.17x 0.98x 1.00x SERPENT192 1.19x 1.24x 1.17x 1.00x 1.14x 1.00x 1.15x 1.17x 1.00x 1.00x SERPENT256 1.16x 1.19x 1.17x 1.00x 1.14x 1.00x 1.15x 1.15x 1.00x 1.00x RFC2268_40 1.00x 0.99x 1.00x 1.01x 1.00x 1.00x 1.03x 1.00x 1.01x 1.00x SEED 1.20x 1.20x 1.18x 1.17x 1.17x 1.19x 1.18x 1.16x 1.19x 1.19x CAMELLIA128 1.38x 1.34x 1.31x 1.00x 1.31x 1.00x 1.29x 1.32x 1.00x 1.00x CAMELLIA192 1.27x 1.27x 1.23x 1.00x 1.25x 1.03x 1.20x 1.23x 1.00x 1.00x CAMELLIA256 1.27x 1.27x 1.26x 1.00x 1.25x 1.03x 1.20x 1.23x 1.00x 1.00x SALSA20 1.04x 1.00x note: bulk encryption/decryption do burn_stack after full buffer processing. instead of after each block. Signed-off-by: Jussi Kivilinna --- src/g10lib.h | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/src/g10lib.h b/src/g10lib.h index e6d20e9..198ab38 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -248,7 +248,7 @@ void _gcry_burn_stack (int bytes); #define wipememory(_ptr,_len) wipememory2(_ptr,0,_len) -/* Optimized fast_wipememory2 for i386 and x86-64 architechtures. Maybe leave +/* Optimized fast_wipememory2 for i386, x86-64 and arm architectures. May leave tail bytes unhandled, in which case tail bytes are handled by wipememory2. */ #if defined(__x86_64__) && __GNUC__ >= 4 @@ -283,6 +283,38 @@ void _gcry_burn_stack (int bytes); _vptr += 4; \ } while (_vlen >= 4); \ } while (0) +#elif defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) && \ + __GNUC__ >= 4 + +#ifdef __ARM_FEATURE_UNALIGNED +#define fast_wipememory2_unaligned_head(_ptr,_set,_len) /*do nothing*/ +#else +#define fast_wipememory2_unaligned_head(_vptr,_vset,_vlen) do { \ + while((size_t)(_vptr)&3 && _vlen) \ + { *_vptr=(_vset); _vptr++; _vlen--; } \ + } while(0) +#endif + +#define fast_wipememory2(_vptr,_vset,_vlen) do { \ + unsigned long _vset4 = _vset; \ + fast_wipememory2_unaligned_head(_vptr,_vset,_vlen); \ + if (_vlen < 8) \ + break; \ + _vset4 *= 0x01010101; \ + asm volatile( \ + "mov %%r4, %[set];\n\t" \ + "mov %%r5, %[set];\n\t" \ + "1:;\n\t" \ + "stm %[ptr]!, {%%r4, %%r5};\n\t" \ + "cmp %[end], %[ptr];\n\t" \ + "bne 1b;\n\t" \ + : [ptr] "=r" (_vptr) \ + : [set] "r" (_vset4), \ + [end] "r" (_vptr+(_vlen&(~0x7))), \ + "0" (_vptr) \ + : "memory", "r4", "r5", "cc"); \ + _vlen &= 0x7; \ + } while (0) #else #define fast_wipememory2(_ptr,_set,_len) #endif From jussi.kivilinna at iki.fi Mon Aug 19 11:16:01 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 19 Aug 2013 12:16:01 +0300 Subject: [PATCH 2/5] cipher: bufhelp: allow unaligned memory accesses on ARM In-Reply-To: <20130819091556.5607.2880.stgit@localhost6.localdomain6> References: <20130819091556.5607.2880.stgit@localhost6.localdomain6> Message-ID: <20130819091601.5607.70684.stgit@localhost6.localdomain6> * cipher/bufhelp.h [__arm__ && __ARM_FEATURE_UNALIGNED]: Enable BUFHELP_FAST_UNALIGNED_ACCESS. -- Newer ARM systems support unaligned memory accesses and on gcc-4.7 and onwards this is identified by __ARM_FEATURE_UNALIGNED macro. Signed-off-by: Jussi Kivilinna --- cipher/bufhelp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cipher/bufhelp.h b/cipher/bufhelp.h index 1c173e2..d829cf1 100644 --- a/cipher/bufhelp.h +++ b/cipher/bufhelp.h @@ -29,7 +29,8 @@ #endif -#if defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) || \ + (defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)) /* These architectures are able of unaligned memory accesses and can handle those fast. */ From jussi.kivilinna at iki.fi Mon Aug 19 11:16:16 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 19 Aug 2013 12:16:16 +0300 Subject: [PATCH 5/5] mpi: add ARMv6 assembly In-Reply-To: <20130819091556.5607.2880.stgit@localhost6.localdomain6> References: <20130819091556.5607.2880.stgit@localhost6.localdomain6> Message-ID: <20130819091616.5607.89084.stgit@localhost6.localdomain6> * mpi/armv6/mpi-asm-defs.h: New. * mpi/armv6/mpih-add1.S: New. * mpi/armv6/mpih-mul1.S: New. * mpi/armv6/mpih-mul2.S: New. * mpi/armv6/mpih-mul3.S: New. * mpi/armv6/mpih-sub1.S: New. * mpi/config.links [arm]: Enable ARMv6 assembly. -- Add mpi assembly for ARMv6 (or later). These are partly based on ARM assembly found in GMP 4.2.1. Old vs new (Cortex-A8, 1Ghz): Algorithm generate 100*sign 100*verify ------------------------------------------------ ECDSA 192 bit 1.14x 1.10x 1.13x ECDSA 224 bit 1.11x 1.12x 1.12x ECDSA 256 bit 1.20x 1.13x 1.14x ECDSA 384 bit 1.13x 1.21x 1.21x ECDSA 521 bit 1.17x 1.20x 1.22x Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit - 1.31x 1.60x RSA 2048 bit - 1.41x 1.47x RSA 3072 bit - 1.50x 1.63x RSA 4096 bit - 1.50x 1.57x Algorithm generate 100*sign 100*verify ------------------------------------------------ DSA 1024/160 - 1.39x 1.38x DSA 2048/224 - 1.50x 1.51x DSA 3072/256 - 1.59x 1.64x NEW: Algorithm generate 100*sign 100*verify ------------------------------------------------ ECDSA 192 bit 70ms 1750ms 3170ms ECDSA 224 bit 90ms 2210ms 4250ms ECDSA 256 bit 100ms 2710ms 5170ms ECDSA 384 bit 230ms 5670ms 11040ms ECDSA 521 bit 540ms 13370ms 25870ms Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 360ms 2200ms 50ms RSA 2048 bit 2770ms 11900ms 150ms RSA 3072 bit 6680ms 32530ms 270ms RSA 4096 bit 10320ms 69440ms 460ms Algorithm generate 100*sign 100*verify ------------------------------------------------ DSA 1024/160 - 990ms 910ms DSA 2048/224 - 3830ms 3410ms DSA 3072/256 - 8270ms 7030ms OLD: Algorithm generate 100*sign 100*verify ------------------------------------------------ ECDSA 192 bit 80ms 1920ms 3580ms ECDSA 224 bit 100ms 2470ms 4760ms ECDSA 256 bit 120ms 3050ms 5870ms ECDSA 384 bit 260ms 6840ms 13330ms ECDSA 521 bit 630ms 16080ms 31500ms Algorithm generate 100*sign 100*verify ------------------------------------------------ RSA 1024 bit 450ms 2890ms 80ms RSA 2048 bit 2320ms 16760ms 220ms RSA 3072 bit 26300ms 48650ms 440ms RSA 4096 bit 15700ms 103910ms 720ms Algorithm generate 100*sign 100*verify ------------------------------------------------ DSA 1024/160 - 1380ms 1260ms DSA 2048/224 - 5740ms 5140ms DSA 3072/256 - 13130ms 11510ms Signed-off-by: Jussi Kivilinna --- mpi/armv6/mpi-asm-defs.h | 10 +++++ mpi/armv6/mpih-add1.S | 76 ++++++++++++++++++++++++++++++++++++ mpi/armv6/mpih-mul1.S | 80 ++++++++++++++++++++++++++++++++++++++ mpi/armv6/mpih-mul2.S | 94 +++++++++++++++++++++++++++++++++++++++++++++ mpi/armv6/mpih-mul3.S | 97 ++++++++++++++++++++++++++++++++++++++++++++++ mpi/armv6/mpih-sub1.S | 77 +++++++++++++++++++++++++++++++++++++ mpi/config.links | 15 +++++++ 7 files changed, 449 insertions(+) create mode 100644 mpi/armv6/mpi-asm-defs.h create mode 100644 mpi/armv6/mpih-add1.S create mode 100644 mpi/armv6/mpih-mul1.S create mode 100644 mpi/armv6/mpih-mul2.S create mode 100644 mpi/armv6/mpih-mul3.S create mode 100644 mpi/armv6/mpih-sub1.S diff --git a/mpi/armv6/mpi-asm-defs.h b/mpi/armv6/mpi-asm-defs.h new file mode 100644 index 0000000..13424e2 --- /dev/null +++ b/mpi/armv6/mpi-asm-defs.h @@ -0,0 +1,10 @@ +/* This file defines some basic constants for the MPI machinery. We + * need to define the types on a per-CPU basis, so it is done with + * this file here. */ +#define BYTES_PER_MPI_LIMB (SIZEOF_UNSIGNED_LONG) + + + + + + diff --git a/mpi/armv6/mpih-add1.S b/mpi/armv6/mpih-add1.S new file mode 100644 index 0000000..60ea4c3 --- /dev/null +++ b/mpi/armv6/mpih-add1.S @@ -0,0 +1,76 @@ +/* ARMv6 add_n -- Add two limb vectors of the same length > 0 and store + * sum in a third limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Note: This code is heavily based on the GNU MP Library (version 4.2.1). + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +.syntax unified +.arm + +/******************* + * mpi_limb_t + * _gcry_mpih_add_n( mpi_ptr_t res_ptr, %r0 + * mpi_ptr_t s1_ptr, %r1 + * mpi_ptr_t s2_ptr, %r2 + * mpi_size_t size) %r3 + */ + +.text + +.globl _gcry_mpih_add_n +.type _gcry_mpih_add_n,%function +_gcry_mpih_add_n: + push {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr}; + cmn %r0, #0; /* clear carry flag */ + + tst %r3, #3; + beq .Large_loop; + +.Loop: + ldr %r4, [%r1], #4; + sub %r3, #1; + ldr %lr, [%r2], #4; + adcs %r4, %lr; + tst %r3, #3; + str %r4, [%r0], #4; + bne .Loop; + + teq %r3, #0; + beq .Lend; + +.Large_loop: + ldm %r1!, {%r4, %r6, %r8, %r10}; + ldm %r2!, {%r5, %r7, %r9, %lr}; + sub %r3, #4; + adcs %r4, %r5; + adcs %r6, %r7; + adcs %r8, %r9; + adcs %r10, %lr; + teq %r3, #0; + stm %r0!, {%r4, %r6, %r8, %r10}; + bne .Large_loop; + +.Lend: + adc %r0, %r3, #0; + pop {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc}; +.size _gcry_mpih_add_n,.-_gcry_mpih_add_n; diff --git a/mpi/armv6/mpih-mul1.S b/mpi/armv6/mpih-mul1.S new file mode 100644 index 0000000..ae19a15 --- /dev/null +++ b/mpi/armv6/mpih-mul1.S @@ -0,0 +1,80 @@ +/* ARMv6 mul_1 -- Multiply a limb vector with a limb and store the result in + * a second limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Note: This code is heavily based on the GNU MP Library (version 4.2.1). + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +.syntax unified +.arm + +/******************* + * mpi_limb_t + * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, %r0 + * mpi_ptr_t s1_ptr, %r1 + * mpi_size_t s1_size, %r2 + * mpi_limb_t s2_limb) %r3 + */ + +.text + +.globl _gcry_mpih_mul_1 +.type _gcry_mpih_mul_1,%function +_gcry_mpih_mul_1: + push {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %lr}; + mov %r4, #0; + + tst %r2, #3; + beq .Large_loop; + +.Loop: + ldr %r5, [%r1], #4; + mov %lr, #0; + umlal %r4, %lr, %r5, %r3; + sub %r2, #1; + str %r4, [%r0], #4; + tst %r2, #3; + mov %r4, %lr; + bne .Loop; + + teq %r2, #0; + beq .Lend; + +.Large_loop: + mov %r9, #0; + ldm %r1!, {%r5, %r6, %r7, %r8}; + mov %r10, #0; + umlal %r4, %r9, %r5, %r3; + mov %r11, #0; + umlal %r9, %r10, %r6, %r3; + mov %lr, #0; + umlal %r10, %r11, %r7, %r3; + subs %r2, #4; + umlal %r11, %lr, %r8, %r3; + stm %r0!, {%r4, %r9, %r10, %r11}; + mov %r4, %lr; + bne .Large_loop; + +.Lend: + mov %r0, %r4; + pop {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %pc}; +.size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1; diff --git a/mpi/armv6/mpih-mul2.S b/mpi/armv6/mpih-mul2.S new file mode 100644 index 0000000..02f7c07 --- /dev/null +++ b/mpi/armv6/mpih-mul2.S @@ -0,0 +1,94 @@ +/* ARMv6 mul_2 -- Multiply a limb vector with a limb and add the result to + * a second limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Note: This code is heavily based on the GNU MP Library (version 4.2.1). + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +.syntax unified +.arm + +/******************* + * mpi_limb_t + * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, %r0 + * mpi_ptr_t s1_ptr, %r1 + * mpi_size_t s1_size, %r2 + * mpi_limb_t s2_limb) %r3 + */ + +.text + +.globl _gcry_mpih_addmul_1 +.type _gcry_mpih_addmul_1,%function +_gcry_mpih_addmul_1: + push {%r4, %r5, %r6, %r8, %r10, %lr}; + mov %lr, #0; + cmn %r0, #0; /* clear carry flag */ + + tst %r2, #3; + beq .Large_loop; +.Loop: + ldr %r5, [%r1], #4; + ldr %r4, [%r0]; + sub %r2, #1; + adcs %r4, %lr; + mov %lr, #0; + umlal %r4, %lr, %r5, %r3; + tst %r2, #3; + str %r4, [%r0], #4; + bne .Loop; + + teq %r2, #0; + beq .Lend; + +.Large_loop: + ldm %r0, {%r4, %r6, %r8, %r10}; + ldr %r5, [%r1], #4; + + sub %r2, #4; + adcs %r4, %lr; + mov %lr, #0; + umlal %r4, %lr, %r5, %r3; + + ldr %r5, [%r1], #4; + adcs %r6, %lr; + mov %lr, #0; + umlal %r6, %lr, %r5, %r3; + + ldr %r5, [%r1], #4; + adcs %r8, %lr; + mov %lr, #0; + umlal %r8, %lr, %r5, %r3; + + ldr %r5, [%r1], #4; + adcs %r10, %lr; + mov %lr, #0; + umlal %r10, %lr, %r5, %r3; + + teq %r2, #0; + stm %r0!, {%r4, %r6, %r8, %r10}; + bne .Large_loop; + +.Lend: + adc %r0, %lr, #0; + pop {%r4, %r5, %r6, %r8, %r10, %pc}; +.size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1; diff --git a/mpi/armv6/mpih-mul3.S b/mpi/armv6/mpih-mul3.S new file mode 100644 index 0000000..e42fc30 --- /dev/null +++ b/mpi/armv6/mpih-mul3.S @@ -0,0 +1,97 @@ +/* ARMv6 mul_3 -- Multiply a limb vector with a limb and subtract the result + * from a second limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Note: This code is heavily based on the GNU MP Library (version 4.2.1). + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +.syntax unified +.arm + +/******************* + * mpi_limb_t + * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, %r0 + * mpi_ptr_t s1_ptr, %r1 + * mpi_size_t s1_size, %r2 + * mpi_limb_t s2_limb) %r3 + */ + +.text + +.globl _gcry_mpih_submul_1 +.type _gcry_mpih_submul_1,%function +_gcry_mpih_submul_1: + push {%r4, %r5, %r6, %r8, %r9, %r10, %lr}; + mov %lr, #0; + cmp %r0, #0; /* prepare carry flag for sbc */ + + tst %r2, #3; + beq .Large_loop; +.Loop: + ldr %r5, [%r1], #4; + mov %r4, %lr; + mov %lr, #0; + ldr %r6, [%r0]; + umlal %r4, %lr, %r5, %r3; + sub %r2, #1; + sbcs %r4, %r6, %r4; + tst %r2, #3; + str %r4, [%r0], #4; + bne .Loop; + + teq %r2, #0; + beq .Lend; + +.Large_loop: + ldr %r5, [%r1], #4; + ldm %r0, {%r4, %r6, %r8, %r10}; + + mov %r9, #0; + umlal %lr, %r9, %r5, %r3; + ldr %r5, [%r1], #4; + sbcs %r4, %r4, %lr; + + mov %lr, #0; + umlal %r9, %lr, %r5, %r3; + ldr %r5, [%r1], #4; + sbcs %r6, %r6, %r9; + + mov %r9, #0; + umlal %lr, %r9, %r5, %r3; + ldr %r5, [%r1], #4; + sbcs %r8, %r8, %lr; + + mov %lr, #0; + umlal %r9, %lr, %r5, %r3; + sub %r2, #4; + sbcs %r10, %r10, %r9; + + teq %r2, #0; + stm %r0!, {%r4, %r6, %r8, %r10}; + bne .Large_loop; + +.Lend: + it cc + movcc %r2, #1; + add %r0, %lr, %r2; + pop {%r4, %r5, %r6, %r8, %r9, %r10, %pc}; +.size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1; diff --git a/mpi/armv6/mpih-sub1.S b/mpi/armv6/mpih-sub1.S new file mode 100644 index 0000000..77d05eb --- /dev/null +++ b/mpi/armv6/mpih-sub1.S @@ -0,0 +1,77 @@ +/* ARMv6 sub_n -- Subtract two limb vectors of the same length > 0 and store + * sum in a third limb vector. + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Note: This code is heavily based on the GNU MP Library (version 4.2.1). + */ + +#include "sysdep.h" +#include "asm-syntax.h" + +.syntax unified +.arm + +/******************* + * mpi_limb_t + * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, %r0 + * mpi_ptr_t s1_ptr, %r1 + * mpi_ptr_t s2_ptr, %r2 + * mpi_size_t size) %r3 + */ + +.text + +.globl _gcry_mpih_sub_n +.type _gcry_mpih_sub_n,%function +_gcry_mpih_sub_n: + push {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr}; + cmp %r0, #0; /* prepare carry flag for sub */ + + tst %r3, #3; + beq .Large_loop; + +.Loop: + ldr %r4, [%r1], #4; + sub %r3, #1; + ldr %lr, [%r2], #4; + sbcs %r4, %lr; + tst %r3, #3; + str %r4, [%r0], #4; + bne .Loop; + + teq %r3, #0; + beq .Lend; + +.Large_loop: + ldm %r1!, {%r4, %r6, %r8, %r10}; + sub %r3, #4; + ldm %r2!, {%r5, %r7, %r9, %lr}; + sbcs %r4, %r5; + sbcs %r6, %r7; + sbcs %r8, %r9; + sbcs %r10, %lr; + teq %r3, #0; + stm %r0!, {%r4, %r6, %r8, %r10}; + bne .Large_loop; + +.Lend: + sbc %r0, %r3, #0; + neg %r0, %r0; + pop {%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc}; +.size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n; diff --git a/mpi/config.links b/mpi/config.links index bcc6e3e..f300255 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -136,6 +136,21 @@ case "${host}" in mpi_extra_modules="udiv-qrnnd" mpi_cpu_arch="alpha" ;; + arm*-*-*) + if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then + if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then + echo '/* configured for armv6 */' >>./mpi/asm-syntax.h + path="armv6" + mpi_cpu_arch="armv6" + else + echo '/* No assembler modules configured */' >>./mpi/asm-syntax.h + path="" + fi + else + echo '/* No assembler modules configured */' >>./mpi/asm-syntax.h + path="" + fi + ;; hppa7000*-*-*) echo '/* configured for HPPA (pa7000) */' >>./mpi/asm-syntax.h path="hppa1.1 hppa" From jussi.kivilinna at iki.fi Mon Aug 19 11:16:11 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 19 Aug 2013 12:16:11 +0300 Subject: [PATCH 4/5] Move ARMv6 detection to configure.ac In-Reply-To: <20130819091556.5607.2880.stgit@localhost6.localdomain6> References: <20130819091556.5607.2880.stgit@localhost6.localdomain6> Message-ID: <20130819091611.5607.51246.stgit@localhost6.localdomain6> * cipher/blowfish-armv6.S: Replace __ARM_ARCH >= 6 checks with HAVE_ARM_ARCH_V6. * cipher/blowfish.c: Ditto. * cipher/camellia-armv6.S: Ditto. * cipher/camellia.h: Ditto. * cipher/cast5-armv6.S: Ditto. * cipher/cast5.c: Ditto. * cipher/rijndael-armv6.S: Ditto. * cipher/rijndael.c: Ditto. * configure.ac: Add HAVE_ARM_ARCH_V6 check. -- Signed-off-by: Jussi Kivilinna --- cipher/blowfish-armv6.S | 9 +-------- cipher/blowfish.c | 9 +-------- cipher/camellia-armv6.S | 9 +-------- cipher/camellia.h | 9 +-------- cipher/cast5-armv6.S | 9 +-------- cipher/cast5.c | 9 +-------- cipher/rijndael-armv6.S | 9 +-------- cipher/rijndael.c | 9 +-------- configure.ac | 23 +++++++++++++++++++++++ 9 files changed, 31 insertions(+), 64 deletions(-) diff --git a/cipher/blowfish-armv6.S b/cipher/blowfish-armv6.S index b11d27f..eea879f 100644 --- a/cipher/blowfish-armv6.S +++ b/cipher/blowfish-armv6.S @@ -20,14 +20,7 @@ #include -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text diff --git a/cipher/blowfish.c b/cipher/blowfish.c index fe4e280..2806433 100644 --- a/cipher/blowfish.c +++ b/cipher/blowfish.c @@ -52,14 +52,7 @@ /* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ #undef USE_ARMV6_ASM -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) # if (BLOWFISH_ROUNDS == 16) && defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) # define USE_ARMV6_ASM 1 # endif diff --git a/cipher/camellia-armv6.S b/cipher/camellia-armv6.S index 769db02..3544754 100644 --- a/cipher/camellia-armv6.S +++ b/cipher/camellia-armv6.S @@ -20,14 +20,7 @@ #include -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text diff --git a/cipher/camellia.h b/cipher/camellia.h index 48f9160..72f2d1f 100644 --- a/cipher/camellia.h +++ b/cipher/camellia.h @@ -32,14 +32,7 @@ #include /* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ # undef USE_ARMV6_ASM -# if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +# if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS # define USE_ARMV6_ASM 1 # endif diff --git a/cipher/cast5-armv6.S b/cipher/cast5-armv6.S index e4b2339..038fc4f 100644 --- a/cipher/cast5-armv6.S +++ b/cipher/cast5-armv6.S @@ -20,14 +20,7 @@ #include -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text diff --git a/cipher/cast5.c b/cipher/cast5.c index 9e7b50f..4377c28 100644 --- a/cipher/cast5.c +++ b/cipher/cast5.c @@ -53,14 +53,7 @@ /* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ #undef USE_ARMV6_ASM -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS # define USE_ARMV6_ASM 1 # endif diff --git a/cipher/rijndael-armv6.S b/cipher/rijndael-armv6.S index e778a94..bbbfb0e 100644 --- a/cipher/rijndael-armv6.S +++ b/cipher/rijndael-armv6.S @@ -20,14 +20,7 @@ #include -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) #ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS .text diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 8855e0c..314f106 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -69,14 +69,7 @@ /* USE_ARMV6_ASM indicates whether to use ARMv6 assembly code. */ #undef USE_ARMV6_ASM -#if defined(__arm__) && defined(__ARMEL__) && \ - ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ - || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ - || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ - || defined(__ARM_ARCH_7EM__)) +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) # ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS # define USE_ARMV6_ASM 1 # endif diff --git a/configure.ac b/configure.ac index c33f36b..f20d0a1 100644 --- a/configure.ac +++ b/configure.ac @@ -965,6 +965,29 @@ fi # +# Check whether compiler is configured for ARMv6 or newer architecture +# +AC_CACHE_CHECK([whether compiler is configured for ARMv6 or newer architecture], + [gcry_cv_cc_arm_arch_is_v6], + [AC_EGREP_CPP(yes, + [#if defined(__arm__) && \ + ((defined(__ARM_ARCH) && __ARM_ARCH >= 6) \ + || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) \ + || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \ + || defined(__ARM_ARCH_7EM__)) + yes + #endif + ], gcry_cv_cc_arm_arch_is_v6=yes, gcry_cv_cc_arm_arch_is_v6=no)]) +if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then + AC_DEFINE(HAVE_ARM_ARCH_V6,1, + [Defined if ARM architecture is v6 or newer]) +fi + + +# # Check whether GCC assembler supports features needed for our ARM # implementations # From wachs at net.in.tum.de Thu Aug 22 12:03:20 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Thu, 22 Aug 2013 12:03:20 +0200 Subject: compile errors on freebsd 9.1 Message-ID: <1377165800.31335.76.camel@fulcrum.net.in.tum.de> Hi, on one of our buildsystems libgcrypt git (Libgcrypt v1.6.0-beta194) fails to build due to serpent and twofish: FreeBSD freebsd91 9.1-RELEASE FreeBSD 9.1-RELEASE #0 r243825 gcc (GCC) 4.2.1 20070831 patched [FreeBSD] serpent: libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src -D_THREAD_SAFE -I/usr/local/include -g -O2 -fvisibility=hidden -Wall -MT serpent.lo -MD -MP -MF .deps/serpent.Tpo -c serpent.c -fPIC -DPIC -o .libs/serpent.o serpent.c: In function '_gcry_serpent_ctr_enc': serpent.c:892: error: expected string literal before ')' token serpent.c: In function '_gcry_serpent_cbc_dec': serpent.c:999: error: expected string literal before ')' token serpent.c: In function '_gcry_serpent_cfb_dec': serpent.c:1101: error: expected string literal before ')' token *** [serpent.lo] Error code 1 twofish: /usr/bin/ld: ../cipher/.libs/libcipher.a(twofish-amd64.o): relocation R_X86_64_PC32 against `__twofish_dec_blk3' can not be used when making a shared object; recompile with -fPIC /usr/bin/ld: final link failed: Bad value *** [libgcrypt.la] Error code 1 When disabling both ciphers it compiles ... Cheers, Matthias -- Dipl.-Inf. Matthias Wachs Free Secure Network Systems Group Technische Universitaet Muenchen Chair for Network Architectures and Services Institute for Informatics / I8 Tel: +49 89 289 18037 Boltzmannstr. 3 / Room 03.05.042 Fax: +49 89 289 18033 D-85748 Garching b. Muenchen, Germany Email: wachs at net.in.tum.de -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 490 bytes Desc: This is a digitally signed message part URL: From jussi.kivilinna at iki.fi Thu Aug 22 14:23:28 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 22 Aug 2013 15:23:28 +0300 Subject: compile errors on freebsd 9.1 In-Reply-To: <1377165800.31335.76.camel@fulcrum.net.in.tum.de> References: <1377165800.31335.76.camel@fulcrum.net.in.tum.de> Message-ID: <521602C0.8040301@iki.fi> On 22.08.2013 13:03, Matthias Wachs wrote: > Hi, > > on one of our buildsystems libgcrypt git (Libgcrypt v1.6.0-beta194) > fails to build due to serpent and twofish: Thanks for reporting. I've attached two patches that should fix problems you are experiencing. First patch fixes obvious error in twofish-amd64 assembly (__twofish_dec_blk3 exported as global symbol instead being marked as function type); second one moves SSE2/XMM register clearing from serpent.c to serpent-sse2-amd64.S. I will also push these patches to git tree. -Jussi > > FreeBSD freebsd91 9.1-RELEASE FreeBSD 9.1-RELEASE #0 r243825 > gcc (GCC) 4.2.1 20070831 patched [FreeBSD] > > serpent: > > libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src > -D_THREAD_SAFE -I/usr/local/include -g -O2 -fvisibility=hidden -Wall -MT > serpent.lo -MD -MP -MF .deps/serpent.Tpo -c serpent.c -fPIC -DPIC > -o .libs/serpent.o > serpent.c: In function '_gcry_serpent_ctr_enc': > serpent.c:892: error: expected string literal before ')' token > serpent.c: In function '_gcry_serpent_cbc_dec': > serpent.c:999: error: expected string literal before ')' token > serpent.c: In function '_gcry_serpent_cfb_dec': > serpent.c:1101: error: expected string literal before ')' token > *** [serpent.lo] Error code 1 > > twofish: > > /usr/bin/ld: ../cipher/.libs/libcipher.a(twofish-amd64.o): relocation > R_X86_64_PC32 against `__twofish_dec_blk3' can not be used when making a > shared object; recompile with -fPIC > /usr/bin/ld: final link failed: Bad value > *** [libgcrypt.la] Error code 1 > > > When disabling both ciphers it compiles ... > > Cheers, > > Matthias > > > > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > -------------- next part -------------- A non-text attachment was scrubbed... Name: 01-fix-__twofish_dec_blk3.patch Type: text/x-patch Size: 724 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: 02-serpent-sse2-move-register-clearing-to-assembly.patch Type: text/x-patch Size: 4599 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 730 bytes Desc: OpenPGP digital signature URL: From wachs at net.in.tum.de Thu Aug 22 14:41:56 2013 From: wachs at net.in.tum.de (Matthias Wachs) Date: Thu, 22 Aug 2013 14:41:56 +0200 Subject: compile errors on freebsd 9.1 In-Reply-To: <521602C0.8040301@iki.fi> References: <1377165800.31335.76.camel@fulcrum.net.in.tum.de> <521602C0.8040301@iki.fi> Message-ID: <1377175316.31335.78.camel@fulcrum.net.in.tum.de> Perfect, thx a lot... compiles on our system! -Matthias On Thu, 2013-08-22 at 15:23 +0300, Jussi Kivilinna wrote: > On 22.08.2013 13:03, Matthias Wachs wrote: > > Hi, > > > > on one of our buildsystems libgcrypt git (Libgcrypt v1.6.0-beta194) > > fails to build due to serpent and twofish: > > Thanks for reporting. > > I've attached two patches that should fix problems you are experiencing. First patch fixes obvious error in twofish-amd64 assembly (__twofish_dec_blk3 exported as global symbol instead being marked as function type); second one moves SSE2/XMM register clearing from serpent.c to serpent-sse2-amd64.S. > > I will also push these patches to git tree. > > -Jussi > > > > > FreeBSD freebsd91 9.1-RELEASE FreeBSD 9.1-RELEASE #0 r243825 > > gcc (GCC) 4.2.1 20070831 patched [FreeBSD] > > > > serpent: > > > > libtool: compile: gcc -DHAVE_CONFIG_H -I. -I.. -I../src -I../src > > -D_THREAD_SAFE -I/usr/local/include -g -O2 -fvisibility=hidden -Wall -MT > > serpent.lo -MD -MP -MF .deps/serpent.Tpo -c serpent.c -fPIC -DPIC > > -o .libs/serpent.o > > serpent.c: In function '_gcry_serpent_ctr_enc': > > serpent.c:892: error: expected string literal before ')' token > > serpent.c: In function '_gcry_serpent_cbc_dec': > > serpent.c:999: error: expected string literal before ')' token > > serpent.c: In function '_gcry_serpent_cfb_dec': > > serpent.c:1101: error: expected string literal before ')' token > > *** [serpent.lo] Error code 1 > > > > twofish: > > > > /usr/bin/ld: ../cipher/.libs/libcipher.a(twofish-amd64.o): relocation > > R_X86_64_PC32 against `__twofish_dec_blk3' can not be used when making a > > shared object; recompile with -fPIC > > /usr/bin/ld: final link failed: Bad value > > *** [libgcrypt.la] Error code 1 > > > > > > When disabling both ciphers it compiles ... > > > > Cheers, > > > > Matthias > > > > > > > > > > _______________________________________________ > > Gcrypt-devel mailing list > > Gcrypt-devel at gnupg.org > > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel > > > -- Dipl.-Inf. Matthias Wachs Free Secure Network Systems Group Technische Universitaet Muenchen Chair for Network Architectures and Services Institute for Informatics / I8 Tel: +49 89 289 18037 Boltzmannstr. 3 / Room 03.05.042 Fax: +49 89 289 18033 D-85748 Garching b. Muenchen, Germany Email: wachs at net.in.tum.de -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 490 bytes Desc: This is a digitally signed message part URL: From jussi.kivilinna at iki.fi Sat Aug 31 12:36:08 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 31 Aug 2013 13:36:08 +0300 Subject: [PATCH 1/4] Correct mpi_cpu_arch for ARMv6 Message-ID: <20130831103608.21320.39476.stgit@localhost6.localdomain6> * mpi/config.links [armv6]: Set mpi_cpu_arch to "arm", instead of "armv6". -- Without this change, HAVE_CPU_ARCH_ARM stays undefined. Signed-off-by: Jussi Kivilinna --- mpi/config.links | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpi/config.links b/mpi/config.links index f300255..9fb4f10 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -141,7 +141,7 @@ case "${host}" in if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then echo '/* configured for armv6 */' >>./mpi/asm-syntax.h path="armv6" - mpi_cpu_arch="armv6" + mpi_cpu_arch="arm" else echo '/* No assembler modules configured */' >>./mpi/asm-syntax.h path="" From jussi.kivilinna at iki.fi Sat Aug 31 12:36:19 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 31 Aug 2013 13:36:19 +0300 Subject: [PATCH 3/4] sha512: reduce stack use in transform function by 512 bytes In-Reply-To: <20130831103608.21320.39476.stgit@localhost6.localdomain6> References: <20130831103608.21320.39476.stgit@localhost6.localdomain6> Message-ID: <20130831103619.21320.32613.stgit@localhost6.localdomain6> * cipher/sha512.c (transform): Change 'u64 w[80]' to 'u64 w[16]' and inline input expansion to first 64 rounds. (sha512_write, sha512_final): Reduce burn_stack depth by 512 bytes. -- The input expansion to w[] array can be inlined with rounds and size of array reduced from u64[80] to u64[16]. On Cortex-A8, this change gives small boost, possibly thanks to reduced burn_stack depth. New vs old (tests/benchmark md sha512 sha384): SHA512 1.09x 1.11x 1.06x 1.09x 1.08x SHA384 1.09x 1.11x 1.06x 1.09x 1.09x Signed-off-by: Jussi Kivilinna --- cipher/sha512.c | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 173 insertions(+), 18 deletions(-) diff --git a/cipher/sha512.c b/cipher/sha512.c index 2163e60..1bbcd11 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -135,7 +135,7 @@ static void transform (SHA512_CONTEXT *hd, const unsigned char *data) { u64 a, b, c, d, e, f, g, h; - u64 w[80]; + u64 w[16]; int t; static const u64 k[] = { @@ -215,11 +215,8 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data) #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - for (t = 16; t < 80; t++) - w[t] = S1 (w[t - 2]) + w[t - 7] + S0 (w[t - 15]) + w[t - 16]; - - for (t = 0; t < 80; ) + for (t = 0; t < 80 - 16; ) { u64 t1, t2; @@ -232,7 +229,125 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data) Unrolled with inline: 330ms */ #if 0 /* Not unrolled. */ - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16]; + w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]); + t2 = Sum0 (a) + Maj (a, b, c); + h = g; + g = f; + f = e; + e = d + t1; + d = c; + c = b; + b = a; + a = t1 + t2; + t++; +#else /* Unrolled to interweave the chain variables. */ + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0]; + w[0] += S1 (w[14]) + w[9] + S0 (w[1]); + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1]; + w[1] += S1 (w[15]) + w[10] + S0 (w[2]); + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2]; + w[2] += S1 (w[0]) + w[11] + S0 (w[3]); + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3]; + w[3] += S1 (w[1]) + w[12] + S0 (w[4]); + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4]; + w[4] += S1 (w[2]) + w[13] + S0 (w[5]); + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5]; + w[5] += S1 (w[3]) + w[14] + S0 (w[6]); + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6]; + w[6] += S1 (w[4]) + w[15] + S0 (w[7]); + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7]; + w[7] += S1 (w[5]) + w[0] + S0 (w[8]); + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8]; + w[8] += S1 (w[6]) + w[1] + S0 (w[9]); + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9]; + w[9] += S1 (w[7]) + w[2] + S0 (w[10]); + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10]; + w[10] += S1 (w[8]) + w[3] + S0 (w[11]); + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11]; + w[11] += S1 (w[9]) + w[4] + S0 (w[12]); + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12]; + w[12] += S1 (w[10]) + w[5] + S0 (w[13]); + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13]; + w[13] += S1 (w[11]) + w[6] + S0 (w[14]); + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14]; + w[14] += S1 (w[12]) + w[7] + S0 (w[15]); + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15]; + w[15] += S1 (w[13]) + w[8] + S0 (w[0]); + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t += 16; +#endif + } + + for (; t < 80; ) + { + u64 t1, t2; + +#if 0 /* Not unrolled. */ + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16]; t2 = Sum0 (a) + Maj (a, b, c); h = g; g = f; @@ -244,47 +359,87 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data) a = t1 + t2; t++; #else /* Unrolled to interweave the chain variables. */ - t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0]; + t2 = Sum0 (a) + Maj (a, b, c); + d += t1; + h = t1 + t2; + + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1]; + t2 = Sum0 (h) + Maj (h, a, b); + c += t1; + g = t1 + t2; + + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2]; + t2 = Sum0 (g) + Maj (g, h, a); + b += t1; + f = t1 + t2; + + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3]; + t2 = Sum0 (f) + Maj (f, g, h); + a += t1; + e = t1 + t2; + + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4]; + t2 = Sum0 (e) + Maj (e, f, g); + h += t1; + d = t1 + t2; + + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5]; + t2 = Sum0 (d) + Maj (d, e, f); + g += t1; + c = t1 + t2; + + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6]; + t2 = Sum0 (c) + Maj (c, d, e); + f += t1; + b = t1 + t2; + + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7]; + t2 = Sum0 (b) + Maj (b, c, d); + e += t1; + a = t1 + t2; + + t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8]; t2 = Sum0 (a) + Maj (a, b, c); d += t1; h = t1 + t2; - t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[t+1]; + t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9]; t2 = Sum0 (h) + Maj (h, a, b); c += t1; g = t1 + t2; - t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[t+2]; + t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10]; t2 = Sum0 (g) + Maj (g, h, a); b += t1; f = t1 + t2; - t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[t+3]; + t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11]; t2 = Sum0 (f) + Maj (f, g, h); a += t1; e = t1 + t2; - t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[t+4]; + t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12]; t2 = Sum0 (e) + Maj (e, f, g); h += t1; d = t1 + t2; - t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[t+5]; + t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13]; t2 = Sum0 (d) + Maj (d, e, f); g += t1; c = t1 + t2; - t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[t+6]; + t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14]; t2 = Sum0 (c) + Maj (c, d, e); f += t1; b = t1 + t2; - t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[t+7]; + t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15]; t2 = Sum0 (b) + Maj (b, c, d); e += t1; a = t1 + t2; - t += 8; + t += 16; #endif } @@ -312,7 +467,7 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen) if (hd->count == 128) { /* flush the buffer */ transform (hd, hd->buf); - _gcry_burn_stack (768); + _gcry_burn_stack (256); hd->count = 0; hd->nblocks++; } @@ -335,7 +490,7 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen) inlen -= 128; inbuf += 128; } - _gcry_burn_stack (768); + _gcry_burn_stack (256); for (; inlen && hd->count < 128; inlen--) hd->buf[hd->count++] = *inbuf++; } @@ -405,7 +560,7 @@ sha512_final (void *context) hd->buf[126] = lsb >> 8; hd->buf[127] = lsb; transform (hd, hd->buf); - _gcry_burn_stack (768); + _gcry_burn_stack (256); p = hd->buf; #ifdef WORDS_BIGENDIAN From jussi.kivilinna at iki.fi Sat Aug 31 12:36:13 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 31 Aug 2013 13:36:13 +0300 Subject: [PATCH 2/4] Add ARM HW feature detection module and add NEON detection In-Reply-To: <20130831103608.21320.39476.stgit@localhost6.localdomain6> References: <20130831103608.21320.39476.stgit@localhost6.localdomain6> Message-ID: <20130831103613.21320.8181.stgit@localhost6.localdomain6> * configure.ac: Add option --disable-neon-support. (HAVE_GCC_INLINE_ASM_NEON): New. (ENABLE_NEON_SUPPORT): New. [arm]: Add 'hwf-arm.lo' as HW feature module. * src/Makefile.am: Add 'hwf-arm.c'. * src/g10lib.h (HWF_ARM_NEON): New macro. * src/global.c (hwflist): Add HWF_ARM_NEON entry. * src/hwf-arm.c: New file. * src/hwf-common.h (_gcry_hwf_detect_arm): New prototype. * src/hwfeatures.c (_gcry_detect_hw_features) [HAVE_CPU_ARCH_ARM]: Add call to _gcry_hwf_detect_arm. -- Add HW detection module for detecting ARM NEON instruction set. ARM does not have cpuid instruction so we have to rely on OS to pass feature set information to user-space. For linux, NEON support can be detected by parsing '/proc/self/auxv' for hardware capabilities information. For other OSes, NEON can be detected by checking if platform/compiler only supports NEON capable CPUs (by check if __ARM_NEON__ macro is defined). Signed-off-by: Jussi Kivilinna --- configure.ac | 43 +++++++++++++++++++++ src/Makefile.am | 2 - src/g10lib.h | 2 + src/global.c | 1 src/hwf-arm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/hwf-common.h | 1 src/hwfeatures.c | 5 ++ 7 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 src/hwf-arm.c diff --git a/configure.ac b/configure.ac index b54b4d6..89b9366 100644 --- a/configure.ac +++ b/configure.ac @@ -595,6 +595,14 @@ AC_ARG_ENABLE(avx2-support, avx2support=$enableval,avx2support=yes) AC_MSG_RESULT($avx2support) +# Implementation of the --disable-neon-support switch. +AC_MSG_CHECKING([whether NEON support is requested]) +AC_ARG_ENABLE(neon-support, + AC_HELP_STRING([--disable-neon-support], + [Disable support for the ARM NEON instructions]), + neonsupport=$enableval,neonsupport=yes) +AC_MSG_RESULT($neonsupport) + # Implementation of the --disable-O-flag-munging switch. AC_MSG_CHECKING([whether a -O flag munging is requested]) AC_ARG_ENABLE([O-flag-munging], @@ -988,6 +996,30 @@ fi # +# Check whether GCC inline assembler supports NEON instructions +# +AC_CACHE_CHECK([whether GCC inline assembler supports NEON instructions], + [gcry_cv_gcc_inline_asm_neon], + [gcry_cv_gcc_inline_asm_neon=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[__asm__( + ".syntax unified\n\t" + ".thumb\n\t" + ".fpu neon\n\t" + "vld1.64 {%q0-%q1}, [%r0]!;\n\t" + "vrev64.8 %q0, %q3;\n\t" + "vadd.u64 %q0, %q1;\n\t" + "vadd.s64 %d3, %d2, %d3;\n\t" + ); + ]])], + [gcry_cv_gcc_inline_asm_neon=yes])]) +if test "$gcry_cv_gcc_inline_asm_neon" = "yes" ; then + AC_DEFINE(HAVE_GCC_INLINE_ASM_NEON,1, + [Defined if inline assembler supports NEON instructions]) +fi + + +# # Check whether GCC assembler supports features needed for our ARM # implementations # @@ -1269,6 +1301,11 @@ if test x"$avx2support" = xyes ; then avx2support="no (unsupported by compiler)" fi fi +if test x"$neonsupport" = xyes ; then + if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then + neonsupport="no (unsupported by compiler)" + fi +fi if test x"$aesnisupport" = xyes ; then AC_DEFINE(ENABLE_AESNI_SUPPORT, 1, @@ -1282,6 +1319,10 @@ if test x"$avx2support" = xyes ; then AC_DEFINE(ENABLE_AVX2_SUPPORT,1, [Enable support for Intel AVX2 instructions.]) fi +if test x"$neonsupport" = xyes ; then + AC_DEFINE(ENABLE_NEON_SUPPORT,1, + [Enable support for ARM NEON instructions.]) +fi # Define conditional sources and config.h symbols depending on the @@ -1597,6 +1638,7 @@ case "$mpi_cpu_arch" in ;; arm) AC_DEFINE(HAVE_CPU_ARCH_ARM, 1, [Defined for ARM platforms]) + GCRYPT_HWF_MODULES="hwf-arm.lo" ;; esac AC_SUBST([GCRYPT_HWF_MODULES]) @@ -1660,6 +1702,7 @@ GCRY_MSG_SHOW([Try using AES-NI crypto: ],[$aesnisupport]) GCRY_MSG_SHOW([Try using DRNG (RDRAND): ],[$drngsupport]) GCRY_MSG_SHOW([Try using Intel AVX: ],[$avxsupport]) GCRY_MSG_SHOW([Try using Intel AVX2: ],[$avx2support]) +GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport]) GCRY_MSG_SHOW([],[]) if test "$print_egd_notice" = "yes"; then diff --git a/src/Makefile.am b/src/Makefile.am index 8eb46e6..d4329c9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -64,7 +64,7 @@ libgcrypt_la_SOURCES = \ ec-context.h \ ath.h ath.c -EXTRA_libgcrypt_la_SOURCES = hwf-x86.c +EXTRA_libgcrypt_la_SOURCES = hwf-x86.c hwf-arm.c gcrypt_hwf_modules = @GCRYPT_HWF_MODULES@ diff --git a/src/g10lib.h b/src/g10lib.h index 198ab38..31131a5 100644 --- a/src/g10lib.h +++ b/src/g10lib.h @@ -155,6 +155,8 @@ int _gcry_log_verbosity( int level ); #define HWF_INTEL_AVX 1024 #define HWF_INTEL_AVX2 2048 +#define HWF_ARM_NEON 4096 + unsigned int _gcry_get_hw_features (void); void _gcry_detect_hw_features (unsigned int); diff --git a/src/global.c b/src/global.c index 9c80573..44667cf 100644 --- a/src/global.c +++ b/src/global.c @@ -70,6 +70,7 @@ static struct { HWF_INTEL_RDRAND,"intel-rdrand" }, { HWF_INTEL_AVX, "intel-avx" }, { HWF_INTEL_AVX2, "intel-avx2" }, + { HWF_ARM_NEON, "arm-neon" }, { 0, NULL} }; diff --git a/src/hwf-arm.c b/src/hwf-arm.c new file mode 100644 index 0000000..9ab4cd0 --- /dev/null +++ b/src/hwf-arm.c @@ -0,0 +1,113 @@ +/* hwf-arm.c - Detect hardware features - ARM part + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "hwf-common.h" + +#if !defined (__arm__) +# error Module build for wrong CPU. +#endif + +#undef HAS_SYS_AT_HWCAP +#ifdef __linux__ + +#define HAS_SYS_AT_HWCAP 1 + +#define AT_HWCAP 16 +#define HWCAP_NEON 4096 + +static int get_hwcap(unsigned int *hwcap) +{ + struct { unsigned int a_type; unsigned int a_val; } auxv; + FILE *f; + int err = -1; + static int hwcap_initialized = 0; + static unsigned int stored_hwcap; + + if (hwcap_initialized) + { + *hwcap = stored_hwcap; + return 0; + } + + f = fopen("/proc/self/auxv", "r"); + if (!f) + { + *hwcap = stored_hwcap; + return -1; + } + + while (fread(&auxv, sizeof(auxv), 1, f) > 0) + { + if (auxv.a_type != AT_HWCAP) + continue; + + stored_hwcap = auxv.a_val; + hwcap_initialized = 1; + err = 0; + break; + } + + fclose(f); + *hwcap = stored_hwcap; + return err; +} + +static unsigned int detect_arm_at_hwcap(void) +{ + unsigned int hwcap; + unsigned int features = 0; + + if (get_hwcap(&hwcap) < 0) + return features; + +#ifdef ENABLE_NEON_SUPPORT + if (hwcap & HWCAP_NEON) + features |= HWF_ARM_NEON; +#endif + + return features; +} + +#endif /* __linux__ */ + +unsigned int +_gcry_hwf_detect_arm (void) +{ + unsigned int ret = 0; + +#if defined (HAS_SYS_AT_HWCAP) + ret |= detect_arm_at_hwcap (); +#else + ret |= 0; +#endif + +#if defined(__ARM_NEON__) && defined(ENABLE_NEON_SUPPORT) + ret |= HWF_ARM_NEON; +#endif + + return ret; +} diff --git a/src/hwf-common.h b/src/hwf-common.h index 974f47d..8f156b5 100644 --- a/src/hwf-common.h +++ b/src/hwf-common.h @@ -21,6 +21,7 @@ #define HWF_COMMON_H unsigned int _gcry_hwf_detect_x86 (void); +unsigned int _gcry_hwf_detect_arm (void); #endif /*HWF_COMMON_H*/ diff --git a/src/hwfeatures.c b/src/hwfeatures.c index 87d05d8..1e3c27d 100644 --- a/src/hwfeatures.c +++ b/src/hwfeatures.c @@ -59,6 +59,11 @@ _gcry_detect_hw_features (unsigned int disabled_features) hw_features = _gcry_hwf_detect_x86 (); } #endif /* HAVE_CPU_ARCH_X86 */ +#if defined (HAVE_CPU_ARCH_ARM) + { + hw_features = _gcry_hwf_detect_arm (); + } +#endif /* HAVE_CPU_ARCH_ARM */ hw_features &= ~disabled_features; } From jussi.kivilinna at iki.fi Sat Aug 31 12:36:24 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 31 Aug 2013 13:36:24 +0300 Subject: [PATCH 4/4] sha512: add ARM/NEON assembly version of transform function In-Reply-To: <20130831103608.21320.39476.stgit@localhost6.localdomain6> References: <20130831103608.21320.39476.stgit@localhost6.localdomain6> Message-ID: <20130831103624.21320.20883.stgit@localhost6.localdomain6> * cipher/Makefile.am: Add 'sha512-armv7-neon.S'. * cipher/sha512-armv7-neon.S: New file. * cipher/sha512.c (USE_ARM_NEON_ASM): New macro. (SHA512_CONTEXT)?[USE_ARM_NEON_ASM]: Add 'use_neon'. (sha512_init, sha384_init) [USE_ARM_NEON_ASM]: Enable 'use_neon' if CPU support NEON instructions. (k): Round constant array moved outside of 'transform' function. (__transform): Renamed from 'tranform' function. [USE_ARM_NEON_ASM] (_gcry_sha512_transform_armv7_neon): New prototype. (transform): New wrapper function for different transform versions. (sha512_write, sha512_final): Burn stack by the amount returned by transform function. * configure.ac (sha512) [neonsupport]: Add 'sha512-armv7-neon.lo'. -- Add NEON assembly for transform function for faster SHA512 on ARM. Major speed up thanks to 64-bit integer registers and large register file that can hold full input buffer. Benchmark results on Cortex-A8, 1Ghz: Old: $ tests/benchmark --hash-repetitions 100 md sha512 sha384 SHA512 17050ms 18780ms 29120ms 18040ms 17190ms SHA384 17130ms 18720ms 29160ms 18090ms 17280ms New: $ tests/benchmark --hash-repetitions 100 md sha512 sha384 SHA512 3600ms 5070ms 15330ms 4510ms 3480ms SHA384 3590ms 5060ms 15350ms 4510ms 3520ms New vs old: SHA512 4.74x 3.70x 1.90x 4.00x 4.94x SHA384 4.77x 3.70x 1.90x 4.01x 4.91x Signed-off-by: Jussi Kivilinna --- cipher/Makefile.am | 2 cipher/sha512-armv7-neon.S | 316 ++++++++++++++++++++++++++++++++++++++++++++ cipher/sha512.c | 150 ++++++++++++++------- configure.ac | 5 + 4 files changed, 422 insertions(+), 51 deletions(-) create mode 100644 cipher/sha512-armv7-neon.S diff --git a/cipher/Makefile.am b/cipher/Makefile.am index e233e79..3dd6f88 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -73,7 +73,7 @@ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ sha1.c \ sha256.c \ -sha512.c \ +sha512.c sha512-armv7-neon.S \ tiger.c \ whirlpool.c \ twofish.c twofish-amd64.S \ diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S new file mode 100644 index 0000000..042b15a --- /dev/null +++ b/cipher/sha512-armv7-neon.S @@ -0,0 +1,316 @@ +/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform + * + * Copyright ? 2013 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +.text + +.syntax unified +.fpu neon +.arm + +/* structure of SHA512_CONTEXT */ +#define hd_a 0 +#define hd_b ((hd_a) + 8) +#define hd_c ((hd_b) + 8) +#define hd_d ((hd_c) + 8) +#define hd_e ((hd_d) + 8) +#define hd_f ((hd_e) + 8) +#define hd_g ((hd_f) + 8) + +/* register macros */ +#define RK %r2 + +#define RA d0 +#define RB d1 +#define RC d2 +#define RD d3 +#define RE d4 +#define RF d5 +#define RG d6 +#define RH d7 + +#define RT0 d8 +#define RT1 d9 +#define RT2 d10 +#define RT3 d11 +#define RT4 d12 +#define RT5 d13 +#define RT6 d14 +#define RT7 d15 + +#define RW0 d16 +#define RW1 d17 +#define RW2 d18 +#define RW3 d19 +#define RW4 d20 +#define RW5 d21 +#define RW6 d22 +#define RW7 d23 +#define RW8 d24 +#define RW9 d25 +#define RW10 d26 +#define RW11 d27 +#define RW12 d28 +#define RW13 d29 +#define RW14 d30 +#define RW15 d31 + +#define RW01q q8 +#define RW23q q9 +#define RW45q q10 +#define RW67q q11 +#define RW89q q12 +#define RW1011q q13 +#define RW1213q q14 +#define RW1415q q15 + +/*********************************************************************** + * ARM assembly implementation of sha512 transform + ***********************************************************************/ +#define round_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw14, rw9, rw1) \ + /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ + vshr.u64 RT1, re, #14; \ + vshl.u64 RT3, re, #64 - 14; \ + vshr.u64 RT4, re, #18; \ + vshl.u64 RT5, re, #64 - 18; \ + veor.64 RT1, RT1, RT3; \ + vld1.64 {RT0}, [RK]!; \ + veor.64 RT1, RT1, RT4; \ + vshr.u64 RT3, re, #41; \ + vshl.u64 RT4, re, #64 - 41; \ + veor.64 RT1, RT1, RT5; \ + vadd.u64 RT0, RT0, rw0; \ + veor.64 RT1, RT1, RT3; \ + vand.64 RT2, re, rf; \ + veor.64 RT1, RT1, RT4; \ + vbic.64 RT6, rg, re; \ + \ + vadd.u64 RT1, RT1, rh; \ + veor.64 RT2, RT2, RT6; \ + vshr.u64 rh, ra, #28; \ + vshl.u64 RT3, ra, #64 - 28; \ + vadd.u64 RT1, RT1, RT0; \ + vshr.u64 RT4, ra, #34; \ + veor.64 rh, rh, RT3; \ + vshl.u64 RT5, ra, #64 - 34; \ + vadd.u64 RT1, RT1, RT2; \ + \ + /* h = Sum0 (a) + Maj (a, b, c); */ \ + veor.64 rh, rh, RT4; \ + vshr.u64 RT3, ra, #39; \ + vshl.u64 RT4, ra, #64 - 39; \ + vorr.64 RT6, ra, rb; \ + vand.64 RT0, ra, rb; \ + veor.64 rh, rh, RT5; \ + vand.64 RT6, RT6, rc; \ + veor.64 rh, rh, RT3; \ + vorr.64 RT0, RT0, RT6; \ + veor.64 rh, rh, RT4; \ + vshr.u64 RT4, rw14, #19; \ + vadd.u64 rh, rh, RT0; \ + vshl.u64 RT2, rw14, #64 - 19; \ + \ + /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ + vshr.u64 RT3, rw14, #61; \ + vshl.u64 RT6, rw14, #64 - 61; \ + veor.64 RT0, RT4, RT2; \ + vshr.u64 RT2, rw14, 6; \ + veor.64 RT0, RT0, RT3; \ + vshr.u64 RT7, rw1, #1; \ + veor.64 RT0, RT0, RT6; \ + vshl.u64 RT4, rw1, #64 - 1; \ + veor.64 RT0, RT0, RT2; \ + vshr.u64 RT5, rw1, #8; \ + vadd.u64 rw0, rw0, RT0; \ + vshl.u64 RT6, rw1, #64 - 8; \ + veor.64 RT7, RT7, RT4; \ + vshr.u64 RT4, rw1, 7; \ + veor.64 RT7, RT7, RT5; \ + vadd.u64 rw0, rw0, rw9; /* w[0]+=w[9]; */\ + veor.64 RT7, RT7, RT6; \ + vadd.u64 rd, rd, RT1; /* d+=t1; */ \ + veor.64 RT7, RT7, RT4; \ + vadd.u64 rh, rh, RT1; /* h+=t1; */ \ + vadd.u64 rw0, rw0, RT7; \ + +#define round_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0) \ + /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ + vld1.64 {RT0}, [RK]!; \ + vshr.u64 RT1, re, #14; \ + vshl.u64 RT3, re, #64 - 14; \ + vshr.u64 RT4, re, #18; \ + vshl.u64 RT5, re, #64 - 18; \ + veor.64 RT1, RT1, RT3; \ + vshr.u64 RT7, ra, #28; \ + veor.64 RT1, RT1, RT4; \ + vshr.u64 RT3, re, #41; \ + vshl.u64 RT4, re, #64 - 41; \ + veor.64 RT1, RT1, RT5; \ + vadd.u64 RT0, RT0, rw0; \ + veor.64 RT1, RT1, RT3; \ + vand.64 RT2, re, rf; \ + veor.64 RT1, RT1, RT4; \ + vbic.64 RT6, rg, re; \ + \ + vadd.u64 RT1, RT1, rh; \ + veor.64 RT2, RT2, RT6; \ + vadd.u64 RT1, RT1, RT0; \ + vshr.u64 RT4, ra, #34; \ + vshl.u64 RT5, ra, #64 - 34; \ + \ + /* t7 = Sum0 (a) + Maj (a, b, c); */ \ + vshl.u64 RT6, ra, #64 - 28; \ + veor.64 RT7, RT7, RT4; \ + vshr.u64 RT3, ra, #39; \ + veor.64 RT7, RT7, RT6; \ + vshl.u64 RT4, ra, #64 - 39; \ + vorr.64 RT6, ra, rb; \ + vand.64 RT0, ra, rb; \ + veor.64 RT7, RT7, RT5; \ + vand.64 RT6, RT6, rc; \ + veor.64 RT7, RT7, RT3; \ + vorr.64 RT0, RT0, RT6; \ + veor.64 RT7, RT7, RT4; \ + vadd.u64 RT1, RT1, RT2; \ + vadd.u64 RT7, RT7, RT0; \ + vadd.u64 rd, rd, RT1; /* d+=t1; */ \ + vadd.u64 rh, RT7, RT1; /* h=t7+t1; */ + +.align 3 +.globl _gcry_sha512_transform_armv7_neon +.type _gcry_sha512_transform_armv7_neon,%function; + +_gcry_sha512_transform_armv7_neon: + /* Input: + * %r0: SHA512_CONTEXT + * %r1: data + * %r2: u64 k[] constants + */ + mov %r3, #0; + + /* Load context to d0-d7 */ + vld1.64 {RA-RD}, [%r0]!; + vld1.64 {RE-RH}, [%r0]; + sub %r0, #(4*8); + + /* Load input to w[16], d16-d31 */ + /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ + vld1.64 {RW0-RW3}, [%r1]!; + vld1.64 {RW4-RW7}, [%r1]!; + vld1.64 {RW8-RW11}, [%r1]!; + vld1.64 {RW12-RW15}, [%r1]; +#ifdef __ARMEL__ + /* byteswap */ + vrev64.8 RW01q, RW01q; + vrev64.8 RW23q, RW23q; + vrev64.8 RW45q, RW45q; + vrev64.8 RW67q, RW67q; + vrev64.8 RW89q, RW89q; + vrev64.8 RW1011q, RW1011q; + vrev64.8 RW1213q, RW1213q; + vrev64.8 RW1415q, RW1415q; +#endif + + /* EABI says that d8-d15 must be preserved by callee. */ + vpush {RT0-RT7}; + +.Loop: + add %r3, #16; + round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW14, RW9, RW1); + cmp %r3, #64; + round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW1, RW15, RW10, RW2); + round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW0, RW11, RW3); + round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW3, RW1, RW12, RW4); + round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW2, RW13, RW5); + round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW5, RW3, RW14, RW6); + round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW4, RW15, RW7); + round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW7, RW5, RW0, RW8); + round_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW6, RW1, RW9); + round_0_63(RH, RA, RB, RC, RD, RE, RF, RG, RW9, RW7, RW2, RW10); + round_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW8, RW3, RW11); + round_0_63(RF, RG, RH, RA, RB, RC, RD, RE, RW11, RW9, RW4, RW12); + round_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW10, RW5, RW13); + round_0_63(RD, RE, RF, RG, RH, RA, RB, RC, RW13, RW11, RW6, RW14); + round_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW12, RW7, RW15); + round_0_63(RB, RC, RD, RE, RF, RG, RH, RA, RW15, RW13, RW8, RW0); + bne .Loop; + + round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0); + round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW1); + round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2); + round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW3); + round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4); + round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW5); + round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6); + round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW7); + round_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8); + round_64_79(RH, RA, RB, RC, RD, RE, RF, RG, RW9); + round_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10); + round_64_79(RF, RG, RH, RA, RB, RC, RD, RE, RW11); + round_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12); + round_64_79(RD, RE, RF, RG, RH, RA, RB, RC, RW13); + round_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14); + round_64_79(RB, RC, RD, RE, RF, RG, RH, RA, RW15); + + /* Load context to d16-d23 */ + vld1.64 {RW0-RW3}, [%r0]!; + vld1.64 {RW4-RW7}, [%r0]; + sub %r0, #(4*8); + + vadd.u64 RA, RW0; + vadd.u64 RB, RW1; + vadd.u64 RC, RW2; + vadd.u64 RD, RW3; + vadd.u64 RE, RW4; + vadd.u64 RF, RW5; + vadd.u64 RG, RW6; + vadd.u64 RH, RW7; + + /* Store the first half of context */ + vst1.64 {RA-RD}, [%r0]!; + + /* Clear used registers */ + /* d16-d31 */ + veor.u64 RW01q, RW01q; + veor.u64 RW23q, RW23q; + veor.u64 RW45q, RW45q; + veor.u64 RW67q, RW67q; + vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ + veor.u64 RW89q, RW89q; + veor.u64 RW1011q, RW1011q; + veor.u64 RW1213q, RW1213q; + veor.u64 RW1415q, RW1415q; + /* d8-d15 */ + vpop {RT0-RT7}; + /* d0-d7 (q0-q3) */ + veor.u64 %q0, %q0; + veor.u64 %q1, %q1; + veor.u64 %q2, %q2; + veor.u64 %q3, %q3; + + bx %lr; +.size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon; + +#endif diff --git a/cipher/sha512.c b/cipher/sha512.c index 1bbcd11..fee3e71 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -53,12 +53,26 @@ #include "cipher.h" #include "hash-common.h" + +/* USE_ARM_NEON_ASM indicates whether to enable ARM NEON assembly code. */ +#undef USE_ARM_NEON_ASM +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) +# if defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) +# define USE_ARM_NEON_ASM 1 +# endif +#endif + + typedef struct { u64 h0, h1, h2, h3, h4, h5, h6, h7; u64 nblocks; byte buf[128]; int count; +#ifdef USE_ARM_NEON_ASM + int use_neon; +#endif } SHA512_CONTEXT; static void @@ -77,6 +91,9 @@ sha512_init (void *context) hd->nblocks = 0; hd->count = 0; +#ifdef USE_ARM_NEON_ASM + hd->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; +#endif } static void @@ -95,6 +112,9 @@ sha384_init (void *context) hd->nblocks = 0; hd->count = 0; +#ifdef USE_ARM_NEON_ASM + hd->use_neon = (_gcry_get_hw_features () & HWF_ARM_NEON) != 0; +#endif } @@ -128,58 +148,59 @@ Sum1 (u64 x) return (ROTR (x, 14) ^ ROTR (x, 18) ^ ROTR (x, 41)); } +static const u64 k[] = + { + U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd), + U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc), + U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019), + U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118), + U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe), + U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2), + U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1), + U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694), + U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3), + U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65), + U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483), + U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5), + U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210), + U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4), + U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725), + U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70), + U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926), + U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df), + U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8), + U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b), + U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001), + U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30), + U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910), + U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8), + U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53), + U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8), + U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb), + U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3), + U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60), + U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec), + U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9), + U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b), + U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207), + U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178), + U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6), + U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b), + U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493), + U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c), + U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a), + U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) + }; + /**************** * Transform the message W which consists of 16 64-bit-words */ static void -transform (SHA512_CONTEXT *hd, const unsigned char *data) +__transform (SHA512_CONTEXT *hd, const unsigned char *data) { u64 a, b, c, d, e, f, g, h; u64 w[16]; int t; - static const u64 k[] = - { - U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd), - U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc), - U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019), - U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118), - U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe), - U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2), - U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1), - U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694), - U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3), - U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65), - U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483), - U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5), - U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210), - U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4), - U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725), - U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70), - U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926), - U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df), - U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8), - U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b), - U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001), - U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30), - U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910), - U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8), - U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53), - U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8), - U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb), - U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3), - U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60), - U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec), - U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9), - U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b), - U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207), - U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178), - U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6), - U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b), - U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493), - U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c), - U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a), - U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817) - }; /* get values from the chaining vars */ a = hd->h0; @@ -455,6 +476,33 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data) } +#ifdef USE_ARM_NEON_ASM +void _gcry_sha512_transform_armv7_neon (SHA512_CONTEXT *hd, + const unsigned char *data, + const u64 k[]); +#endif + + +static unsigned int +transform (SHA512_CONTEXT *hd, const unsigned char *data) +{ +#ifdef USE_ARM_NEON_ASM + if (hd->use_neon) + { + _gcry_sha512_transform_armv7_neon(hd, data, k); + + /* return stack burn depth */ + return (sizeof(void *) * 3); + } +#endif + + __transform (hd, data); + + /* return stack burn depth */ + return 256; +} + + /* Update the message digest with the contents * of INBUF with length INLEN. */ @@ -463,11 +511,12 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen) { const unsigned char *inbuf = inbuf_arg; SHA512_CONTEXT *hd = context; + unsigned int stack_burn_depth = 0; if (hd->count == 128) { /* flush the buffer */ - transform (hd, hd->buf); - _gcry_burn_stack (256); + stack_burn_depth = transform (hd, hd->buf); + _gcry_burn_stack (stack_burn_depth); hd->count = 0; hd->nblocks++; } @@ -484,13 +533,13 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen) while (inlen >= 128) { - transform (hd, inbuf); + stack_burn_depth = transform (hd, inbuf); hd->count = 0; hd->nblocks++; inlen -= 128; inbuf += 128; } - _gcry_burn_stack (256); + _gcry_burn_stack (stack_burn_depth); for (; inlen && hd->count < 128; inlen--) hd->buf[hd->count++] = *inbuf++; } @@ -508,6 +557,7 @@ static void sha512_final (void *context) { SHA512_CONTEXT *hd = context; + unsigned int stack_burn_depth; u64 t, msb, lsb; byte *p; @@ -559,8 +609,8 @@ sha512_final (void *context) hd->buf[125] = lsb >> 16; hd->buf[126] = lsb >> 8; hd->buf[127] = lsb; - transform (hd, hd->buf); - _gcry_burn_stack (256); + stack_burn_depth = transform (hd, hd->buf); + _gcry_burn_stack (stack_burn_depth); p = hd->buf; #ifdef WORDS_BIGENDIAN diff --git a/configure.ac b/configure.ac index 89b9366..959327a 100644 --- a/configure.ac +++ b/configure.ac @@ -1526,6 +1526,11 @@ LIST_MEMBER(sha512, $enabled_digests) if test "$found" = "1" ; then GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512.lo" AC_DEFINE(USE_SHA512, 1, [Defined if this module should be included]) + + if test x"$neonsupport" = xyes ; then + # Build with the NEON implementation + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha512-armv7-neon.lo" + fi fi LIST_MEMBER(tiger, $enabled_digests) From jussi.kivilinna at iki.fi Sat Aug 31 15:40:39 2013 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 31 Aug 2013 16:40:39 +0300 Subject: [PATCH] Fix building for x32 target Message-ID: <20130831134039.4728.39322.stgit@localhost6.localdomain6> * mpi/amd64/mpi-asm-defs.h: New file. * random/rndhw.c (poll_padlock) [__x86_64__]: Also check if __LP64__ is defined. [USE_DRNG, __x86_64__]: Also check if __LP64__ is defined. -- In short, x32 is new x86-64 ABI with 32-bit pointers. Adding support is straightforward, small fix for mpi and fixes for random/rndhw.c. AMD64 assembly functions appear to work fine with x32 and 'make check' passes. Signed-off-by: Jussi Kivilinna --- mpi/amd64/mpi-asm-defs.h | 4 ++++ random/rndhw.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 mpi/amd64/mpi-asm-defs.h diff --git a/mpi/amd64/mpi-asm-defs.h b/mpi/amd64/mpi-asm-defs.h new file mode 100644 index 0000000..6519065 --- /dev/null +++ b/mpi/amd64/mpi-asm-defs.h @@ -0,0 +1,4 @@ +/* This file defines some basic constants for the MPI machinery. We + * need to define the types on a per-CPU basis, so it is done with + * this file here. */ +#define BYTES_PER_MPI_LIMB (SIZEOF_UNSIGNED_LONG_LONG) diff --git a/random/rndhw.c b/random/rndhw.c index ca99c5f..e625512 100644 --- a/random/rndhw.c +++ b/random/rndhw.c @@ -69,7 +69,7 @@ poll_padlock (void (*add)(const void*, size_t, enum random_origins), nbytes = 0; while (nbytes < 64) { -#ifdef __x86_64__ +#if defined(__x86_64__) && defined(__LP64__) asm volatile ("movq %1, %%rdi\n\t" /* Set buffer. */ "xorq %%rdx, %%rdx\n\t" /* Request up to 8 bytes. */ @@ -123,7 +123,7 @@ poll_padlock (void (*add)(const void*, size_t, enum random_origins), #ifdef USE_DRNG # define RDRAND_RETRY_LOOPS 10 # define RDRAND_INT ".byte 0x0f,0xc7,0xf0" -# ifdef __x86_64__ +# if defined(__x86_64__) && defined(__LP64__) # define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" # else # define RDRAND_LONG RDRAND_INT