[RFC PATCH v2] Initial implementation of GCM
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Nov 8 14:43:35 CET 2013
On 08.11.2013 12:03, Dmitry Eremin-Solenikov wrote:
> Currently it is still quite slow.
>
> Still no support for generate_iv(). Is it really necessary?
>
> TODO: Merge/reuse cipher-internal state used by CCM.
>
> Changelog entry will be present in final patch submission.
>
> Changes since v1:
> - 6x-7x speedup.
> - added bench-slope support
>
> Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov at gmail.com>
> ---
> cipher/Makefile.am | 2 +-
> cipher/cipher-ccm.c | 12 +-
> cipher/cipher-gcm.c | 476 +++++++++++++++++++++++++++++++++++++++++++++++
> cipher/cipher-internal.h | 36 +++-
> cipher/cipher.c | 32 ++++
> src/gcrypt.h.in | 6 +-
> tests/basic.c | 337 +++++++++++++++++++++++++++++++++
> tests/bench-slope.c | 128 +++++++++++++
> tests/benchmark.c | 2 +
> 9 files changed, 1023 insertions(+), 8 deletions(-)
> create mode 100644 cipher/cipher-gcm.c
>
[snip]
> +static void ghash(unsigned char *result, const unsigned char *buf, const unsigned char *gcmM)
> +{
> + unsigned char V[16];
> + int i;
> +
> + buf_xor(V, result, buf, 16);
> +
> + memset(result, 0, 16);
> +
> + for (i = 15; i >= 0; i--)
> + {
> + byte A = result[15];
> + byte T[16];
> + int j;
> + const byte *M = &gcmM[(V[i] & 0xf) * 16];
> +
> + memmove(result+1, result, 15);
> + result[0] = gcmR[A][0];
> + result[1] ^= gcmR[A][1];
> +
> + T[0] = M[0] >> 4;
> + for (j = 1; j < 16; j++)
> + T[j] = (M[j] >> 4) | (M[j-1] << 4);
> + T[0] ^= gcmR[(M[15] & 0xf) << 4][0];
> + T[1] ^= gcmR[(M[15] & 0xf) << 4][1];
> + buf_xor(T, T, &gcmM[(V[i] >> 4) * 16], 16);
> + buf_xor(result, result, T, 16);
> + }
> +}
> +#define GHASH(c, result, buf) ghash (result, buf, c->gcm_table);
Following is faster:
static void
bshift (u64 *b)
{
u64 t[2], mask;
t[0] = be_bswap64(b[0]);
t[1] = be_bswap64(b[1]);
mask = t[1] & 1 ? 0xe1: 0;
t[1] = (t[1] >> 1) | (t[0] << 63);
t[0] = (t[0] >> 1) ^ (mask << 56);
b[0] = be_bswap64(t[0]);
b[1] = be_bswap64(t[1]);
}
static void
fillM (unsigned char *h, u64 *M)
{
int i, j;
M[0 * 2 + 0] = 0;
M[0 * 2 + 1] = 0;
buf_cpy (&M[8 * 2], h, 16);
for (i = 4; i > 0; i /= 2)
{
M[i * 2 + 0] = M[2 * i * 2 + 0];
M[i * 2 + 1] = M[2 * i * 2 + 1];
bshift (&M[i * 2]);
}
for (i = 2; i < 16; i *= 2)
for (j = 1; j < i; j++)
buf_xor (&M[(i + j) * 2], &M[i * 2], &M[j * 2], 16);
}
static void
ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
{
unsigned char V[16];
u64 tmp[2] = {0,};
int i;
byte A, m15;
u64 T[2];
u64 y;
const u64 *M;
buf_xor (V, result, buf, 16);
/* First round can be manually tweaked based on fact that 'tmp' is zero. */
i = 15;
M = &gcmM[(V[i] & 0xf) * 2];
m15 = M[1] >> 56;
T[0] = (M[0] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
T[1] = (M[1] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
T[0] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) << 12;
T[1] |= (M[1] & 0x0f0f0f0f0f0f0f0fULL) << 12;
T[1] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) >> (64 - 12);
T[0] ^= gcmR[(m15 & 0xf) << 4][0] | (gcmR[(m15 & 0xf) << 4][1] << 8);
tmp[0] = T[0] ^ gcmM[(V[i] >> 4) * 2 + 0];
tmp[1] = T[1] ^ gcmM[(V[i] >> 4) * 2 + 1];
for (--i; i >= 0; i--)
{
M = &gcmM[(V[i] & 0xf) * 2];
A = tmp[1] >> 56;
y = tmp[0];
tmp[0] = (y << 8);
tmp[1] = (y >> 56) | (tmp[1] << 8);
tmp[0] ^= gcmR[A][0] | (gcmR[A][1] << 8);
m15 = M[1] >> 56;
T[0] = (M[0] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
T[1] = (M[1] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
T[0] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) << 12;
T[1] |= (M[1] & 0x0f0f0f0f0f0f0f0fULL) << 12;
T[1] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) >> (64 - 12);
T[0] ^= gcmR[(m15 & 0xf) << 4][0] | (gcmR[(m15 & 0xf) << 4][1] << 8);
tmp[0] ^= T[0] ^ gcmM[(V[i] >> 4) * 2 + 0];
tmp[1] ^= T[1] ^ gcmM[(V[i] >> 4) * 2 + 1];
}
buf_cpy (result, tmp, 16);
}
Note, the cipher_handle->gcm_table is also changed to:
#ifdef GCM_USE_TABLES
u64 gcm_table[2 * 16]; /* pre-calculated table for GCM */
#endif
Cortex-A8:
old GCM auth | 228.5 ns/B 4.17 MiB/s 230.3 c/B
new GCM auth | 87.30 ns/B 10.92 MiB/s 88.00 c/B
Core i5-4570:
old GCM auth | 26.22 ns/B 36.37 MiB/s 83.91 c/B
new GCM auth | 5.02 ns/B 190.0 MiB/s 16.06 c/B
Intel can be improved further with use of ctr-bulk encryption
functions and ghash with clmul instruction.
> +
> +#else
> +
[snip]
> +
> +
> +gcry_err_code_t
> +_gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
> + byte *outbuf, unsigned int outbuflen,
> + const byte *inbuf, unsigned int inbuflen)
> +{
> + unsigned int n;
> + int i;
> + unsigned int blocksize = c->spec->blocksize;
> + unsigned char tmp[MAX_BLOCKSIZE];
> +
> + if (blocksize >= 0x20)
> + return GPG_ERR_CIPHER_ALGO;
> + if (blocksize != 0x10)
> + return GPG_ERR_CIPHER_ALGO;
The '>= 0x20' can removed.
> + if (outbuflen < inbuflen)
> + return GPG_ERR_BUFFER_TOO_SHORT;
> +
[snip]
> +
> +gcry_err_code_t
> +_gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
> + byte *outbuf, unsigned int outbuflen,
> + const byte *inbuf, unsigned int inbuflen)
> +{
> + unsigned int n;
> + int i;
> + unsigned int blocksize = c->spec->blocksize;
> + unsigned char tmp[MAX_BLOCKSIZE];
> +
> + if (blocksize >= 0x20)
> + return GPG_ERR_CIPHER_ALGO;
> + if (blocksize != 0x10)
> + return GPG_ERR_CIPHER_ALGO;
Unnecessary '>= 0x20'.
> + if (outbuflen < inbuflen)
> + return GPG_ERR_BUFFER_TOO_SHORT;
> +
[snip]
> +gcry_err_code_t
> +_gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
> + byte *outbuf, unsigned int outbuflen,
> + int check)
> +{
> + if (outbuflen < 16)
> + return GPG_ERR_BUFFER_TOO_SHORT;
> +
> + if (!c->marks.tag)
> + {
> + GHASH (c, c->u_tag.tag, c->length);
> + buf_xor (c->u_tag.tag, c->lastiv, c->u_tag.tag, 16);
> + c->marks.tag = 1;
> + }
> + memcpy (outbuf, c->u_tag.tag, 16);
When doing the above optimizations of ghash, tests/basic did not
first report any errors.. because of this memcpy.
Overwrites the outbuf when should be doing 'check'.
> + if (!check)
> + {
> + memcpy (outbuf, c->u_tag.tag, outbuflen);
> + return GPG_ERR_NO_ERROR;
> + }
> + else
> + {
> + int diff, i;
> +
> + /* Constant-time compare. */
> + for (i = 0, diff = 0; i < outbuflen; i++)
> + diff -= !!(outbuf[i] - c->u_tag.tag[i]);
> +
> + return !diff ? GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
> + }
> +
[snip]
>
> + /* The interim tag for GCM mode. */
> + union {
> + cipher_context_alignment_t iv_align;
> + unsigned char tag[MAX_BLOCKSIZE];
> + } u_tag;
> +
Maybe this and gcm_table could be moved within u_mode union.
> /* Space to save an IV or CTR for chaining operations. */
> unsigned char lastiv[MAX_BLOCKSIZE];
> int unused; /* Number of unused bytes in LASTIV. */
> + unsigned char length[MAX_BLOCKSIZE]; /* bit counters for GCM */
> +#ifdef GCM_USE_TABLES
> + unsigned char gcm_table[16 * 16]; /* pre-calculated table for GCM */
> +#endif
>
[snip]
> +gcry_err_code_t _gcry_cipher_gcm_get_tag
> +/* */ (gcry_cipher_hd_t c,
> + unsigned char *outbuf, unsigned int outbuflen);
> +gcry_err_code_t _gcry_cipher_gcm_check_tag
> +/* */ (gcry_cipher_hd_t c,
> + const unsigned char *outbuf, unsigned int outbuflen);
>
Building with './configure --enable-maintainer-mode' fails here. Prototypes do
not match with functions in cipher-gcm.c (unsigned int <=> size_t).
>
> #endif /*G10_CIPHER_INTERNAL_H*/
> diff --git a/cipher/cipher.c b/cipher/cipher.c
[snip]
> @@ -622,6 +630,8 @@ cipher_reset (gcry_cipher_hd_t c)
> memset (c->u_ctr.ctr, 0, c->spec->blocksize);
> memset (&c->u_mode, 0, sizeof c->u_mode);
> c->unused = 0;
> +/* memset (c->u_tag.tag, 0, c->spec->blocksize);
> + memset (c->length, 0, c->spec->blocksize); */
Maybe u_mode can be cleared depending on currently selected mode, full clear for CCM
and less for GCM.
-Jussi
More information about the Gcrypt-devel
mailing list