[RFC PATCH v2] Initial implementation of GCM

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Nov 8 14:43:35 CET 2013


On 08.11.2013 12:03, Dmitry Eremin-Solenikov wrote:
> Currently it is still quite slow.
> 
> Still no support for generate_iv(). Is it really necessary?
> 
> TODO: Merge/reuse cipher-internal state used by CCM.
> 
> Changelog entry will be present in final patch submission.
> 
> Changes since v1:
> - 6x-7x speedup.
> - added bench-slope support
> 
> Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov at gmail.com>
> ---
>  cipher/Makefile.am       |   2 +-
>  cipher/cipher-ccm.c      |  12 +-
>  cipher/cipher-gcm.c      | 476 +++++++++++++++++++++++++++++++++++++++++++++++
>  cipher/cipher-internal.h |  36 +++-
>  cipher/cipher.c          |  32 ++++
>  src/gcrypt.h.in          |   6 +-
>  tests/basic.c            | 337 +++++++++++++++++++++++++++++++++
>  tests/bench-slope.c      | 128 +++++++++++++
>  tests/benchmark.c        |   2 +
>  9 files changed, 1023 insertions(+), 8 deletions(-)
>  create mode 100644 cipher/cipher-gcm.c
> 
[snip]
> +static void ghash(unsigned char *result, const unsigned char *buf, const unsigned char *gcmM)
> +{
> +  unsigned char V[16];
> +  int i;
> +
> +  buf_xor(V, result, buf, 16);
> +
> +  memset(result, 0, 16);
> +
> +  for (i = 15; i >= 0; i--)
> +    {
> +      byte A = result[15];
> +      byte T[16];
> +      int j;
> +      const byte *M = &gcmM[(V[i] & 0xf) * 16];
> +
> +      memmove(result+1, result, 15);
> +      result[0] = gcmR[A][0];
> +      result[1] ^= gcmR[A][1];
> +
> +      T[0] = M[0] >> 4;
> +      for (j = 1; j < 16; j++)
> +        T[j] = (M[j] >> 4) | (M[j-1] << 4);
> +      T[0] ^= gcmR[(M[15] & 0xf) << 4][0];
> +      T[1] ^= gcmR[(M[15] & 0xf) << 4][1];
> +      buf_xor(T, T, &gcmM[(V[i] >> 4) * 16], 16);
> +      buf_xor(result, result, T, 16);
> +    }
> +}
> +#define GHASH(c, result, buf) ghash (result, buf, c->gcm_table);

Following is faster:


static void
bshift (u64 *b)
{
  u64 t[2], mask;

  t[0] = be_bswap64(b[0]);
  t[1] = be_bswap64(b[1]);
  mask = t[1] & 1 ? 0xe1: 0;

  t[1] = (t[1] >> 1) | (t[0] << 63);
  t[0] = (t[0] >> 1) ^ (mask << 56);

  b[0] = be_bswap64(t[0]);
  b[1] = be_bswap64(t[1]);
}

static void
fillM (unsigned char *h, u64 *M)
{
  int i, j;

  M[0 * 2 + 0] = 0;
  M[0 * 2 + 1] = 0;
  buf_cpy (&M[8 * 2], h, 16);

  for (i = 4; i > 0; i /= 2)
    {
      M[i * 2 + 0] = M[2 * i * 2 + 0];
      M[i * 2 + 1] = M[2 * i * 2 + 1];

      bshift (&M[i * 2]);
    }

  for (i = 2; i < 16; i *= 2)
    for (j = 1; j < i; j++)
      buf_xor (&M[(i + j) * 2], &M[i * 2], &M[j * 2], 16);
}

static void
ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM)
{
  unsigned char V[16];
  u64 tmp[2] = {0,};
  int i;
  byte A, m15;
  u64 T[2];
  u64 y;
  const u64 *M;

  buf_xor (V, result, buf, 16);

  /* First round can be manually tweaked based on fact that 'tmp' is zero. */
  i = 15;

  M = &gcmM[(V[i] & 0xf) * 2];
  m15 = M[1] >> 56;
  T[0] = (M[0] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
  T[1] = (M[1] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
  T[0] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) << 12;
  T[1] |= (M[1] & 0x0f0f0f0f0f0f0f0fULL) << 12;
  T[1] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) >> (64 - 12);
  T[0] ^= gcmR[(m15 & 0xf) << 4][0] | (gcmR[(m15 & 0xf) << 4][1] << 8);

  tmp[0] = T[0] ^ gcmM[(V[i] >> 4) * 2 + 0];
  tmp[1] = T[1] ^ gcmM[(V[i] >> 4) * 2 + 1];

  for (--i; i >= 0; i--)
    {
      M = &gcmM[(V[i] & 0xf) * 2];
      A = tmp[1] >> 56;

      y = tmp[0];
      tmp[0] = (y << 8);
      tmp[1] = (y >> 56) | (tmp[1] << 8);
      tmp[0] ^= gcmR[A][0] | (gcmR[A][1] << 8);

      m15 = M[1] >> 56;
      T[0] = (M[0] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
      T[1] = (M[1] >> 4) & 0x0f0f0f0f0f0f0f0fULL;
      T[0] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) << 12;
      T[1] |= (M[1] & 0x0f0f0f0f0f0f0f0fULL) << 12;
      T[1] |= (M[0] & 0x0f0f0f0f0f0f0f0fULL) >> (64 - 12);
      T[0] ^= gcmR[(m15 & 0xf) << 4][0] | (gcmR[(m15 & 0xf) << 4][1] << 8);

      tmp[0] ^= T[0] ^ gcmM[(V[i] >> 4) * 2 + 0];
      tmp[1] ^= T[1] ^ gcmM[(V[i] >> 4) * 2 + 1];
    }

  buf_cpy (result, tmp, 16);
}


Note, the cipher_handle->gcm_table is also changed to:
#ifdef GCM_USE_TABLES
  u64 gcm_table[2 * 16]; /* pre-calculated table for GCM */
#endif


Cortex-A8:
 old GCM auth |     228.5 ns/B      4.17 MiB/s     230.3 c/B
 new GCM auth |     87.30 ns/B     10.92 MiB/s     88.00 c/B

Core i5-4570:
 old GCM auth |     26.22 ns/B     36.37 MiB/s     83.91 c/B
 new GCM auth |      5.02 ns/B     190.0 MiB/s     16.06 c/B


Intel can be improved further with use of ctr-bulk encryption
functions and ghash with clmul instruction.

> +
> +#else
> +
[snip]
> +
> +
> +gcry_err_code_t
> +_gcry_cipher_gcm_encrypt (gcry_cipher_hd_t c,
> +                          byte *outbuf, unsigned int outbuflen,
> +                          const byte *inbuf, unsigned int inbuflen)
> +{
> +  unsigned int n;
> +  int i;
> +  unsigned int blocksize = c->spec->blocksize;
> +  unsigned char tmp[MAX_BLOCKSIZE];
> +
> +  if (blocksize >= 0x20)
> +    return GPG_ERR_CIPHER_ALGO;
> +  if (blocksize != 0x10)
> +    return GPG_ERR_CIPHER_ALGO;

The '>= 0x20' can removed.

> +  if (outbuflen < inbuflen)
> +    return GPG_ERR_BUFFER_TOO_SHORT;
> +
[snip]
> +
> +gcry_err_code_t
> +_gcry_cipher_gcm_decrypt (gcry_cipher_hd_t c,
> +                          byte *outbuf, unsigned int outbuflen,
> +                          const byte *inbuf, unsigned int inbuflen)
> +{
> +  unsigned int n;
> +  int i;
> +  unsigned int blocksize = c->spec->blocksize;
> +  unsigned char tmp[MAX_BLOCKSIZE];
> +
> +  if (blocksize >= 0x20)
> +    return GPG_ERR_CIPHER_ALGO;
> +  if (blocksize != 0x10)
> +    return GPG_ERR_CIPHER_ALGO;

Unnecessary '>= 0x20'.

> +  if (outbuflen < inbuflen)
> +    return GPG_ERR_BUFFER_TOO_SHORT;
> +
[snip]
> +gcry_err_code_t
> +_gcry_cipher_gcm_tag (gcry_cipher_hd_t c,
> +                      byte *outbuf, unsigned int outbuflen,
> +                      int check)
> +{
> +  if (outbuflen < 16)
> +    return GPG_ERR_BUFFER_TOO_SHORT;
> +
> +  if (!c->marks.tag)
> +    {
> +      GHASH (c, c->u_tag.tag, c->length);
> +      buf_xor (c->u_tag.tag, c->lastiv, c->u_tag.tag, 16);
> +      c->marks.tag = 1;
> +    }
> +  memcpy (outbuf, c->u_tag.tag, 16);

When doing the above optimizations of ghash, tests/basic did not
first report any errors.. because of this memcpy.

Overwrites the outbuf when should be doing 'check'.

> +  if (!check)
> +    {
> +      memcpy (outbuf, c->u_tag.tag, outbuflen);
> +      return GPG_ERR_NO_ERROR;
> +    }
> +  else
> +    {
> +      int diff, i;
> +
> +      /* Constant-time compare. */
> +      for (i = 0, diff = 0; i < outbuflen; i++)
> +        diff -= !!(outbuf[i] - c->u_tag.tag[i]);
> +
> +      return !diff ? GPG_ERR_NO_ERROR : GPG_ERR_CHECKSUM;
> +    }
> +
[snip]
>  
> +  /* The interim tag for GCM mode.  */
> +  union {
> +    cipher_context_alignment_t iv_align;
> +    unsigned char tag[MAX_BLOCKSIZE];
> +  } u_tag;
> +

Maybe this and gcm_table could be moved within u_mode union.

>    /* Space to save an IV or CTR for chaining operations.  */
>    unsigned char lastiv[MAX_BLOCKSIZE];
>    int unused;  /* Number of unused bytes in LASTIV. */
> +  unsigned char length[MAX_BLOCKSIZE]; /* bit counters for GCM */
> +#ifdef GCM_USE_TABLES
> +  unsigned char gcm_table[16 * 16]; /* pre-calculated table for GCM */
> +#endif
>  
[snip]
> +gcry_err_code_t _gcry_cipher_gcm_get_tag
> +/*           */   (gcry_cipher_hd_t c,
> +                   unsigned char *outbuf, unsigned int outbuflen);
> +gcry_err_code_t _gcry_cipher_gcm_check_tag
> +/*           */   (gcry_cipher_hd_t c,
> +                   const unsigned char *outbuf, unsigned int outbuflen);
>  

Building with './configure --enable-maintainer-mode' fails here. Prototypes do
not match with functions in cipher-gcm.c (unsigned int <=> size_t).

>  
>  #endif /*G10_CIPHER_INTERNAL_H*/
> diff --git a/cipher/cipher.c b/cipher/cipher.c
[snip]
> @@ -622,6 +630,8 @@ cipher_reset (gcry_cipher_hd_t c)
>    memset (c->u_ctr.ctr, 0, c->spec->blocksize);
>    memset (&c->u_mode, 0, sizeof c->u_mode);
>    c->unused = 0;
> +/*  memset (c->u_tag.tag, 0, c->spec->blocksize);
> +  memset (c->length, 0, c->spec->blocksize); */

Maybe u_mode can be cleared depending on currently selected mode, full clear for CCM
and less for GCM.

-Jussi




More information about the Gcrypt-devel mailing list