From gniibe at fsij.org  Fri Jul  3 11:27:19 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 03 Jul 2015 18:27:19 +0900
Subject: [PATCH] Curve25519 encryption support (experimental)
Message-ID: <55965577.7020306@fsij.org>

Hello,

This is pretty immature experimental patch for Curve25519 encryption
support.

Since Montgomery curve is available in libgcrypt, it is used.

I assume that key generation is done with:

    (genkey(ecc(curve Curve25519)(flags eddsa)))

Then, '(flags eddsa)' means that public key is in DJB format with
the prefix 0x40 like EdDSA.

I tested with modified version of GnuPG 2.1.  I'm going to submit
the patch for GnuPG now.

Please note that this is highly experimental.  The format is not
yet decided.


diff --git a/cipher/ecc-common.h b/cipher/ecc-common.h
index f0d97ea..6b3b063 100644
--- a/cipher/ecc-common.h
+++ b/cipher/ecc-common.h
@@ -132,6 +132,8 @@ gpg_err_code_t _gcry_ecc_eddsa_verify (gcry_mpi_t input,
                                        ECC_public_key *pk,
                                        gcry_mpi_t r, gcry_mpi_t s,
                                        int hashalgo, gcry_mpi_t pkmpi);
+gpg_err_code_t _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx,
+                                           mpi_point_t result);

 /*-- ecc-gost.c --*/
 gpg_err_code_t _gcry_ecc_gost_sign (gcry_mpi_t input, ECC_secret_key *skey,
diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c
index 9975bb4..5d855bd 100644
--- a/cipher/ecc-curves.c
+++ b/cipher/ecc-curves.c
@@ -40,7 +40,7 @@ static const struct
   const char *other; /* Other name. */
 } curve_aliases[] =
   {
-  /*{ "Curve25519", "1.3.6.1.4.1.3029.1.5.1" },*/
+    { "Curve25519", "1.3.6.1.4.1.3029.1.5.1" },
     { "Ed25519",    "1.3.6.1.4.1.11591.15.1" },

     { "NIST P-192", "1.2.840.10045.3.1.1" }, /* X9.62 OID  */
@@ -129,6 +129,18 @@ static const ecc_domain_parms_t domain_parms[] =
       "0x6666666666666666666666666666666666666666666666666666666666666658",
       "0x08"
     },
+    {
+      /* (y^2 = x^3 + 486662*x^2 + x) */
+      "Curve25519", 256, 0,
+      MPI_EC_MONTGOMERY, ECC_DIALECT_ED25519,
+      "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED",
+      "0x01DB41",
+      "0x01",
+      "0x1000000000000000000000000000000014DEF9DEA2F79CD65812631A5CF5D3ED",
+      "0x0000000000000000000000000000000000000000000000000000000000000009",
+      "0x20AE19A1B8A086B4E01EDD2C7748D14C923D4D7E6D7C61B229E9C5A27ECED3D9",
+      "0x08"
+    },
 #if 0 /* No real specs yet found.  */
     {
       /* x^2 + y^2 = 1 + 3617x^2y^2 mod 2^414 - 17 */
diff --git a/cipher/ecc-eddsa.c b/cipher/ecc-eddsa.c
index 4323d8e..72481ba 100644
--- a/cipher/ecc-eddsa.c
+++ b/cipher/ecc-eddsa.c
@@ -400,6 +400,51 @@ _gcry_ecc_eddsa_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result,
 }


+gpg_err_code_t
+_gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
+{
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+
+  if (mpi_is_opaque (pk))
+    {
+      const unsigned char *buf;
+
+      buf = mpi_get_opaque (pk, &rawmpilen);
+      if (!buf)
+        return GPG_ERR_INV_OBJ;
+      rawmpilen = (rawmpilen + 7)/8;
+
+      if (rawmpilen > 1 && (rawmpilen%2) && buf[0] == 0x40)
+        {
+          rawmpilen--;
+          buf++;
+        }
+
+      rawmpi = xtrymalloc (rawmpilen? rawmpilen:1);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+      memcpy (rawmpi, buf, rawmpilen);
+      reverse_buffer (rawmpi, rawmpilen);
+    }
+  else
+    {
+      /* Note: Without using an opaque MPI it is not reliable possible
+         to find out whether the public key has been given in
+         uncompressed format.  Thus we expect native EdDSA format.  */
+      rawmpi = _gcry_mpi_get_buffer (pk, ctx->nbits/8, &rawmpilen, NULL);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+    }
+
+  _gcry_mpi_set_buffer (result->x, rawmpi, rawmpilen, 0);
+  xfree (rawmpi);
+  mpi_set_ui (result->z, 1);
+
+  return 0;
+}
+
+
 /* Compute the A value as used by EdDSA.  The caller needs to provide
    the context EC and the actual secret D as an MPI.  The function
    returns a newly allocated 64 byte buffer at r_digest; the first 32
diff --git a/cipher/ecc.c b/cipher/ecc.c
index 5ffe84b..e5b3459 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -174,7 +174,10 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx,
   point_init (&sk->Q);

   x = mpi_new (pbits);
-  y = mpi_new (pbits);
+  if (r_y == NULL)
+    y = NULL;
+  else
+    y = mpi_new (pbits);
   if (_gcry_mpi_ec_get_affine (x, y, &Q, ctx))
     log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "Q");

@@ -187,7 +190,7 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx,
    * possibilities without any loss of security.  Note that we don't
    * do that for Ed25519 so that we do not violate the special
    * construction of the secret key.  */
-  if (E->dialect == ECC_DIALECT_ED25519)
+  if (E->dialect == ECC_DIALECT_ED25519 || r_y == NULL)
     point_set (&sk->Q, &Q);
   else
     {
@@ -231,7 +234,8 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx,
     }

   *r_x = x;
-  *r_y = y;
+  if (r_y)
+    *r_y = y;

   point_free (&Q);
   /* Now we can test our keys (this should never fail!).  */
@@ -307,7 +311,7 @@ test_ecdh_only_keys (ECC_secret_key *sk, unsigned int nbits)
   mpi_ec_t ec;

   if (DBG_CIPHER)
-    log_debug ("Testing key.\n");
+    log_debug ("Testing ECDH only key.\n");

   point_init (&R_);

@@ -572,7 +576,9 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)

   ctx = _gcry_mpi_ec_p_internal_new (E.model, E.dialect, 0, E.p, E.a, E.b);

-  if ((flags & PUBKEY_FLAG_EDDSA))
+  if (E.model == MPI_EC_MONTGOMERY)
+    rc = nist_generate_key (&sk, &E, ctx, flags, nbits, &Qx, NULL);
+  else if ((flags & PUBKEY_FLAG_EDDSA))
     rc = _gcry_ecc_eddsa_genkey (&sk, &E, ctx, flags);
   else
     rc = nist_generate_key (&sk, &E, ctx, flags, nbits, &Qx, &Qy);
@@ -582,26 +588,41 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
   /* Copy data to the result.  */
   Gx = mpi_new (0);
   Gy = mpi_new (0);
-  if (_gcry_mpi_ec_get_affine (Gx, Gy, &sk.E.G, ctx))
-    log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "G");
-  base = _gcry_ecc_ec2os (Gx, Gy, sk.E.p);
   if (sk.E.dialect == ECC_DIALECT_ED25519 && !(flags & PUBKEY_FLAG_NOCOMP))
     {
       unsigned char *encpk;
       unsigned int encpklen;

-      /* (Gx and Gy are used as scratch variables)  */
-      rc = _gcry_ecc_eddsa_encodepoint (&sk.Q, ctx, Gx, Gy,
-                                        !!(flags & PUBKEY_FLAG_COMP),
-                                        &encpk, &encpklen);
+      if (E.model != MPI_EC_MONTGOMERY)
+        /* (Gx and Gy are used as scratch variables)  */
+        rc = _gcry_ecc_eddsa_encodepoint (&sk.Q, ctx, Gx, Gy,
+                                          !!(flags & PUBKEY_FLAG_COMP),
+                                          &encpk, &encpklen);
+      else
+        {
+          int off = !!(flags & PUBKEY_FLAG_COMP);
+
+          encpk = _gcry_mpi_get_buffer_extra (Qx, ctx->nbits/8, off?-1:0,
+                                              &encpklen, NULL);
+          if (encpk == NULL)
+            rc = gpg_err_code_from_syserror ();
+          else
+            {
+              if (off)
+                encpk[0] = 0x40;
+              encpklen += off;
+            }
+        }
       if (rc)
         return rc;
       public = mpi_new (0);
       mpi_set_opaque (public, encpk, encpklen*8);
-      encpk = NULL;
     }
   else
     {
+      if (_gcry_mpi_ec_get_affine (Gx, Gy, &sk.E.G, ctx))
+        log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "G");
+      base = _gcry_ecc_ec2os (Gx, Gy, sk.E.p);
       if (!Qx)
         {
           /* This is the case for a key from _gcry_ecc_eddsa_generate
@@ -1216,6 +1237,18 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   gcry_mpi_t data = NULL;
   ECC_public_key pk;
   mpi_ec_t ec = NULL;
+  int flags;
+
+  /* Look for flags. */
+  l1 = sexp_find_token (keyparms, "flags", 0);
+  if (l1)
+    {
+      rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      if (rc)
+        goto leave;
+    }
+  sexp_release (l1);
+  l1 = NULL;

   memset (&pk, 0, sizeof pk);
   _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_ENCRYPT,
@@ -1239,7 +1272,9 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   /*
    * Extract the key.
    */
-  rc = sexp_extract_param (keyparms, NULL, "-p?a?b?g?n?h?+q",
+  rc = sexp_extract_param (keyparms, NULL,
+                           (flags & PUBKEY_FLAG_EDDSA)?
+                           "-p?a?b?g?n?h?/q" : "-p?a?b?g?n?h?+q",
                            &pk.E.p, &pk.E.a, &pk.E.b, &mpi_g, &pk.E.n, &pk.E.h,
                            &mpi_q, NULL);
   if (rc)
@@ -1252,7 +1287,6 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -1261,7 +1295,7 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &pk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.  */
@@ -1292,42 +1326,73 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       goto leave;
     }

+  /* Compute the encrypted value.  */
+  ec = _gcry_mpi_ec_p_internal_new (pk.E.model, pk.E.dialect, 0,
+                                    pk.E.p, pk.E.a, pk.E.b);
+
   /* Convert the public key.  */
   if (mpi_q)
     {
       point_init (&pk.Q);
-      rc = _gcry_ecc_os2ec (&pk.Q, mpi_q);
+      if (ec->model == MPI_EC_MONTGOMERY)
+        rc = _gcry_ecc_mont_decodepoint (mpi_q, ec, &pk.Q);
+      else
+        rc = _gcry_ecc_os2ec (&pk.Q, mpi_q);
       if (rc)
         goto leave;
     }

-  /* Compute the encrypted value.  */
-  ec = _gcry_mpi_ec_p_internal_new (pk.E.model, pk.E.dialect, 0,
-                                    pk.E.p, pk.E.a, pk.E.b);
-
   /* The following is false: assert( mpi_cmp_ui( R.x, 1 )==0 );, so */
   {
     mpi_point_struct R;  /* Result that we return.  */
     gcry_mpi_t x, y;
+    unsigned char *rawmpi;
+    unsigned int rawmpilen;

     x = mpi_new (0);
-    y = mpi_new (0);
+    if (ec->model == MPI_EC_MONTGOMERY)
+      y = NULL;
+    else
+      y = mpi_new (0);

     point_init (&R);

     /* R = kQ  <=>  R = kdG  */
     _gcry_mpi_ec_mul_point (&R, data, &pk.Q, ec);
-
     if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
       log_fatal ("ecdh: Failed to get affine coordinates for kdG\n");
-    mpi_s = _gcry_ecc_ec2os (x, y, pk.E.p);
+    if (y)
+      mpi_s = _gcry_ecc_ec2os (x, y, pk.E.p);
+    else
+      {
+        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        if (!rawmpi)
+          rc = gpg_err_code_from_syserror ();
+        else
+          {
+            mpi_s = mpi_new (0);
+            mpi_set_opaque (mpi_s, rawmpi, rawmpilen*8);
+          }
+      }

     /* R = kG */
     _gcry_mpi_ec_mul_point (&R, data, &pk.E.G, ec);

     if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
       log_fatal ("ecdh: Failed to get affine coordinates for kG\n");
-    mpi_e = _gcry_ecc_ec2os (x, y, pk.E.p);
+    if (y)
+      mpi_e = _gcry_ecc_ec2os (x, y, pk.E.p);
+    else
+      {
+        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        if (!rawmpi)
+          rc = gpg_err_code_from_syserror ();
+        else
+          {
+            mpi_e = mpi_new (0);
+            mpi_set_opaque (mpi_e, rawmpi, rawmpilen*8);
+          }
+      }

     mpi_free (x);
     mpi_free (y);
@@ -1335,7 +1400,8 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
     point_free (&R);
   }

-  rc = sexp_build (r_ciph, NULL, "(enc-val(ecdh(s%m)(e%m)))", mpi_s, mpi_e);
+  if (!rc)
+    rc = sexp_build (r_ciph, NULL, "(enc-val(ecdh(s%m)(e%m)))", mpi_s, mpi_e);

  leave:
   _gcry_mpi_release (pk.E.p);
@@ -1351,6 +1417,7 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   _gcry_mpi_release (mpi_s);
   _gcry_mpi_release (mpi_e);
   xfree (curvename);
+  sexp_release (l1);
   _gcry_mpi_ec_free (ec);
   _gcry_pk_util_free_encoding_ctx (&ctx);
   if (DBG_CIPHER)
@@ -1380,6 +1447,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   mpi_point_struct kG;
   mpi_point_struct R;
   gcry_mpi_t r = NULL;
+  int flags = 0;

   memset (&sk, 0, sizeof sk);
   point_init (&kG);
@@ -1388,6 +1456,17 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_DECRYPT,
                                    ecc_get_nbits (keyparms));

+  /* Look for flags. */
+  l1 = sexp_find_token (keyparms, "flags", 0);
+  if (l1)
+    {
+      rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      if (rc)
+        goto leave;
+    }
+  sexp_release (l1);
+  l1 = NULL;
+
   /*
    * Extract the data.
    */
@@ -1430,7 +1509,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &sk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.  */
@@ -1462,18 +1541,19 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
     }


+  ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0,
+                                    sk.E.p, sk.E.a, sk.E.b);
+
   /*
    * Compute the plaintext.
    */
-  rc = _gcry_ecc_os2ec (&kG, data_e);
+  if (ec->model == MPI_EC_MONTGOMERY)
+    rc = _gcry_ecc_mont_decodepoint (data_e, ec, &kG);
+  else
+    rc = _gcry_ecc_os2ec (&kG, data_e);
   if (rc)
-    {
-      point_free (&kG);
-      return rc;
-    }
+    return rc;

-  ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0,
-                                    sk.E.p, sk.E.a, sk.E.b);

   /* R = dkG */
   _gcry_mpi_ec_mul_point (&R, sk.d, &kG, ec);
@@ -1483,12 +1563,30 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
     gcry_mpi_t x, y;

     x = mpi_new (0);
-    y = mpi_new (0);
+    if (ec->model == MPI_EC_MONTGOMERY)
+      y = NULL;
+    else
+      y = mpi_new (0);

     if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
       log_fatal ("ecdh: Failed to get affine coordinates\n");

-    r = _gcry_ecc_ec2os (x, y, sk.E.p);
+    if (y)
+      r = _gcry_ecc_ec2os (x, y, sk.E.p);
+    else
+      {
+        unsigned char *rawmpi;
+        unsigned int rawmpilen;
+
+        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        if (!rawmpi)
+          rc = gpg_err_code_from_syserror ();
+        else
+          {
+            r = mpi_new (0);
+            mpi_set_opaque (r, rawmpi, rawmpilen*8);
+          }
+      }
     if (!r)
       rc = gpg_err_code_from_syserror ();
     else
--


From gniibe at fsij.org  Mon Jul  6 05:04:41 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Mon, 06 Jul 2015 12:04:41 +0900
Subject: [PATCH] ecc: fix memory leaks.
Message-ID: <5599F049.8060205@fsij.org>

Hello,

While modifying ecc for Curve25519, I've found some errors in ecc.c.

Here are changes.

    cipher/ecc.c (ecc_generate): Fix memory leak on error of
    _gcry_pk_util_parse_flaglist and _gcry_ecc_eddsa_encodepoint.
    (ecc_check_secret_key): Fix memory leak on error of
    _gcry_ecc_update_curve_param.
    (ecc_sign, ecc_verify, ecc_encrypt_raw, ecc_decrypt_raw): Remove
    unnecessary sexp_release and fix memory leak on error of
    _gcry_ecc_fill_in_curve.
    (ecc_decrypt_raw): Fix double free of the point kG and memory leak
    on error of _gcry_ecc_os2ec.

diff --git a/cipher/ecc.c b/cipher/ecc.c
index 5ffe84b..f5bc50a 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -551,7 +551,6 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
     return GPG_ERR_NO_OBJ; /* No NBITS parameter. */

   rc = _gcry_ecc_fill_in_curve (nbits, curve_name, &E, &nbits);
-  xfree (curve_name); curve_name = NULL;
   if (rc)
     goto leave;

@@ -595,10 +594,9 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
                                         !!(flags & PUBKEY_FLAG_COMP),
                                         &encpk, &encpklen);
       if (rc)
-        return rc;
+        goto leave;
       public = mpi_new (0);
       mpi_set_opaque (public, encpk, encpklen*8);
-      encpk = NULL;
     }
   else
     {
@@ -691,6 +689,7 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
   mpi_free (Qx);
   mpi_free (Qy);
   _gcry_mpi_ec_free (ctx);
+  xfree (curve_name);
   sexp_release (curve_flags);
   sexp_release (curve_info);
   return rc;
@@ -744,7 +743,7 @@ ecc_check_secret_key (gcry_sexp_t keyparms)
                                              &sk.E.p, &sk.E.a, &sk.E.b,
                                              &mpi_g, &sk.E.n, &sk.E.h);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   if (mpi_g)
@@ -877,7 +876,6 @@ ecc_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -886,7 +884,7 @@ ecc_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &sk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.
@@ -1043,7 +1041,6 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (s_keyparms, "curve", 5);
   if (l1)
     {
@@ -1052,7 +1049,7 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &pk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.
@@ -1252,7 +1249,6 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -1261,7 +1257,7 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &pk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.  */
@@ -1421,7 +1417,6 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -1430,7 +1425,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &sk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.  */
@@ -1467,10 +1462,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
    */
   rc = _gcry_ecc_os2ec (&kG, data_e);
   if (rc)
-    {
-      point_free (&kG);
-      return rc;
-    }
+    goto leave;

   ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0,
                                     sk.E.p, sk.E.a, sk.E.b);
--


From cvs at cvs.gnupg.org  Tue Jul  7 04:09:54 2015
From: cvs at cvs.gnupg.org (by NIIBE Yutaka)
Date: Tue, 07 Jul 2015 04:09:54 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-231-g0a7547e
Message-ID: <E1ZCIEF-0007TV-K8@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  0a7547e487a8bc4e7ac9599c55579eb2e4a13f06 (commit)
      from  a36ee7501f68ad7ebcfe31f9659430b9d2c3ddd1 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 0a7547e487a8bc4e7ac9599c55579eb2e4a13f06
Author: NIIBE Yutaka <gniibe at fsij.org>
Date:   Mon Jul 6 12:01:00 2015 +0900

    ecc: fix memory leaks.
    
    cipher/ecc.c (ecc_generate): Fix memory leak on error of
    _gcry_pk_util_parse_flaglist and _gcry_ecc_eddsa_encodepoint.
    (ecc_check_secret_key): Fix memory leak on error of
    _gcry_ecc_update_curve_param.
    (ecc_sign, ecc_verify, ecc_encrypt_raw, ecc_decrypt_raw): Remove
    unnecessary sexp_release and fix memory leak on error of
    _gcry_ecc_fill_in_curve.
    (ecc_decrypt_raw): Fix double free of the point kG and memory leak
    on error of _gcry_ecc_os2ec.

diff --git a/cipher/ecc.c b/cipher/ecc.c
index 5ffe84b..f5bc50a 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -551,7 +551,6 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
     return GPG_ERR_NO_OBJ; /* No NBITS parameter. */
 
   rc = _gcry_ecc_fill_in_curve (nbits, curve_name, &E, &nbits);
-  xfree (curve_name); curve_name = NULL;
   if (rc)
     goto leave;
 
@@ -595,10 +594,9 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
                                         !!(flags & PUBKEY_FLAG_COMP),
                                         &encpk, &encpklen);
       if (rc)
-        return rc;
+        goto leave;
       public = mpi_new (0);
       mpi_set_opaque (public, encpk, encpklen*8);
-      encpk = NULL;
     }
   else
     {
@@ -691,6 +689,7 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
   mpi_free (Qx);
   mpi_free (Qy);
   _gcry_mpi_ec_free (ctx);
+  xfree (curve_name);
   sexp_release (curve_flags);
   sexp_release (curve_info);
   return rc;
@@ -744,7 +743,7 @@ ecc_check_secret_key (gcry_sexp_t keyparms)
                                              &sk.E.p, &sk.E.a, &sk.E.b,
                                              &mpi_g, &sk.E.n, &sk.E.h);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   if (mpi_g)
@@ -877,7 +876,6 @@ ecc_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -886,7 +884,7 @@ ecc_sign (gcry_sexp_t *r_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &sk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.
@@ -1043,7 +1041,6 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (s_keyparms, "curve", 5);
   if (l1)
     {
@@ -1052,7 +1049,7 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &pk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.
@@ -1252,7 +1249,6 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -1261,7 +1257,7 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &pk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.  */
@@ -1421,7 +1417,6 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
-  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {
@@ -1430,7 +1425,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         {
           rc = _gcry_ecc_fill_in_curve (0, curvename, &sk.E, NULL);
           if (rc)
-            return rc;
+            goto leave;
         }
     }
   /* Guess required fields if a curve parameter has not been given.  */
@@ -1467,10 +1462,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
    */
   rc = _gcry_ecc_os2ec (&kG, data_e);
   if (rc)
-    {
-      point_free (&kG);
-      return rc;
-    }
+    goto leave;
 
   ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0,
                                     sk.E.p, sk.E.a, sk.E.b);

-----------------------------------------------------------------------

Summary of changes:
 cipher/ecc.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From ed at prote.us  Tue Jul  7 17:52:59 2015
From: ed at prote.us (Ed Johns)
Date: Tue, 7 Jul 2015 11:52:59 -0400
Subject: [PATCH] Curve25519 encryption support (experimental)
Message-ID: <CAKVrepWnXu8a2PGD2X1cdEC+-s6h4G3KxN9q+aEFy1zMOEH+gA@mail.gmail.com>

Hello,

I was interested in trying your Curve25519 encryption support. I've
downloaded the latest alpha code. I see that the patch exists.

Would you have a test case that I could used to get started with this patch?

Thanks

Ed Johns
-------------- next part --------------
An HTML attachment was scrubbed...
URL: </pipermail/attachments/20150707/f4dc6431/attachment.html>

From gniibe at fsij.org  Wed Jul  8 02:34:24 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Wed, 08 Jul 2015 09:34:24 +0900
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <55965577.7020306@fsij.org>
References: <55965577.7020306@fsij.org>
Message-ID: <559C7010.6040700@fsij.org>

Hello,

Here is an update of the patch against current master branch.

This is an experimental patch for Curve25519 encryption support.

Key generation is done with SEXP:

     (genkey(ecc(curve Curve25519)(flags eddsa comp)))

Here, I changed the meaning of '(flags eddsa)' a bit.  Now, it means
that the key is in DJB format and under DJB processing:

	Little endian
	Private key: MSB is 0, (MS-1)B is 1, multiple of cofactor
	ECDH: Assume a private key conforms this ----^

That is, '(flags eddsa)' also makes sense to Curve25519.

'(flags comp)' is also a bit different now.  It means the key comes
with the prefix 0x40 like EdDSA in GnuPG.  (Since the computation is
only done with x-coordinate with Curve25519, there would be no way
other than shorter format.  So, it is not "compression", but
requesting a prefix.)

Those changes of flags would be confusing and needed to improve.


I tested with modified version of GnuPG 2.1 and Gnuk.

I don't think all possible combinations work.  Following is not
implemented/tested, and I don't know about all semantics.

    (genkey(ecc(curve Curve25519)))
    (genkey(ecc(curve Curve25519)(flags comp)))
    (genkey(ecc(curve Curve25519)(flags eddsa)))

Please note that this is highly experimental.  The SEXP format is not
yet decided.


diff --git a/cipher/ecc-common.h b/cipher/ecc-common.h
index f0d97ea..6b3b063 100644
--- a/cipher/ecc-common.h
+++ b/cipher/ecc-common.h
@@ -132,6 +132,8 @@ gpg_err_code_t _gcry_ecc_eddsa_verify (gcry_mpi_t input,
                                        ECC_public_key *pk,
                                        gcry_mpi_t r, gcry_mpi_t s,
                                        int hashalgo, gcry_mpi_t pkmpi);
+gpg_err_code_t _gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx,
+                                           mpi_point_t result);

 /*-- ecc-gost.c --*/
 gpg_err_code_t _gcry_ecc_gost_sign (gcry_mpi_t input, ECC_secret_key *skey,
diff --git a/cipher/ecc-curves.c b/cipher/ecc-curves.c
index 9975bb4..5d855bd 100644
--- a/cipher/ecc-curves.c
+++ b/cipher/ecc-curves.c
@@ -40,7 +40,7 @@ static const struct
   const char *other; /* Other name. */
 } curve_aliases[] =
   {
-  /*{ "Curve25519", "1.3.6.1.4.1.3029.1.5.1" },*/
+    { "Curve25519", "1.3.6.1.4.1.3029.1.5.1" },
     { "Ed25519",    "1.3.6.1.4.1.11591.15.1" },

     { "NIST P-192", "1.2.840.10045.3.1.1" }, /* X9.62 OID  */
@@ -129,6 +129,18 @@ static const ecc_domain_parms_t domain_parms[] =
       "0x6666666666666666666666666666666666666666666666666666666666666658",
       "0x08"
     },
+    {
+      /* (y^2 = x^3 + 486662*x^2 + x) */
+      "Curve25519", 256, 0,
+      MPI_EC_MONTGOMERY, ECC_DIALECT_ED25519,
+      "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED",
+      "0x01DB41",
+      "0x01",
+      "0x1000000000000000000000000000000014DEF9DEA2F79CD65812631A5CF5D3ED",
+      "0x0000000000000000000000000000000000000000000000000000000000000009",
+      "0x20AE19A1B8A086B4E01EDD2C7748D14C923D4D7E6D7C61B229E9C5A27ECED3D9",
+      "0x08"
+    },
 #if 0 /* No real specs yet found.  */
     {
       /* x^2 + y^2 = 1 + 3617x^2y^2 mod 2^414 - 17 */
diff --git a/cipher/ecc-eddsa.c b/cipher/ecc-eddsa.c
index 4323d8e..72481ba 100644
--- a/cipher/ecc-eddsa.c
+++ b/cipher/ecc-eddsa.c
@@ -400,6 +400,51 @@ _gcry_ecc_eddsa_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result,
 }


+gpg_err_code_t
+_gcry_ecc_mont_decodepoint (gcry_mpi_t pk, mpi_ec_t ctx, mpi_point_t result)
+{
+  unsigned char *rawmpi;
+  unsigned int rawmpilen;
+
+  if (mpi_is_opaque (pk))
+    {
+      const unsigned char *buf;
+
+      buf = mpi_get_opaque (pk, &rawmpilen);
+      if (!buf)
+        return GPG_ERR_INV_OBJ;
+      rawmpilen = (rawmpilen + 7)/8;
+
+      if (rawmpilen > 1 && (rawmpilen%2) && buf[0] == 0x40)
+        {
+          rawmpilen--;
+          buf++;
+        }
+
+      rawmpi = xtrymalloc (rawmpilen? rawmpilen:1);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+      memcpy (rawmpi, buf, rawmpilen);
+      reverse_buffer (rawmpi, rawmpilen);
+    }
+  else
+    {
+      /* Note: Without using an opaque MPI it is not reliable possible
+         to find out whether the public key has been given in
+         uncompressed format.  Thus we expect native EdDSA format.  */
+      rawmpi = _gcry_mpi_get_buffer (pk, ctx->nbits/8, &rawmpilen, NULL);
+      if (!rawmpi)
+        return gpg_err_code_from_syserror ();
+    }
+
+  _gcry_mpi_set_buffer (result->x, rawmpi, rawmpilen, 0);
+  xfree (rawmpi);
+  mpi_set_ui (result->z, 1);
+
+  return 0;
+}
+
+
 /* Compute the A value as used by EdDSA.  The caller needs to provide
    the context EC and the actual secret D as an MPI.  The function
    returns a newly allocated 64 byte buffer at r_digest; the first 32
diff --git a/cipher/ecc.c b/cipher/ecc.c
index f5bc50a..de4fdbd 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -174,7 +174,10 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx,
   point_init (&sk->Q);

   x = mpi_new (pbits);
-  y = mpi_new (pbits);
+  if (r_y == NULL)
+    y = NULL;
+  else
+    y = mpi_new (pbits);
   if (_gcry_mpi_ec_get_affine (x, y, &Q, ctx))
     log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "Q");

@@ -187,7 +190,7 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx,
    * possibilities without any loss of security.  Note that we don't
    * do that for Ed25519 so that we do not violate the special
    * construction of the secret key.  */
-  if (E->dialect == ECC_DIALECT_ED25519)
+  if (E->dialect == ECC_DIALECT_ED25519 || r_y == NULL)
     point_set (&sk->Q, &Q);
   else
     {
@@ -231,7 +234,8 @@ nist_generate_key (ECC_secret_key *sk, elliptic_curve_t *E, mpi_ec_t ctx,
     }

   *r_x = x;
-  *r_y = y;
+  if (r_y)
+    *r_y = y;

   point_free (&Q);
   /* Now we can test our keys (this should never fail!).  */
@@ -307,7 +311,7 @@ test_ecdh_only_keys (ECC_secret_key *sk, unsigned int nbits)
   mpi_ec_t ec;

   if (DBG_CIPHER)
-    log_debug ("Testing key.\n");
+    log_debug ("Testing ECDH only key.\n");

   point_init (&R_);

@@ -571,7 +575,9 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)

   ctx = _gcry_mpi_ec_p_internal_new (E.model, E.dialect, 0, E.p, E.a, E.b);

-  if ((flags & PUBKEY_FLAG_EDDSA))
+  if (E.model == MPI_EC_MONTGOMERY)
+    rc = nist_generate_key (&sk, &E, ctx, flags, nbits, &Qx, NULL);
+  else if ((flags & PUBKEY_FLAG_EDDSA))
     rc = _gcry_ecc_eddsa_genkey (&sk, &E, ctx, flags);
   else
     rc = nist_generate_key (&sk, &E, ctx, flags, nbits, &Qx, &Qy);
@@ -581,18 +587,31 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
   /* Copy data to the result.  */
   Gx = mpi_new (0);
   Gy = mpi_new (0);
-  if (_gcry_mpi_ec_get_affine (Gx, Gy, &sk.E.G, ctx))
-    log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "G");
-  base = _gcry_ecc_ec2os (Gx, Gy, sk.E.p);
   if (sk.E.dialect == ECC_DIALECT_ED25519 && !(flags & PUBKEY_FLAG_NOCOMP))
     {
       unsigned char *encpk;
       unsigned int encpklen;

-      /* (Gx and Gy are used as scratch variables)  */
-      rc = _gcry_ecc_eddsa_encodepoint (&sk.Q, ctx, Gx, Gy,
-                                        !!(flags & PUBKEY_FLAG_COMP),
-                                        &encpk, &encpklen);
+      if (E.model != MPI_EC_MONTGOMERY)
+        /* (Gx and Gy are used as scratch variables)  */
+        rc = _gcry_ecc_eddsa_encodepoint (&sk.Q, ctx, Gx, Gy,
+                                          !!(flags & PUBKEY_FLAG_COMP),
+                                          &encpk, &encpklen);
+      else
+        {
+          int off = !!(flags & PUBKEY_FLAG_COMP);
+
+          encpk = _gcry_mpi_get_buffer_extra (Qx, ctx->nbits/8, off?-1:0,
+                                              &encpklen, NULL);
+          if (encpk == NULL)
+            rc = gpg_err_code_from_syserror ();
+          else
+            {
+              if (off)
+                encpk[0] = 0x40;
+              encpklen += off;
+            }
+        }
       if (rc)
         goto leave;
       public = mpi_new (0);
@@ -600,6 +619,9 @@ ecc_generate (const gcry_sexp_t genparms, gcry_sexp_t *r_skey)
     }
   else
     {
+      if (_gcry_mpi_ec_get_affine (Gx, Gy, &sk.E.G, ctx))
+        log_fatal ("ecgen: Failed to get affine coordinates for %s\n", "G");
+      base = _gcry_ecc_ec2os (Gx, Gy, sk.E.p);
       if (!Qx)
         {
           /* This is the case for a key from _gcry_ecc_eddsa_generate
@@ -1213,6 +1235,18 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   gcry_mpi_t data = NULL;
   ECC_public_key pk;
   mpi_ec_t ec = NULL;
+  int flags;
+
+  /* Look for flags. */
+  l1 = sexp_find_token (keyparms, "flags", 0);
+  if (l1)
+    {
+      rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      if (rc)
+        goto leave;
+    }
+  sexp_release (l1);
+  l1 = NULL;

   memset (&pk, 0, sizeof pk);
   _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_ENCRYPT,
@@ -1236,7 +1270,9 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   /*
    * Extract the key.
    */
-  rc = sexp_extract_param (keyparms, NULL, "-p?a?b?g?n?h?+q",
+  rc = sexp_extract_param (keyparms, NULL,
+                           (flags & PUBKEY_FLAG_EDDSA)?
+                           "-p?a?b?g?n?h?/q" : "-p?a?b?g?n?h?+q",
                            &pk.E.p, &pk.E.a, &pk.E.b, &mpi_g, &pk.E.n, &pk.E.h,
                            &mpi_q, NULL);
   if (rc)
@@ -1288,26 +1324,34 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
       goto leave;
     }

+  /* Compute the encrypted value.  */
+  ec = _gcry_mpi_ec_p_internal_new (pk.E.model, pk.E.dialect, 0,
+                                    pk.E.p, pk.E.a, pk.E.b);
+
   /* Convert the public key.  */
   if (mpi_q)
     {
       point_init (&pk.Q);
-      rc = _gcry_ecc_os2ec (&pk.Q, mpi_q);
+      if (ec->model == MPI_EC_MONTGOMERY)
+        rc = _gcry_ecc_mont_decodepoint (mpi_q, ec, &pk.Q);
+      else
+        rc = _gcry_ecc_os2ec (&pk.Q, mpi_q);
       if (rc)
         goto leave;
     }

-  /* Compute the encrypted value.  */
-  ec = _gcry_mpi_ec_p_internal_new (pk.E.model, pk.E.dialect, 0,
-                                    pk.E.p, pk.E.a, pk.E.b);
-
   /* The following is false: assert( mpi_cmp_ui( R.x, 1 )==0 );, so */
   {
     mpi_point_struct R;  /* Result that we return.  */
     gcry_mpi_t x, y;
+    unsigned char *rawmpi;
+    unsigned int rawmpilen;

     x = mpi_new (0);
-    y = mpi_new (0);
+    if (ec->model == MPI_EC_MONTGOMERY)
+      y = NULL;
+    else
+      y = mpi_new (0);

     point_init (&R);

@@ -1316,14 +1360,38 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)

     if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
       log_fatal ("ecdh: Failed to get affine coordinates for kdG\n");
-    mpi_s = _gcry_ecc_ec2os (x, y, pk.E.p);
+    if (y)
+      mpi_s = _gcry_ecc_ec2os (x, y, pk.E.p);
+    else
+      {
+        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        if (!rawmpi)
+          rc = gpg_err_code_from_syserror ();
+        else
+          {
+            mpi_s = mpi_new (0);
+            mpi_set_opaque (mpi_s, rawmpi, rawmpilen*8);
+          }
+      }

     /* R = kG */
     _gcry_mpi_ec_mul_point (&R, data, &pk.E.G, ec);

     if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
       log_fatal ("ecdh: Failed to get affine coordinates for kG\n");
-    mpi_e = _gcry_ecc_ec2os (x, y, pk.E.p);
+    if (y)
+      mpi_e = _gcry_ecc_ec2os (x, y, pk.E.p);
+    else
+      {
+        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        if (!rawmpi)
+          rc = gpg_err_code_from_syserror ();
+        else
+          {
+            mpi_e = mpi_new (0);
+            mpi_set_opaque (mpi_e, rawmpi, rawmpilen*8);
+          }
+      }

     mpi_free (x);
     mpi_free (y);
@@ -1331,7 +1399,8 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
     point_free (&R);
   }

-  rc = sexp_build (r_ciph, NULL, "(enc-val(ecdh(s%m)(e%m)))", mpi_s, mpi_e);
+  if (!rc)
+    rc = sexp_build (r_ciph, NULL, "(enc-val(ecdh(s%m)(e%m)))", mpi_s, mpi_e);

  leave:
   _gcry_mpi_release (pk.E.p);
@@ -1347,6 +1416,7 @@ ecc_encrypt_raw (gcry_sexp_t *r_ciph, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   _gcry_mpi_release (mpi_s);
   _gcry_mpi_release (mpi_e);
   xfree (curvename);
+  sexp_release (l1);
   _gcry_mpi_ec_free (ec);
   _gcry_pk_util_free_encoding_ctx (&ctx);
   if (DBG_CIPHER)
@@ -1376,6 +1446,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   mpi_point_struct kG;
   mpi_point_struct R;
   gcry_mpi_t r = NULL;
+  int flags = 0;

   memset (&sk, 0, sizeof sk);
   point_init (&kG);
@@ -1384,6 +1455,17 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
   _gcry_pk_util_init_encoding_ctx (&ctx, PUBKEY_OP_DECRYPT,
                                    ecc_get_nbits (keyparms));

+  /* Look for flags. */
+  l1 = sexp_find_token (keyparms, "flags", 0);
+  if (l1)
+    {
+      rc = _gcry_pk_util_parse_flaglist (l1, &flags, NULL);
+      if (rc)
+        goto leave;
+    }
+  sexp_release (l1);
+  l1 = NULL;
+
   /*
    * Extract the data.
    */
@@ -1457,15 +1539,19 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
     }


+  ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0,
+                                    sk.E.p, sk.E.a, sk.E.b);
+
   /*
    * Compute the plaintext.
    */
-  rc = _gcry_ecc_os2ec (&kG, data_e);
+  if (ec->model == MPI_EC_MONTGOMERY)
+    rc = _gcry_ecc_mont_decodepoint (data_e, ec, &kG);
+  else
+    rc = _gcry_ecc_os2ec (&kG, data_e);
   if (rc)
     goto leave;

-  ec = _gcry_mpi_ec_p_internal_new (sk.E.model, sk.E.dialect, 0,
-                                    sk.E.p, sk.E.a, sk.E.b);

   /* R = dkG */
   _gcry_mpi_ec_mul_point (&R, sk.d, &kG, ec);
@@ -1475,12 +1561,30 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
     gcry_mpi_t x, y;

     x = mpi_new (0);
-    y = mpi_new (0);
+    if (ec->model == MPI_EC_MONTGOMERY)
+      y = NULL;
+    else
+      y = mpi_new (0);

     if (_gcry_mpi_ec_get_affine (x, y, &R, ec))
       log_fatal ("ecdh: Failed to get affine coordinates\n");

-    r = _gcry_ecc_ec2os (x, y, sk.E.p);
+    if (y)
+      r = _gcry_ecc_ec2os (x, y, sk.E.p);
+    else
+      {
+        unsigned char *rawmpi;
+        unsigned int rawmpilen;
+
+        rawmpi = _gcry_mpi_get_buffer (x, ec->nbits/8, &rawmpilen, NULL);
+        if (!rawmpi)
+          rc = gpg_err_code_from_syserror ();
+        else
+          {
+            r = mpi_new (0);
+            mpi_set_opaque (r, rawmpi, rawmpilen*8);
+          }
+      }
     if (!r)
       rc = gpg_err_code_from_syserror ();
     else
-- 


From gniibe at fsij.org  Wed Jul  8 03:32:05 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Wed, 08 Jul 2015 10:32:05 +0900
Subject: [PATCH] Curve25519 encryption support (experimental)
In-Reply-To: <CAKVrepWnXu8a2PGD2X1cdEC+-s6h4G3KxN9q+aEFy1zMOEH+gA@mail.gmail.com>
References: <CAKVrepWnXu8a2PGD2X1cdEC+-s6h4G3KxN9q+aEFy1zMOEH+gA@mail.gmail.com>
Message-ID: <559C7D95.9020409@fsij.org>

Hello,

Thank you for your interests.

On 07/08/2015 12:52 AM, Ed Johns wrote:
> I was interested in trying your Curve25519 encryption support. I've
> downloaded the latest alpha code. I see that the patch exists.

Today, I posted new updated patch.

    https://lists.gnupg.org/pipermail/gcrypt-devel/2015-July/003464.html

The patch is intended to apply to the master (development) branch of
libgcrypt.  The code is available at git.gnupg.org, you can browse it;

    http://git.gnupg.org/cgi-bin/gitweb.cgi?p=libgcrypt.git;a=summary

> Would you have a test case that I could used to get started with
> this patch?

Simple tests are not yet available.  Sure, I'll add test cases, so
that features can be tested by 'make check'.

THE example is available, as GnuPG.  My post is:

    https://lists.gnupg.org/pipermail/gnupg-devel/2015-July/030118.html

The ECDH encryption/decryption code is divided into GnuPG and
libgcrypt.  It would be good to see both to study.

ECDH encryption/decryption for GnuPG is described in:

    http://tools.ietf.org/html/rfc6637#section-8

For Curve25519, only x-coordinate is used in the computation.
-- 


From peter at lekensteyn.nl  Thu Jul  9 17:11:30 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:30 +0200
Subject: [PATCH 0/6] Memory leaks and undefined behavior fixes
Message-ID: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>

Hi,

Here are some small patches which fixes memory leaks and undefined
behavior (copying from/to a NULL buffer). These were caught by running
the test suite with ASAN and ubsan enabled.

There is also a patch to the commit-msg hook such that it does not
complain for the diff in `git commit -v`.

The "Add LSan annotation to ignore a memory leak" patch "works" in the
sense that the memory leak gets ignored.

Finally there is a patch to clarify the RSA secret calculation. This was
the original reason why I dug into the libgcrypt code. (I suspected a
bug in this code, but it turns out that Wireshark had an issue by
passing u = q^-1 mod p in the private key instead of u = p^-1 mod q.)

Kind regards,
Peter
--

Peter Wu (6):
  sexp: fix invalid deallocation in error path
  ecc: fix memory leak
  build: ignore scissor line for the commit-msg hook
  Fix undefined behavior wrt memcpy
  Add LSan annotation to ignore a memory leak
  rsa: clarify the RSA secret parameters

 build-aux/git-hooks/commit-msg |  6 ++++++
 cipher/cipher-gcm.c            |  2 +-
 cipher/ecc.c                   |  1 +
 cipher/mac-poly1305.c          |  3 +++
 cipher/rsa.c                   |  9 +++++++--
 mpi/mpiutil.c                  |  5 ++++-
 src/g10lib.h                   | 17 +++++++++++++++++
 src/sexp.c                     |  2 +-
 8 files changed, 40 insertions(+), 5 deletions(-)

-- 
2.4.4


From peter at lekensteyn.nl  Thu Jul  9 17:11:31 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:31 +0200
Subject: [PATCH 1/6] sexp: fix invalid deallocation in error path
In-Reply-To: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
Message-ID: <1436454696-20362-2-git-send-email-peter@lekensteyn.nl>

* src/sexp.c: Fix wrong condition in error path.
--
This appears to be a copy and paste error and could result in wrong
memory being freed in the error path (when arrayisdesc[idx] == 2, the
condition ((!2) == 1) is false).

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
---
 src/sexp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sexp.c b/src/sexp.c
index 9bc13ca..1c014e0 100644
--- a/src/sexp.c
+++ b/src/sexp.c
@@ -2405,7 +2405,7 @@ _gcry_sexp_vextract_param (gcry_sexp_t sexp, const char *path,
           _gcry_mpi_release (*array[idx]);
           *array[idx] = NULL;
         }
-      else if (!arrayisdesc[idx] == 1)
+      else if (arrayisdesc[idx] == 1)
         {
           /* Caller provided buffer.  */
           gcry_buffer_t *spec = (gcry_buffer_t*)array[idx];
-- 
2.4.4


From peter at lekensteyn.nl  Thu Jul  9 17:11:36 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:36 +0200
Subject: [PATCH 6/6] rsa: clarify the RSA secret parameters
In-Reply-To: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
Message-ID: <1436454696-20362-7-git-send-email-peter@lekensteyn.nl>

* cipher/rsa.c: Clarify meaning of the 'u' parameter. Fix error in
  comments.

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
---
 cipher/rsa.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/cipher/rsa.c b/cipher/rsa.c
index 9a8d235..25e9d10 100644
--- a/cipher/rsa.c
+++ b/cipher/rsa.c
@@ -700,7 +700,12 @@ stronger_key_check ( RSA_secret_key *skey )
  *
  *	m = c^d mod n
  *
- * Or faster:
+ * Or faster using Garner's Algorithm. Note that u is *not* the
+ * coefficient from RFC 3447 (PKCS#1), but the multiplicative inverse of
+ * p, mod q, from RFC 4880 (OpenPGP).
+ *
+ *      (precomputed:)
+ *      u  = p ^ -1 mod q
  *
  *      m1 = c ^ (d mod (p-1)) mod p
  *      m2 = c ^ (d mod (q-1)) mod q
@@ -738,7 +743,7 @@ secret (gcry_mpi_t output, gcry_mpi_t input, RSA_secret_key *skey )
       if ( mpi_has_sign ( h ) )
         mpi_add ( h, h, skey->q );
       mpi_mulm( h, skey->u, h, skey->q );
-      /* m = m2 + h * p */
+      /* m = m1 + h * p */
       mpi_mul ( h, h, skey->p );
       mpi_add ( output, m1, h );
 
-- 
2.4.4


From peter at lekensteyn.nl  Thu Jul  9 17:11:33 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:33 +0200
Subject: [PATCH 3/6] build: ignore scissor line for the commit-msg hook
In-Reply-To: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
Message-ID: <1436454696-20362-4-git-send-email-peter@lekensteyn.nl>

* build-aux/git-hooks/commit-msg: Stop processing more lines when the
  scissor line is encountered.
--
This allows the command `git commit -v` to work even if the code is
longer than 72 characters. Note that comments are already ignored by the
previous line.

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
---
 build-aux/git-hooks/commit-msg | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/build-aux/git-hooks/commit-msg b/build-aux/git-hooks/commit-msg
index 5a697c7..3ca918b 100755
--- a/build-aux/git-hooks/commit-msg
+++ b/build-aux/git-hooks/commit-msg
@@ -86,11 +86,17 @@ sub check_msg($$)
   2 <= @line && length $line[1]
     and return 'second line must be empty';
 
+  # See git-commit(1), this is the --cleanup=scissors option. Everything
+  # after and including this line gets ignored.
+  my $marker = '# ------------------------ >8 ------------------------';
+
   # Limit line length to allow for the ChangeLog's leading TAB.
   foreach my $line (@line)
     {
       72 < length $line && $line =~ /^[^#]/
         and return 'line longer than 72 characters';
+
+      last if $line eq $marker;
     }
 
   return '';
-- 
2.4.4


From peter at lekensteyn.nl  Thu Jul  9 17:11:32 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:32 +0200
Subject: [PATCH 2/6] ecc: fix memory leak
In-Reply-To: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
Message-ID: <1436454696-20362-3-git-send-email-peter@lekensteyn.nl>

* cipher/ecc.c: Release memory which was allocated before by
  _gcry_pk_util_preparse_sigval.
--
Caught by LeakSanitizer (LSan). Now the test suite (make check) passes
with no memleaks.

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
---
 cipher/ecc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cipher/ecc.c b/cipher/ecc.c
index f5bc50a..c17a553 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -1040,6 +1040,7 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
       if (rc)
         goto leave;
     }
+  sexp_release (l1);
   /* Add missing parameters using the optional curve parameter.  */
   l1 = sexp_find_token (s_keyparms, "curve", 5);
   if (l1)
-- 
2.4.4


From peter at lekensteyn.nl  Thu Jul  9 17:11:35 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:35 +0200
Subject: [PATCH 5/6] Add LSan annotation to ignore a memory leak
In-Reply-To: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
Message-ID: <1436454696-20362-6-git-send-email-peter@lekensteyn.nl>

* src/g10lib.h: Add annotate_leaked_object macro that ignores leaked
  objects. This avoids LSan from reporting deliberately leaked memory.
* mpi/mpiutil.c: Mark "constant" MPIs as leaked.

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
---
I am not happy with the method to detect LSan availability, but here I
hope to solicit for some feedback. Should the code really be added to
mpiutil? Or let the caller (tests/mpitests.c) handle leaks?
---
 mpi/mpiutil.c |  5 ++++-
 src/g10lib.h  | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/mpi/mpiutil.c b/mpi/mpiutil.c
index 71b3f1c..9a796c6 100644
--- a/mpi/mpiutil.c
+++ b/mpi/mpiutil.c
@@ -211,7 +211,10 @@ _gcry_mpi_free( gcry_mpi_t a )
   if (!a )
     return;
   if ((a->flags & 32))
-    return; /* Never release a constant. */
+    {
+      annotate_leaked_object(a);
+      return; /* Never release a constant. */
+    }
   if ((a->flags & 4))
     xfree( a->d );
   else
diff --git a/src/g10lib.h b/src/g10lib.h
index 50a08ec..5793f8c 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -88,6 +88,16 @@
 #define DIM(v) (sizeof(v)/sizeof((v)[0]))
 #define DIMof(type,member)   DIM(((type *)0)->member)
 
+/* Detect LeakSanitizer (LSan) support for GCC and Clang based on the
+   availability of AddressSanitizer (ASAN).  */
+#ifdef __SANITIZE_ADDRESS__
+# define LEAK_SANITIZER
+#elif defined(__has_feature)
+# if __has_feature(address_sanitizer)
+#  define LEAK_SANITIZER
+# endif
+#endif
+
 
 
 /*-- src/global.c -*/
@@ -126,6 +136,13 @@ int   _gcry_is_secure (const void *a) _GCRY_GCC_ATTR_PURE;
 #define xstrdup(a)       _gcry_xstrdup ((a))
 #define xfree(a)         _gcry_free ((a))
 
+/* Allows "constant" MPIs to be annotated as memory leak.  */
+#ifdef LEAK_SANITIZER
+# include <sanitizer/lsan_interface.h>
+# define annotate_leaked_object(a)  __lsan_ignore_object((a))
+#else
+# define annotate_leaked_object(a)  do { } while (0)
+#endif
 
 /*-- src/misc.c --*/
 
-- 
2.4.4


From peter at lekensteyn.nl  Thu Jul  9 17:11:34 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu,  9 Jul 2015 17:11:34 +0200
Subject: [PATCH 4/6] Fix undefined behavior wrt memcpy
In-Reply-To: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
Message-ID: <1436454696-20362-5-git-send-email-peter@lekensteyn.nl>

* cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let
  the function continue to add padding as needed though.
* cipher/mac-poly1305.c: If the caller requested to finish the hash
  function without a copy of the result, return immediately.
--
Caught by UndefinedBehaviorSanitizer.

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
---
 cipher/cipher-gcm.c   | 2 +-
 cipher/mac-poly1305.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 6b13fc5..3711a1d 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -474,7 +474,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 
   do
     {
-      if (buflen + unused < blocksize || unused > 0)
+      if (buflen > 0 && (buflen + unused < blocksize || unused > 0))
         {
           n = blocksize - unused;
           n = n < buflen ? n : buflen;
diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c
index 76b369a..b80f87d 100644
--- a/cipher/mac-poly1305.c
+++ b/cipher/mac-poly1305.c
@@ -260,6 +260,9 @@ poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen)
       mac_ctx->marks.tag = 1;
     }
 
+  if (*outlen == 0)
+    return 0;
+
   if (*outlen <= POLY1305_TAGLEN)
     buf_cpy (outbuf, mac_ctx->tag, *outlen);
   else
-- 
2.4.4


From gniibe at fsij.org  Fri Jul 10 03:01:58 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 10 Jul 2015 10:01:58 +0900
Subject: [PATCH 2/6] ecc: fix memory leak
In-Reply-To: <1436454696-20362-3-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-3-git-send-email-peter@lekensteyn.nl>
Message-ID: <559F1986.5070902@fsij.org>

Hello,

Thank you for your check and patches.

On 07/10/2015 12:11 AM, Peter Wu wrote:
> * cipher/ecc.c: Release memory which was allocated before by
>   _gcry_pk_util_preparse_sigval.
> --
> Caught by LeakSanitizer (LSan). Now the test suite (make check) passes
> with no memleaks.

This one was introduced by my last commit.  You're right.

I'm going to commit the fix, but the line inserted will be after the
comment as before.
-- 


From cvs at cvs.gnupg.org  Fri Jul 10 03:30:37 2015
From: cvs at cvs.gnupg.org (by Peter Wu)
Date: Fri, 10 Jul 2015 03:30:37 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-232-g2a7aa3e
Message-ID: <E1ZDN2k-0005y6-Be@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  2a7aa3ea4d03a9c808d5888f5509c08cd27aa27c (commit)
      from  0a7547e487a8bc4e7ac9599c55579eb2e4a13f06 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 2a7aa3ea4d03a9c808d5888f5509c08cd27aa27c
Author: Peter Wu <peter at lekensteyn.nl>
Date:   Fri Jul 10 10:15:26 2015 +0900

    ecc: fix memory leak.
    
    * cipher/ecc.c (ecc_verify): Release memory which was allocated before
    by _gcry_pk_util_preparse_sigval.
    (ecc_decrypt_raw): Likewise.
    
    --
    
    Caught by LeakSanitizer (LSan). Now the test suite (make check) passes
    with no memleaks.
    
    Signed-off-by: Peter Wu <peter at lekensteyn.nl>
    
    The last commit (0a7547e487a8bc4e7ac9599c55579eb2e4a13f06) includes
    wrong fixes for sexp_release.
    
    ecc_decrypt_raw fix added by gniibe.

diff --git a/cipher/ecc.c b/cipher/ecc.c
index f5bc50a..e33f999 100644
--- a/cipher/ecc.c
+++ b/cipher/ecc.c
@@ -1041,6 +1041,7 @@ ecc_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t s_keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
+  sexp_release (l1);
   l1 = sexp_find_token (s_keyparms, "curve", 5);
   if (l1)
     {
@@ -1417,6 +1418,7 @@ ecc_decrypt_raw (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
         goto leave;
     }
   /* Add missing parameters using the optional curve parameter.  */
+  sexp_release (l1);
   l1 = sexp_find_token (keyparms, "curve", 5);
   if (l1)
     {

-----------------------------------------------------------------------

Summary of changes:
 cipher/ecc.c | 2 ++
 1 file changed, 2 insertions(+)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Fri Jul 10 04:11:08 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 10 Jul 2015 11:11:08 +0900
Subject: [PATCH 1/6] sexp: fix invalid deallocation in error path
In-Reply-To: <1436454696-20362-2-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-2-git-send-email-peter@lekensteyn.nl>
Message-ID: <559F29BC.40507@fsij.org>

Hello,

Thank you for the fix.  I'd like to commit your fix without your
comment since it is not accurate.  Is it OK for you?

On 07/10/2015 12:11 AM, Peter Wu wrote:
> --
> This appears to be a copy and paste error and could result in wrong
> memory being freed in the error path (when arrayisdesc[idx] == 2, the
> condition ((!2) == 1) is false).


When arrayisdesc[idx] == 1, it means that the buffer is provided by
caller.

When arrayisdesc[idx] == 2, it means that the buffer is allocated here
in _gcry_sexp_vextract_param.

When arrayisdesc[idx] == 1, the wrong expression '!arrayisdesc[idx] == 1'
is evaluated to !1 == 1 -> 0 == 1 -> 0 (false) and it doesn't go into
the statements to set spec->len = 0 but goes into 'else' clause to
free the buffer.   THIS IS A PROBLEM.

When arrayisdesc[idx] == 2, the wrong expression '!arrayisdesc[idx] == 1'
is evaluated to !2 == 1 -> 0 == 1 -> 0 (false) and it goes into 'else'
clause (which is correct behaviour).
-- 


From peter at lekensteyn.nl  Sat Jul 11 01:02:23 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Sat, 11 Jul 2015 01:02:23 +0200
Subject: [PATCH 1/6] sexp: fix invalid deallocation in error path
In-Reply-To: <559F29BC.40507@fsij.org>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-2-git-send-email-peter@lekensteyn.nl>
 <559F29BC.40507@fsij.org>
Message-ID: <20150710230223.GA22106@al>

Hi,

On Fri, Jul 10, 2015 at 11:11:08AM +0900, NIIBE Yutaka wrote:
> Thank you for the fix.  I'd like to commit your fix without your
> comment since it is not accurate.  Is it OK for you?

You are right, I swapped the case. Feel free to adjust it to whatever
you deem appropriate.

> On 07/10/2015 12:11 AM, Peter Wu wrote:
> > --
> > This appears to be a copy and paste error and could result in wrong
> > memory being freed in the error path (when arrayisdesc[idx] == 2, the
> > condition ((!2) == 1) is false).
> 
> 
> When arrayisdesc[idx] == 1, it means that the buffer is provided by
> caller.
> 
> When arrayisdesc[idx] == 2, it means that the buffer is allocated here
> in _gcry_sexp_vextract_param.
> 
> When arrayisdesc[idx] == 1, the wrong expression '!arrayisdesc[idx] == 1'
> is evaluated to !1 == 1 -> 0 == 1 -> 0 (false) and it doesn't go into
> the statements to set spec->len = 0 but goes into 'else' clause to
> free the buffer.   THIS IS A PROBLEM.

Right, thanks for laying this down.

> When arrayisdesc[idx] == 2, the wrong expression '!arrayisdesc[idx] == 1'
> is evaluated to !2 == 1 -> 0 == 1 -> 0 (false) and it goes into 'else'
> clause (which is correct behaviour).

By accident, yes. This patch corrects the 1 case.

Kind regards,
Peter


From cvs at cvs.gnupg.org  Tue Jul 14 02:55:43 2015
From: cvs at cvs.gnupg.org (by Peter Wu)
Date: Tue, 14 Jul 2015 02:55:43 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-233-g0f9532b
Message-ID: <E1ZEoP8-0003oa-Fd@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  0f9532b186c1e0b54d7e7a6d76bce82b6226122b (commit)
      from  2a7aa3ea4d03a9c808d5888f5509c08cd27aa27c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 0f9532b186c1e0b54d7e7a6d76bce82b6226122b
Author: Peter Wu <peter at lekensteyn.nl>
Date:   Tue Jul 14 09:53:38 2015 +0900

    sexp: Fix invalid deallocation in error path.
    
    * src/sexp.c: Fix wrong condition.
    
    --
    
    Signed-off-by: Peter Wu <peter at lekensteyn.nl>

diff --git a/src/sexp.c b/src/sexp.c
index 9bc13ca..1c014e0 100644
--- a/src/sexp.c
+++ b/src/sexp.c
@@ -2405,7 +2405,7 @@ _gcry_sexp_vextract_param (gcry_sexp_t sexp, const char *path,
           _gcry_mpi_release (*array[idx]);
           *array[idx] = NULL;
         }
-      else if (!arrayisdesc[idx] == 1)
+      else if (arrayisdesc[idx] == 1)
         {
           /* Caller provided buffer.  */
           gcry_buffer_t *spec = (gcry_buffer_t*)array[idx];

-----------------------------------------------------------------------

Summary of changes:
 src/sexp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Thu Jul 16 06:26:33 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 16 Jul 2015 13:26:33 +0900
Subject: [PATCH 4/6] Fix undefined behavior wrt memcpy
In-Reply-To: <1436454696-20362-5-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-5-git-send-email-peter@lekensteyn.nl>
Message-ID: <55A73279.1010607@fsij.org>

Hello, Jussi,

Last week, following patch was submitted to gcrypt-devel.  Since it's
the code you wrote, I write to you.

I think that memcpy can be called with 0 length, but pointers should
be valid one (not NULL), even though most implementations works well.
So, it is worth to consider the patch for the correctness of the code.

On 07/10/2015 12:11 AM, Peter Wu wrote:
> * cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let
>   the function continue to add padding as needed though.
> * cipher/mac-poly1305.c: If the caller requested to finish the hash
>   function without a copy of the result, return immediately.
> --
> Caught by UndefinedBehaviorSanitizer.
> 
> Signed-off-by: Peter Wu <peter at lekensteyn.nl>
> ---
>  cipher/cipher-gcm.c   | 2 +-
>  cipher/mac-poly1305.c | 3 +++
>  2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
> index 6b13fc5..3711a1d 100644
> --- a/cipher/cipher-gcm.c
> +++ b/cipher/cipher-gcm.c
> @@ -474,7 +474,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
>  
>    do
>      {
> -      if (buflen + unused < blocksize || unused > 0)
> +      if (buflen > 0 && (buflen + unused < blocksize || unused > 0))
>          {
>            n = blocksize - unused;
>            n = n < buflen ? n : buflen;
> diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c
> index 76b369a..b80f87d 100644
> --- a/cipher/mac-poly1305.c
> +++ b/cipher/mac-poly1305.c
> @@ -260,6 +260,9 @@ poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen)
>        mac_ctx->marks.tag = 1;
>      }
>  
> +  if (*outlen == 0)
> +    return 0;
> +
>    if (*outlen <= POLY1305_TAGLEN)
>      buf_cpy (outbuf, mac_ctx->tag, *outlen);
>    else
> 


From gniibe at fsij.org  Thu Jul 16 06:57:17 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 16 Jul 2015 13:57:17 +0900
Subject: [PATCH 6/6] rsa: clarify the RSA secret parameters
In-Reply-To: <1436454696-20362-7-git-send-email-peter@lekensteyn.nl>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-7-git-send-email-peter@lekensteyn.nl>
Message-ID: <55A739AD.1030306@fsij.org>

Hello,

Thank you for the patch.

On 07/10/2015 12:11 AM, Peter Wu wrote:
> * cipher/rsa.c: Clarify meaning of the 'u' parameter. Fix error in
>   comments.

For the first part, I think that it's correctly described in the
documentation: (gcrypt)RSA key parameters

Even it has an example as:

       Note that OpenSSL uses slighly different parameters: q < p and u =
    q^{-1} \bmod p.  To use these parameters you will need to swap the
    values and recompute u.  Here is example code to do this:

       if (gcry_mpi_cmp (p, q) > 0)
         {
           gcry_mpi_swap (p, q);
           gcry_mpi_invm (u, p, q);
         }

I'm not sure (and wondering) if we need more.

I know that it's a pitfall of libgcrypt (something common).  For your
reference, I know this one in Fedora:

http://pkgs.fedoraproject.org/cgit/libgcrypt.git/diff/?id=376991d05a1a0e2911242061c41ca5c5a915e339&id2=f56a95f03b711eac70ddc8673b6417a93a45c2bd

That's was same mistake.
-- 


From cvs at cvs.gnupg.org  Thu Jul 16 07:12:00 2015
From: cvs at cvs.gnupg.org (by Peter Wu)
Date: Thu, 16 Jul 2015 07:12:00 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-234-g9cd55e8
Message-ID: <E1ZFbM9-0001cE-Lv@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  9cd55e8e948f0049cb23495f536decf797d072f7 (commit)
      from  0f9532b186c1e0b54d7e7a6d76bce82b6226122b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 9cd55e8e948f0049cb23495f536decf797d072f7
Author: Peter Wu <peter at lekensteyn.nl>
Date:   Thu Jul 16 13:59:44 2015 +0900

    rsa: Fix error in comments.
    
    * cipher/rsa.c: Fix.
    
    --
    
    Signed-off-by: Peter Wu <peter at lekensteyn.nl>

diff --git a/cipher/rsa.c b/cipher/rsa.c
index 9a8d235..e4f73d5 100644
--- a/cipher/rsa.c
+++ b/cipher/rsa.c
@@ -738,7 +738,7 @@ secret (gcry_mpi_t output, gcry_mpi_t input, RSA_secret_key *skey )
       if ( mpi_has_sign ( h ) )
         mpi_add ( h, h, skey->q );
       mpi_mulm( h, skey->u, h, skey->q );
-      /* m = m2 + h * p */
+      /* m = m1 + h * p */
       mpi_mul ( h, h, skey->p );
       mpi_add ( output, m1, h );
 

-----------------------------------------------------------------------

Summary of changes:
 cipher/rsa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From jussi.kivilinna at iki.fi  Thu Jul 16 16:37:57 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Thu, 16 Jul 2015 17:37:57 +0300
Subject: [PATCH 4/6] Fix undefined behavior wrt memcpy
In-Reply-To: <55A73279.1010607@fsij.org>
Message-ID: <8577d624-a9d4-4f57-a009-3e0968b738ec@email.android.com>

Hello,

Patch looks good and should definitely be merged. I will push it to repo when I get to my computer in a week or two.

-Jussi

16.7.2015 7.26 NIIBE Yutaka <gniibe at fsij.org> kirjoitti:
>
> Hello, Jussi, 
>
> Last week, following patch was submitted to gcrypt-devel.? Since it's 
> the code you wrote, I write to you. 
>
> I think that memcpy can be called with 0 length, but pointers should 
> be valid one (not NULL), even though most implementations works well. 
> So, it is worth to consider the patch for the correctness of the code. 
>
> On 07/10/2015 12:11 AM, Peter Wu wrote: 
> > * cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let 
> >?? the function continue to add padding as needed though. 
> > * cipher/mac-poly1305.c: If the caller requested to finish the hash 
> >?? function without a copy of the result, return immediately. 
> > -- 
> > Caught by UndefinedBehaviorSanitizer. 
> > 
> > Signed-off-by: Peter Wu <peter at lekensteyn.nl> 
> > --- 
> >? cipher/cipher-gcm.c?? | 2 +- 
> >? cipher/mac-poly1305.c | 3 +++ 
> >? 2 files changed, 4 insertions(+), 1 deletion(-) 
> > 
> > diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c 
> > index 6b13fc5..3711a1d 100644 
> > --- a/cipher/cipher-gcm.c 
> > +++ b/cipher/cipher-gcm.c 
> > @@ -474,7 +474,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, 
> >? 
> >??? do 
> >????? { 
> > -????? if (buflen + unused < blocksize || unused > 0) 
> > +????? if (buflen > 0 && (buflen + unused < blocksize || unused > 0)) 
> >????????? { 
> >??????????? n = blocksize - unused; 
> >??????????? n = n < buflen ? n : buflen; 
> > diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c 
> > index 76b369a..b80f87d 100644 
> > --- a/cipher/mac-poly1305.c 
> > +++ b/cipher/mac-poly1305.c 
> > @@ -260,6 +260,9 @@ poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen) 
> >??????? mac_ctx->marks.tag = 1; 
> >????? } 
> >? 
> > +? if (*outlen == 0) 
> > +??? return 0; 
> > + 
> >??? if (*outlen <= POLY1305_TAGLEN) 
> >????? buf_cpy (outbuf, mac_ctx->tag, *outlen); 
> >??? else 
> > 
>
>
> _______________________________________________ 
> Gcrypt-devel mailing list 
> Gcrypt-devel at gnupg.org 
> http://lists.gnupg.org/mailman/listinfo/gcrypt-devel 
>

From peter at lekensteyn.nl  Thu Jul 16 21:40:50 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Thu, 16 Jul 2015 21:40:50 +0200
Subject: [PATCH 6/6] rsa: clarify the RSA secret parameters
In-Reply-To: <55A739AD.1030306@fsij.org>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-7-git-send-email-peter@lekensteyn.nl>
 <55A739AD.1030306@fsij.org>
Message-ID: <20150716194050.GA31777@al>

On Thu, Jul 16, 2015 at 01:57:17PM +0900, NIIBE Yutaka wrote:
> Hello,
> 
> Thank you for the patch.
> 
> On 07/10/2015 12:11 AM, Peter Wu wrote:
> > * cipher/rsa.c: Clarify meaning of the 'u' parameter. Fix error in
> >   comments.
> 
> For the first part, I think that it's correctly described in the
> documentation: (gcrypt)RSA key parameters
> 
> Even it has an example as:
> 
>        Note that OpenSSL uses slighly different parameters: q < p and u =
>     q^{-1} \bmod p.  To use these parameters you will need to swap the
>     values and recompute u.  Here is example code to do this:
> 
>        if (gcry_mpi_cmp (p, q) > 0)
>          {
>            gcry_mpi_swap (p, q);
>            gcry_mpi_invm (u, p, q);
>          }
> 
> I'm not sure (and wondering) if we need more.

That is documented in a different place. Repeating the same does not
hurt, especially when noting why it differs from other common RSA
implementations (presumably due to the origin from OpenPGP).

> I know that it's a pitfall of libgcrypt (something common).  For your
> reference, I know this one in Fedora:
> 
> http://pkgs.fedoraproject.org/cgit/libgcrypt.git/diff/?id=376991d05a1a0e2911242061c41ca5c5a915e339&id2=f56a95f03b711eac70ddc8673b6417a93a45c2bd
> 
> That's was same mistake.

Given this mistake, why not add the comment to save some hours from
other reviewers?

In my case it was an unexpected user error where $u = q^{-1} mod p$ was
used. Maybe the function that imports the RSA parameters should first
check for u * p == 1 mod q as a sanity check?
-- 
Kind regards,
Peter Wu
https://lekensteyn.nl


From gniibe at fsij.org  Fri Jul 17 02:31:36 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 17 Jul 2015 09:31:36 +0900
Subject: [PATCH 6/6] rsa: clarify the RSA secret parameters
In-Reply-To: <20150716194050.GA31777@al>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-7-git-send-email-peter@lekensteyn.nl>
 <55A739AD.1030306@fsij.org> <20150716194050.GA31777@al>
Message-ID: <55A84CE8.6010703@fsij.org>

On 07/17/2015 04:40 AM, Peter Wu wrote:
> That is documented in a different place. Repeating the same does not
> hurt, especially when noting why it differs from other common RSA
> implementations (presumably due to the origin from OpenPGP).
> 
>> I know that it's a pitfall of libgcrypt (something common).  For your
>> reference, I know this one in Fedora:
>>
>> http://pkgs.fedoraproject.org/cgit/libgcrypt.git/diff/?id=376991d05a1a0e2911242061c41ca5c5a915e339&id2=f56a95f03b711eac70ddc8673b6417a93a45c2bd
>>
>> That's was same mistake.
> 
> Given this mistake, why not add the comment to save some hours from
> other reviewers?
> 
> In my case it was an unexpected user error where $u = q^{-1} mod p$ was
> used. Maybe the function that imports the RSA parameters should first
> check for u * p == 1 mod q as a sanity check?

For myself, I support your opinion toward better/meaningful comments.
Well, I have such a tendency to look into the code directly (instead
of documentation), too.

Currently, I'm not sure how we can improve the comment.  Every code
has its context.  It would be better to describe its important
context/assumption.

Let us wait to ask Werner's opinion.
-- 


From wk at gnupg.org  Wed Jul 22 13:54:09 2015
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 Jul 2015 13:54:09 +0200
Subject: [PATCH 3/6] build: ignore scissor line for the commit-msg hook
In-Reply-To: <1436454696-20362-4-git-send-email-peter@lekensteyn.nl> (Peter
 Wu's message of "Thu, 9 Jul 2015 17:11:33 +0200")
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-4-git-send-email-peter@lekensteyn.nl>
Message-ID: <87oaj4e7by.fsf@vigenere.g10code.de>

On Thu,  9 Jul 2015 17:11, peter at lekensteyn.nl said:
> * build-aux/git-hooks/commit-msg: Stop processing more lines when the
>   scissor line is encountered.
> --
> This allows the command `git commit -v` to work even if the code is
> longer than 72 characters. Note that comments are already ignored by the
> previous line.

So this is to allow overlong lines in commit messages? 

--cleanup=scissor seems to be a newer option.  I think this is useful in
some cases.  I will propagate your patch to the other packages.

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Wed Jul 22 15:06:53 2015
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 Jul 2015 15:06:53 +0200
Subject: [PATCH 5/6] Add LSan annotation to ignore a memory leak
In-Reply-To: <1436454696-20362-6-git-send-email-peter@lekensteyn.nl> (Peter
 Wu's message of "Thu, 9 Jul 2015 17:11:35 +0200")
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-6-git-send-email-peter@lekensteyn.nl>
Message-ID: <87fv4ge3yq.fsf@vigenere.g10code.de>

On Thu,  9 Jul 2015 17:11, peter at lekensteyn.nl said:

> I am not happy with the method to detect LSan availability, but here I
> hope to solicit for some feedback. Should the code really be added to

Adding a __GNUC__ condition would make it more portable.  I like this
style of annotations and they are useful for other code as well.
libgpg-error is used by all GnuPG code and thus it might be useful to
add this annotation macros to gpg-error.h.

> mpiutil? Or let the caller (tests/mpitests.c) handle leaks?

In GnuPG we have a register_mem_cleanup_func to run free with atexit but
that is not appropriate for a library.  I think such annotations are a
better way.


Salam-Shalom,

   Werner


-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Wed Jul 22 15:10:28 2015
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 Jul 2015 15:10:28 +0200
Subject: [PATCH 6/6] rsa: clarify the RSA secret parameters
In-Reply-To: <20150716194050.GA31777@al> (Peter Wu's message of "Thu, 16 Jul
 2015 21:40:50 +0200")
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-7-git-send-email-peter@lekensteyn.nl>
 <55A739AD.1030306@fsij.org> <20150716194050.GA31777@al>
Message-ID: <87bnf4e3sr.fsf@vigenere.g10code.de>

On Thu, 16 Jul 2015 21:40, peter at lekensteyn.nl said:

> That is documented in a different place. Repeating the same does not
> hurt, especially when noting why it differs from other common RSA
> implementations (presumably due to the origin from OpenPGP).

Actually PGP why of using the CRT is older than SSLeay ;-)

> Given this mistake, why not add the comment to save some hours from
> other reviewers?

I agree; it does not harm.


Shalom-Salam,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Wed Jul 22 15:21:24 2015
From: wk at gnupg.org (Werner Koch)
Date: Wed, 22 Jul 2015 15:21:24 +0200
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <559C7010.6040700@fsij.org> (NIIBE Yutaka's message of "Wed, 08
 Jul 2015 09:34:24 +0900")
References: <55965577.7020306@fsij.org> <559C7010.6040700@fsij.org>
Message-ID: <877fpse3aj.fsf@vigenere.g10code.de>

On Wed,  8 Jul 2015 02:34, gniibe at fsij.org said:

> Here, I changed the meaning of '(flags eddsa)' a bit.  Now, it means
> that the key is in DJB format and under DJB processing:

We should not overload that flag with a new meaning:

  @item eddsa
  @cindex EdDSA
  Use the EdDSA scheme signing instead of the default ECDSA algorithm.
  Note that the EdDSA uses a special form of the public key.
  
This flag describes the EdDSA algorithm and not the encoding of the
points.  Right, the default for that algorithm are those from Bernstein
et al's paper but the idea is to use it also for future versions of
EdDSA.  For example:

@misc{cryptoeprint:2015:677,
    author = {Daniel J. Bernstein and Simon Josefsson and Tanja Lange and Peter Schwabe and Bo-Yin Yang},
    title = {EdDSA for more curves},
    howpublished = {Cryptology ePrint Archive, Report 2015/677},
    year = {2015},
    note = {\url{http://eprint.iacr.org/}},
}

We should keep the encoding separate.  What about an "x-only" or "mont"
flag to indicate that we only convey the x-ccordinate?


Salam-Shalom,

   Werner


-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From peter at lekensteyn.nl  Wed Jul 22 21:13:25 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Wed, 22 Jul 2015 21:13:25 +0200
Subject: DCO signature
Message-ID: <20150722191325.GA8113@al>

Libgcrypt Developer's Certificate of Origin.  Version 1.0
=========================================================

By making a contribution to the Libgcrypt project, I certify that:

(a) The contribution was created in whole or in part by me and I
    have the right to submit it under the free software license
    indicated in the file; or

(b) The contribution is based upon previous work that, to the
    best of my knowledge, is covered under an appropriate free
    software license and I have the right under that license to
    submit that work with modifications, whether created in whole
    or in part by me, under the same free software license
    (unless I am permitted to submit under a different license),
    as indicated in the file; or

(c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.

(d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including
    all personal information I submit with it, including my
    sign-off) is maintained indefinitely and may be redistributed
    consistent with this project or the free software license(s)
    involved.

Signed-off-by: Peter Wu <peter at lekensteyn.nl>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: </pipermail/attachments/20150722/afea79a2/attachment.sig>

From peter at lekensteyn.nl  Wed Jul 22 21:17:13 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Wed, 22 Jul 2015 21:17:13 +0200
Subject: [PATCH 3/6] build: ignore scissor line for the commit-msg hook
In-Reply-To: <87oaj4e7by.fsf@vigenere.g10code.de>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-4-git-send-email-peter@lekensteyn.nl>
 <87oaj4e7by.fsf@vigenere.g10code.de>
Message-ID: <20150722191713.GB8113@al>

On Wed, Jul 22, 2015 at 01:54:09PM +0200, Werner Koch wrote:
> On Thu,  9 Jul 2015 17:11, peter at lekensteyn.nl said:
> > * build-aux/git-hooks/commit-msg: Stop processing more lines when the
> >   scissor line is encountered.
> > --
> > This allows the command `git commit -v` to work even if the code is
> > longer than 72 characters. Note that comments are already ignored by the
> > previous line.
> 
> So this is to allow overlong lines in commit messages? 

Yes, it allows overlong lines in the draft commit message which are
removed on exit.

> --cleanup=scissor seems to be a newer option.  I think this is useful in
> some cases.  I will propagate your patch to the other packages.

This option was introduced with git v1.8.5-rc2-7-g1a72cfd.

-- 
Kind regards,
Peter Wu
https://lekensteyn.nl


From peter at lekensteyn.nl  Wed Jul 22 21:40:41 2015
From: peter at lekensteyn.nl (Peter Wu)
Date: Wed, 22 Jul 2015 21:40:41 +0200
Subject: [PATCH 5/6] Add LSan annotation to ignore a memory leak
In-Reply-To: <87fv4ge3yq.fsf@vigenere.g10code.de>
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-6-git-send-email-peter@lekensteyn.nl>
 <87fv4ge3yq.fsf@vigenere.g10code.de>
Message-ID: <20150722194041.GC8113@al>

On Wed, Jul 22, 2015 at 03:06:53PM +0200, Werner Koch wrote:
> On Thu,  9 Jul 2015 17:11, peter at lekensteyn.nl said:
> 
> > I am not happy with the method to detect LSan availability, but here I
> > hope to solicit for some feedback. Should the code really be added to
> 
> Adding a __GNUC__ condition would make it more portable.  I like this
> style of annotations and they are useful for other code as well.

LSan is also supported by the Clang compiler, how would adding __GNUC__
make the definition more portable?

> libgpg-error is used by all GnuPG code and thus it might be useful to
> add this annotation macros to gpg-error.h.

I thought that LSan detection could be improved
(http://stackoverflow.com/q/31273016/427545), but looking at Chromium,
they seem to set cflags, ldflags (-fsanitize=leak) and macros (including
LEAK_SANITIZER) depending on the build config.

Ah well, those who use -fsanitize=threads or -fsanitize=leaks can also
set -DLEAK_SANITIZER themselves. I should probably add a comment for
that in the patch.

> > mpiutil? Or let the caller (tests/mpitests.c) handle leaks?
> 
> In GnuPG we have a register_mem_cleanup_func to run free with atexit but
> that is not appropriate for a library.  I think such annotations are a
> better way.

I will look at adding this to libgpg-error. Is src/gpg-error.h.in the
appropriate file for this? Can the patch be submitted to this list?
-- 
Kind regards,
Peter Wu
https://lekensteyn.nl


From gniibe at fsij.org  Thu Jul 23 10:02:58 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Thu, 23 Jul 2015 17:02:58 +0900
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <877fpse3aj.fsf@vigenere.g10code.de>
References: <55965577.7020306@fsij.org> <559C7010.6040700@fsij.org>
 <877fpse3aj.fsf@vigenere.g10code.de>
Message-ID: <55B09FB2.9060403@fsij.org>

On 07/22/2015 10:21 PM, Werner Koch wrote:
> On Wed,  8 Jul 2015 02:34, gniibe at fsij.org said:
> 
>> Here, I changed the meaning of '(flags eddsa)' a bit.  Now, it means
>> that the key is in DJB format and under DJB processing:
> 
> We should not overload that flag with a new meaning:

OK, I see.

Let me explain my point.

When I said "the key is in DJB format and under DJB processing", I meant:

	Secret key is multiplied by cofactor and its msb is set.

	Encryption/signature processing assumes such a secret key.

Well, let me call this "sec-is-multiplied-by-cofactor-and-msb-set",
for now.

For Curve25519, its model is MPI_EC_MONTGOMERY and its seret key
should come with a flag of sec-is-multiplied-by-cofactor-and-msb-set.

The eddsa flag already implies something like
sec-is-multiplied-by-cofactor-and-msb-set flag, since EdDSA
computation also assumes similar.  I think that when we introduce
another curve for EdDSA, we will fix current hard-wired cofactor
handling and msb handling.


> We should keep the encoding separate.

I see, I will.

It is possible to define co-factor ECDH with Montgomery curve where
secret key is 1 <= d <= n, and ECDH computation is done by hkdG
(multiplied by cofactor).  It's not implemented yet.

So, its meaning is sec-is-multiplied-by-cofactor-and-msb-set (not mont
or x-only, which is defined by curve's model or compression).

I don't have good naming for the flag though.
-- 


From wk at gnupg.org  Thu Jul 23 14:32:55 2015
From: wk at gnupg.org (Werner Koch)
Date: Thu, 23 Jul 2015 14:32:55 +0200
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <55B09FB2.9060403@fsij.org> (NIIBE Yutaka's message of "Thu, 23
 Jul 2015 17:02:58 +0900")
References: <55965577.7020306@fsij.org> <559C7010.6040700@fsij.org>
 <877fpse3aj.fsf@vigenere.g10code.de> <55B09FB2.9060403@fsij.org>
Message-ID: <87y4i7awaw.fsf@vigenere.g10code.de>

On Thu, 23 Jul 2015 10:02, gniibe at fsij.org said:

> So, its meaning is sec-is-multiplied-by-cofactor-and-msb-set (not mont
> or x-only, which is defined by curve's model or compression).
>
> I don't have good naming for the flag though.

"djb" :-)

Anyone else with a suggestion for the name of such a flag?


Shalom-Salam,

   Werner


-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Thu Jul 23 14:35:47 2015
From: wk at gnupg.org (Werner Koch)
Date: Thu, 23 Jul 2015 14:35:47 +0200
Subject: [PATCH 5/6] Add LSan annotation to ignore a memory leak
In-Reply-To: <20150722194041.GC8113@al> (Peter Wu's message of "Wed, 22 Jul
 2015 21:40:41 +0200")
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-6-git-send-email-peter@lekensteyn.nl>
 <87fv4ge3yq.fsf@vigenere.g10code.de> <20150722194041.GC8113@al>
Message-ID: <87twsvaw64.fsf@vigenere.g10code.de>

On Wed, 22 Jul 2015 21:40, peter at lekensteyn.nl said:

> LSan is also supported by the Clang compiler, how would adding __GNUC__
> make the definition more portable?

Although I do not like that but Clang also defined __GNUC__.  Thus
__GNUC__ is sufficient to enable the annotation for those tow while it
won't be enabled for other compilers.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From cvs at cvs.gnupg.org  Thu Jul 23 14:47:34 2015
From: cvs at cvs.gnupg.org (by Werner Koch)
Date: Thu, 23 Jul 2015 14:47:34 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-235-gda500a0
Message-ID: <E1ZIFnf-0001LM-3X@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  da500a030481a3c534e933b642e6f051aa064b97 (commit)
      from  9cd55e8e948f0049cb23495f536decf797d072f7 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit da500a030481a3c534e933b642e6f051aa064b97
Author: Werner Koch <wk at gnupg.org>
Date:   Thu Jul 23 14:38:49 2015 +0200

    Register DCO for Peter Wu.
    
    --

diff --git a/AUTHORS b/AUTHORS
index 1e34aed..42d5ee6 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -165,6 +165,9 @@ Markus Teich <markus dot teich at stusta dot mhn dot de>
 Milan Broz <gmazyland at gmail.com>
 2014-01-13:52D44CC6.4050707 at gmail.com:
 
+Peter Wu <peter at lekensteyn.nl>
+2015-07-22:20150722191325.GA8113 at al:
+
 Rafa?l Carr? <funman at videolan.org>
 2012-04-20:4F91988B.1080502 at videolan.org:
 

-----------------------------------------------------------------------

Summary of changes:
 AUTHORS | 3 +++
 1 file changed, 3 insertions(+)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits

From cvs at cvs.gnupg.org  Thu Jul 23 14:51:15 2015
From: cvs at cvs.gnupg.org (by Peter Wu)
Date: Thu, 23 Jul 2015 14:51:15 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-236-gada0a7d
Message-ID: <E1ZIFrE-0001RT-9m@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  ada0a7d302cca97b327faaacac7a5d0b8043df88 (commit)
      from  da500a030481a3c534e933b642e6f051aa064b97 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit ada0a7d302cca97b327faaacac7a5d0b8043df88
Author: Peter Wu <peter at lekensteyn.nl>
Date:   Thu Jul 9 17:11:33 2015 +0200

    build: ignore scissor line for the commit-msg hook
    
    * build-aux/git-hooks/commit-msg: Stop processing more lines when the
      scissor line is encountered.
    --
    This allows the command `git commit -v` to work even if the code is
    longer than 72 characters. Note that comments are already ignored by the
    previous line.
    
    Signed-off-by: Peter Wu <peter at lekensteyn.nl>

diff --git a/build-aux/git-hooks/commit-msg b/build-aux/git-hooks/commit-msg
index 5a697c7..3ca918b 100755
--- a/build-aux/git-hooks/commit-msg
+++ b/build-aux/git-hooks/commit-msg
@@ -86,11 +86,17 @@ sub check_msg($$)
   2 <= @line && length $line[1]
     and return 'second line must be empty';
 
+  # See git-commit(1), this is the --cleanup=scissors option. Everything
+  # after and including this line gets ignored.
+  my $marker = '# ------------------------ >8 ------------------------';
+
   # Limit line length to allow for the ChangeLog's leading TAB.
   foreach my $line (@line)
     {
       72 < length $line && $line =~ /^[^#]/
         and return 'line longer than 72 characters';
+
+      last if $line eq $marker;
     }
 
   return '';

-----------------------------------------------------------------------

Summary of changes:
 build-aux/git-hooks/commit-msg | 6 ++++++
 1 file changed, 6 insertions(+)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From gniibe at fsij.org  Fri Jul 24 08:32:22 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 24 Jul 2015 15:32:22 +0900
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <87y4i7awaw.fsf@vigenere.g10code.de>
References: <55965577.7020306@fsij.org>
 <559C7010.6040700@fsij.org>	<877fpse3aj.fsf@vigenere.g10code.de>
 <55B09FB2.9060403@fsij.org> <87y4i7awaw.fsf@vigenere.g10code.de>
Message-ID: <55B1DBF6.1010109@fsij.org>

On 07/23/2015 09:32 PM, Werner Koch wrote:
> On Thu, 23 Jul 2015 10:02, gniibe at fsij.org said:
> 
>> So, its meaning is sec-is-multiplied-by-cofactor-and-msb-set (not mont
>> or x-only, which is defined by curve's model or compression).
>>
>> I don't have good naming for the flag though.
> 
> "djb" :-)

It is good for us. :-)  It would require some more explanation
for other people.

> Anyone else with a suggestion for the name of such a flag?

>From poor vocabulary of non-native speaker,

    trim, rational, legitimate, validated, solid,

come up.

I think that the practice makes much sense because it encourages
constant time implementation.  I wonder why it wasn't common for
the standardization of ECC before safe curves.


How about "advance"?  In some sense, a secret key with this flag is
like a ticket sold in advance; For both sides (buy & sell), it
eliminates a possibility of failures (of payment).

When we see the flag, it means that it's advanced ECC with safe curve.

My point is: It would be good it has better connotation.
-- 


From grothoff at gnunet.org  Fri Jul 24 08:38:39 2015
From: grothoff at gnunet.org (Christian Grothoff)
Date: Fri, 24 Jul 2015 08:38:39 +0200
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <55B1DBF6.1010109@fsij.org>
References: <55965577.7020306@fsij.org>
 <559C7010.6040700@fsij.org>	<877fpse3aj.fsf@vigenere.g10code.de>
 <55B09FB2.9060403@fsij.org> <87y4i7awaw.fsf@vigenere.g10code.de>
 <55B1DBF6.1010109@fsij.org>
Message-ID: <55B1DD6F.6000106@gnunet.org>

Why have a flag for the sane/safe behaviour? If we need a flag at all,
shouldn't we have it for the unsafe behaviour? (and then we can just
call it 'unsafe', to be appropriately discouraging).  AFAIK encryption
support is kind-of new anyway, so hopefully this isn't needed to avoid
breaking backwards-compatibility with anything that has been deployed...

On 07/24/2015 08:32 AM, NIIBE Yutaka wrote:
> On 07/23/2015 09:32 PM, Werner Koch wrote:
>> On Thu, 23 Jul 2015 10:02, gniibe at fsij.org said:
>>
>>> So, its meaning is sec-is-multiplied-by-cofactor-and-msb-set (not mont
>>> or x-only, which is defined by curve's model or compression).
>>>
>>> I don't have good naming for the flag though.
>>
>> "djb" :-)
> 
> It is good for us. :-)  It would require some more explanation
> for other people.
> 
>> Anyone else with a suggestion for the name of such a flag?
> 
> From poor vocabulary of non-native speaker,
> 
>     trim, rational, legitimate, validated, solid,
> 
> come up.
> 
> I think that the practice makes much sense because it encourages
> constant time implementation.  I wonder why it wasn't common for
> the standardization of ECC before safe curves.
> 
> 
> How about "advance"?  In some sense, a secret key with this flag is
> like a ticket sold in advance; For both sides (buy & sell), it
> eliminates a possibility of failures (of payment).
> 
> When we see the flag, it means that it's advanced ECC with safe curve.
> 
> My point is: It would be good it has better connotation.
> 


From gniibe at fsij.org  Fri Jul 24 11:15:38 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Fri, 24 Jul 2015 18:15:38 +0900
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <55B1DD6F.6000106@gnunet.org>
References: <55965577.7020306@fsij.org>
 <559C7010.6040700@fsij.org>	<877fpse3aj.fsf@vigenere.g10code.de>
 <55B09FB2.9060403@fsij.org> <87y4i7awaw.fsf@vigenere.g10code.de>
 <55B1DBF6.1010109@fsij.org> <55B1DD6F.6000106@gnunet.org>
Message-ID: <55B2023A.8010402@fsij.org>

Hello,

Thank you for your comment.  Let me clarify.

On 07/24/2015 03:38 PM, Christian Grothoff wrote:
> Why have a flag for the sane/safe behaviour? If we need a flag at all,
> shouldn't we have it for the unsafe behaviour? (and then we can just
> call it 'unsafe', to be appropriately discouraging).  AFAIK encryption
> support is kind-of new anyway, so hopefully this isn't needed to avoid
> breaking backwards-compatibility with anything that has been deployed...

We already have "classic" ECC (including ECDH encryption) with the
NIST, Brainpool, and GOST curves.  Well, I'd say, it is not-that-safe
if we compare modern ECC with safe curve.  Its deployment (libgcrypt
feature of classic ECC) is not that popular now, but it's published
somehow by GnuPG 2.1's ECC support.

With Curve25519, we are introducing new safer practice of
sec-is-multiplied-by-cofactor-and-msb-set.

I think that this practice can be applied to existing ECC code (since
all existing curves have cofactor=1, only "msb-set" part is relevant),
if/when we want to improve existing ECC code to be constant time.
-- 


From wk at gnupg.org  Fri Jul 24 16:46:13 2015
From: wk at gnupg.org (Werner Koch)
Date: Fri, 24 Jul 2015 16:46:13 +0200
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <55B1DD6F.6000106@gnunet.org> (Christian Grothoff's message of
 "Fri, 24 Jul 2015 08:38:39 +0200")
References: <55965577.7020306@fsij.org> <559C7010.6040700@fsij.org>
 <877fpse3aj.fsf@vigenere.g10code.de> <55B09FB2.9060403@fsij.org>
 <87y4i7awaw.fsf@vigenere.g10code.de> <55B1DBF6.1010109@fsij.org>
 <55B1DD6F.6000106@gnunet.org>
Message-ID: <87k2tpaa16.fsf@vigenere.g10code.de>

On Fri, 24 Jul 2015 08:38, grothoff at gnunet.org said:

>> It is good for us. :-)  It would require some more explanation
>> for other people.

What about

  le-tweak

because it is a tweak in the little endian representation.  Yeah, I know
that it sounds like "let weak" ;-)

Or

  le-twist
  le-highbit
  le-msb
  twistle


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From gniibe at fsij.org  Sat Jul 25 05:08:24 2015
From: gniibe at fsij.org (NIIBE Yutaka)
Date: Sat, 25 Jul 2015 12:08:24 +0900
Subject: [EXPERIMENTAL-PATCH] Curve25519 encryption support (updated)
In-Reply-To: <87k2tpaa16.fsf@vigenere.g10code.de>
References: <55965577.7020306@fsij.org> <559C7010.6040700@fsij.org>
 <877fpse3aj.fsf@vigenere.g10code.de> <55B09FB2.9060403@fsij.org>
 <87y4i7awaw.fsf@vigenere.g10code.de> <55B1DBF6.1010109@fsij.org>
 <55B1DD6F.6000106@gnunet.org> <87k2tpaa16.fsf@vigenere.g10code.de>
Message-ID: <55B2FDA8.8090505@fsij.org>

On 07/24/2015 11:46 PM, Werner Koch wrote:
> What about
> 
>   le-tweak
> 
> because it is a tweak in the little endian representation.  Yeah, I know
> that it sounds like "let weak" ;-)
> 
> Or
> 
>   le-twist
>   le-highbit
>   le-msb
>   twistle

I feel that "twist" is a bit confusing, because we also use the term
"twist" for curves.  tweak would be better.

I think that it's not specific to little endian.  Last year, when I
tested Curve25519, the key (secret and public) was in big endian
format.


For cofactor multiplied secret key, I refer the site:

    SafeCurves: choosing safe curves for elliptic-curve cryptography
    Twist security
    http://safecurves.cr.yp.to/twist.html

    In the section: Background: small-subgroup attacks,
    it says:

    A protocol designer can protect against this type of attack for
    any curve by specifying n=hs.

Here, h is the cofactor, n is the secret key, and I think that s is
something secret.


For secret key with MSB=1, its obvious that it's against timing
attack.  Since it's so obvious, I don't have good reference.  Here is
an explanation I found in Q&A site:

    When using Curve25519, why does the private key always have a fixed bit at 2^254?

    crypto.stackexchange.com/questions/11810/when-using-curve25519-why-does-the-private-key-always-have-a-fixed-bit-at-2254

(As I said yesterday, this can be applied to computation with other
curves.)


Yes, the practice of secret key is a tweak against such attacks.
-- 


From cvs at cvs.gnupg.org  Sun Jul 26 16:00:42 2015
From: cvs at cvs.gnupg.org (by Peter Wu)
Date: Sun, 26 Jul 2015 16:00:42 +0200
Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.6.0-237-g46c0726
Message-ID: <E1ZJMMz-0005ZZ-VH@lists.gnupg.org>

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  46c072669eb81ed610cc5b3c0dc0c75a143afbb4 (commit)
      from  ada0a7d302cca97b327faaacac7a5d0b8043df88 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 46c072669eb81ed610cc5b3c0dc0c75a143afbb4
Author: Peter Wu <peter at lekensteyn.nl>
Date:   Sun Jul 26 16:50:33 2015 +0300

    Fix undefined behavior wrt memcpy
    
    * cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let
    the function continue to add padding as needed though.
    * cipher/mac-poly1305.c: If the caller requested to finish the hash
    function without a copy of the result, return immediately.
    --
    Caught by UndefinedBehaviorSanitizer.
    
    Signed-off-by: Peter Wu <peter at lekensteyn.nl>

diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
index 6b13fc5..3711a1d 100644
--- a/cipher/cipher-gcm.c
+++ b/cipher/cipher-gcm.c
@@ -474,7 +474,7 @@ do_ghash_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf,
 
   do
     {
-      if (buflen + unused < blocksize || unused > 0)
+      if (buflen > 0 && (buflen + unused < blocksize || unused > 0))
         {
           n = blocksize - unused;
           n = n < buflen ? n : buflen;
diff --git a/cipher/mac-poly1305.c b/cipher/mac-poly1305.c
index 76b369a..b80f87d 100644
--- a/cipher/mac-poly1305.c
+++ b/cipher/mac-poly1305.c
@@ -260,6 +260,9 @@ poly1305mac_read (gcry_mac_hd_t h, unsigned char *outbuf, size_t *outlen)
       mac_ctx->marks.tag = 1;
     }
 
+  if (*outlen == 0)
+    return 0;
+
   if (*outlen <= POLY1305_TAGLEN)
     buf_cpy (outbuf, mac_ctx->tag, *outlen);
   else

-----------------------------------------------------------------------

Summary of changes:
 cipher/cipher-gcm.c   | 2 +-
 cipher/mac-poly1305.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits


From wk at gnupg.org  Mon Jul 27 07:45:25 2015
From: wk at gnupg.org (Werner Koch)
Date: Mon, 27 Jul 2015 07:45:25 +0200
Subject: [git] GCRYPT - branch, master,
 updated. libgcrypt-1.6.0-237-g46c0726
In-Reply-To: <E1ZJMMz-0005ZZ-VH@lists.gnupg.org> (by Peter Wu's message of
 "Sun, 26 Jul 2015 16:00:42 +0200")
References: <E1ZJMMz-0005ZZ-VH@lists.gnupg.org>
Message-ID: <87zj2i887e.fsf@vigenere.g10code.de>

On Sun, 26 Jul 2015 16:00, cvs at cvs.gnupg.org said:

> commit 46c072669eb81ed610cc5b3c0dc0c75a143afbb4

>     Fix undefined behavior wrt memcpy
>     
>     * cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let
>     the function continue to add padding as needed though.

I think it is a bit surprising that buf_cpy does not behave similar to
memcpy and requires the caller to check that LEN is greater than zero.
Would it be a noticeable loss of speed if buf_cpy would do the test
instead of the caller?


Shalom-Salam,

   Werner


-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Mon Jul 27 10:15:41 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 11:15:41 +0300
Subject: [git] GCRYPT - branch, master,
 updated. libgcrypt-1.6.0-237-g46c0726
In-Reply-To: <87zj2i887e.fsf@vigenere.g10code.de>
References: <E1ZJMMz-0005ZZ-VH@lists.gnupg.org>
 <87zj2i887e.fsf@vigenere.g10code.de>
Message-ID: <55B5E8AD.3010102@iki.fi>

On 27.07.2015 08:45, Werner Koch wrote:
> On Sun, 26 Jul 2015 16:00, cvs at cvs.gnupg.org said:
> 
>> commit 46c072669eb81ed610cc5b3c0dc0c75a143afbb4
> 
>>     Fix undefined behavior wrt memcpy
>>     
>>     * cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let
>>     the function continue to add padding as needed though.
> 
> I think it is a bit surprising that buf_cpy does not behave similar to
> memcpy and requires the caller to check that LEN is greater than zero.
> Would it be a noticeable loss of speed if buf_cpy would do the test
> instead of the caller?
> 

Regular buf_cpy can handle the srcbuf==NULL && len==0 case (undefined for
memcpy) without problem.

Issue is that on x86, buf_cpy just uses memcpy directly (for faster
code generation) and Peter caught undefined memcpy usage (srcbuf==NULL
&& len==0) with UndefinedBehaviorSanitizer. I guess buffer length check
could be add to x86 version of buf_cpy.

-Jussi

> 
> Shalom-Salam,
> 
>    Werner
> 
> 


From jussi.kivilinna at iki.fi  Mon Jul 27 11:04:15 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 12:04:15 +0300
Subject: [PATCH 1/6] Add OCB bulk mode for AES SSSE3 implementation
Message-ID: <20150727090415.18742.12674.stgit@localhost6.localdomain6>

* cipher/rijndael-ssse3-amd64.c (SSSE3_STATE_SIZE): New.
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS] (vpaes_ssse3_prepare): Use
'ssse3_state' for storing current SSSE3 state.
[HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS]
(vpaes_ssse3_cleanup): Restore SSSE3 state from 'ssse3_state'.
(_gcry_aes_ssse3_do_setkey, _gcry_aes_ssse3_prepare_decryption)
(_gcry_aes_ssse3_encrypt, _gcry_aes_ssse3_cfb_enc)
(_gcry_aes_ssse3_cbc_enc, _gcry_aes_ssse3_ctr_enc)
(_gcry_aes_ssse3_decrypt, _gcry_aes_ssse3_cfb_dec)
(_gcry_aes_ssse3_cbc_dec, _gcry_aes_ssse3_cbc_dec): Add 'ssse3_state'
array.
(get_l, ssse3_ocb_enc, ssse3_ocb_dec, _gcry_aes_ssse3_ocb_crypt)
(_gcry_aes_ssse3_ocb_auth): New.
* cipher/rijndael.c (_gcry_aes_ssse3_ocb_crypt)
(_gcry_aes_ssse3_ocb_auth): New.
(_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth) [USE_SSSE3]: Use SSSE3
implementation for OCB.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-ssse3-amd64.c |  305 ++++++++++++++++++++++++++++++++++++++++-
 cipher/rijndael.c             |   19 +++
 2 files changed, 320 insertions(+), 4 deletions(-)

diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index 21438dc..0cdb532 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -45,6 +45,7 @@
 #include "bufhelp.h"
 #include "cipher-selftest.h"
 #include "rijndael-internal.h"
+#include "./cipher-internal.h"
 
 
 #ifdef USE_SSSE3
@@ -62,9 +63,9 @@
   SSE registers are cleared and won't reveal any information about
   the key or the data.  */
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define SSSE3_STATE_SIZE (16 * 10)
 /* XMM6-XMM15 are callee-saved registers on WIN64. */
 # define vpaes_ssse3_prepare() \
-    char win64tmp[16 * 10]; \
     asm volatile ("movdqu %%xmm6,  0*16(%0)\n\t" \
                   "movdqu %%xmm7,  1*16(%0)\n\t" \
                   "movdqu %%xmm8,  2*16(%0)\n\t" \
@@ -76,7 +77,7 @@
                   "movdqu %%xmm14, 8*16(%0)\n\t" \
                   "movdqu %%xmm15, 9*16(%0)\n\t" \
                   : \
-                  : "r" (win64tmp) \
+                  : "r" (ssse3_state) \
                   : "memory" )
 # define vpaes_ssse3_cleanup() \
     asm volatile ("pxor	%%xmm0,  %%xmm0 \n\t" \
@@ -96,10 +97,11 @@
                   "movdqu 8*16(%0), %%xmm14 \n\t" \
                   "movdqu 9*16(%0), %%xmm15 \n\t" \
                   : \
-                  : "r" (win64tmp) \
+                  : "r" (ssse3_state) \
                   : "memory" )
 #else
-# define vpaes_ssse3_prepare() /*_*/
+# define SSSE3_STATE_SIZE 1
+# define vpaes_ssse3_prepare() (void)ssse3_state
 # define vpaes_ssse3_cleanup() \
     asm volatile ("pxor	%%xmm0,  %%xmm0 \n\t" \
                   "pxor	%%xmm1,  %%xmm1 \n\t" \
@@ -148,6 +150,7 @@ void
 _gcry_aes_ssse3_do_setkey (RIJNDAEL_context *ctx, const byte *key)
 {
   unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare();
 
@@ -178,6 +181,7 @@ void
 _gcry_aes_ssse3_prepare_decryption (RIJNDAEL_context *ctx)
 {
   unsigned int keybits = (ctx->rounds - 10) * 32 + 128;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare();
 
@@ -238,6 +242,7 @@ _gcry_aes_ssse3_encrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
 {
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare_enc (aes_const_ptr);
   asm volatile ("movdqu %[src], %%xmm0\n\t"
@@ -261,6 +266,7 @@ _gcry_aes_ssse3_cfb_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
 {
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare_enc (aes_const_ptr);
 
@@ -300,6 +306,7 @@ _gcry_aes_ssse3_cbc_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
 {
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare_enc (aes_const_ptr);
 
@@ -347,6 +354,7 @@ _gcry_aes_ssse3_ctr_enc (RIJNDAEL_context *ctx, unsigned char *outbuf,
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
   u64 ctrlow;
 
   vpaes_ssse3_prepare_enc (aes_const_ptr);
@@ -411,6 +419,7 @@ _gcry_aes_ssse3_decrypt (const RIJNDAEL_context *ctx, unsigned char *dst,
 {
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare_dec (aes_const_ptr);
   asm volatile ("movdqu %[src], %%xmm0\n\t"
@@ -434,6 +443,7 @@ _gcry_aes_ssse3_cfb_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
 {
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare_enc (aes_const_ptr);
 
@@ -474,6 +484,7 @@ _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
 {
   unsigned int nrounds = ctx->rounds;
   const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
 
   vpaes_ssse3_prepare_dec (aes_const_ptr);
 
@@ -516,6 +527,292 @@ _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx, unsigned char *outbuf,
 }
 
 
+static inline const unsigned char *
+get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i, unsigned char *iv,
+       unsigned char *ctr, const void **aes_const_ptr,
+       byte ssse3_state[SSSE3_STATE_SIZE], int encrypt)
+{
+  const unsigned char *l;
+  unsigned int ntz;
+
+  if (i & 0xffffffffU)
+    {
+      asm ("rep;bsf %k[low], %k[ntz]\n\t"
+           : [ntz] "=r" (ntz)
+           : [low] "r" (i & 0xffffffffU)
+           : "cc");
+    }
+  else
+    {
+      if (OCB_L_TABLE_SIZE < 32)
+        {
+          ntz = 32;
+        }
+      else if (i)
+        {
+          asm ("rep;bsf %k[high], %k[ntz]\n\t"
+               : [ntz] "=r" (ntz)
+               : [high] "r" (i >> 32)
+               : "cc");
+          ntz += 32;
+        }
+      else
+        {
+          ntz = 64;
+        }
+    }
+
+  if (ntz < OCB_L_TABLE_SIZE)
+    {
+      l = c->u_mode.ocb.L[ntz];
+    }
+  else
+    {
+      /* Store Offset & Checksum before calling external function */
+      asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                    "movdqu %%xmm6, %[ctr]\n\t"
+                    : [iv] "=m" (*iv),
+                      [ctr] "=m" (*ctr)
+                    :
+                    : "memory" );
+
+      /* Restore SSSE3 state. */
+      vpaes_ssse3_cleanup();
+
+      l = _gcry_cipher_ocb_get_l (c, l_tmp, i);
+
+      /* Save SSSE3 state. */
+      if (encrypt)
+	{
+	  vpaes_ssse3_prepare_enc (*aes_const_ptr);
+	}
+      else
+	{
+	  vpaes_ssse3_prepare_dec (*aes_const_ptr);
+	}
+
+      /* Restore Offset & Checksum */
+      asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                    "movdqu %[ctr], %%xmm6\n\t"
+                    : /* No output */
+                    : [iv] "m" (*iv),
+                      [ctr] "m" (*ctr)
+                    : "memory" );
+    }
+
+  return l;
+}
+
+
+static void
+ssse3_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks)
+{
+  union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  unsigned int nrounds = ctx->rounds;
+  const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc (aes_const_ptr);
+
+  /* Preload Offset and Checksum */
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_iv.iv),
+                  [ctr] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      const unsigned char *l;
+
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr, &aes_const_ptr,
+		ssse3_state, 1);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm7\n\t"
+                    "pxor   %%xmm0,   %%xmm6\n\t"
+                    "pxor   %%xmm7,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+      asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_iv.iv),
+                  [ctr] "=m" (*c->u_ctr.ctr)
+                :
+                : "memory" );
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+  vpaes_ssse3_cleanup ();
+}
+
+static void
+ssse3_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
+               const void *inbuf_arg, size_t nblocks)
+{
+  union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 n = c->u_mode.ocb.data_nblocks;
+  unsigned int nrounds = ctx->rounds;
+  const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_dec (aes_const_ptr);
+
+  /* Preload Offset and Checksum */
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_iv.iv),
+                  [ctr] "m" (*c->u_ctr.ctr)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      const unsigned char *l;
+
+      l = get_l(c, l_tmp.x1, ++n, c->u_iv.iv, c->u_ctr.ctr, &aes_const_ptr,
+		ssse3_state, 0);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+      /* Checksum_i = Checksum_{i-1} xor P_i  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[inbuf], %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm7\n\t"
+                    "pxor   %%xmm7,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [inbuf] "m" (*inbuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_dec (ctx, nrounds, aes_const_ptr);
+
+      asm volatile ("pxor   %%xmm7, %%xmm0\n\t"
+                    "pxor   %%xmm0, %%xmm6\n\t"
+                    "movdqu %%xmm0, %[outbuf]\n\t"
+                    : [outbuf] "=m" (*outbuf)
+                    :
+                    : "memory" );
+
+      inbuf += BLOCKSIZE;
+      outbuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.data_nblocks = n;
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_iv.iv),
+                  [ctr] "=m" (*c->u_ctr.ctr)
+                :
+                : "memory" );
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+  vpaes_ssse3_cleanup ();
+}
+
+
+void
+_gcry_aes_ssse3_ocb_crypt(gcry_cipher_hd_t c, void *outbuf_arg,
+                          const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  if (encrypt)
+    ssse3_ocb_enc(c, outbuf_arg, inbuf_arg, nblocks);
+  else
+    ssse3_ocb_dec(c, outbuf_arg, inbuf_arg, nblocks);
+}
+
+
+void
+_gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+                          size_t nblocks)
+{
+  union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
+  RIJNDAEL_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 n = c->u_mode.ocb.aad_nblocks;
+  unsigned int nrounds = ctx->rounds;
+  const void *aes_const_ptr;
+  byte ssse3_state[SSSE3_STATE_SIZE];
+
+  vpaes_ssse3_prepare_enc (aes_const_ptr);
+
+  /* Preload Offset and Sum */
+  asm volatile ("movdqu %[iv], %%xmm7\n\t"
+                "movdqu %[ctr], %%xmm6\n\t"
+                : /* No output */
+                : [iv] "m" (*c->u_mode.ocb.aad_offset),
+                  [ctr] "m" (*c->u_mode.ocb.aad_sum)
+                : "memory" );
+
+  for ( ;nblocks; nblocks-- )
+    {
+      const unsigned char *l;
+
+      l = get_l(c, l_tmp.x1, ++n, c->u_mode.ocb.aad_offset,
+                c->u_mode.ocb.aad_sum, &aes_const_ptr, ssse3_state, 1);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      asm volatile ("movdqu %[l],     %%xmm1\n\t"
+                    "movdqu %[abuf],  %%xmm0\n\t"
+                    "pxor   %%xmm1,   %%xmm7\n\t"
+                    "pxor   %%xmm7,   %%xmm0\n\t"
+                    :
+                    : [l] "m" (*l),
+                      [abuf] "m" (*abuf)
+                    : "memory" );
+
+      do_vpaes_ssse3_enc (ctx, nrounds, aes_const_ptr);
+
+      asm volatile ("pxor   %%xmm0,   %%xmm6\n\t"
+                    :
+                    :
+                    : "memory" );
+
+      abuf += BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = n;
+  asm volatile ("movdqu %%xmm7, %[iv]\n\t"
+                "movdqu %%xmm6, %[ctr]\n\t"
+                : [iv] "=m" (*c->u_mode.ocb.aad_offset),
+                  [ctr] "=m" (*c->u_mode.ocb.aad_sum)
+                :
+                : "memory" );
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+  vpaes_ssse3_cleanup ();
+}
+
+
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 # define X(...)
 #else
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4f063c4..1fe16d6 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -137,6 +137,11 @@ extern void _gcry_aes_ssse3_cbc_dec (RIJNDAEL_context *ctx,
                                      unsigned char *outbuf,
                                      const unsigned char *inbuf,
                                      unsigned char *iv, size_t nblocks);
+extern void _gcry_aes_ssse3_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+                                       const void *inbuf_arg, size_t nblocks,
+                                       int encrypt);
+extern void _gcry_aes_ssse3_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+                                      size_t nblocks);
 #endif
 
 #ifdef USE_PADLOCK
@@ -1226,6 +1231,13 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
       burn_depth = 0;
     }
 #endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+  else if (ctx->use_ssse3)
+    {
+      _gcry_aes_ssse3_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+      burn_depth = 0;
+    }
+#endif /*USE_SSSE3*/
   else if (encrypt)
     {
       union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
@@ -1314,6 +1326,13 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
       burn_depth = 0;
     }
 #endif /*USE_AESNI*/
+#ifdef USE_SSSE3
+  else if (ctx->use_ssse3)
+    {
+      _gcry_aes_ssse3_ocb_auth (c, abuf, nblocks);
+      burn_depth = 0;
+    }
+#endif /*USE_SSSE3*/
   else
     {
       union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;


From jussi.kivilinna at iki.fi  Mon Jul 27 11:04:25 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 12:04:25 +0300
Subject: [PATCH 3/6] Add bulk OCB for Twofish AMD64 implementation
In-Reply-To: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
References: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
Message-ID: <20150727090425.18742.7458.stgit@localhost6.localdomain6>

* cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk
functions for Twofish.
* cipher/twofish-amd64.S: Add OCB assembly functions.
* cipher/twofish.c (_gcry_twofish_amd64_ocb_enc)
(_gcry_twofish_amd64_ocb_dec, _gcry_twofish_amd64_ocb_auth): New
prototypes.
(call_sysv_fn5, call_sysv_fn6, twofish_amd64_ocb_enc)
(twofish_amd64_ocb_dec, twofish_amd64_ocb_auth, get_l)
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): New.
* src/cipher.h (_gcry_twofish_ocb_crypt)
(_gcry_twofish_ocb_auth): New.
* tests/basic.c (check_ocb_cipher): Add test-vector for Twofish.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher.c        |    2 
 cipher/twofish-amd64.S |  310 ++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/twofish.c       |  259 ++++++++++++++++++++++++++++++++++++++++
 src/cipher.h           |    5 +
 tests/basic.c          |   20 ++-
 5 files changed, 588 insertions(+), 8 deletions(-)

diff --git a/cipher/cipher.c b/cipher/cipher.c
index 2d2b0ad..8483c5f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -561,6 +561,8 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.cbc_dec = _gcry_twofish_cbc_dec;
               h->bulk.cfb_dec = _gcry_twofish_cfb_dec;
               h->bulk.ctr_enc = _gcry_twofish_ctr_enc;
+              h->bulk.ocb_crypt = _gcry_twofish_ocb_crypt;
+              h->bulk.ocb_auth  = _gcry_twofish_ocb_auth;
               break;
 #endif /*USE_TWOFISH*/
 
diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S
index ea88b94..aa964e0 100644
--- a/cipher/twofish-amd64.S
+++ b/cipher/twofish-amd64.S
@@ -1,6 +1,6 @@
 /* twofish-amd64.S  -  AMD64 assembly implementation of Twofish cipher
  *
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -734,5 +734,313 @@ _gcry_twofish_amd64_cfb_dec:
 	ret;
 ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;)
 
+.align 8
+.globl _gcry_twofish_amd64_ocb_enc
+ELF(.type   _gcry_twofish_amd64_ocb_enc, at function;)
+_gcry_twofish_amd64_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[3])
+	 */
+	subq $(8 * 8), %rsp;
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %rdx, RX0;
+	movq %rcx, RX1;
+	movq %r8, RX2;
+	movq %r9, RY0;
+	movq %rsi, RY1;
+
+	/* Load offset */
+	movq (0 * 8)(RX1), RT0;
+	movq (1 * 8)(RX1), RT1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq (RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	/* Store Offset_i */
+	movq RT0, (0 * 8)(RY1);
+	movq RT1, (1 * 8)(RY1);
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xor RAB0, (0 * 8)(RX2);
+	xor RCD0, (1 * 8)(RX2);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB0;
+	xorq RT1, RCD0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 8(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	/* Store Offset_i */
+	movq RT0, (2 * 8)(RY1);
+	movq RT1, (3 * 8)(RY1);
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xor RAB1, (0 * 8)(RX2);
+	xor RCD1, (1 * 8)(RX2);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB1;
+	xorq RT1, RCD1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 16(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+	/* Store Offset_i */
+	movq RT0, (4 * 8)(RY1);
+	movq RT1, (5 * 8)(RY1);
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xor RAB2, (0 * 8)(RX2);
+	xor RCD2, (1 * 8)(RX2);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB2;
+	xorq RT1, RCD2;
+
+	/* Store offset */
+	movq RT0, (0 * 8)(RX1);
+	movq RT1, (1 * 8)(RX1);
+
+	/* CX_i = ENCIPHER(K, PX_i)  */
+	call __twofish_enc_blk3;
+
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	/* C_i = CX_i xor Offset_i  */
+	xorq RCD0, (0 * 8)(RX1);
+	xorq RAB0, (1 * 8)(RX1);
+	xorq RCD1, (2 * 8)(RX1);
+	xorq RAB1, (3 * 8)(RX1);
+	xorq RCD2, (4 * 8)(RX1);
+	xorq RAB2, (5 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	addq $(8 * 8), %rsp;
+
+	ret;
+ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;)
+
+.align 8
+.globl _gcry_twofish_amd64_ocb_dec
+ELF(.type   _gcry_twofish_amd64_ocb_dec, at function;)
+_gcry_twofish_amd64_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[3])
+	 */
+	subq $(8 * 8), %rsp;
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+
+	movq %rsi, (6 * 8)(%rsp);
+	movq %r8,  (7 * 8)(%rsp);
+	movq %rdx, RX0;
+	movq %rcx, RX1;
+	movq %r9, RY0;
+	movq %rsi, RY1;
+
+	/* Load offset */
+	movq (0 * 8)(RX1), RT0;
+	movq (1 * 8)(RX1), RT1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq (RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	/* Store Offset_i */
+	movq RT0, (0 * 8)(RY1);
+	movq RT1, (1 * 8)(RY1);
+	/* CX_i = C_i xor Offset_i */
+	xorq RT0, RAB0;
+	xorq RT1, RCD0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 8(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	/* Store Offset_i */
+	movq RT0, (2 * 8)(RY1);
+	movq RT1, (3 * 8)(RY1);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB1;
+	xorq RT1, RCD1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 16(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+	/* Store Offset_i */
+	movq RT0, (4 * 8)(RY1);
+	movq RT1, (5 * 8)(RY1);
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB2;
+	xorq RT1, RCD2;
+
+	/* Store offset */
+	movq RT0, (0 * 8)(RX1);
+	movq RT1, (1 * 8)(RX1);
+
+	/* PX_i = DECIPHER(K, CX_i)  */
+	call __twofish_dec_blk3;
+
+	movq (7 * 8)(%rsp), RX2; /*checksum*/
+	movq (6 * 8)(%rsp), RX1; /*dst*/
+
+	/* Load checksum */
+	movq (0 * 8)(RX2), RT0;
+	movq (1 * 8)(RX2), RT1;
+
+	/* P_i = PX_i xor Offset_i  */
+	xorq RCD0, (0 * 8)(RX1);
+	xorq RAB0, (1 * 8)(RX1);
+	xorq RCD1, (2 * 8)(RX1);
+	xorq RAB1, (3 * 8)(RX1);
+	xorq RCD2, (4 * 8)(RX1);
+	xorq RAB2, (5 * 8)(RX1);
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	xorq (0 * 8)(RX1), RT0;
+	xorq (1 * 8)(RX1), RT1;
+	xorq (2 * 8)(RX1), RT0;
+	xorq (3 * 8)(RX1), RT1;
+	xorq (4 * 8)(RX1), RT0;
+	xorq (5 * 8)(RX1), RT1;
+
+	/* Store checksum */
+	movq RT0, (0 * 8)(RX2);
+	movq RT1, (1 * 8)(RX2);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	addq $(8 * 8), %rsp;
+
+	ret;
+ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;)
+
+.align 8
+.globl _gcry_twofish_amd64_ocb_auth
+ELF(.type   _gcry_twofish_amd64_ocb_auth, at function;)
+_gcry_twofish_amd64_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (3 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[3])
+	 */
+	subq $(8 * 8), %rsp;
+	movq %rbp, (0 * 8)(%rsp);
+	movq %rbx, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+	movq %r14, (4 * 8)(%rsp);
+	movq %r15, (5 * 8)(%rsp);
+
+	movq %rcx, (6 * 8)(%rsp);
+	movq %rsi, RX0;
+	movq %rdx, RX1;
+	movq %r8, RY0;
+
+	/* Load offset */
+	movq (0 * 8)(RX1), RT0;
+	movq (1 * 8)(RX1), RT1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq (RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (0 * 8)(RX0), RAB0;
+	movq (1 * 8)(RX0), RCD0;
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB0;
+	xorq RT1, RCD0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 8(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (2 * 8)(RX0), RAB1;
+	movq (3 * 8)(RX0), RCD1;
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB1;
+	xorq RT1, RCD1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	movq 16(RY0), RY2;
+	xorq (0 * 8)(RY2), RT0;
+	xorq (1 * 8)(RY2), RT1;
+	movq (4 * 8)(RX0), RAB2;
+	movq (5 * 8)(RX0), RCD2;
+	/* PX_i = P_i xor Offset_i */
+	xorq RT0, RAB2;
+	xorq RT1, RCD2;
+
+	/* Store offset */
+	movq RT0, (0 * 8)(RX1);
+	movq RT1, (1 * 8)(RX1);
+
+	/* C_i = ENCIPHER(K, PX_i)  */
+	call __twofish_enc_blk3;
+
+	movq (6 * 8)(%rsp), RX1; /*checksum*/
+
+	/* Checksum_i = C_i xor Checksum_i  */
+	xorq RCD0, RCD1;
+	xorq RAB0, RAB1;
+	xorq RCD1, RCD2;
+	xorq RAB1, RAB2;
+	xorq RCD2, (0 * 8)(RX1);
+	xorq RAB2, (1 * 8)(RX1);
+
+	movq (0 * 8)(%rsp), %rbp;
+	movq (1 * 8)(%rsp), %rbx;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+	movq (4 * 8)(%rsp), %r14;
+	movq (5 * 8)(%rsp), %r15;
+	addq $(8 * 8), %rsp;
+
+	ret;
+ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;)
+
 #endif /*USE_TWOFISH*/
 #endif /*__x86_64*/
diff --git a/cipher/twofish.c b/cipher/twofish.c
index ce83fad..9b9c35f 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -45,6 +45,7 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
+#include "cipher-internal.h"
 #include "cipher-selftest.h"
 
 
@@ -755,6 +756,18 @@ extern void _gcry_twofish_amd64_cbc_dec(const TWOFISH_context *c, byte *out,
 extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
 					const byte *in, byte *iv);
 
+extern void _gcry_twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out,
+					const byte *in, byte *offset,
+					byte *checksum, const void *Ls[3]);
+
+extern void _gcry_twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out,
+					const byte *in, byte *offset,
+					byte *checksum, const void *Ls[3]);
+
+extern void _gcry_twofish_amd64_ocb_auth(const TWOFISH_context *ctx,
+					 const byte *abuf, byte *offset,
+					 byte *checksum, const void *Ls[3]);
+
 #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
 static inline void
 call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
@@ -771,6 +784,43 @@ call_sysv_fn (const void *fn, const void *arg1, const void *arg2,
                 :
                 : "cc", "memory", "r8", "r9", "r10", "r11");
 }
+
+static inline void
+call_sysv_fn5 (const void *fn, const void *arg1, const void *arg2,
+               const void *arg3, const void *arg4, const void *arg5)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("movq %[arg5], %%r8\n\t"
+		"callq *%0\n\t"
+		: "+a" (fn),
+		  "+D" (arg1),
+		  "+S" (arg2),
+		  "+d" (arg3),
+		  "+c" (arg4)
+		: [arg5] "g" (arg5)
+		: "cc", "memory", "r8", "r9", "r10", "r11");
+}
+
+static inline void
+call_sysv_fn6 (const void *fn, const void *arg1, const void *arg2,
+               const void *arg3, const void *arg4, const void *arg5,
+	       const void *arg6)
+{
+  /* Call SystemV ABI function without storing non-volatile XMM registers,
+   * as target function does not use vector instruction sets. */
+  asm volatile ("movq %[arg5], %%r8\n\t"
+		"movq %[arg6], %%r9\n\t"
+		"callq *%0\n\t"
+		: "+a" (fn),
+		  "+D" (arg1),
+		  "+S" (arg2),
+		  "+d" (arg3),
+		  "+c" (arg4)
+		: [arg5] "g" (arg5),
+		  [arg6] "g" (arg6)
+		: "cc", "memory", "r8", "r9", "r10", "r11");
+}
 #endif
 
 static inline void
@@ -826,6 +876,39 @@ twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out, const byte *in,
 #endif
 }
 
+static inline void
+twofish_amd64_ocb_enc(const TWOFISH_context *ctx, byte *out, const byte *in,
+		      byte *offset, byte *checksum, const void *Ls[3])
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn6(_gcry_twofish_amd64_ocb_enc, ctx, out, in, offset, checksum, Ls);
+#else
+  _gcry_twofish_amd64_ocb_enc(ctx, out, in, offset, checksum, Ls);
+#endif
+}
+
+static inline void
+twofish_amd64_ocb_dec(const TWOFISH_context *ctx, byte *out, const byte *in,
+		      byte *offset, byte *checksum, const void *Ls[3])
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn6(_gcry_twofish_amd64_ocb_dec, ctx, out, in, offset, checksum, Ls);
+#else
+  _gcry_twofish_amd64_ocb_dec(ctx, out, in, offset, checksum, Ls);
+#endif
+}
+
+static inline void
+twofish_amd64_ocb_auth(const TWOFISH_context *ctx, const byte *abuf,
+		       byte *offset, byte *checksum, const void *Ls[3])
+{
+#ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+  call_sysv_fn5(_gcry_twofish_amd64_ocb_auth, ctx, abuf, offset, checksum, Ls);
+#else
+  _gcry_twofish_amd64_ocb_auth(ctx, abuf, offset, checksum, Ls);
+#endif
+}
+
 #elif defined(USE_ARM_ASM)
 
 /* Assembly implementations of Twofish. */
@@ -1188,6 +1271,182 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+static inline const unsigned char *
+get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
+{
+  unsigned int ntz = _gcry_ctz64 (i);
+
+  if (ntz < OCB_L_TABLE_SIZE)
+      return c->u_mode.ocb.L[ntz];
+  else
+      return _gcry_cipher_ocb_get_l (c, l_tmp, i);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+void
+_gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  TWOFISH_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char l_tmp[TWOFISH_BLOCKSIZE];
+  const unsigned char *l;
+  unsigned int burn, burn_stack_depth = 0;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+
+#ifdef USE_AMD64_ASM
+  {
+    const void *Ls[3];
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+	/* l_tmp will be used only every 65536-th block. */
+	Ls[0] = get_l(c, l_tmp, blkn + 1);
+	Ls[1] = get_l(c, l_tmp, blkn + 2);
+	Ls[2] = get_l(c, l_tmp, blkn + 3);
+	blkn += 3;
+
+	if (encrypt)
+	  twofish_amd64_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+				Ls);
+	else
+	  twofish_amd64_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr,
+				Ls);
+
+	nblocks -= 3;
+	outbuf += 3 * TWOFISH_BLOCKSIZE;
+	inbuf  += 3 * TWOFISH_BLOCKSIZE;
+
+	burn = 8 * sizeof(void*);
+	if (burn > burn_stack_depth)
+	  burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  if (encrypt)
+    {
+      for (; nblocks; nblocks--)
+	{
+	  l = get_l(c, l_tmp, ++blkn);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  buf_xor_1 (c->u_iv.iv, l, TWOFISH_BLOCKSIZE);
+	  buf_cpy (l_tmp, inbuf, TWOFISH_BLOCKSIZE);
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  buf_xor_1 (c->u_ctr.ctr, l_tmp, TWOFISH_BLOCKSIZE);
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
+	  burn = twofish_encrypt(ctx, l_tmp, l_tmp);
+	  if (burn > burn_stack_depth)
+	    burn_stack_depth = burn;
+	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
+	  buf_cpy (outbuf, l_tmp, TWOFISH_BLOCKSIZE);
+
+	  inbuf += TWOFISH_BLOCKSIZE;
+	  outbuf += TWOFISH_BLOCKSIZE;
+	}
+    }
+  else
+    {
+      for (; nblocks; nblocks--)
+	{
+	  l = get_l(c, l_tmp, ++blkn);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  buf_xor_1 (c->u_iv.iv, l, TWOFISH_BLOCKSIZE);
+	  buf_cpy (l_tmp, inbuf, TWOFISH_BLOCKSIZE);
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
+	  burn = twofish_decrypt(ctx, l_tmp, l_tmp);
+	  if (burn > burn_stack_depth)
+	    burn_stack_depth = burn;
+	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  buf_xor_1 (c->u_ctr.ctr, l_tmp, TWOFISH_BLOCKSIZE);
+	  buf_cpy (outbuf, l_tmp, TWOFISH_BLOCKSIZE);
+
+	  inbuf += TWOFISH_BLOCKSIZE;
+	  outbuf += TWOFISH_BLOCKSIZE;
+	}
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+void
+_gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			size_t nblocks)
+{
+  TWOFISH_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  unsigned char l_tmp[TWOFISH_BLOCKSIZE];
+  const unsigned char *l;
+  unsigned int burn, burn_stack_depth = 0;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+#ifdef USE_AMD64_ASM
+  {
+    const void *Ls[3];
+
+    /* Process data in 3 block chunks. */
+    while (nblocks >= 3)
+      {
+	/* l_tmp will be used only every 65536-th block. */
+	Ls[0] = get_l(c, l_tmp, blkn + 1);
+	Ls[1] = get_l(c, l_tmp, blkn + 2);
+	Ls[2] = get_l(c, l_tmp, blkn + 3);
+	blkn += 3;
+
+	twofish_amd64_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+			      c->u_mode.ocb.aad_sum, Ls);
+
+	nblocks -= 3;
+	abuf += 3 * TWOFISH_BLOCKSIZE;
+
+	burn = 8 * sizeof(void*);
+	if (burn > burn_stack_depth)
+	  burn_stack_depth = burn;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+  for (; nblocks; nblocks--)
+    {
+      l = get_l(c, l_tmp, ++blkn);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      buf_xor_1 (c->u_mode.ocb.aad_offset, l, TWOFISH_BLOCKSIZE);
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, TWOFISH_BLOCKSIZE);
+      burn = twofish_encrypt(ctx, l_tmp, l_tmp);
+      if (burn > burn_stack_depth)
+	burn_stack_depth = burn;
+      buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, TWOFISH_BLOCKSIZE);
+
+      abuf += TWOFISH_BLOCKSIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
 

 /* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR
diff --git a/src/cipher.h b/src/cipher.h
index a0aac51..1a66f6d 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -217,6 +217,11 @@ void _gcry_twofish_cbc_dec (void *context, unsigned char *iv,
 void _gcry_twofish_cfb_dec (void *context, unsigned char *iv,
                             void *outbuf_arg, const void *inbuf_arg,
                             size_t nblocks);
+void _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			      const void *inbuf_arg, size_t nblocks,
+			      int encrypt);
+void _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			     size_t nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);
diff --git a/tests/basic.c b/tests/basic.c
index e3f4bfd..124df55 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3330,20 +3330,26 @@ check_ocb_cipher (void)
 
   /* Check large buffer encryption/decryption. */
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES, 16,
-                            "\xf5\xf3\x12\x7d\x58\x2d\x96\xe8"
-                            "\x33\xfd\x7a\x4f\x42\x60\x5d\x20");
+			    "\xf5\xf3\x12\x7d\x58\x2d\x96\xe8"
+			    "\x33\xfd\x7a\x4f\x42\x60\x5d\x20");
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES256, 32,
-                            "\xfa\x26\xa5\xbf\xf6\x7d\x3a\x8d"
-                            "\xfe\x96\x67\xc9\xc8\x41\x03\x51");
+			    "\xfa\x26\xa5\xbf\xf6\x7d\x3a\x8d"
+			    "\xfe\x96\x67\xc9\xc8\x41\x03\x51");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA128, 16,
-                            "\x28\x23\x38\x45\x2b\xfd\x42\x45"
+			    "\x28\x23\x38\x45\x2b\xfd\x42\x45"
 			    "\x43\x64\x7e\x67\x7f\xf4\x8b\xcd");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA192, 24,
-                            "\xee\xca\xe5\x39\x27\x2d\x33\xe7"
+			    "\xee\xca\xe5\x39\x27\x2d\x33\xe7"
 			    "\x79\x74\xb0\x1d\x37\x12\xd5\x6c");
   check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA256, 32,
-                            "\x39\x39\xd0\x2d\x05\x68\x74\xee"
+			    "\x39\x39\xd0\x2d\x05\x68\x74\xee"
 			    "\x18\x6b\xea\x3d\x0b\xd3\x58\xae");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 16,
+			    "\x63\xe3\x0e\xb9\x11\x6f\x14\xba"
+			    "\x79\xe4\xa7\x9e\xad\x3c\x02\x0c");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 32,
+			    "\xf6\xd4\xfe\x4e\x50\x85\x13\x59"
+			    "\x69\x0e\x4c\x67\x3e\xdd\x47\x90");
 }
 
 
From jussi.kivilinna at iki.fi  Mon Jul 27 11:04:35 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 12:04:35 +0300
Subject: [PATCH 5/6] Reduce amount of duplicated code in OCB bulk
 implementations
In-Reply-To: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
References: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
Message-ID: <20150727090435.18742.24848.stgit@localhost6.localdomain6>

* cipher/cipher-ocb.c (_gcry_cipher_ocb_authenticate)
(ocb_crypt): Change bulk function to return number of unprocessed
blocks.
* src/cipher.h (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth)
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth)
(_gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth)
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Change return type
to 'size_t'.
* cipher/camellia-glue.c (get_l): Only if USE_AESNI_AVX or
USE_AESNI_AVX2 defined.
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Change return type
to 'size_t' and return remaining blocks; Remove unaccelerated common
code path. Enable remaining common code only if USE_AESNI_AVX or
USE_AESNI_AVX2 defined; Remove unaccelerated common code.
* cipher/rijndael.c (_gcry_aes_ocb_crypt, _gcry_aes_ocb_auth): Change
return type to 'size_t' and return zero.
* cipher/serpent.c (get_l): Only if USE_SSE2, USE_AVX2 or USE_NEON
defined.
(_gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth): Change return type
to 'size_t' and return remaining blocks; Remove unaccelerated common
code path. Enable remaining common code only if USE_SSE2, USE_AVX2 or
USE_NEON defined; Remove unaccelerated common code.
* cipher/twofish.c (get_l): Only if USE_AMD64_ASM defined.
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Change return type
to 'size_t' and return remaining blocks; Remove unaccelerated common
code path. Enable remaining common code only if USE_AMD64_ASM defined;
Remove unaccelerated common code.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-glue.c   |   87 +++++++++++++-------------------------------
 cipher/cipher-internal.h |    7 ++--
 cipher/cipher-ocb.c      |   32 +++++++++++-----
 cipher/rijndael.c        |    8 +++-
 cipher/serpent.c         |   85 ++++++++++++-------------------------------
 cipher/twofish.c         |   91 ++++++++++------------------------------------
 src/cipher.h             |   38 ++++++++++---------
 7 files changed, 120 insertions(+), 228 deletions(-)

diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 197e1b3..99516fc 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -604,6 +604,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
 static inline const unsigned char *
 get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
 {
@@ -614,22 +615,29 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
   else
       return _gcry_cipher_ocb_get_l (c, l_tmp, i);
 }
+#endif
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
-void
+size_t
 _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			  const void *inbuf_arg, size_t nblocks, int encrypt)
 {
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
-  const unsigned char *l;
   int burn_stack_depth;
   u64 blkn = c->u_mode.ocb.data_nblocks;
 
   burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
 			      CAMELLIA_decrypt_stack_burn_size;
+#else
+  (void)c;
+  (void)outbuf_arg;
+  (void)inbuf_arg;
+  (void)encrypt;
+#endif
 
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
@@ -723,70 +731,35 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
 #endif
 
-  if (encrypt)
-    {
-      for (; nblocks; nblocks--)
-	{
-	  l = get_l(c, l_tmp, ++blkn);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE);
-	  buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE);
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE);
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
-	  Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
-	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
-	  buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE);
-
-	  inbuf += CAMELLIA_BLOCK_SIZE;
-	  outbuf += CAMELLIA_BLOCK_SIZE;
-	}
-    }
-  else
-    {
-      for (; nblocks; nblocks--)
-	{
-	  l = get_l(c, l_tmp, ++blkn);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE);
-	  buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE);
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
-	  Camellia_DecryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
-	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE);
-	  buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE);
-
-	  inbuf += CAMELLIA_BLOCK_SIZE;
-	  outbuf += CAMELLIA_BLOCK_SIZE;
-	}
-    }
-
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   c->u_mode.ocb.data_nblocks = blkn;
 
   wipememory(&l_tmp, sizeof(l_tmp));
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
 }
 
 /* Bulk authentication of complete blocks in OCB mode. */
-void
+size_t
 _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
-			size_t nblocks)
+			 size_t nblocks)
 {
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
-  const unsigned char *l;
   int burn_stack_depth;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
 
   burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+#else
+  (void)c;
+  (void)abuf_arg;
+#endif
 
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
@@ -870,26 +843,16 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     }
 #endif
 
-  for (; nblocks; nblocks--)
-    {
-      l = get_l(c, l_tmp, ++blkn);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      buf_xor_1 (c->u_mode.ocb.aad_offset, l, CAMELLIA_BLOCK_SIZE);
-      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, CAMELLIA_BLOCK_SIZE);
-      Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
-      buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, CAMELLIA_BLOCK_SIZE);
-
-      abuf += CAMELLIA_BLOCK_SIZE;
-    }
-
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   c->u_mode.ocb.aad_nblocks = blkn;
 
   wipememory(&l_tmp, sizeof(l_tmp));
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
 }
 
 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
index e20ea56..bb86d37 100644
--- a/cipher/cipher-internal.h
+++ b/cipher/cipher-internal.h
@@ -128,9 +128,10 @@ struct gcry_cipher_handle
     void (*ctr_enc)(void *context, unsigned char *iv,
                     void *outbuf_arg, const void *inbuf_arg,
                     size_t nblocks);
-    void (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
-                      const void *inbuf_arg, size_t nblocks, int encrypt);
-    void (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks);
+    size_t (*ocb_crypt)(gcry_cipher_hd_t c, void *outbuf_arg,
+			const void *inbuf_arg, size_t nblocks, int encrypt);
+    size_t (*ocb_auth)(gcry_cipher_hd_t c, const void *abuf_arg,
+		       size_t nblocks);
   } bulk;
 
 
diff --git a/cipher/cipher-ocb.c b/cipher/cipher-ocb.c
index bc6fd87..096975a 100644
--- a/cipher/cipher-ocb.c
+++ b/cipher/cipher-ocb.c
@@ -260,10 +260,17 @@ _gcry_cipher_ocb_authenticate (gcry_cipher_hd_t c, const unsigned char *abuf,
   /* Use a bulk method if available.  */
   if (abuflen >= OCB_BLOCK_LEN && c->bulk.ocb_auth)
     {
-      size_t nblks = abuflen / OCB_BLOCK_LEN;
-      c->bulk.ocb_auth (c, abuf, nblks);
-      abuf += nblks * OCB_BLOCK_LEN;
-      abuflen -= nblks * OCB_BLOCK_LEN;
+      size_t nblks;
+      size_t nleft;
+      size_t ndone;
+
+      nblks = abuflen / OCB_BLOCK_LEN;
+      nleft = c->bulk.ocb_auth (c, abuf, nblks);
+      ndone = nblks - nleft;
+
+      abuf += ndone * OCB_BLOCK_LEN;
+      abuflen -= ndone * OCB_BLOCK_LEN;
+      nblks = nleft;
     }
 
   /* Hash all full blocks.  */
@@ -354,12 +361,17 @@ ocb_crypt (gcry_cipher_hd_t c, int encrypt,
   /* Use a bulk method if available.  */
   if (nblks && c->bulk.ocb_crypt)
     {
-      c->bulk.ocb_crypt (c, outbuf, inbuf, nblks, encrypt);
-      inbuf  += nblks * OCB_BLOCK_LEN;
-      outbuf += nblks * OCB_BLOCK_LEN;
-      inbuflen -= nblks * OCB_BLOCK_LEN;
-      outbuflen -= nblks * OCB_BLOCK_LEN;
-      nblks = 0;
+      size_t nleft;
+      size_t ndone;
+
+      nleft = c->bulk.ocb_crypt (c, outbuf, inbuf, nblks, encrypt);
+      ndone = nblks - nleft;
+
+      inbuf += ndone * OCB_BLOCK_LEN;
+      outbuf += ndone * OCB_BLOCK_LEN;
+      inbuflen -= ndone * OCB_BLOCK_LEN;
+      outbuflen -= ndone * OCB_BLOCK_LEN;
+      nblks = nleft;
     }
 
   if (nblks)
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 1fe16d6..4368c6d 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1200,7 +1200,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
 
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
-void
+size_t
 _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                      const void *inbuf_arg, size_t nblocks, int encrypt)
 {
@@ -1303,11 +1303,13 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   if (burn_depth)
     _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+
+  return 0;
 }
 
 
 /* Bulk authentication of complete blocks in OCB mode. */
-void
+size_t
 _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 {
   RIJNDAEL_context *ctx = (void *)&c->context.c;
@@ -1364,6 +1366,8 @@ _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
 
   if (burn_depth)
     _gcry_burn_stack (burn_depth + 4 * sizeof(void *));
+
+  return 0;
 }
 
 
diff --git a/cipher/serpent.c b/cipher/serpent.c
index eb491aa..0a54a17 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -1226,6 +1226,7 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
 static inline const unsigned char *
 get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
 {
@@ -1236,19 +1237,26 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
   else
       return _gcry_cipher_ocb_get_l (c, l_tmp, i);
 }
+#endif
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
-void
+size_t
 _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			const void *inbuf_arg, size_t nblocks, int encrypt)
 {
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
   serpent_context_t *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char l_tmp[sizeof(serpent_block_t)];
-  const unsigned char *l;
   int burn_stack_depth = 2 * sizeof (serpent_block_t);
   u64 blkn = c->u_mode.ocb.data_nblocks;
+#else
+  (void)c;
+  (void)outbuf_arg;
+  (void)inbuf_arg;
+  (void)encrypt;
+#endif
 
 #ifdef USE_AVX2
   if (ctx->use_avx2)
@@ -1381,68 +1389,33 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
     }
 #endif
 
-  if (encrypt)
-    {
-      for (; nblocks; nblocks--)
-	{
-	  l = get_l(c, l_tmp, ++blkn);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  buf_xor_1 (c->u_iv.iv, l, sizeof(serpent_block_t));
-	  buf_cpy (l_tmp, inbuf, sizeof(serpent_block_t));
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  buf_xor_1 (c->u_ctr.ctr, l_tmp, sizeof(serpent_block_t));
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
-	  serpent_encrypt_internal(ctx, l_tmp, l_tmp);
-	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
-	  buf_cpy (outbuf, l_tmp, sizeof(serpent_block_t));
-
-	  inbuf += sizeof(serpent_block_t);
-	  outbuf += sizeof(serpent_block_t);
-	}
-    }
-  else
-    {
-      for (; nblocks; nblocks--)
-	{
-	  l = get_l(c, l_tmp, ++blkn);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  buf_xor_1 (c->u_iv.iv, l, sizeof(serpent_block_t));
-	  buf_cpy (l_tmp, inbuf, sizeof(serpent_block_t));
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
-	  serpent_decrypt_internal(ctx, l_tmp, l_tmp);
-	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  buf_xor_1 (c->u_ctr.ctr, l_tmp, sizeof(serpent_block_t));
-	  buf_cpy (outbuf, l_tmp, sizeof(serpent_block_t));
-
-	  inbuf += sizeof(serpent_block_t);
-	  outbuf += sizeof(serpent_block_t);
-	}
-    }
-
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
   c->u_mode.ocb.data_nblocks = blkn;
 
   wipememory(&l_tmp, sizeof(l_tmp));
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
 }
 
 /* Bulk authentication of complete blocks in OCB mode. */
-void
+size_t
 _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			size_t nblocks)
 {
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
   serpent_context_t *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   unsigned char l_tmp[sizeof(serpent_block_t)];
-  const unsigned char *l;
   int burn_stack_depth = 2 * sizeof(serpent_block_t);
   u64 blkn = c->u_mode.ocb.aad_nblocks;
+#else
+  (void)c;
+  (void)abuf_arg;
+#endif
 
 #ifdef USE_AVX2
   if (ctx->use_avx2)
@@ -1560,26 +1533,16 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
     }
 #endif
 
-  for (; nblocks; nblocks--)
-    {
-      l = get_l(c, l_tmp, ++blkn);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      buf_xor_1 (c->u_mode.ocb.aad_offset, l, sizeof(serpent_block_t));
-      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, sizeof(serpent_block_t));
-      serpent_encrypt_internal(ctx, l_tmp, l_tmp);
-      buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, sizeof(serpent_block_t));
-
-      abuf += sizeof(serpent_block_t);
-    }
-
+#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
   c->u_mode.ocb.aad_nblocks = blkn;
 
   wipememory(&l_tmp, sizeof(l_tmp));
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#endif
+
+  return nblocks;
 }
 
 
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 9b9c35f..3ee2be5 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -1271,6 +1271,7 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+#ifdef USE_AMD64_ASM
 static inline const unsigned char *
 get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
 {
@@ -1281,21 +1282,21 @@ get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
   else
       return _gcry_cipher_ocb_get_l (c, l_tmp, i);
 }
+#endif
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
-void
+size_t
 _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 			const void *inbuf_arg, size_t nblocks, int encrypt)
 {
+#ifdef USE_AMD64_ASM
   TWOFISH_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
   unsigned char l_tmp[TWOFISH_BLOCKSIZE];
-  const unsigned char *l;
   unsigned int burn, burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
 
-#ifdef USE_AMD64_ASM
   {
     const void *Ls[3];
 
@@ -1326,54 +1327,6 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
-#endif
-
-  if (encrypt)
-    {
-      for (; nblocks; nblocks--)
-	{
-	  l = get_l(c, l_tmp, ++blkn);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  buf_xor_1 (c->u_iv.iv, l, TWOFISH_BLOCKSIZE);
-	  buf_cpy (l_tmp, inbuf, TWOFISH_BLOCKSIZE);
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  buf_xor_1 (c->u_ctr.ctr, l_tmp, TWOFISH_BLOCKSIZE);
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
-	  burn = twofish_encrypt(ctx, l_tmp, l_tmp);
-	  if (burn > burn_stack_depth)
-	    burn_stack_depth = burn;
-	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
-	  buf_cpy (outbuf, l_tmp, TWOFISH_BLOCKSIZE);
-
-	  inbuf += TWOFISH_BLOCKSIZE;
-	  outbuf += TWOFISH_BLOCKSIZE;
-	}
-    }
-  else
-    {
-      for (; nblocks; nblocks--)
-	{
-	  l = get_l(c, l_tmp, ++blkn);
-
-	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-	  buf_xor_1 (c->u_iv.iv, l, TWOFISH_BLOCKSIZE);
-	  buf_cpy (l_tmp, inbuf, TWOFISH_BLOCKSIZE);
-	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
-	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
-	  burn = twofish_decrypt(ctx, l_tmp, l_tmp);
-	  if (burn > burn_stack_depth)
-	    burn_stack_depth = burn;
-	  buf_xor_1 (l_tmp, c->u_iv.iv, TWOFISH_BLOCKSIZE);
-	  /* Checksum_i = Checksum_{i-1} xor P_i  */
-	  buf_xor_1 (c->u_ctr.ctr, l_tmp, TWOFISH_BLOCKSIZE);
-	  buf_cpy (outbuf, l_tmp, TWOFISH_BLOCKSIZE);
-
-	  inbuf += TWOFISH_BLOCKSIZE;
-	  outbuf += TWOFISH_BLOCKSIZE;
-	}
-    }
 
   c->u_mode.ocb.data_nblocks = blkn;
 
@@ -1381,21 +1334,28 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#else
+  (void)c;
+  (void)outbuf_arg;
+  (void)inbuf_arg;
+  (void)encrypt;
+#endif
+
+  return nblocks;
 }
 
 /* Bulk authentication of complete blocks in OCB mode. */
-void
+size_t
 _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			size_t nblocks)
 {
+#ifdef USE_AMD64_ASM
   TWOFISH_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
   unsigned char l_tmp[TWOFISH_BLOCKSIZE];
-  const unsigned char *l;
   unsigned int burn, burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
 
-#ifdef USE_AMD64_ASM
   {
     const void *Ls[3];
 
@@ -1421,23 +1381,6 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
     /* Use generic code to handle smaller chunks... */
   }
-#endif
-
-  for (; nblocks; nblocks--)
-    {
-      l = get_l(c, l_tmp, ++blkn);
-
-      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
-      buf_xor_1 (c->u_mode.ocb.aad_offset, l, TWOFISH_BLOCKSIZE);
-      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
-      buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, TWOFISH_BLOCKSIZE);
-      burn = twofish_encrypt(ctx, l_tmp, l_tmp);
-      if (burn > burn_stack_depth)
-	burn_stack_depth = burn;
-      buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, TWOFISH_BLOCKSIZE);
-
-      abuf += TWOFISH_BLOCKSIZE;
-    }
 
   c->u_mode.ocb.aad_nblocks = blkn;
 
@@ -1445,6 +1388,12 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
   if (burn_stack_depth)
     _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+#else
+  (void)c;
+  (void)abuf_arg;
+#endif
+
+  return nblocks;
 }
 
 
diff --git a/src/cipher.h b/src/cipher.h
index d16746a..52f2695 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -136,10 +136,10 @@ void _gcry_aes_cbc_dec (void *context, unsigned char *iv,
 void _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
                         void *outbuf_arg, const void *inbuf_arg,
                         size_t nblocks);
-void _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-                         const void *inbuf_arg, size_t nblocks, int encrypt);
-void _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
-                         size_t nblocks);
+size_t _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			    const void *inbuf_arg, size_t nblocks, int encrypt);
+size_t _gcry_aes_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			   size_t nblocks);
 
 /*-- blowfish.c --*/
 void _gcry_blowfish_cfb_dec (void *context, unsigned char *iv,
@@ -177,11 +177,11 @@ void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
 void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              size_t nblocks);
-void _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-			       const void *inbuf_arg, size_t nblocks,
-			       int encrypt);
-void _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
-			      size_t nblocks);
+size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				 const void *inbuf_arg, size_t nblocks,
+				 int encrypt);
+size_t _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+				size_t nblocks);
 
 /*-- des.c --*/
 void _gcry_3des_ctr_enc (void *context, unsigned char *ctr,
@@ -206,11 +206,11 @@ void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
 void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
                             void *outbuf_arg, const void *inbuf_arg,
                             size_t nblocks);
-void _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-			      const void *inbuf_arg, size_t nblocks,
-			      int encrypt);
-void _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
-			     size_t nblocks);
+size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				const void *inbuf_arg, size_t nblocks,
+				int encrypt);
+size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			       size_t nblocks);
 
 /*-- twofish.c --*/
 void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr,
@@ -222,11 +222,11 @@ void _gcry_twofish_cbc_dec (void *context, unsigned char *iv,
 void _gcry_twofish_cfb_dec (void *context, unsigned char *iv,
                             void *outbuf_arg, const void *inbuf_arg,
                             size_t nblocks);
-void _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
-			      const void *inbuf_arg, size_t nblocks,
-			      int encrypt);
-void _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
-			     size_t nblocks);
+size_t _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+				const void *inbuf_arg, size_t nblocks,
+				int encrypt);
+size_t _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			       size_t nblocks);
 
 /*-- dsa.c --*/
 void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);


From jussi.kivilinna at iki.fi  Mon Jul 27 11:04:30 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 12:04:30 +0300
Subject: [PATCH 4/6] Add bulk OCB for Serpent SSE2,
 AVX2 and NEON implementations
In-Reply-To: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
References: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
Message-ID: <20150727090430.18742.9807.stgit@localhost6.localdomain6>

* cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk
functions for Serpent.
* cipher/serpent-armv7-neon.S: Add OCB assembly functions.
* cipher/serpent-avx2-amd64.S: Add OCB assembly functions.
* cipher/serpent-sse2-amd64.S: Add OCB assembly functions.
* cipher/serpent.c (_gcry_serpent_sse2_ocb_enc)
(_gcry_serpent_sse2_ocb_dec, _gcry_serpent_sse2_ocb_auth)
(_gcry_serpent_neon_ocb_enc, _gcry_serpent_neon_ocb_dec)
(_gcry_serpent_neon_ocb_auth, _gcry_serpent_avx2_ocb_enc)
(_gcry_serpent_avx2_ocb_dec, _gcry_serpent_avx2_ocb_auth): New
prototypes.
(get_l, _gcry_serpent_ocb_crypt, _gcry_serpent_ocb_auth): New.
* src/cipher.h (_gcry_serpent_ocb_crypt)
(_gcry_serpent_ocb_auth): New.
* tests/basic.c (check_ocb_cipher): Add test-vector for serpent.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher.c             |    2 
 cipher/serpent-armv7-neon.S |  255 ++++++++++++++++++++++++++
 cipher/serpent-avx2-amd64.S |  307 +++++++++++++++++++++++++++++++-
 cipher/serpent-sse2-amd64.S |  307 +++++++++++++++++++++++++++++++-
 cipher/serpent.c            |  419 +++++++++++++++++++++++++++++++++++++++++++
 src/cipher.h                |    5 +
 tests/basic.c               |    9 +
 7 files changed, 1301 insertions(+), 3 deletions(-)

diff --git a/cipher/cipher.c b/cipher/cipher.c
index 8483c5f..30c2f48 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -553,6 +553,8 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
               h->bulk.cfb_dec = _gcry_serpent_cfb_dec;
               h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
+              h->bulk.ocb_crypt = _gcry_serpent_ocb_crypt;
+              h->bulk.ocb_auth  = _gcry_serpent_ocb_auth;
               break;
 #endif /*USE_SERPENT*/
 #ifdef USE_TWOFISH
diff --git a/cipher/serpent-armv7-neon.S b/cipher/serpent-armv7-neon.S
index 3559558..adff639 100644
--- a/cipher/serpent-armv7-neon.S
+++ b/cipher/serpent-armv7-neon.S
@@ -866,4 +866,259 @@ _gcry_serpent_neon_cbc_dec:
 	pop {pc};
 .size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
 
+.align 3
+.globl _gcry_serpent_neon_ocb_enc
+.type _gcry_serpent_neon_ocb_enc,%function;
+_gcry_serpent_neon_ocb_enc:
+	/* input:
+	 *	r0  : ctx, CTX
+	 *	r1  : dst (8 blocks)
+	 *	r2  : src (8 blocks)
+	 *	r3  : offset
+	 *	sp+0: checksum
+	 *	sp+4: L pointers (void *L[8])
+	 */
+
+	push {r4-r11, ip, lr};
+	add ip, sp, #(10*4);
+
+	vpush {RA4-RB2};
+
+	ldm ip, {r4, lr};
+
+	vld1.8 {RT0}, [r3];
+	vld1.8 {RT1}, [r4];
+
+	/* Load L pointers */
+	ldm lr!, {r5, r6, r7, r8};
+	ldm lr, {r9, r10, r11, ip};
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2];
+
+#define OCB_INPUT(lreg, vreg) \
+	  vld1.8 {RT3}, [lreg]; \
+	  veor RT0, RT3; \
+	  veor RT1, vreg; \
+	  veor vreg, RT0; \
+	  vst1.8 {RT0}, [r1]!;
+
+	OCB_INPUT(r5, RA0);
+	OCB_INPUT(r6, RA1);
+	OCB_INPUT(r7, RA2);
+	OCB_INPUT(r8, RA3);
+	OCB_INPUT(r9, RB0);
+	OCB_INPUT(r10, RB1);
+	OCB_INPUT(r11, RB2);
+	OCB_INPUT(ip, RB3);
+#undef OCB_INPUT
+
+	sub r1, r1, #(8*16);
+	vst1.8 {RT0}, [r3];
+	vst1.8 {RT1}, [r4];
+	mov r2, r1;
+
+	bl __serpent_enc_blk8;
+
+	vld1.8 {RT0, RT1}, [r1]!;
+	veor RT0, RA4, RT0;
+	veor RT1, RA1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	vst1.8 {RT0, RT1}, [r2]!;
+	veor RT2, RA2, RT2;
+	veor RT3, RA0, RT3;
+	vld1.8 {RT0, RT1}, [r1]!;
+	vst1.8 {RT2, RT3}, [r2]!;
+	veor RT0, RB4, RT0;
+	veor RT1, RB1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	vst1.8 {RT0, RT1}, [r2]!;
+	veor RT2, RB2, RT2;
+	veor RT3, RB0, RT3;
+	vst1.8 {RT2, RT3}, [r2]!;
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RA3, RA3;
+	veor RB3, RB3;
+
+	pop {r4-r11, ip, pc};
+.size _gcry_serpent_neon_ocb_enc,.-_gcry_serpent_neon_ocb_enc;
+
+.align 3
+.globl _gcry_serpent_neon_ocb_dec
+.type _gcry_serpent_neon_ocb_dec,%function;
+_gcry_serpent_neon_ocb_dec:
+	/* input:
+	 *	r0  : ctx, CTX
+	 *	r1  : dst (8 blocks)
+	 *	r2  : src (8 blocks)
+	 *	r3  : offset
+	 *	sp+0: checksum
+	 *	sp+4: L pointers (void *L[8])
+	 */
+
+	push {r4-r11, ip, lr};
+	add ip, sp, #(10*4);
+
+	vpush {RA4-RB2};
+
+	ldm ip, {r4, lr};
+
+	vld1.8 {RT0}, [r3];
+
+	/* Load L pointers */
+	ldm lr!, {r5, r6, r7, r8};
+	ldm lr, {r9, r10, r11, ip};
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+	vld1.8 {RA0, RA1}, [r2]!;
+	vld1.8 {RA2, RA3}, [r2]!;
+	vld1.8 {RB0, RB1}, [r2]!;
+	vld1.8 {RB2, RB3}, [r2];
+
+#define OCB_INPUT(lreg, vreg) \
+	  vld1.8 {RT3}, [lreg]; \
+	  veor RT0, RT3; \
+	  veor vreg, RT0; \
+	  vst1.8 {RT0}, [r1]!;
+
+	OCB_INPUT(r5, RA0);
+	OCB_INPUT(r6, RA1);
+	OCB_INPUT(r7, RA2);
+	OCB_INPUT(r8, RA3);
+	OCB_INPUT(r9, RB0);
+	OCB_INPUT(r10, RB1);
+	OCB_INPUT(r11, RB2);
+	OCB_INPUT(ip, RB3);
+#undef OCB_INPUT
+
+	sub r1, r1, #(8*16);
+	vst1.8 {RT0}, [r3];
+	mov r2, r1;
+
+	bl __serpent_dec_blk8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	vld1.8 {RA4}, [r4];
+
+	vld1.8 {RT0, RT1}, [r1]!;
+	veor RA0, RA0, RT0;
+	veor RA1, RA1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	veor RA4, RA4, RA0;
+	vst1.8 {RA0, RA1}, [r2]!;
+	veor RA4, RA4, RA1;
+	veor RA2, RA2, RT2;
+	veor RA3, RA3, RT3;
+	vld1.8 {RT0, RT1}, [r1]!;
+	veor RA4, RA4, RA2;
+	vst1.8 {RA2, RA3}, [r2]!;
+	veor RA4, RA4, RA3;
+	veor RB0, RB0, RT0;
+	veor RB1, RB1, RT1;
+	vld1.8 {RT2, RT3}, [r1]!;
+	veor RA4, RA4, RB0;
+	vst1.8 {RB0, RB1}, [r2]!;
+	veor RA4, RA4, RB1;
+	veor RB2, RB2, RT2;
+	veor RB3, RB3, RT3;
+	veor RA4, RA4, RB2;
+	vst1.8 {RB2, RB3}, [r2]!;
+
+	veor RA4, RA4, RB3;
+	vst1.8 {RA4}, [r4];
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RB4, RB4;
+
+	pop {r4-r11, ip, pc};
+.size _gcry_serpent_neon_ocb_dec,.-_gcry_serpent_neon_ocb_dec;
+
+.align 3
+.globl _gcry_serpent_neon_ocb_auth
+.type _gcry_serpent_neon_ocb_auth,%function;
+_gcry_serpent_neon_ocb_auth:
+	/* input:
+	 *	r0  : ctx, CTX
+	 *	r1  : abuf (8 blocks)
+	 *	r2  : offset
+	 *	r3  : checksum
+	 *	sp+0: L pointers (void *L[8])
+	 */
+
+	push {r5-r11, ip, lr};
+	ldr lr, [sp, #(9*4)];
+
+	vpush {RA4-RB2};
+
+	vld1.8 {RT0}, [r2];
+
+	/* Load L pointers */
+	ldm lr!, {r5, r6, r7, r8};
+	ldm lr, {r9, r10, r11, ip};
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+	vld1.8 {RA0, RA1}, [r1]!;
+	vld1.8 {RA2, RA3}, [r1]!;
+	vld1.8 {RB0, RB1}, [r1]!;
+	vld1.8 {RB2, RB3}, [r1];
+
+#define OCB_INPUT(lreg, vreg) \
+	  vld1.8 {RT3}, [lreg]; \
+	  veor RT0, RT3; \
+	  veor vreg, RT0;
+
+	OCB_INPUT(r5, RA0);
+	OCB_INPUT(r6, RA1);
+	OCB_INPUT(r7, RA2);
+	OCB_INPUT(r8, RA3);
+	OCB_INPUT(r9, RB0);
+	OCB_INPUT(r10, RB1);
+	OCB_INPUT(r11, RB2);
+	OCB_INPUT(ip, RB3);
+#undef OCB_INPUT
+
+	vst1.8 {RT0}, [r2];
+
+	bl __serpent_enc_blk8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	vld1.8 {RT0}, [r3];
+
+	veor RA4, RB4;
+	veor RA1, RB1;
+	veor RA2, RB2;
+	veor RA0, RB0;
+
+	veor RA2, RT0;
+	veor RA1, RA4;
+	veor RA0, RA2;
+
+	veor RA0, RA1;
+
+	vst1.8 {RA0}, [r3];
+
+	vpop {RA4-RB2};
+
+	/* clear the used registers */
+	veor RA3, RA3;
+	veor RB3, RB3;
+
+	pop {r5-r11, ip, pc};
+.size _gcry_serpent_neon_ocb_auth,.-_gcry_serpent_neon_ocb_auth;
+
 #endif
diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S
index 3f59f06..2902dab 100644
--- a/cipher/serpent-avx2-amd64.S
+++ b/cipher/serpent-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* serpent-avx2-amd64.S  -  AVX2 implementation of Serpent cipher
  *
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -808,6 +808,311 @@ _gcry_serpent_avx2_cfb_dec:
 	ret
 ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
 
+.align 8
+.globl _gcry_serpent_avx2_ocb_enc
+ELF(.type _gcry_serpent_avx2_ocb_enc, at function;)
+
+_gcry_serpent_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	vmovdqu (%rcx), RTMP0x;
+	vmovdqu (%r8), RTMP1x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RTMP1, RTMP1; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vmovdqu RTMP0x, (%rcx);
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __serpent_enc_blk16;
+
+	addq $(4 * 8), %rsp;
+
+	vpxor (0 * 32)(%rsi), RA4, RA4;
+	vpxor (1 * 32)(%rsi), RA1, RA1;
+	vpxor (2 * 32)(%rsi), RA2, RA2;
+	vpxor (3 * 32)(%rsi), RA0, RA0;
+	vpxor (4 * 32)(%rsi), RB4, RB4;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RB2, RB2;
+	vpxor (7 * 32)(%rsi), RB0, RB0;
+
+	vmovdqu RA4, (0 * 32)(%rsi);
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vmovdqu RA0, (3 * 32)(%rsi);
+	vmovdqu RB4, (4 * 32)(%rsi);
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vmovdqu RB0, (7 * 32)(%rsi);
+
+	vzeroall;
+
+	ret;
+ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_serpent_avx2_ocb_dec
+ELF(.type _gcry_serpent_avx2_ocb_dec, at function;)
+
+_gcry_serpent_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	vmovdqu (%rcx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg; \
+	  vmovdqu RNOT, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rcx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __serpent_dec_blk16;
+
+	addq $(4 * 8), %rsp;
+
+	vmovdqu (%r8), RTMP1x;
+
+	vpxor (0 * 32)(%rsi), RA0, RA0;
+	vpxor (1 * 32)(%rsi), RA1, RA1;
+	vpxor (2 * 32)(%rsi), RA2, RA2;
+	vpxor (3 * 32)(%rsi), RA3, RA3;
+	vpxor (4 * 32)(%rsi), RB0, RB0;
+	vpxor (5 * 32)(%rsi), RB1, RB1;
+	vpxor (6 * 32)(%rsi), RB2, RB2;
+	vpxor (7 * 32)(%rsi), RB3, RB3;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vmovdqu RA0, (0 * 32)(%rsi);
+	vpxor RA0, RTMP1, RTMP1;
+	vmovdqu RA1, (1 * 32)(%rsi);
+	vpxor RA1, RTMP1, RTMP1;
+	vmovdqu RA2, (2 * 32)(%rsi);
+	vpxor RA2, RTMP1, RTMP1;
+	vmovdqu RA3, (3 * 32)(%rsi);
+	vpxor RA3, RTMP1, RTMP1;
+	vmovdqu RB0, (4 * 32)(%rsi);
+	vpxor RB0, RTMP1, RTMP1;
+	vmovdqu RB1, (5 * 32)(%rsi);
+	vpxor RB1, RTMP1, RTMP1;
+	vmovdqu RB2, (6 * 32)(%rsi);
+	vpxor RB2, RTMP1, RTMP1;
+	vmovdqu RB3, (7 * 32)(%rsi);
+	vpxor RB3, RTMP1, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%r8);
+
+	vzeroall;
+
+	ret;
+ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_serpent_avx2_ocb_auth
+ELF(.type _gcry_serpent_avx2_ocb_auth, at function;)
+
+_gcry_serpent_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+
+	vzeroupper;
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	vmovdqu (%rdx), RTMP0x;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), RTMP0x, RNOTx; \
+	  vpxor (l1reg), RNOTx, RTMP0x; \
+	  vinserti128 $1, RTMP0x, RNOT, RNOT; \
+	  vpxor yreg, RNOT, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, RA0);
+	OCB_INPUT(1, %r12, %r13, RA1);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, RA2);
+	OCB_INPUT(3, %r12, %r13, RA3);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, RB0);
+	OCB_INPUT(5, %r12, %r13, RB1);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, RB2);
+	OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+	vmovdqu RTMP0x, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __serpent_enc_blk16;
+
+	addq $(4 * 8), %rsp;
+
+	vpxor RA4, RB4, RA4;
+	vpxor RA1, RB1, RA1;
+	vpxor RA2, RB2, RA2;
+	vpxor RA0, RB0, RA0;
+
+	vpxor RA4, RA1, RA1;
+	vpxor RA2, RA0, RA0;
+
+	vpxor RA1, RA0, RTMP1;
+
+	vextracti128 $1, RTMP1, RNOTx;
+	vpxor (%rcx), RTMP1x, RTMP1x;
+	vpxor RNOTx, RTMP1x, RTMP1x;
+	vmovdqu RTMP1x, (%rcx);
+
+	vzeroall;
+
+	ret;
+ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
+
 .data
 .align 16
 
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index adbf4e2..b149af2 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -1,6 +1,6 @@
 /* serpent-sse2-amd64.S  -  SSE2 implementation of Serpent cipher
  *
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -866,5 +866,310 @@ _gcry_serpent_sse2_cfb_dec:
 	ret
 ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;)
 
+.align 8
+.globl _gcry_serpent_sse2_ocb_enc
+ELF(.type _gcry_serpent_sse2_ocb_enc, at function;)
+
+_gcry_serpent_sse2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[8])
+	 */
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	movdqu (%rcx), RTMP0;
+	movdqu (%r8), RTMP1;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  movdqu (n * 16)(%rdx), xreg; \
+	  movdqu (lreg), RNOT; \
+	  pxor RNOT, RTMP0; \
+	  pxor xreg, RTMP1; \
+	  pxor RTMP0, xreg; \
+	  movdqu RTMP0, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	movdqu RTMP0, (%rcx);
+	movdqu RTMP1, (%r8);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __serpent_enc_blk8;
+
+	addq $(4 * 8), %rsp;
+
+	pxor_u((0 * 16)(%rsi), RA4, RTMP0);
+	pxor_u((1 * 16)(%rsi), RA1, RTMP0);
+	pxor_u((2 * 16)(%rsi), RA2, RTMP0);
+	pxor_u((3 * 16)(%rsi), RA0, RTMP0);
+	pxor_u((4 * 16)(%rsi), RB4, RTMP0);
+	pxor_u((5 * 16)(%rsi), RB1, RTMP0);
+	pxor_u((6 * 16)(%rsi), RB2, RTMP0);
+	pxor_u((7 * 16)(%rsi), RB0, RTMP0);
+
+	movdqu RA4, (0 * 16)(%rsi);
+	movdqu RA1, (1 * 16)(%rsi);
+	movdqu RA2, (2 * 16)(%rsi);
+	movdqu RA0, (3 * 16)(%rsi);
+	movdqu RB4, (4 * 16)(%rsi);
+	movdqu RB1, (5 * 16)(%rsi);
+	movdqu RB2, (6 * 16)(%rsi);
+	movdqu RB0, (7 * 16)(%rsi);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;)
+
+.align 8
+.globl _gcry_serpent_sse2_ocb_dec
+ELF(.type _gcry_serpent_sse2_ocb_dec, at function;)
+
+_gcry_serpent_sse2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[8])
+	 */
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	movdqu (%rcx), RTMP0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  movdqu (n * 16)(%rdx), xreg; \
+	  movdqu (lreg), RNOT; \
+	  pxor RNOT, RTMP0; \
+	  pxor RTMP0, xreg; \
+	  movdqu RTMP0, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	movdqu RTMP0, (%rcx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __serpent_dec_blk8;
+
+	addq $(4 * 8), %rsp;
+
+	movdqu (%r8), RTMP0;
+
+	pxor_u((0 * 16)(%rsi), RA0, RTMP1);
+	pxor_u((1 * 16)(%rsi), RA1, RTMP1);
+	pxor_u((2 * 16)(%rsi), RA2, RTMP1);
+	pxor_u((3 * 16)(%rsi), RA3, RTMP1);
+	pxor_u((4 * 16)(%rsi), RB0, RTMP1);
+	pxor_u((5 * 16)(%rsi), RB1, RTMP1);
+	pxor_u((6 * 16)(%rsi), RB2, RTMP1);
+	pxor_u((7 * 16)(%rsi), RB3, RTMP1);
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	movdqu RA0, (0 * 16)(%rsi);
+	pxor RA0, RTMP0;
+	movdqu RA1, (1 * 16)(%rsi);
+	pxor RA1, RTMP0;
+	movdqu RA2, (2 * 16)(%rsi);
+	pxor RA2, RTMP0;
+	movdqu RA3, (3 * 16)(%rsi);
+	pxor RA3, RTMP0;
+	movdqu RB0, (4 * 16)(%rsi);
+	pxor RB0, RTMP0;
+	movdqu RB1, (5 * 16)(%rsi);
+	pxor RB1, RTMP0;
+	movdqu RB2, (6 * 16)(%rsi);
+	pxor RB2, RTMP0;
+	movdqu RB3, (7 * 16)(%rsi);
+	pxor RB3, RTMP0;
+
+	movdqu RTMP0, (%r8);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;)
+
+.align 8
+.globl _gcry_serpent_sse2_ocb_auth
+ELF(.type _gcry_serpent_sse2_ocb_auth, at function;)
+
+_gcry_serpent_sse2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (8 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[8])
+	 */
+
+	subq $(4 * 8), %rsp;
+
+	movq %r10, (0 * 8)(%rsp);
+	movq %r11, (1 * 8)(%rsp);
+	movq %r12, (2 * 8)(%rsp);
+	movq %r13, (3 * 8)(%rsp);
+
+	movdqu (%rdx), RTMP0;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  movdqu (n * 16)(%rsi), xreg; \
+	  movdqu (lreg), RNOT; \
+	  pxor RNOT, RTMP0; \
+	  pxor RTMP0, xreg;
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, RA0);
+	OCB_INPUT(1, %r11, RA1);
+	OCB_INPUT(2, %r12, RA2);
+	OCB_INPUT(3, %r13, RA3);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, RB0);
+	OCB_INPUT(5, %r11, RB1);
+	OCB_INPUT(6, %r12, RB2);
+	OCB_INPUT(7, %r13, RB3);
+#undef OCB_INPUT
+
+	movdqu RTMP0, (%rdx);
+
+	movq (0 * 8)(%rsp), %r10;
+	movq (1 * 8)(%rsp), %r11;
+	movq (2 * 8)(%rsp), %r12;
+	movq (3 * 8)(%rsp), %r13;
+
+	call __serpent_enc_blk8;
+
+	addq $(4 * 8), %rsp;
+
+	movdqu (%rcx), RTMP0;
+	pxor RB4, RA4;
+	pxor RB1, RA1;
+	pxor RB2, RA2;
+	pxor RB0, RA0;
+
+	pxor RTMP0, RA2;
+	pxor RA4, RA1;
+	pxor RA2, RA0;
+
+	pxor RA1, RA0;
+	movdqu RA0, (%rcx);
+
+	/* clear the used registers */
+	pxor RA0, RA0;
+	pxor RA1, RA1;
+	pxor RA2, RA2;
+	pxor RA3, RA3;
+	pxor RA4, RA4;
+	pxor RB0, RB0;
+	pxor RB1, RB1;
+	pxor RB2, RB2;
+	pxor RB3, RB3;
+	pxor RB4, RB4;
+	pxor RTMP0, RTMP0;
+	pxor RTMP1, RTMP1;
+	pxor RTMP2, RTMP2;
+	pxor RNOT, RNOT;
+
+	ret;
+ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;)
+
 #endif /*defined(USE_SERPENT)*/
 #endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 7d0e112..eb491aa 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -29,6 +29,7 @@
 #include "cipher.h"
 #include "bithelp.h"
 #include "bufhelp.h"
+#include "cipher-internal.h"
 #include "cipher-selftest.h"
 
 
@@ -118,10 +119,30 @@ extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_ocb_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_ocb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[8]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_sse2_ocb_auth(serpent_context_t *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const void *Ls[8]) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_AVX2
-/* Assembler implementations of Serpent using SSE2.  Process 16 block in
+/* Assembler implementations of Serpent using AVX2.  Process 16 block in
    parallel.
  */
 extern void _gcry_serpent_avx2_ctr_enc(serpent_context_t *ctx,
@@ -138,6 +159,26 @@ extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_ocb_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_ocb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_serpent_avx2_ocb_auth(serpent_context_t *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const void *Ls[16]) ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_NEON
@@ -158,6 +199,26 @@ extern void _gcry_serpent_neon_cfb_dec(serpent_context_t *ctx,
 				       unsigned char *out,
 				       const unsigned char *in,
 				       unsigned char *iv);
+
+extern void _gcry_serpent_neon_ocb_enc(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[8]);
+
+extern void _gcry_serpent_neon_ocb_dec(serpent_context_t *ctx,
+				       unsigned char *out,
+				       const unsigned char *in,
+				       unsigned char *offset,
+				       unsigned char *checksum,
+				       const void *Ls[8]);
+
+extern void _gcry_serpent_neon_ocb_auth(serpent_context_t *ctx,
+					const unsigned char *abuf,
+					unsigned char *offset,
+					unsigned char *checksum,
+					const void *Ls[8]);
 #endif
 
 
@@ -1165,6 +1226,362 @@ _gcry_serpent_cfb_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+static inline const unsigned char *
+get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
+{
+  unsigned int ntz = _gcry_ctz64 (i);
+
+  if (ntz < OCB_L_TABLE_SIZE)
+      return c->u_mode.ocb.L[ntz];
+  else
+      return _gcry_cipher_ocb_get_l (c, l_tmp, i);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+void
+_gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  serpent_context_t *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char l_tmp[sizeof(serpent_block_t)];
+  const unsigned char *l;
+  int burn_stack_depth = 2 * sizeof (serpent_block_t);
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      const void *Ls[16];
+      int i;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 16; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  if (encrypt)
+	    _gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+				      c->u_ctr.ctr, Ls);
+	  else
+	    _gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+				      c->u_ctr.ctr, Ls);
+
+	  nblocks -= 16;
+	  outbuf += 16 * sizeof(serpent_block_t);
+	  inbuf  += 16 * sizeof(serpent_block_t);
+	  did_use_avx2 = 1;
+	}
+
+      if (did_use_avx2)
+	{
+	  /* serpent-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+    const void *Ls[8];
+    int i;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+	/* l_tmp will be used only every 65536-th block. */
+	for (i = 0; i < 8; i += 4)
+	  {
+	    Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	    Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	    Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	    Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	    blkn += 4;
+	  }
+
+	if (encrypt)
+	  _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+				      c->u_ctr.ctr, Ls);
+	else
+	  _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+				      c->u_ctr.ctr, Ls);
+
+	nblocks -= 8;
+	outbuf += 8 * sizeof(serpent_block_t);
+	inbuf  += 8 * sizeof(serpent_block_t);
+	did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+	/* serpent-sse2 assembly code does not use stack */
+	if (nblocks == 0)
+	  burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+      const void *Ls[8];
+      int i;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 8; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  if (encrypt)
+	    _gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+				       c->u_ctr.ctr, Ls);
+	  else
+	    _gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+				       c->u_ctr.ctr, Ls);
+
+	  nblocks -= 8;
+	  outbuf += 8 * sizeof(serpent_block_t);
+	  inbuf  += 8 * sizeof(serpent_block_t);
+	  did_use_neon = 1;
+	}
+
+      if (did_use_neon)
+	{
+	  /* serpent-neon assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  if (encrypt)
+    {
+      for (; nblocks; nblocks--)
+	{
+	  l = get_l(c, l_tmp, ++blkn);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  buf_xor_1 (c->u_iv.iv, l, sizeof(serpent_block_t));
+	  buf_cpy (l_tmp, inbuf, sizeof(serpent_block_t));
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  buf_xor_1 (c->u_ctr.ctr, l_tmp, sizeof(serpent_block_t));
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
+	  serpent_encrypt_internal(ctx, l_tmp, l_tmp);
+	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
+	  buf_cpy (outbuf, l_tmp, sizeof(serpent_block_t));
+
+	  inbuf += sizeof(serpent_block_t);
+	  outbuf += sizeof(serpent_block_t);
+	}
+    }
+  else
+    {
+      for (; nblocks; nblocks--)
+	{
+	  l = get_l(c, l_tmp, ++blkn);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  buf_xor_1 (c->u_iv.iv, l, sizeof(serpent_block_t));
+	  buf_cpy (l_tmp, inbuf, sizeof(serpent_block_t));
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
+	  serpent_decrypt_internal(ctx, l_tmp, l_tmp);
+	  buf_xor_1 (l_tmp, c->u_iv.iv, sizeof(serpent_block_t));
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  buf_xor_1 (c->u_ctr.ctr, l_tmp, sizeof(serpent_block_t));
+	  buf_cpy (outbuf, l_tmp, sizeof(serpent_block_t));
+
+	  inbuf += sizeof(serpent_block_t);
+	  outbuf += sizeof(serpent_block_t);
+	}
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+void
+_gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			size_t nblocks)
+{
+  serpent_context_t *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  unsigned char l_tmp[sizeof(serpent_block_t)];
+  const unsigned char *l;
+  int burn_stack_depth = 2 * sizeof(serpent_block_t);
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+#ifdef USE_AVX2
+  if (ctx->use_avx2)
+    {
+      int did_use_avx2 = 0;
+      const void *Ls[16];
+      int i;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 16; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+				      c->u_mode.ocb.aad_sum, Ls);
+
+	  nblocks -= 16;
+	  abuf += 16 * sizeof(serpent_block_t);
+	  did_use_avx2 = 1;
+	}
+
+      if (did_use_avx2)
+	{
+	  /* serpent-avx2 assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_SSE2
+  {
+    int did_use_sse2 = 0;
+    const void *Ls[8];
+    int i;
+
+    /* Process data in 8 block chunks. */
+    while (nblocks >= 8)
+      {
+	/* l_tmp will be used only every 65536-th block. */
+	for (i = 0; i < 8; i += 4)
+	  {
+	    Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	    Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	    Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	    Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	    blkn += 4;
+	  }
+
+	_gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+				    c->u_mode.ocb.aad_sum, Ls);
+
+	nblocks -= 8;
+	abuf += 8 * sizeof(serpent_block_t);
+	did_use_sse2 = 1;
+      }
+
+    if (did_use_sse2)
+      {
+	/* serpent-avx2 assembly code does not use stack */
+	if (nblocks == 0)
+	  burn_stack_depth = 0;
+      }
+
+    /* Use generic code to handle smaller chunks... */
+  }
+#endif
+
+#ifdef USE_NEON
+  if (ctx->use_neon)
+    {
+      int did_use_neon = 0;
+      const void *Ls[8];
+      int i;
+
+      /* Process data in 8 block chunks. */
+      while (nblocks >= 8)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 8; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+				      c->u_mode.ocb.aad_sum, Ls);
+
+	  nblocks -= 8;
+	  abuf += 8 * sizeof(serpent_block_t);
+	  did_use_neon = 1;
+	}
+
+      if (did_use_neon)
+	{
+	  /* serpent-neon assembly code does not use stack */
+	  if (nblocks == 0)
+	    burn_stack_depth = 0;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for (; nblocks; nblocks--)
+    {
+      l = get_l(c, l_tmp, ++blkn);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      buf_xor_1 (c->u_mode.ocb.aad_offset, l, sizeof(serpent_block_t));
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, sizeof(serpent_block_t));
+      serpent_encrypt_internal(ctx, l_tmp, l_tmp);
+      buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, sizeof(serpent_block_t));
+
+      abuf += sizeof(serpent_block_t);
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
 

 /* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
diff --git a/src/cipher.h b/src/cipher.h
index 1a66f6d..d16746a 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -206,6 +206,11 @@ void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
 void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
                             void *outbuf_arg, const void *inbuf_arg,
                             size_t nblocks);
+void _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			      const void *inbuf_arg, size_t nblocks,
+			      int encrypt);
+void _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			     size_t nblocks);
 
 /*-- twofish.c --*/
 void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr,
diff --git a/tests/basic.c b/tests/basic.c
index 124df55..3ad05a4 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3350,6 +3350,15 @@ check_ocb_cipher (void)
   check_ocb_cipher_largebuf(GCRY_CIPHER_TWOFISH, 32,
 			    "\xf6\xd4\xfe\x4e\x50\x85\x13\x59"
 			    "\x69\x0e\x4c\x67\x3e\xdd\x47\x90");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT128, 16,
+			    "\x3c\xfb\x66\x14\x3c\xc8\x6c\x67"
+			    "\x26\xb8\x23\xeb\xaf\x43\x98\x69");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT192, 24,
+			    "\x5e\x62\x27\xc5\x32\xc3\x1d\xe6"
+			    "\x2e\x65\xe7\xd6\xfb\x05\xd7\xb2");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT256, 32,
+			    "\xe7\x8b\xe6\xd4\x2f\x7a\x36\x4c"
+			    "\xba\xee\x20\xe2\x68\xf4\xcb\xcc");
 }
 
 
From jussi.kivilinna at iki.fi  Mon Jul 27 11:04:40 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 12:04:40 +0300
Subject: [PATCH 6/6] Reduce code size for Twofish key-setup and remove key
 dependend branch
In-Reply-To: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
References: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
Message-ID: <20150727090440.18742.74531.stgit@localhost6.localdomain6>

* cipher/twofish.c (poly_to_exp): Increase size by one, change type
from byte to u16 and insert '492' to index 0.
(exp_to_poly): Increase size by 256, let new cells have zero value.
(CALC_S): Execute unconditionally with help of modified tables.
(do_twofish_setkey): Change type for 'tmp' to 'unsigned int'; Un-unroll
CALC_K256 and CALC_K phases to reduce generated object size.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/twofish.c |   76 ++++++++++++++++++------------------------------------
 1 file changed, 26 insertions(+), 50 deletions(-)

diff --git a/cipher/twofish.c b/cipher/twofish.c
index 3ee2be5..11e60a7 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -356,7 +356,8 @@ static const u32 mds[4][256] = {
  * see a non-horrible way of avoiding them, and I did manage to group the
  * statements so that each if covers four group multiplications. */
 
-static const byte poly_to_exp[255] = {
+static const u16 poly_to_exp[256] = {
+   492,
    0x00, 0x01, 0x17, 0x02, 0x2E, 0x18, 0x53, 0x03, 0x6A, 0x2F, 0x93, 0x19,
    0x34, 0x54, 0x45, 0x04, 0x5C, 0x6B, 0xB6, 0x30, 0xA6, 0x94, 0x4B, 0x1A,
    0x8C, 0x35, 0x81, 0x55, 0xAA, 0x46, 0x0D, 0x05, 0x24, 0x5D, 0x87, 0x6C,
@@ -381,7 +382,7 @@ static const byte poly_to_exp[255] = {
    0x85, 0xC8, 0xA1
 };
 
-static const byte exp_to_poly[492] = {
+static const byte exp_to_poly[492 + 256] = {
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D, 0x9A, 0x79, 0xF2,
    0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC, 0xF5, 0xA7, 0x03,
    0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3, 0x8B, 0x5B, 0xB6,
@@ -422,7 +423,7 @@ static const byte exp_to_poly[492] = {
    0x3F, 0x7E, 0xFC, 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE,
    0xB1, 0x2F, 0x5E, 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41,
    0x82, 0x49, 0x92, 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E,
-   0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB
+   0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB,
 };
 

@@ -494,14 +495,15 @@ static byte calc_sb_tbl[512] = {
     0x6F, 0x16, 0x9D, 0x25, 0x36, 0x86, 0x42, 0x56,
     0x4A, 0x55, 0x5E, 0x09, 0xC1, 0xBE, 0xE0, 0x91
 };
+
 /* Macro to perform one column of the RS matrix multiplication.  The
  * parameters a, b, c, and d are the four bytes of output; i is the index
  * of the key bytes, and w, x, y, and z, are the column of constants from
  * the RS matrix, preprocessed through the poly_to_exp table. */
 
 #define CALC_S(a, b, c, d, i, w, x, y, z) \
-   if (key[i]) { \
-      tmp = poly_to_exp[key[i] - 1]; \
+   { \
+      tmp = poly_to_exp[key[i]]; \
       (a) ^= exp_to_poly[tmp + (w)]; \
       (b) ^= exp_to_poly[tmp + (x)]; \
       (c) ^= exp_to_poly[tmp + (y)]; \
@@ -600,7 +602,7 @@ do_twofish_setkey (TWOFISH_context *ctx, const byte *key, const unsigned keylen)
   byte si = 0, sj = 0, sk = 0, sl = 0, sm = 0, sn = 0, so = 0, sp = 0;
 
   /* Temporary for CALC_S. */
-  byte tmp;
+  unsigned int tmp;
 
   /* Flags for self-test. */
   static int initialized = 0;
@@ -668,28 +670,15 @@ do_twofish_setkey (TWOFISH_context *ctx, const byte *key, const unsigned keylen)
           CALC_SB256_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
 	}
 
-      /* Calculate whitening and round subkeys.  The constants are
-       * indices of subkeys, preprocessed through q0 and q1. */
-      CALC_K256 (w, 0, 0xA9, 0x75, 0x67, 0xF3);
-      CALC_K256 (w, 2, 0xB3, 0xC6, 0xE8, 0xF4);
-      CALC_K256 (w, 4, 0x04, 0xDB, 0xFD, 0x7B);
-      CALC_K256 (w, 6, 0xA3, 0xFB, 0x76, 0xC8);
-      CALC_K256 (k, 0, 0x9A, 0x4A, 0x92, 0xD3);
-      CALC_K256 (k, 2, 0x80, 0xE6, 0x78, 0x6B);
-      CALC_K256 (k, 4, 0xE4, 0x45, 0xDD, 0x7D);
-      CALC_K256 (k, 6, 0xD1, 0xE8, 0x38, 0x4B);
-      CALC_K256 (k, 8, 0x0D, 0xD6, 0xC6, 0x32);
-      CALC_K256 (k, 10, 0x35, 0xD8, 0x98, 0xFD);
-      CALC_K256 (k, 12, 0x18, 0x37, 0xF7, 0x71);
-      CALC_K256 (k, 14, 0xEC, 0xF1, 0x6C, 0xE1);
-      CALC_K256 (k, 16, 0x43, 0x30, 0x75, 0x0F);
-      CALC_K256 (k, 18, 0x37, 0xF8, 0x26, 0x1B);
-      CALC_K256 (k, 20, 0xFA, 0x87, 0x13, 0xFA);
-      CALC_K256 (k, 22, 0x94, 0x06, 0x48, 0x3F);
-      CALC_K256 (k, 24, 0xF2, 0x5E, 0xD0, 0xBA);
-      CALC_K256 (k, 26, 0x8B, 0xAE, 0x30, 0x5B);
-      CALC_K256 (k, 28, 0x84, 0x8A, 0x54, 0x00);
-      CALC_K256 (k, 30, 0xDF, 0xBC, 0x23, 0x9D);
+      /* Calculate whitening and round subkeys. */
+      for (i = 0; i < 8; i += 2)
+	{
+	  CALC_K256 ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
+      for (j = 0; j < 32; j += 2, i += 2)
+	{
+	  CALC_K256 ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
     }
   else
     {
@@ -699,28 +688,15 @@ do_twofish_setkey (TWOFISH_context *ctx, const byte *key, const unsigned keylen)
           CALC_SB_2( i, calc_sb_tbl[j], calc_sb_tbl[k] );
         }
 
-      /* Calculate whitening and round subkeys.  The constants are
-       * indices of subkeys, preprocessed through q0 and q1. */
-      CALC_K (w, 0, 0xA9, 0x75, 0x67, 0xF3);
-      CALC_K (w, 2, 0xB3, 0xC6, 0xE8, 0xF4);
-      CALC_K (w, 4, 0x04, 0xDB, 0xFD, 0x7B);
-      CALC_K (w, 6, 0xA3, 0xFB, 0x76, 0xC8);
-      CALC_K (k, 0, 0x9A, 0x4A, 0x92, 0xD3);
-      CALC_K (k, 2, 0x80, 0xE6, 0x78, 0x6B);
-      CALC_K (k, 4, 0xE4, 0x45, 0xDD, 0x7D);
-      CALC_K (k, 6, 0xD1, 0xE8, 0x38, 0x4B);
-      CALC_K (k, 8, 0x0D, 0xD6, 0xC6, 0x32);
-      CALC_K (k, 10, 0x35, 0xD8, 0x98, 0xFD);
-      CALC_K (k, 12, 0x18, 0x37, 0xF7, 0x71);
-      CALC_K (k, 14, 0xEC, 0xF1, 0x6C, 0xE1);
-      CALC_K (k, 16, 0x43, 0x30, 0x75, 0x0F);
-      CALC_K (k, 18, 0x37, 0xF8, 0x26, 0x1B);
-      CALC_K (k, 20, 0xFA, 0x87, 0x13, 0xFA);
-      CALC_K (k, 22, 0x94, 0x06, 0x48, 0x3F);
-      CALC_K (k, 24, 0xF2, 0x5E, 0xD0, 0xBA);
-      CALC_K (k, 26, 0x8B, 0xAE, 0x30, 0x5B);
-      CALC_K (k, 28, 0x84, 0x8A, 0x54, 0x00);
-      CALC_K (k, 30, 0xDF, 0xBC, 0x23, 0x9D);
+      /* Calculate whitening and round subkeys. */
+      for (i = 0; i < 8; i += 2)
+	{
+	  CALC_K ( w, i, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
+      for (j = 0; j < 32; j += 2, i += 2)
+	{
+	  CALC_K ( k, j, q0[i], q1[i], q0[i + 1], q1[i + 1] );
+	}
     }
 
   return 0;


From wk at gnupg.org  Mon Jul 27 12:19:54 2015
From: wk at gnupg.org (Werner Koch)
Date: Mon, 27 Jul 2015 12:19:54 +0200
Subject: [git] GCRYPT - branch, master,
 updated. libgcrypt-1.6.0-237-g46c0726
In-Reply-To: <55B5E8AD.3010102@iki.fi> (Jussi Kivilinna's message of "Mon, 27
 Jul 2015 11:15:41 +0300")
References: <E1ZJMMz-0005ZZ-VH@lists.gnupg.org>
 <87zj2i887e.fsf@vigenere.g10code.de> <55B5E8AD.3010102@iki.fi>
Message-ID: <87a8uh9a2d.fsf@vigenere.g10code.de>

On Mon, 27 Jul 2015 10:15, jussi.kivilinna at iki.fi said:

>>>     * cipher/cipher-gcm.c: Do not copy zero bytes from an empty buffer. Let

> Regular buf_cpy can handle the srcbuf==NULL && len==0 case (undefined for
> memcpy) without problem.

Okay, I missed that it was about !SRCBUF.  I falsely assumed "empty"
denotes a zero length string.


Salam-Shalom,

   Werner

-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From wk at gnupg.org  Mon Jul 27 12:22:23 2015
From: wk at gnupg.org (Werner Koch)
Date: Mon, 27 Jul 2015 12:22:23 +0200
Subject: [PATCH 5/6] Add LSan annotation to ignore a memory leak
In-Reply-To: <20150722194041.GC8113@al> (Peter Wu's message of "Wed, 22 Jul
 2015 21:40:41 +0200")
References: <1436454696-20362-1-git-send-email-peter@lekensteyn.nl>
 <1436454696-20362-6-git-send-email-peter@lekensteyn.nl>
 <87fv4ge3yq.fsf@vigenere.g10code.de> <20150722194041.GC8113@al>
Message-ID: <87615599y8.fsf@vigenere.g10code.de>

On Wed, 22 Jul 2015 21:40, peter at lekensteyn.nl said:

> I will look at adding this to libgpg-error. Is src/gpg-error.h.in the
> appropriate file for this? Can the patch be submitted to this list?

Meanwhile I moved the GCC attribute stuff to gpg-error.h.  Feel free to
send a patch to implement your annotation (e.g. using GPGRT_ANNOTATE_*
or gpgrt_annotate_*).  No extra DCO required.


Shalom-Salam,

   Werner


-- 
Die Gedanken sind frei.  Ausnahmen regelt ein Bundesgesetz.


From jussi.kivilinna at iki.fi  Mon Jul 27 11:04:20 2015
From: jussi.kivilinna at iki.fi (Jussi Kivilinna)
Date: Mon, 27 Jul 2015 12:04:20 +0300
Subject: [PATCH 2/6] Add bulk OCB for Camellia AES-NI/AVX and AES-NI/AVX2
 implementations
In-Reply-To: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
References: <20150727090415.18742.12674.stgit@localhost6.localdomain6>
Message-ID: <20150727090420.18742.23093.stgit@localhost6.localdomain6>

* cipher/camellia-aesni-avx-amd64.S: Add OCB assembly functions.
* cipher/camellia-aesni-avx2-amd64.S: Add OCB assembly functions.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_ocb_enc)
(_gcry_camellia_aesni_avx_ocb_dec, _gcry_camellia_aesni_avx_ocb_auth)
(_gcry_camellia_aesni_avx2_ocb_enc, _gcry_camellia_aesni_avx2_ocb_dec)
(_gcry_camellia_aesni_avx2_ocb_auth): New prototypes.
(get_l, _gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): New.
* cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk
functions for Camellia.
* src/cipher.h (_gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth): New.
* tests/basic.c (check_ocb_cipher): Add test-vector for Camellia.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S  |  424 ++++++++++++++++++++++++++++++
 cipher/camellia-aesni-avx2-amd64.S |  503 ++++++++++++++++++++++++++++++++++++
 cipher/camellia-glue.c             |  329 ++++++++++++++++++++++++
 cipher/cipher.c                    |    2 
 src/cipher.h                       |    5 
 tests/basic.c                      |    9 +
 6 files changed, 1266 insertions(+), 6 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index c047a21..5a3a3cb 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
 /* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
  *
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -1211,6 +1211,428 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	ret;
 ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
 
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_enc
+ELF(.type   _gcry_camellia_aesni_avx_ocb_enc, at function;)
+
+_gcry_camellia_aesni_avx_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(16 * 16 + 4 * 8), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 16 + 0 * 8)(%rax);
+	movq %r11, (16 * 16 + 1 * 8)(%rax);
+	movq %r12, (16 * 16 + 2 * 8)(%rax);
+	movq %r13, (16 * 16 + 3 * 8)(%rax);
+
+	vmovdqu (%rcx), %xmm14;
+	vmovdqu (%r8), %xmm15;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rdx), xreg; \
+	  vpxor (lreg), %xmm14, %xmm14; \
+	  vpxor xreg, %xmm15, %xmm15; \
+	  vpxor xreg, %xmm14, xreg; \
+	  vmovdqu %xmm14, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %xmm0);
+	vmovdqu %xmm0, (15 * 16)(%rax);
+	OCB_INPUT(1, %r11, %xmm0);
+	vmovdqu %xmm0, (14 * 16)(%rax);
+	OCB_INPUT(2, %r12, %xmm13);
+	OCB_INPUT(3, %r13, %xmm12);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %xmm11);
+	OCB_INPUT(5, %r11, %xmm10);
+	OCB_INPUT(6, %r12, %xmm9);
+	OCB_INPUT(7, %r13, %xmm8);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %xmm7);
+	OCB_INPUT(9, %r11, %xmm6);
+	OCB_INPUT(10, %r12, %xmm5);
+	OCB_INPUT(11, %r13, %xmm4);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %xmm3);
+	OCB_INPUT(13, %r11, %xmm2);
+	OCB_INPUT(14, %r12, %xmm1);
+	OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm14, (%rcx);
+	vmovdqu %xmm15, (%r8);
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 16 + 0 * 8)(%rax), %r10;
+	movq (16 * 16 + 1 * 8)(%rax), %r11;
+	movq (16 * 16 + 2 * 8)(%rax), %r12;
+	movq (16 * 16 + 3 * 8)(%rax), %r13;
+
+	leave;
+	ret;
+ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_dec
+ELF(.type   _gcry_camellia_aesni_avx_ocb_dec, at function;)
+
+_gcry_camellia_aesni_avx_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[16])
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(16 * 16 + 4 * 8), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 16 + 0 * 8)(%rax);
+	movq %r11, (16 * 16 + 1 * 8)(%rax);
+	movq %r12, (16 * 16 + 2 * 8)(%rax);
+	movq %r13, (16 * 16 + 3 * 8)(%rax);
+
+	vmovdqu (%rcx), %xmm15;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rdx), xreg; \
+	  vpxor (lreg), %xmm15, %xmm15; \
+	  vpxor xreg, %xmm15, xreg; \
+	  vmovdqu %xmm15, (n * 16)(%rsi);
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %xmm0);
+	vmovdqu %xmm0, (15 * 16)(%rax);
+	OCB_INPUT(1, %r11, %xmm14);
+	OCB_INPUT(2, %r12, %xmm13);
+	OCB_INPUT(3, %r13, %xmm12);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %xmm11);
+	OCB_INPUT(5, %r11, %xmm10);
+	OCB_INPUT(6, %r12, %xmm9);
+	OCB_INPUT(7, %r13, %xmm8);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %xmm7);
+	OCB_INPUT(9, %r11, %xmm6);
+	OCB_INPUT(10, %r12, %xmm5);
+	OCB_INPUT(11, %r13, %xmm4);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %xmm3);
+	OCB_INPUT(13, %r11, %xmm2);
+	OCB_INPUT(14, %r12, %xmm1);
+	OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm15, (%rcx);
+
+	movq %r8, %r10;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r9d;
+	cmovel %r9d, %r8d; /* max */
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX, %r8, 8), %xmm15;
+	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor %xmm14, %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_dec_blk16;
+
+	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+	vmovdqu %xmm7, (7 * 16)(%rax);
+	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vpxor (%r10), %xmm7, %xmm7;
+	vpxor %xmm6, %xmm7, %xmm7;
+	vpxor %xmm5, %xmm7, %xmm7;
+	vpxor %xmm4, %xmm7, %xmm7;
+	vpxor %xmm3, %xmm7, %xmm7;
+	vpxor %xmm2, %xmm7, %xmm7;
+	vpxor %xmm1, %xmm7, %xmm7;
+	vpxor %xmm0, %xmm7, %xmm7;
+	vpxor %xmm15, %xmm7, %xmm7;
+	vpxor %xmm14, %xmm7, %xmm7;
+	vpxor %xmm13, %xmm7, %xmm7;
+	vpxor %xmm12, %xmm7, %xmm7;
+	vpxor %xmm11, %xmm7, %xmm7;
+	vpxor %xmm10, %xmm7, %xmm7;
+	vpxor %xmm9, %xmm7, %xmm7;
+	vpxor %xmm8, %xmm7, %xmm7;
+	vmovdqu %xmm7, (%r10);
+	vmovdqu (7 * 16)(%rax), %xmm7;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 16 + 0 * 8)(%rax), %r10;
+	movq (16 * 16 + 1 * 8)(%rax), %r11;
+	movq (16 * 16 + 2 * 8)(%rax), %r12;
+	movq (16 * 16 + 3 * 8)(%rax), %r13;
+
+	leave;
+	ret;
+ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_auth
+ELF(.type   _gcry_camellia_aesni_avx_ocb_auth, at function;)
+
+_gcry_camellia_aesni_avx_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(16 * 16 + 4 * 8), %rsp;
+	andq $~31, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 16 + 0 * 8)(%rax);
+	movq %r11, (16 * 16 + 1 * 8)(%rax);
+	movq %r12, (16 * 16 + 2 * 8)(%rax);
+	movq %r13, (16 * 16 + 3 * 8)(%rax);
+
+	vmovdqu (%rdx), %xmm15;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+
+#define OCB_INPUT(n, lreg, xreg) \
+	  vmovdqu (n * 16)(%rsi), xreg; \
+	  vpxor (lreg), %xmm15, %xmm15; \
+	  vpxor xreg, %xmm15, xreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %xmm0);
+	vmovdqu %xmm0, (15 * 16)(%rax);
+	OCB_INPUT(1, %r11, %xmm14);
+	OCB_INPUT(2, %r12, %xmm13);
+	OCB_INPUT(3, %r13, %xmm12);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %xmm11);
+	OCB_INPUT(5, %r11, %xmm10);
+	OCB_INPUT(6, %r12, %xmm9);
+	OCB_INPUT(7, %r13, %xmm8);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(8, %r10, %xmm7);
+	OCB_INPUT(9, %r11, %xmm6);
+	OCB_INPUT(10, %r12, %xmm5);
+	OCB_INPUT(11, %r13, %xmm4);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(12, %r10, %xmm3);
+	OCB_INPUT(13, %r11, %xmm2);
+	OCB_INPUT(14, %r12, %xmm1);
+	OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm15, (%rdx);
+
+	movq %rcx, %r10;
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor %xmm14, %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	vpxor %xmm7, %xmm6, %xmm6;
+	vpxor %xmm5, %xmm4, %xmm4;
+	vpxor %xmm3, %xmm2, %xmm2;
+	vpxor %xmm1, %xmm0, %xmm0;
+	vpxor %xmm15, %xmm14, %xmm14;
+	vpxor %xmm13, %xmm12, %xmm12;
+	vpxor %xmm11, %xmm10, %xmm10;
+	vpxor %xmm9, %xmm8, %xmm8;
+
+	vpxor %xmm6, %xmm4, %xmm4;
+	vpxor %xmm2, %xmm0, %xmm0;
+	vpxor %xmm14, %xmm12, %xmm12;
+	vpxor %xmm10, %xmm8, %xmm8;
+
+	vpxor %xmm4, %xmm0, %xmm0;
+	vpxor %xmm12, %xmm8, %xmm8;
+
+	vpxor %xmm0, %xmm8, %xmm0;
+	vpxor (%r10), %xmm0, %xmm0;
+	vmovdqu %xmm0, (%r10);
+
+	vzeroall;
+
+	movq (16 * 16 + 0 * 8)(%rax), %r10;
+	movq (16 * 16 + 1 * 8)(%rax), %r11;
+	movq (16 * 16 + 2 * 8)(%rax), %r12;
+	movq (16 * 16 + 3 * 8)(%rax), %r13;
+
+	leave;
+	ret;
+ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
+
 /*
  * IN:
  *  ab: 64-bit AB state
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index a3fa229..26381df 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
 /* camellia-avx2-aesni-amd64.S  -  AES-NI/AVX2 implementation of Camellia cipher
  *
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -1127,8 +1127,8 @@ ELF(.type   _gcry_camellia_aesni_avx2_cbc_dec, at function;)
 _gcry_camellia_aesni_avx2_cbc_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
 
@@ -1199,8 +1199,8 @@ ELF(.type   _gcry_camellia_aesni_avx2_cfb_dec, at function;)
 _gcry_camellia_aesni_avx2_cfb_dec:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst (16 blocks)
-	 *	%rdx: src (16 blocks)
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
 	 *	%rcx: iv
 	 */
 
@@ -1266,5 +1266,498 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 	ret;
 ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
 
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_enc
+ELF(.type   _gcry_camellia_aesni_avx2_ocb_enc, at function;)
+
+_gcry_camellia_aesni_avx2_ocb_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[32])
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(16 * 32 + 4 * 8), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 32 + 0 * 8)(%rax);
+	movq %r11, (16 * 32 + 1 * 8)(%rax);
+	movq %r12, (16 * 32 + 2 * 8)(%rax);
+	movq %r13, (16 * 32 + 3 * 8)(%rax);
+
+	vmovdqu (%rcx), %xmm14;
+	vmovdqu (%r8), %xmm13;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), %xmm14, %xmm15; \
+	  vpxor (l1reg), %xmm15, %xmm14; \
+	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+	  vpxor yreg, %ymm13, %ymm13; \
+	  vpxor yreg, %ymm15, yreg; \
+	  vmovdqu %ymm15, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (15 * 32)(%rax);
+	OCB_INPUT(1, %r12, %r13, %ymm0);
+	vmovdqu %ymm0, (14 * 32)(%rax);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (13 * 32)(%rax);
+	OCB_INPUT(3, %r12, %r13, %ymm12);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, %ymm11);
+	OCB_INPUT(5, %r12, %r13, %ymm10);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, %ymm9);
+	OCB_INPUT(7, %r12, %r13, %ymm8);
+	movq (16 * 8)(%r9), %r10;
+	movq (17 * 8)(%r9), %r11;
+	movq (18 * 8)(%r9), %r12;
+	movq (19 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %r11, %ymm7);
+	OCB_INPUT(9, %r12, %r13, %ymm6);
+	movq (20 * 8)(%r9), %r10;
+	movq (21 * 8)(%r9), %r11;
+	movq (22 * 8)(%r9), %r12;
+	movq (23 * 8)(%r9), %r13;
+	OCB_INPUT(10, %r10, %r11, %ymm5);
+	OCB_INPUT(11, %r12, %r13, %ymm4);
+	movq (24 * 8)(%r9), %r10;
+	movq (25 * 8)(%r9), %r11;
+	movq (26 * 8)(%r9), %r12;
+	movq (27 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %r11, %ymm3);
+	OCB_INPUT(13, %r12, %r13, %ymm2);
+	movq (28 * 8)(%r9), %r10;
+	movq (29 * 8)(%r9), %r11;
+	movq (30 * 8)(%r9), %r12;
+	movq (31 * 8)(%r9), %r13;
+	OCB_INPUT(14, %r10, %r11, %ymm1);
+	OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+	vextracti128 $1, %ymm13, %xmm15;
+	vmovdqu %xmm14, (%rcx);
+	vpxor %xmm13, %xmm15, %xmm15;
+	vmovdqu %xmm15, (%r8);
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 32 + 0 * 8)(%rax), %r10;
+	movq (16 * 32 + 1 * 8)(%rax), %r11;
+	movq (16 * 32 + 2 * 8)(%rax), %r12;
+	movq (16 * 32 + 3 * 8)(%rax), %r13;
+
+	leave;
+	ret;
+ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_dec
+ELF(.type   _gcry_camellia_aesni_avx2_ocb_dec, at function;)
+
+_gcry_camellia_aesni_avx2_ocb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: offset
+	 *	%r8 : checksum
+	 *	%r9 : L pointers (void *L[32])
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(16 * 32 + 4 * 8), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 32 + 0 * 8)(%rax);
+	movq %r11, (16 * 32 + 1 * 8)(%rax);
+	movq %r12, (16 * 32 + 2 * 8)(%rax);
+	movq %r13, (16 * 32 + 3 * 8)(%rax);
+
+	vmovdqu (%rcx), %xmm14;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rdx), yreg; \
+	  vpxor (l0reg), %xmm14, %xmm15; \
+	  vpxor (l1reg), %xmm15, %xmm14; \
+	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+	  vpxor yreg, %ymm15, yreg; \
+	  vmovdqu %ymm15, (n * 32)(%rsi);
+
+	movq (0 * 8)(%r9), %r10;
+	movq (1 * 8)(%r9), %r11;
+	movq (2 * 8)(%r9), %r12;
+	movq (3 * 8)(%r9), %r13;
+	OCB_INPUT(0, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (15 * 32)(%rax);
+	OCB_INPUT(1, %r12, %r13, %ymm0);
+	vmovdqu %ymm0, (14 * 32)(%rax);
+	movq (4 * 8)(%r9), %r10;
+	movq (5 * 8)(%r9), %r11;
+	movq (6 * 8)(%r9), %r12;
+	movq (7 * 8)(%r9), %r13;
+	OCB_INPUT(2, %r10, %r11, %ymm13);
+	OCB_INPUT(3, %r12, %r13, %ymm12);
+	movq (8 * 8)(%r9), %r10;
+	movq (9 * 8)(%r9), %r11;
+	movq (10 * 8)(%r9), %r12;
+	movq (11 * 8)(%r9), %r13;
+	OCB_INPUT(4, %r10, %r11, %ymm11);
+	OCB_INPUT(5, %r12, %r13, %ymm10);
+	movq (12 * 8)(%r9), %r10;
+	movq (13 * 8)(%r9), %r11;
+	movq (14 * 8)(%r9), %r12;
+	movq (15 * 8)(%r9), %r13;
+	OCB_INPUT(6, %r10, %r11, %ymm9);
+	OCB_INPUT(7, %r12, %r13, %ymm8);
+	movq (16 * 8)(%r9), %r10;
+	movq (17 * 8)(%r9), %r11;
+	movq (18 * 8)(%r9), %r12;
+	movq (19 * 8)(%r9), %r13;
+	OCB_INPUT(8, %r10, %r11, %ymm7);
+	OCB_INPUT(9, %r12, %r13, %ymm6);
+	movq (20 * 8)(%r9), %r10;
+	movq (21 * 8)(%r9), %r11;
+	movq (22 * 8)(%r9), %r12;
+	movq (23 * 8)(%r9), %r13;
+	OCB_INPUT(10, %r10, %r11, %ymm5);
+	OCB_INPUT(11, %r12, %r13, %ymm4);
+	movq (24 * 8)(%r9), %r10;
+	movq (25 * 8)(%r9), %r11;
+	movq (26 * 8)(%r9), %r12;
+	movq (27 * 8)(%r9), %r13;
+	OCB_INPUT(12, %r10, %r11, %ymm3);
+	OCB_INPUT(13, %r12, %r13, %ymm2);
+	movq (28 * 8)(%r9), %r10;
+	movq (29 * 8)(%r9), %r11;
+	movq (30 * 8)(%r9), %r12;
+	movq (31 * 8)(%r9), %r13;
+	OCB_INPUT(14, %r10, %r11, %ymm1);
+	OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm14, (%rcx);
+
+	movq %r8, %r10;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %r9d;
+	cmovel %r9d, %r8d; /* max */
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor %ymm13, %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_dec_blk32;
+
+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+	vmovdqu %ymm7, (7 * 32)(%rax);
+	vmovdqu %ymm6, (6 * 32)(%rax);
+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+
+	vpxor %ymm5, %ymm7, %ymm7;
+	vpxor %ymm4, %ymm6, %ymm6;
+	vpxor %ymm3, %ymm7, %ymm7;
+	vpxor %ymm2, %ymm6, %ymm6;
+	vpxor %ymm1, %ymm7, %ymm7;
+	vpxor %ymm0, %ymm6, %ymm6;
+	vpxor %ymm15, %ymm7, %ymm7;
+	vpxor %ymm14, %ymm6, %ymm6;
+	vpxor %ymm13, %ymm7, %ymm7;
+	vpxor %ymm12, %ymm6, %ymm6;
+	vpxor %ymm11, %ymm7, %ymm7;
+	vpxor %ymm10, %ymm6, %ymm6;
+	vpxor %ymm9, %ymm7, %ymm7;
+	vpxor %ymm8, %ymm6, %ymm6;
+	vpxor %ymm7, %ymm6, %ymm7;
+
+	vextracti128 $1, %ymm7, %xmm6;
+	vpxor %xmm6, %xmm7, %xmm7;
+	vpxor (%r10), %xmm7, %xmm7;
+	vmovdqu %xmm7, (%r10);
+
+	vmovdqu 7 * 32(%rax), %ymm7;
+	vmovdqu 6 * 32(%rax), %ymm6;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroall;
+
+	movq (16 * 32 + 0 * 8)(%rax), %r10;
+	movq (16 * 32 + 1 * 8)(%rax), %r11;
+	movq (16 * 32 + 2 * 8)(%rax), %r12;
+	movq (16 * 32 + 3 * 8)(%rax), %r13;
+
+	leave;
+	ret;
+ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_auth
+ELF(.type   _gcry_camellia_aesni_avx2_ocb_auth, at function;)
+
+_gcry_camellia_aesni_avx2_ocb_auth:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: abuf (16 blocks)
+	 *	%rdx: offset
+	 *	%rcx: checksum
+	 *	%r8 : L pointers (void *L[16])
+	 */
+
+	pushq %rbp;
+	movq %rsp, %rbp;
+
+	vzeroupper;
+
+	subq $(16 * 32 + 4 * 8), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	movq %r10, (16 * 32 + 0 * 8)(%rax);
+	movq %r11, (16 * 32 + 1 * 8)(%rax);
+	movq %r12, (16 * 32 + 2 * 8)(%rax);
+	movq %r13, (16 * 32 + 3 * 8)(%rax);
+
+	vmovdqu (%rdx), %xmm14;
+
+	/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	/* Checksum_i = Checksum_{i-1} xor P_i  */
+	/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+	  vmovdqu (n * 32)(%rsi), yreg; \
+	  vpxor (l0reg), %xmm14, %xmm15; \
+	  vpxor (l1reg), %xmm15, %xmm14; \
+	  vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+	  vpxor yreg, %ymm15, yreg;
+
+	movq (0 * 8)(%r8), %r10;
+	movq (1 * 8)(%r8), %r11;
+	movq (2 * 8)(%r8), %r12;
+	movq (3 * 8)(%r8), %r13;
+	OCB_INPUT(0, %r10, %r11, %ymm0);
+	vmovdqu %ymm0, (15 * 32)(%rax);
+	OCB_INPUT(1, %r12, %r13, %ymm0);
+	vmovdqu %ymm0, (14 * 32)(%rax);
+	movq (4 * 8)(%r8), %r10;
+	movq (5 * 8)(%r8), %r11;
+	movq (6 * 8)(%r8), %r12;
+	movq (7 * 8)(%r8), %r13;
+	OCB_INPUT(2, %r10, %r11, %ymm13);
+	OCB_INPUT(3, %r12, %r13, %ymm12);
+	movq (8 * 8)(%r8), %r10;
+	movq (9 * 8)(%r8), %r11;
+	movq (10 * 8)(%r8), %r12;
+	movq (11 * 8)(%r8), %r13;
+	OCB_INPUT(4, %r10, %r11, %ymm11);
+	OCB_INPUT(5, %r12, %r13, %ymm10);
+	movq (12 * 8)(%r8), %r10;
+	movq (13 * 8)(%r8), %r11;
+	movq (14 * 8)(%r8), %r12;
+	movq (15 * 8)(%r8), %r13;
+	OCB_INPUT(6, %r10, %r11, %ymm9);
+	OCB_INPUT(7, %r12, %r13, %ymm8);
+	movq (16 * 8)(%r8), %r10;
+	movq (17 * 8)(%r8), %r11;
+	movq (18 * 8)(%r8), %r12;
+	movq (19 * 8)(%r8), %r13;
+	OCB_INPUT(8, %r10, %r11, %ymm7);
+	OCB_INPUT(9, %r12, %r13, %ymm6);
+	movq (20 * 8)(%r8), %r10;
+	movq (21 * 8)(%r8), %r11;
+	movq (22 * 8)(%r8), %r12;
+	movq (23 * 8)(%r8), %r13;
+	OCB_INPUT(10, %r10, %r11, %ymm5);
+	OCB_INPUT(11, %r12, %r13, %ymm4);
+	movq (24 * 8)(%r8), %r10;
+	movq (25 * 8)(%r8), %r11;
+	movq (26 * 8)(%r8), %r12;
+	movq (27 * 8)(%r8), %r13;
+	OCB_INPUT(12, %r10, %r11, %ymm3);
+	OCB_INPUT(13, %r12, %r13, %ymm2);
+	movq (28 * 8)(%r8), %r10;
+	movq (29 * 8)(%r8), %r11;
+	movq (30 * 8)(%r8), %r12;
+	movq (31 * 8)(%r8), %r13;
+	OCB_INPUT(14, %r10, %r11, %ymm1);
+	OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+	vmovdqu %xmm14, (%rdx);
+
+	movq %rcx, %r10;
+
+	/* inpack16_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor %ymm12, %ymm15, %ymm12;
+	vpxor %ymm13, %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	vpxor %ymm7, %ymm6, %ymm6;
+	vpxor %ymm5, %ymm4, %ymm4;
+	vpxor %ymm3, %ymm2, %ymm2;
+	vpxor %ymm1, %ymm0, %ymm0;
+	vpxor %ymm15, %ymm14, %ymm14;
+	vpxor %ymm13, %ymm12, %ymm12;
+	vpxor %ymm11, %ymm10, %ymm10;
+	vpxor %ymm9, %ymm8, %ymm8;
+
+	vpxor %ymm6, %ymm4, %ymm4;
+	vpxor %ymm2, %ymm0, %ymm0;
+	vpxor %ymm14, %ymm12, %ymm12;
+	vpxor %ymm10, %ymm8, %ymm8;
+
+	vpxor %ymm4, %ymm0, %ymm0;
+	vpxor %ymm12, %ymm8, %ymm8;
+
+	vpxor %ymm0, %ymm8, %ymm0;
+
+	vextracti128 $1, %ymm0, %xmm1;
+	vpxor (%r10), %xmm0, %xmm0;
+	vpxor %xmm0, %xmm1, %xmm0;
+	vmovdqu %xmm0, (%r10);
+
+	vzeroall;
+
+	movq (16 * 32 + 0 * 8)(%rax), %r10;
+	movq (16 * 32 + 1 * 8)(%rax), %r11;
+	movq (16 * 32 + 2 * 8)(%rax), %r12;
+	movq (16 * 32 + 3 * 8)(%rax), %r13;
+
+	leave;
+	ret;
+ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;)
+
 #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
 #endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 5032321..197e1b3 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -63,6 +63,7 @@
 #include "cipher.h"
 #include "camellia.h"
 #include "bufhelp.h"
+#include "cipher-internal.h"
 #include "cipher-selftest.h"
 
 /* Helper macro to force alignment to 16 bytes.  */
@@ -135,6 +136,26 @@ extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
 					     const unsigned char *in,
 					     unsigned char *iv) ASM_FUNC_ABI;
 
+extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const void *Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
+					     unsigned char *out,
+					     const unsigned char *in,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const void *Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
+					     const unsigned char *abuf,
+					     unsigned char *offset,
+					     unsigned char *checksum,
+					     const void *Ls[16]) ASM_FUNC_ABI;
+
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 					    const unsigned char *key,
 					    unsigned int keylen) ASM_FUNC_ABI;
@@ -158,6 +179,26 @@ extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
 					      unsigned char *out,
 					      const unsigned char *in,
 					      unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *offset,
+					      unsigned char *checksum,
+					      const void *Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx,
+					      unsigned char *out,
+					      const unsigned char *in,
+					      unsigned char *offset,
+					      unsigned char *checksum,
+					      const void *Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
+					       const unsigned char *abuf,
+					       unsigned char *offset,
+					       unsigned char *checksum,
+					       const void *Ls[32]) ASM_FUNC_ABI;
 #endif
 
 static const char *selftest(void);
@@ -563,6 +604,294 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
   _gcry_burn_stack(burn_stack_depth);
 }
 
+static inline const unsigned char *
+get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
+{
+  unsigned int ntz = _gcry_ctz64 (i);
+
+  if (ntz < OCB_L_TABLE_SIZE)
+      return c->u_mode.ocb.L[ntz];
+  else
+      return _gcry_cipher_ocb_get_l (c, l_tmp, i);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+void
+_gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			  const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  CAMELLIA_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
+  const unsigned char *l;
+  int burn_stack_depth;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+
+  burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
+			      CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+      const void *Ls[32];
+      int i;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 32; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  if (encrypt)
+	    _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					      c->u_ctr.ctr, Ls);
+	  else
+	    _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					      c->u_ctr.ctr, Ls);
+
+	  nblocks -= 32;
+	  outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+	  inbuf  += 32 * CAMELLIA_BLOCK_SIZE;
+	  did_use_aesni_avx2 = 1;
+	}
+
+      if (did_use_aesni_avx2)
+	{
+	  int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx2_burn_stack_depth)
+	    burn_stack_depth = avx2_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+      const void *Ls[16];
+      int i;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 16; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  if (encrypt)
+	    _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+					    c->u_ctr.ctr, Ls);
+	  else
+	    _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+					    c->u_ctr.ctr, Ls);
+
+	  nblocks -= 16;
+	  outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+	  inbuf  += 16 * CAMELLIA_BLOCK_SIZE;
+	  did_use_aesni_avx = 1;
+	}
+
+      if (did_use_aesni_avx)
+	{
+	  int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx_burn_stack_depth)
+	    burn_stack_depth = avx_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  if (encrypt)
+    {
+      for (; nblocks; nblocks--)
+	{
+	  l = get_l(c, l_tmp, ++blkn);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE);
+	  buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE);
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE);
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+	  Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
+	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+	  buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE);
+
+	  inbuf += CAMELLIA_BLOCK_SIZE;
+	  outbuf += CAMELLIA_BLOCK_SIZE;
+	}
+    }
+  else
+    {
+      for (; nblocks; nblocks--)
+	{
+	  l = get_l(c, l_tmp, ++blkn);
+
+	  /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	  buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE);
+	  buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE);
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+	  Camellia_DecryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
+	  buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+	  /* Checksum_i = Checksum_{i-1} xor P_i  */
+	  buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE);
+	  buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE);
+
+	  inbuf += CAMELLIA_BLOCK_SIZE;
+	  outbuf += CAMELLIA_BLOCK_SIZE;
+	}
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+void
+_gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			size_t nblocks)
+{
+  CAMELLIA_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
+  const unsigned char *l;
+  int burn_stack_depth;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2)
+    {
+      int did_use_aesni_avx2 = 0;
+      const void *Ls[32];
+      int i;
+
+      /* Process data in 32 block chunks. */
+      while (nblocks >= 32)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 32; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					    c->u_mode.ocb.aad_sum, Ls);
+
+	  nblocks -= 32;
+	  abuf += 32 * CAMELLIA_BLOCK_SIZE;
+	  did_use_aesni_avx2 = 1;
+	}
+
+      if (did_use_aesni_avx2)
+	{
+	  int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx2_burn_stack_depth)
+	    burn_stack_depth = avx2_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+#ifdef USE_AESNI_AVX
+  if (ctx->use_aesni_avx)
+    {
+      int did_use_aesni_avx = 0;
+      const void *Ls[16];
+      int i;
+
+      /* Process data in 16 block chunks. */
+      while (nblocks >= 16)
+	{
+	  /* l_tmp will be used only every 65536-th block. */
+	  for (i = 0; i < 16; i += 4)
+	    {
+	      Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+	      Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+	      Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+	      Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+	      blkn += 4;
+	    }
+
+	  _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+					    c->u_mode.ocb.aad_sum, Ls);
+
+	  nblocks -= 16;
+	  abuf += 16 * CAMELLIA_BLOCK_SIZE;
+	  did_use_aesni_avx = 1;
+	}
+
+      if (did_use_aesni_avx)
+	{
+	  int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+				      2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+	  if (burn_stack_depth < avx_burn_stack_depth)
+	    burn_stack_depth = avx_burn_stack_depth;
+	}
+
+      /* Use generic code to handle smaller chunks... */
+    }
+#endif
+
+  for (; nblocks; nblocks--)
+    {
+      l = get_l(c, l_tmp, ++blkn);
+
+      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+      buf_xor_1 (c->u_mode.ocb.aad_offset, l, CAMELLIA_BLOCK_SIZE);
+      /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i)  */
+      buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, CAMELLIA_BLOCK_SIZE);
+      Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
+      buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, CAMELLIA_BLOCK_SIZE);
+
+      abuf += CAMELLIA_BLOCK_SIZE;
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  wipememory(&l_tmp, sizeof(l_tmp));
+
+  if (burn_stack_depth)
+    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
 /* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
    encryption.  Returns NULL on success. */
 static const char*
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 7a29824..2d2b0ad 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -535,6 +535,8 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
               h->bulk.cfb_dec = _gcry_camellia_cfb_dec;
               h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
+              h->bulk.ocb_crypt = _gcry_camellia_ocb_crypt;
+              h->bulk.ocb_auth  = _gcry_camellia_ocb_auth;
               break;
 #endif /*USE_CAMELLIA*/
 #ifdef USE_DES
diff --git a/src/cipher.h b/src/cipher.h
index ef183fd..a0aac51 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -177,6 +177,11 @@ void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
 void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
                              void *outbuf_arg, const void *inbuf_arg,
                              size_t nblocks);
+void _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			       const void *inbuf_arg, size_t nblocks,
+			       int encrypt);
+void _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			      size_t nblocks);
 
 /*-- des.c --*/
 void _gcry_3des_ctr_enc (void *context, unsigned char *ctr,
diff --git a/tests/basic.c b/tests/basic.c
index 2c664c0..e3f4bfd 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3335,6 +3335,15 @@ check_ocb_cipher (void)
   check_ocb_cipher_largebuf(GCRY_CIPHER_AES256, 32,
                             "\xfa\x26\xa5\xbf\xf6\x7d\x3a\x8d"
                             "\xfe\x96\x67\xc9\xc8\x41\x03\x51");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA128, 16,
+                            "\x28\x23\x38\x45\x2b\xfd\x42\x45"
+			    "\x43\x64\x7e\x67\x7f\xf4\x8b\xcd");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA192, 24,
+                            "\xee\xca\xe5\x39\x27\x2d\x33\xe7"
+			    "\x79\x74\xb0\x1d\x37\x12\xd5\x6c");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA256, 32,
+                            "\x39\x39\xd0\x2d\x05\x68\x74\xee"
+			    "\x18\x6b\xea\x3d\x0b\xd3\x58\xae");
 }