[git] GpgOL - branch, master, updated. gpgol-1.2.0-115-gf8ccdc6

by Andre Heinecke cvs at cvs.gnupg.org
Sun Nov 22 17:16:58 CET 2015


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GnuPG extension for MS Outlook".

The branch, master has been updated
       via  f8ccdc665fa554b70d24b2f606f016986a14652e (commit)
       via  2d5127b9faebe6677fa335cc8d20f52241fc7eb5 (commit)
       via  978b651f156049bfb7e35fc931c644acec83b5f3 (commit)
       via  642fd5b02bd3b0f3545e8bed68e3625663cacab3 (commit)
       via  9b77ef1253a881ab1c75022b2ad887b4cd2b0489 (commit)
       via  f67f41e0775336225e175f8b22e820b191dc4606 (commit)
       via  1ee1143f6c00ab35f48651059bca820967d86755 (commit)
      from  1362563c9370cc9c00463293a7f6eeb91b9424de (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit f8ccdc665fa554b70d24b2f606f016986a14652e
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 17:13:46 2015 +0100

    Use mlang charset conversion in RFC2047 parser
    
    * src/rfc2047parse.c (rfc2047_decode_tokens): Use malang-charset
      conversion.
      (_rfc2047_token, rfc2047_token_new_encoded_word): Unconstify charset.
      (rfc2047_token_free): New. Also free charset.
      (rfc2047_tokens_free): Use token_free.
      (g_mime_utils_header_decode_phrase): Cleanup redundant code.
      (rfc2047_parse): Ditto.
    
    --
    At least with a latin1 encoded mail from Kmail this worked and
    it's probably the same conversion Outlook uses internally so
    his should support everything that outlook supports.

diff --git a/src/rfc2047parse.c b/src/rfc2047parse.c
index 72b195a..d60fc35 100644
--- a/src/rfc2047parse.c
+++ b/src/rfc2047parse.c
@@ -33,6 +33,7 @@
 
 #include <stdbool.h>
 #include "common.h"
+#include "mlang-charset.h"
 
 #include "gmime-table-private.h"
 
@@ -61,7 +62,7 @@ static unsigned char gmime_base64_rank[256] = {
 
 typedef struct _rfc2047_token {
     struct _rfc2047_token *next;
-    const char *charset;
+    char *charset;
     const char *text;
     size_t length;
     char encoding;
@@ -86,8 +87,9 @@ rfc2047_token_new_encoded_word (const char *word, size_t len)
 {
   rfc2047_token *token;
   const char *payload;
-  const char *charset;
+  char *charset;
   const char *inptr;
+  const char *tmpchar;
   char *buf, *lang;
   char encoding;
   size_t n;
@@ -98,9 +100,9 @@ rfc2047_token_new_encoded_word (const char *word, size_t len)
 
   /* skip over '=?' */
   inptr = word + 2;
-  charset = inptr;
+  tmpchar = inptr;
 
-  if (*charset == '?' || *charset == '*') {
+  if (*tmpchar == '?' || *tmpchar == '*') {
       /* this would result in an empty charset */
       return NULL;
   }
@@ -110,9 +112,9 @@ rfc2047_token_new_encoded_word (const char *word, size_t len)
     return NULL;
 
   /* copy the charset into a buffer */
-  n = (size_t) (inptr - charset);
-  buf = _alloca (n + 1);
-  memcpy (buf, charset, n);
+  n = (size_t) (inptr - tmpchar);
+  buf = malloc (n + 1);
+  memcpy (buf, tmpchar, n);
   buf[n] = '\0';
   charset = buf;
 
@@ -156,14 +158,23 @@ rfc2047_token_new_encoded_word (const char *word, size_t len)
     return NULL;
 
   token = rfc2047_token_new (payload, inptr - payload);
-  /* TODO lookup charset.*/
-  log_debug ("%s:%s: Charset name: %p", SRCNAME, __func__, charset);
-  token->charset = "UTF-8";/* g_mime_charset_iconv_name (charset); */
+  token->charset = charset;
   token->encoding = encoding;
 
   return token;
 }
 
+static void
+rfc2047_token_free (rfc2047_token * tok)
+{
+  if (!tok)
+    {
+      return;
+    }
+  xfree (tok->charset);
+  xfree (tok);
+}
+
 static rfc2047_token *
 tokenize_rfc2047_phrase (const char *in, size_t *len)
 {
@@ -248,7 +259,7 @@ non_rfc2047:
                   tail->next = lwsp;
                   tail = lwsp;
               } else if (lwsp != NULL) {
-                  xfree (lwsp);
+                  rfc2047_token_free (lwsp);
               }
 
               tail->next = token;
@@ -305,7 +316,7 @@ rfc2047_token_list_free (rfc2047_token * tokens)
   while (cur)
     {
       rfc2047_token *next = cur->next;
-      xfree (cur);
+      rfc2047_token_free (cur);
       cur = next;
     }
 }
@@ -506,13 +517,7 @@ rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
   char encoding;
   unsigned int save;
   int state;
-#if 0
-  TODO Conversion
-  size_t ninval;
-  iconv_t cd;
   char *str;
-#endif
-
 
   decoded = xmalloc (buflen + 1);
   memset (decoded, 0, buflen + 1);
@@ -565,43 +570,29 @@ rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
           /* convert the raw decoded text into UTF-8 */
           if (!strcasecmp (charset, "UTF-8")) {
               strncat (decoded, (char *) outptr, outlen);
-          }
-#if 0
-  /* TODO handle other charsets */
-          else if ((cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
-              w(g_warning ("Cannot convert from %s to UTF-8, header display may "
-                           "be corrupt: %s", charset[0] ? charset : "unspecified charset",
-                           g_strerror (errno)));
-
-              str = g_mime_utils_decode_8bit ((char *) outptr, outlen);
-              g_string_append (decoded, str);
-              g_free (str);
           } else {
-              str = g_malloc (outlen + 1);
-              len = outlen;
-
-              len = charset_convert (cd, (char *) outptr, outlen, &str, &len, &ninval);
-              g_mime_iconv_close (cd);
-
-              g_string_append_len (decoded, str, len);
-              g_free (str);
-
-#if w(!)0
-              if (ninval > 0) {
-                  g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
-                             "corrupt: %s", outlen, (char *) outptr, g_strerror (errno));
-              }
-#endif
+              str = ansi_charset_to_utf8 (charset, outptr, outlen);
+
+              if (!str)
+                {
+                  log_error ("%s:%s: Failed conversion from: %s for word: %s.",
+                             SRCNAME, __func__, charset, outptr);
+                }
+              else
+                {
+                  strcat (decoded, str);
+                  xfree (str);
+                }
           }
-      } else if (token->is_8bit) {
-          /* *sigh* I hate broken mailers... */
-          str = g_mime_utils_decode_8bit (token->text, token->length);
-          g_string_append (decoded, str);
-          g_free (str);
-#endif
       } else {
           strncat (decoded, token->text, token->length);
       }
+      if (token && token->is_8bit)
+      {
+        /* We don't support this. */
+        log_error ("%s:%s: Unknown 8bit encoding detected.",
+                   SRCNAME, __func__);
+      }
 
       token = next;
   }
@@ -636,16 +627,6 @@ g_mime_utils_header_decode_phrase (const char *phrase)
   decoded = rfc2047_decode_tokens (tokens, len);
   rfc2047_token_list_free (tokens);
 
-  if (decoded && strlen(decoded))
-    {
-      return decoded;
-    }
-  else
-    {
-      xfree (decoded);
-      return strdup (phrase);
-    }
-
   return decoded;
 }
 
@@ -662,10 +643,6 @@ rfc2047_parse (const char *input)
   log_debug ("%s:%s: Input: \"%s\"",
              SRCNAME, __func__, input);
 
-  if (!strncmp (input, "=?", 2))
-    {
-      return strdup (input);
-    }
   decoded = g_mime_utils_header_decode_phrase (input);
 
   log_debug ("%s:%s: Decoded: \"%s\"",

commit 2d5127b9faebe6677fa335cc8d20f52241fc7eb5
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 17:11:07 2015 +0100

    Fallback to content-type name if no cd/filename
    
    * src/mimeparser.c (t2body): Fallback to ct/name if cd/filename
      not found or not parsed.
    
    --
    This is neccessary for enigmail attachments which uses an
    rfc 2038 (i think) content-dispositon filename but an
    RFC 2047 content-type / name so that Outlook can understand
    the filename from the content-type / name. We understand this
    now, too.

diff --git a/src/mimeparser.c b/src/mimeparser.c
index 6899e36..be5f4a1 100644
--- a/src/mimeparser.c
+++ b/src/mimeparser.c
@@ -831,6 +831,15 @@ t2body (mime_context_t ctx, rfc822parse_t msg)
   if (s)
     charset = xstrdup (s);
 
+  if (!filename)
+    {
+      /* Check for Content-Type name if Content-Disposition filename
+         was not found */
+      s = rfc822parse_query_parameter (field, "name", 0);
+      if (s)
+        filename = rfc2047_parse (s);
+    }
+
   /* Update our idea of the entire MIME structure.  */
   {
     mimestruct_item_t ms;

commit 978b651f156049bfb7e35fc931c644acec83b5f3
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 17:09:36 2015 +0100

    Parse RFC2047 attachment names
    
    * src/mimeparser.c (start_attachment): Expect filename as utf8 and
      set it as Unicode property in MAPI.
      (t2body): Parse the attachment filename.

diff --git a/src/mimeparser.c b/src/mimeparser.c
index af21ad5..6899e36 100644
--- a/src/mimeparser.c
+++ b/src/mimeparser.c
@@ -37,6 +37,7 @@
 #include "mymapitags.h"
 
 #include "rfc822parse.h"
+#include "rfc2047parse.h"
 #include "common.h"
 #include "engine.h"
 #include "mapihelp.h"
@@ -419,12 +420,17 @@ start_attachment (mime_context_t ctx, int is_body)
      functions.  */
   if (ctx->mimestruct_cur && ctx->mimestruct_cur->filename)
     {
-      prop.ulPropTag = PR_ATTACH_LONG_FILENAME_A;
+      prop.ulPropTag = PR_ATTACH_LONG_FILENAME_W;
+      wchar_t * utf16_str = NULL;
       if (!strcmp (ctx->mimestruct_cur->filename, "smime.p7m"))
-        prop.Value.lpszA = "x-smime.p7m";
+        prop.Value.lpszW = L"x-smime.p7m";
       else
-        prop.Value.lpszA = ctx->mimestruct_cur->filename;
+        {
+          utf16_str = utf8_to_wchar (ctx->mimestruct_cur->filename);
+          prop.Value.lpszW = utf16_str;
+        }
       hr = HrSetOneProp ((LPMAPIPROP)newatt, &prop);
+      xfree (utf16_str);
       if (hr)
         {
           log_error ("%s:%s: can't set attach long filename: hr=%#lx\n",
@@ -757,8 +763,6 @@ finish_message (LPMESSAGE message, gpg_error_t err, int protect_mode,
   return mapi_save_changes (message, KEEP_OPEN_READWRITE|FORCE_SAVE);
 }
 
-
-
 /* Process the transition to body event. 
 
    This means we have received the empty line indicating the body and
@@ -799,7 +803,7 @@ t2body (mime_context_t ctx, rfc822parse_t msg)
     {
       s = rfc822parse_query_parameter (field, "filename", 0);
       if (s)
-        filename = xstrdup (s);
+        filename = rfc2047_parse (s);
       s = rfc822parse_query_parameter (field, NULL, 1);
       if (s && strcmp (s, "inline"))
         not_inline_text = 1;

commit 642fd5b02bd3b0f3545e8bed68e3625663cacab3
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 17:06:45 2015 +0100

    Use RFC2047 for attachment names
    
    * src/mimemaker.c (utf8_to_rfc2047b): New. Convert utf8 to rfc2047
      base64 encoding.
    
    --
    We always use UTF8 and Base64 for simplicity. (I wish this was
    everything thats allowed,..)

diff --git a/src/mimemaker.c b/src/mimemaker.c
index 0d4ddee..0641dff 100644
--- a/src/mimemaker.c
+++ b/src/mimemaker.c
@@ -784,9 +784,28 @@ infer_content_encoding (const void *data, size_t datalen)
 }
 
 
-
-
-
+/* Convert an utf8 input string to RFC2047 base64 encoding which
+   is the subset of RFC2047 outlook likes.
+   Return value needs to be freed.
+   */
+static char *
+utf8_to_rfc2047b (const char *input)
+{
+  char *ret;
+  if (!input)
+    {
+      return NULL;
+    }
+  char *b64_encoded = b64_encode (input, strlen (input));
+  if (gpgrt_asprintf (&ret, "=?UTF-8?B?%s?=", b64_encoded) == -1)
+    {
+      log_error ("%s:%s: Error: %i", SRCNAME, __func__, __LINE__);
+      xfree (b64_encoded);
+      return NULL;
+    }
+  xfree (b64_encoded);
+  return ret;
+}
 
 /* Write a MIME part to SINK.  First the BOUNDARY is written (unless
    it is NULL) then the DATA is analyzed and appropriate headers are
@@ -800,6 +819,7 @@ write_part (sink_t sink, const char *data, size_t datalen,
   int rc;
   const char *ct;
   int use_b64, use_qp, is_text;
+  char *encoded_filename;
 
   if (filename)
     {
@@ -858,9 +878,10 @@ write_part (sink_t sink, const char *data, size_t datalen,
                                NULL)))
       return rc;
 
-  if (filename)
+  encoded_filename = utf8_to_rfc2047b (filename);
+  if (encoded_filename)
     if ((rc=write_multistring (sink,
-                               "\tname=\"", filename, "\"\r\n",
+                               "\tname=\"", encoded_filename, "\"\r\n",
                                NULL)))
       return rc;
 
@@ -873,13 +894,14 @@ write_part (sink_t sink, const char *data, size_t datalen,
                                NULL)))
     return rc;
 
-  if (filename)
+  if (encoded_filename)
     if ((rc=write_multistring (sink,
                                "Content-Disposition: attachment;\r\n"
-                               "\tfilename=\"", filename, "\"\r\n",
+                               "\tfilename=\"", encoded_filename, "\"\r\n",
                                NULL)))
       return rc;
 
+  xfree(encoded_filename);
 
   /* Write delimiter.  */
   if ((rc = write_string (sink, "\r\n")))

commit 9b77ef1253a881ab1c75022b2ad887b4cd2b0489
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 17:05:32 2015 +0100

    Add base64 encoder
    
    * src/common.c, src/common.h (b64_encode): New.
    
    --
    Will be used to encode RFC2047 strings.

diff --git a/src/common.c b/src/common.c
index 8473fee..fd53592 100644
--- a/src/common.c
+++ b/src/common.c
@@ -895,6 +895,49 @@ b64_decode (b64_state_t *state, char *buffer, size_t length)
 }
 
 
+/* Base 64 encode the input. If input is null returns NULL otherwise
+   a pointer to the malloced encoded string. */
+char *
+b64_encode (const char *input, size_t length)
+{
+  size_t out_len = 4 * ((length + 2) / 3);
+  char *ret;
+  int i, j;
+
+  if (!length || !input)
+    {
+      return NULL;
+    }
+  ret = xmalloc (out_len);
+
+  for (i = 0, j = 0; i < length;)
+    {
+      unsigned int a = i < length ? (unsigned char)input[i++] : 0;
+      unsigned int b = i < length ? (unsigned char)input[i++] : 0;
+      unsigned int c = i < length ? (unsigned char)input[i++] : 0;
+
+      unsigned int triple = (a << 0x10) + (b << 0x08) + c;
+
+      ret[j++] = bintoasc[(triple >> 3 * 6) & 0x3F];
+      ret[j++] = bintoasc[(triple >> 2 * 6) & 0x3F];
+      ret[j++] = bintoasc[(triple >> 1 * 6) & 0x3F];
+      ret[j++] = bintoasc[(triple >> 0 * 6) & 0x3F];
+    }
+
+  if (length % 3)
+    {
+      ret [j - 1] = '=';
+    }
+  if (length % 3 == 1)
+    {
+      ret [j - 2] = '=';
+    }
+
+  ret[++j] = '\0';
+  log_debug("Encoded to: %s ", ret);
+  return ret;
+}
+
 /* Create a boundary.  Note that mimemaker.c knows about the structure
    of the boundary (i.e. that it starts with "=-=") so that it can
    protect against accidently used boundaries within the content.  */
diff --git a/src/common.h b/src/common.h
index c525a17..1d46515 100644
--- a/src/common.h
+++ b/src/common.h
@@ -189,6 +189,7 @@ char *get_data_dir (void);
 size_t qp_decode (char *buffer, size_t length, int *r_slbrk);
 void b64_init (b64_state_t *state);
 size_t b64_decode (b64_state_t *state, char *buffer, size_t length);
+char * b64_encode (const char *input, size_t length);
 
 /* Get a temporary filename with and its name */
 wchar_t *get_tmp_outfile (wchar_t *name, HANDLE *outHandle);

commit f67f41e0775336225e175f8b22e820b191dc4606
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 17:02:05 2015 +0100

    Add MIME language to UTF8 conversion code
    
    * src/mlang-charset.cpp, src/mlang-charset.h: New.
    * src/Makefile.am: Update accordingly.
    
    --
    This is COM based code but as we are a COM addin anyway
    we can utilize it to convert MIME strings to unicode and
    then from unicode to UTF8.
    Our internal "we do everything in UTF-8" has a bit overhead
    here but I prefer it.

diff --git a/src/Makefile.am b/src/Makefile.am
index 2010722..457c7b3 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -82,7 +82,8 @@ gpgol_SOURCES = \
 	windowmessages.h windowmessages.cpp \
 	gpgolstr.h gpgolstr.cpp \
 	mail.h mail.cpp \
-	rfc2047parse.h rfc2047parse.c
+	rfc2047parse.h rfc2047parse.c \
+	mlang-charset.cpp mlang-charset.h
 
 
 #treeview_SOURCES = treeview.c
diff --git a/src/mlang-charset.cpp b/src/mlang-charset.cpp
new file mode 100644
index 0000000..221f57f
--- /dev/null
+++ b/src/mlang-charset.cpp
@@ -0,0 +1,98 @@
+/* @file mlang-charset.cpp
+ * @brief Convert between charsets using Mlang
+ *
+ *    Copyright (C) 2015 Intevation GmbH
+ *
+ * This file is part of GpgOL.
+ *
+ * GpgOL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * GpgOL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "util.h"
+#define INITGUID
+#include <initguid.h>
+DEFINE_GUID (IID_IMultiLanguage, 0x275c23e1,0x3747,0x11d0,0x9f,
+                                 0xea,0x00,0xaa,0x00,0x3f,0x86,0x46);
+#include <mlang.h>
+#undef INITGUID
+
+#include "mlang-charset.h"
+
+char *ansi_charset_to_utf8 (const char *charset, char *input,
+                            size_t inlen)
+{
+  LPMULTILANGUAGE multilang = NULL;
+  MIMECSETINFO mime_info;
+  HRESULT err;
+  DWORD enc;
+  DWORD mode = 0;
+  unsigned int wlen = 0;
+  wchar_t *buf;
+  char *ret;
+
+  CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_INPROC_SERVER,
+                   IID_IMultiLanguage, (void**)&multilang);
+
+  if (!multilang)
+    {
+      log_error ("%s:%s: Failed to get multilang obj.",
+                 SRCNAME, __func__);
+      return NULL;
+    }
+
+
+  mime_info.uiCodePage = 0;
+  mime_info.uiInternetEncoding = 0;
+  BSTR w_charset = utf8_to_wchar (charset);
+  err = multilang->GetCharsetInfo (w_charset, &mime_info);
+  xfree (w_charset);
+  if (err != S_OK)
+    {
+      log_error ("%s:%s: Failed to find charset for: %s",
+                 SRCNAME, __func__, charset);
+      multilang->Release ();
+      return NULL;
+    }
+  enc = (mime_info.uiInternetEncoding == 0) ? mime_info.uiCodePage :
+                                              mime_info.uiInternetEncoding;
+
+  /** Get the size of the result */
+  err = multilang->ConvertStringToUnicode(&mode, enc, input,
+                                          &inlen, NULL, &wlen);
+  if (FAILED (err))
+    {
+      log_error ("%s:%s: Failed conversion.",
+                 SRCNAME, __func__);
+      multilang->Release ();
+      return NULL;
+  }
+  buf = (wchar_t*) xmalloc(sizeof(wchar_t) * (wlen + 1));
+
+  err = multilang->ConvertStringToUnicode(&mode, enc, input, &inlen,
+                                          buf, &wlen);
+  multilang->Release ();
+  if (FAILED (err))
+    {
+      log_error ("%s:%s: Failed conversion 2.",
+                 SRCNAME, __func__);
+      xfree (buf);
+      return NULL;
+    }
+  /* Doc is not clear if this is terminated. */
+  buf[wlen] = L'\0';
+
+  ret = wchar_to_utf8 (buf);
+  xfree (buf);
+  return ret;
+}
diff --git a/src/mlang-charset.h b/src/mlang-charset.h
new file mode 100644
index 0000000..3c55fd3
--- /dev/null
+++ b/src/mlang-charset.h
@@ -0,0 +1,43 @@
+/* @file mlang-charset.h
+ * @brief Convert between charsets using Mlang
+ *
+ *    Copyright (C) 2015 Intevation GmbH
+ *
+ * This file is part of GpgOL.
+ *
+ * GpgOL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * GpgOL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "util.h"
+#ifdef __cplusplus
+extern "C" {
+#if 0
+}
+#endif
+#endif
+
+/** @brief convert input to utf8.
+  *
+  * @param charset: ANSI name of the charset to decode.
+  * @param input: The input to convert.
+  * @param inlen: The size of the input.
+  *
+  * @returns NULL on error or an UTF-8 encoded NULL terminated string.
+  */
+
+char *ansi_charset_to_utf8 (const char *charset, char *input,
+                            size_t inlen);
+#ifdef __cplusplus
+}
+#endif

commit 1ee1143f6c00ab35f48651059bca820967d86755
Author: Andre Heinecke <aheinecke at intevation.de>
Date:   Sun Nov 22 15:41:35 2015 +0100

    Add rfc2047 parser based on GMime
    
    * src/rfc2047parse.c, src/rfc2047parse.h: New.
    * src/Makefile.am: Update accordingly.
    --
    To properly handle attachment names the same way as outlook
    does we need to be able to parse rfc2047 words at least.
    From the rfc's I'm not sure if we need to parse multiple words
    but let's assume we do for added compatibility. And this was
    an easy part of GMime to extract.
    
    For now it only handles UTF8 binary and quoted printable.

diff --git a/src/Makefile.am b/src/Makefile.am
index e5cb2c4..2010722 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -81,7 +81,8 @@ gpgol_SOURCES = \
 	attachment.h attachment.cpp \
 	windowmessages.h windowmessages.cpp \
 	gpgolstr.h gpgolstr.cpp \
-	mail.h mail.cpp
+	mail.h mail.cpp \
+	rfc2047parse.h rfc2047parse.c
 
 
 #treeview_SOURCES = treeview.c
diff --git a/src/rfc2047parse.c b/src/rfc2047parse.c
new file mode 100644
index 0000000..72b195a
--- /dev/null
+++ b/src/rfc2047parse.c
@@ -0,0 +1,680 @@
+/* @file rfc2047parse.c
+ * @brief Parsercode for rfc2047
+ *
+ *    Copyright (C) 2015 Intevation GmbH
+ *
+ * This file is part of GpgOL.
+ *
+ * GpgOL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * GpgOL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* This code is heavily based (mostly verbatim copy with glib
+ *  dependencies removed) on GMime rev 496313fb
+ * modified by aheinecke at intevation.de
+ *
+ * Copyright (C) 2000-2014 Jeffrey Stedfast
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public License
+ *  as published by the Free Software Foundation; either version 2.1
+ *  of the License, or (at your option) any later version.
+ */
+
+#include <stdbool.h>
+#include "common.h"
+
+#include "gmime-table-private.h"
+
+/* mabye we need this at some point later? */
+#define G_MIME_RFC2047_WORKAROUNDS false;
+
+
+static unsigned char gmime_base64_rank[256] = {
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255, 62,255,255,255, 63,
+    52, 53, 54, 55, 56, 57, 58, 59, 60, 61,255,255,255,  0,255,255,
+    255,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,255,255,255,255,255,
+    255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+    255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+};
+
+typedef struct _rfc2047_token {
+    struct _rfc2047_token *next;
+    const char *charset;
+    const char *text;
+    size_t length;
+    char encoding;
+    char is_8bit;
+} rfc2047_token;
+
+static rfc2047_token *
+rfc2047_token_new (const char *text, size_t len)
+{
+  rfc2047_token *token;
+
+  token = xmalloc (sizeof (rfc2047_token));
+  memset (token, 0, sizeof (rfc2047_token));
+  token->length = len;
+  token->text = text;
+
+  return token;
+}
+
+static rfc2047_token *
+rfc2047_token_new_encoded_word (const char *word, size_t len)
+{
+  rfc2047_token *token;
+  const char *payload;
+  const char *charset;
+  const char *inptr;
+  char *buf, *lang;
+  char encoding;
+  size_t n;
+
+  /* check that this could even be an encoded-word token */
+  if (len < 7 || strncmp (word, "=?", 2) != 0 || strncmp (word + len - 2, "?=", 2) != 0)
+    return NULL;
+
+  /* skip over '=?' */
+  inptr = word + 2;
+  charset = inptr;
+
+  if (*charset == '?' || *charset == '*') {
+      /* this would result in an empty charset */
+      return NULL;
+  }
+
+  /* skip to the end of the charset */
+  if (!(inptr = memchr (inptr, '?', len - 2)) || inptr[2] != '?')
+    return NULL;
+
+  /* copy the charset into a buffer */
+  n = (size_t) (inptr - charset);
+  buf = _alloca (n + 1);
+  memcpy (buf, charset, n);
+  buf[n] = '\0';
+  charset = buf;
+
+  /* rfc2231 updates rfc2047 encoded words...
+   * The ABNF given in RFC 2047 for encoded-words is:
+   *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
+   * This specification changes this ABNF to:
+   *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
+   */
+
+  /* trim off the 'language' part if it's there... */
+  if ((lang = strchr (charset, '*')))
+    *lang = '\0';
+
+  /* skip over the '?' */
+  inptr++;
+
+  /* make sure the first char after the encoding is another '?' */
+  if (inptr[1] != '?')
+    return NULL;
+
+  switch (*inptr++) {
+    case 'B': case 'b':
+      encoding = 'B';
+      break;
+    case 'Q': case 'q':
+      encoding = 'Q';
+      break;
+    default:
+      return NULL;
+  }
+
+  /* the payload begins right after the '?' */
+  payload = inptr + 1;
+
+  /* find the end of the payload */
+  inptr = word + len - 2;
+
+  /* make sure that we don't have something like: =?iso-8859-1?Q?= */
+  if (payload > inptr)
+    return NULL;
+
+  token = rfc2047_token_new (payload, inptr - payload);
+  /* TODO lookup charset.*/
+  log_debug ("%s:%s: Charset name: %p", SRCNAME, __func__, charset);
+  token->charset = "UTF-8";/* g_mime_charset_iconv_name (charset); */
+  token->encoding = encoding;
+
+  return token;
+}
+
+static rfc2047_token *
+tokenize_rfc2047_phrase (const char *in, size_t *len)
+{
+  bool enable_rfc2047_workarounds = G_MIME_RFC2047_WORKAROUNDS;
+  rfc2047_token list, *lwsp, *token, *tail;
+  register const char *inptr = in;
+  bool encoded = false;
+  const char *text, *word;
+  bool ascii;
+  size_t n;
+
+  tail = (rfc2047_token *) &list;
+  list.next = NULL;
+  lwsp = NULL;
+
+  while (*inptr != '\0') {
+      text = inptr;
+      while (is_lwsp (*inptr))
+        inptr++;
+
+      if (inptr > text)
+        lwsp = rfc2047_token_new (text, inptr - text);
+      else
+        lwsp = NULL;
+
+      word = inptr;
+      ascii = TRUE;
+      if (is_atom (*inptr)) {
+          if (enable_rfc2047_workarounds) {
+              /* Make an extra effort to detect and
+               * separate encoded-word tokens that
+               * have been merged with other
+               * words. */
+
+              if (!strncmp (inptr, "=?", 2)) {
+                  inptr += 2;
+
+                  /* skip past the charset (if one is even declared, sigh) */
+                  while (*inptr && *inptr != '?') {
+                      ascii = ascii && is_ascii (*inptr);
+                      inptr++;
+                  }
+
+                  /* sanity check encoding type */
+                  if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
+                    goto non_rfc2047;
+
+                  inptr += 3;
+
+                  /* find the end of the rfc2047 encoded word token */
+                  while (*inptr && strncmp (inptr, "?=", 2) != 0) {
+                      ascii = ascii && is_ascii (*inptr);
+                      inptr++;
+                  }
+
+                  if (*inptr == '\0') {
+                      /* didn't find an end marker... */
+                      inptr = word + 2;
+                      ascii = TRUE;
+
+                      goto non_rfc2047;
+                  }
+
+                  inptr += 2;
+              } else {
+non_rfc2047:
+                  /* stop if we encounter a possible rfc2047 encoded
+                   * token even if it's inside another word, sigh. */
+                  while (is_atom (*inptr) && strncmp (inptr, "=?", 2) != 0)
+                    inptr++;
+              }
+          } else {
+              while (is_atom (*inptr))
+                inptr++;
+          }
+
+          n = (size_t) (inptr - word);
+          if ((token = rfc2047_token_new_encoded_word (word, n))) {
+              /* rfc2047 states that you must ignore all
+               * whitespace between encoded words */
+              if (!encoded && lwsp != NULL) {
+                  tail->next = lwsp;
+                  tail = lwsp;
+              } else if (lwsp != NULL) {
+                  xfree (lwsp);
+              }
+
+              tail->next = token;
+              tail = token;
+
+              encoded = TRUE;
+          } else {
+              /* append the lwsp and atom tokens */
+              if (lwsp != NULL) {
+                  tail->next = lwsp;
+                  tail = lwsp;
+              }
+
+              token = rfc2047_token_new (word, n);
+              token->is_8bit = ascii ? 0 : 1;
+              tail->next = token;
+              tail = token;
+
+              encoded = FALSE;
+          }
+      } else {
+          /* append the lwsp token */
+          if (lwsp != NULL) {
+              tail->next = lwsp;
+              tail = lwsp;
+          }
+
+          ascii = TRUE;
+          while (*inptr && !is_lwsp (*inptr) && !is_atom (*inptr)) {
+              ascii = ascii && is_ascii (*inptr);
+              inptr++;
+          }
+
+          n = (size_t) (inptr - word);
+          token = rfc2047_token_new (word, n);
+          token->is_8bit = ascii ? 0 : 1;
+
+          tail->next = token;
+          tail = token;
+
+          encoded = FALSE;
+      }
+  }
+
+  *len = (size_t) (inptr - in);
+
+  return list.next;
+}
+
+static void
+rfc2047_token_list_free (rfc2047_token * tokens)
+{
+  rfc2047_token * cur = tokens;
+  while (cur)
+    {
+      rfc2047_token *next = cur->next;
+      xfree (cur);
+      cur = next;
+    }
+}
+
+/* this decodes rfc2047's version of quoted-printable */
+static size_t
+quoted_decode (const unsigned char *in, size_t len, unsigned char *out, int *state, unsigned int *save)
+{
+  register const unsigned char *inptr;
+  register unsigned char *outptr;
+  const unsigned char *inend;
+  unsigned char c, c1;
+  unsigned int saved;
+  int need;
+
+  if (len == 0)
+    return 0;
+
+  inend = in + len;
+  outptr = out;
+  inptr = in;
+
+  need = *state;
+  saved = *save;
+
+  if (need > 0) {
+      if (isxdigit ((int) *inptr)) {
+          if (need == 1) {
+              c = toupper ((int) (saved & 0xff));
+              c1 = toupper ((int) *inptr++);
+              saved = 0;
+              need = 0;
+
+              goto decode;
+          }
+
+          saved = 0;
+          need = 0;
+
+          goto equals;
+      }
+
+      /* last encoded-word ended in a malformed quoted-printable sequence */
+      *outptr++ = '=';
+
+      if (need == 1)
+        *outptr++ = (char) (saved & 0xff);
+
+      saved = 0;
+      need = 0;
+  }
+
+  while (inptr < inend) {
+      c = *inptr++;
+      if (c == '=') {
+equals:
+          if (inend - inptr >= 2) {
+              if (isxdigit ((int) inptr[0]) && isxdigit ((int) inptr[1])) {
+                  c = toupper (*inptr++);
+                  c1 = toupper (*inptr++);
+decode:
+                  *outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
+                    | ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
+              } else {
+                  /* malformed quoted-printable sequence? */
+                  *outptr++ = '=';
+              }
+          } else {
+              /* truncated payload, maybe it was split across encoded-words? */
+              if (inptr < inend) {
+                  if (isxdigit ((int) *inptr)) {
+                      saved = *inptr;
+                      need = 1;
+                      break;
+                  } else {
+                      /* malformed quoted-printable sequence? */
+                      *outptr++ = '=';
+                  }
+              } else {
+                  saved = 0;
+                  need = 2;
+                  break;
+              }
+          }
+      } else if (c == '_') {
+          /* _'s are an rfc2047 shortcut for encoding spaces */
+          *outptr++ = ' ';
+      } else {
+          *outptr++ = c;
+      }
+  }
+
+  *state = need;
+  *save = saved;
+
+  return (size_t) (outptr - out);
+}
+
+/**
+ * g_mime_encoding_base64_decode_step:
+ * @inbuf: input buffer
+ * @inlen: input buffer length
+ * @outbuf: output buffer
+ * @state: holds the number of bits that are stored in @save
+ * @save: leftover bits that have not yet been decoded
+ *
+ * Decodes a chunk of base64 encoded data.
+ *
+ * Returns: the number of bytes decoded (which have been dumped in
+ * @outbuf).
+ **/
+size_t
+g_mime_encoding_base64_decode_step (const unsigned char *inbuf, size_t inlen, unsigned char *outbuf, int *state, unsigned int *save)
+{
+  register const unsigned char *inptr;
+  register unsigned char *outptr;
+  const unsigned char *inend;
+  register unsigned int saved;
+  unsigned char c;
+  int npad, n, i;
+
+  inend = inbuf + inlen;
+  outptr = outbuf;
+  inptr = inbuf;
+
+  npad = (*state >> 8) & 0xff;
+  n = *state & 0xff;
+  saved = *save;
+
+  /* convert 4 base64 bytes to 3 normal bytes */
+  while (inptr < inend) {
+      c = gmime_base64_rank[*inptr++];
+      if (c != 0xff) {
+          saved = (saved << 6) | c;
+          n++;
+          if (n == 4) {
+              *outptr++ = saved >> 16;
+              *outptr++ = saved >> 8;
+              *outptr++ = saved;
+              n = 0;
+
+              if (npad > 0) {
+                  outptr -= npad;
+                  npad = 0;
+              }
+          }
+      }
+  }
+
+  /* quickly scan back for '=' on the end somewhere */
+  /* fortunately we can drop 1 output char for each trailing '=' (up to 2) */
+  for (i = 2; inptr > inbuf && i; ) {
+      inptr--;
+      if (gmime_base64_rank[*inptr] != 0xff) {
+          if (*inptr == '=' && outptr > outbuf) {
+              if (n == 0) {
+                  /* we've got a complete quartet so it's
+                     safe to drop an output character. */
+                  outptr--;
+              } else if (npad < 2) {
+                  /* keep a record of the number of ='s at
+                     the end of the input stream, up to 2 */
+                  npad++;
+              }
+          }
+
+          i--;
+      }
+  }
+
+  *state = (npad << 8) | n;
+  *save = n ? saved : 0;
+
+  return (outptr - outbuf);
+}
+
+static size_t
+rfc2047_token_decode (rfc2047_token *token, unsigned char *outbuf, int *state, unsigned int *save)
+{
+  const unsigned char *inbuf = (const unsigned char *) token->text;
+  size_t len = token->length;
+
+  if (token->encoding == 'B')
+    return g_mime_encoding_base64_decode_step (inbuf, len, outbuf, state, save);
+  else
+    return quoted_decode (inbuf, len, outbuf, state, save);
+}
+
+static char *
+rfc2047_decode_tokens (rfc2047_token *tokens, size_t buflen)
+{
+  rfc2047_token *token, *next;
+  size_t outlen, len, tmplen;
+  unsigned char *outptr;
+  const char *charset;
+  char *outbuf;
+  char *decoded;
+  char encoding;
+  unsigned int save;
+  int state;
+#if 0
+  TODO Conversion
+  size_t ninval;
+  iconv_t cd;
+  char *str;
+#endif
+
+
+  decoded = xmalloc (buflen + 1);
+  memset (decoded, 0, buflen + 1);
+  tmplen = 76;
+  outbuf = xmalloc (tmplen);
+
+  token = tokens;
+  while (token != NULL) {
+      next = token->next;
+
+      if (token->encoding) {
+          /* In order to work around broken mailers, we need to combine
+           * the raw decoded content of runs of identically encoded word
+           * tokens before converting into UTF-8. */
+          encoding = token->encoding;
+          charset = token->charset;
+          len = token->length;
+          state = 0;
+          save = 0;
+
+          /* find the end of the run (and measure the buffer length we'll need) */
+          while (next && next->encoding == encoding && !strcmp (next->charset, charset)) {
+              len += next->length;
+              next = next->next;
+          }
+
+          /* make sure our temporary output buffer is large enough... */
+          if (len > tmplen)
+            {
+              xrealloc (outbuf, len + 1);
+              tmplen = len + 1;
+            }
+
+          /* base64 / quoted-printable decode each of the tokens... */
+          outptr = outbuf;
+          outlen = 0;
+          do {
+              /* Note: by not resetting state/save each loop, we effectively
+               * treat the payloads as one continuous block, thus allowing
+               * us to handle cases where a hex-encoded triplet of a
+               * quoted-printable encoded payload is split between 2 or more
+               * encoded-word tokens. */
+              len = rfc2047_token_decode (token, outptr, &state, &save);
+              token = token->next;
+              outptr += len;
+              outlen += len;
+          } while (token != next);
+          outptr = outbuf;
+
+          /* convert the raw decoded text into UTF-8 */
+          if (!strcasecmp (charset, "UTF-8")) {
+              strncat (decoded, (char *) outptr, outlen);
+          }
+#if 0
+  /* TODO handle other charsets */
+          else if ((cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
+              w(g_warning ("Cannot convert from %s to UTF-8, header display may "
+                           "be corrupt: %s", charset[0] ? charset : "unspecified charset",
+                           g_strerror (errno)));
+
+              str = g_mime_utils_decode_8bit ((char *) outptr, outlen);
+              g_string_append (decoded, str);
+              g_free (str);
+          } else {
+              str = g_malloc (outlen + 1);
+              len = outlen;
+
+              len = charset_convert (cd, (char *) outptr, outlen, &str, &len, &ninval);
+              g_mime_iconv_close (cd);
+
+              g_string_append_len (decoded, str, len);
+              g_free (str);
+
+#if w(!)0
+              if (ninval > 0) {
+                  g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
+                             "corrupt: %s", outlen, (char *) outptr, g_strerror (errno));
+              }
+#endif
+          }
+      } else if (token->is_8bit) {
+          /* *sigh* I hate broken mailers... */
+          str = g_mime_utils_decode_8bit (token->text, token->length);
+          g_string_append (decoded, str);
+          g_free (str);
+#endif
+      } else {
+          strncat (decoded, token->text, token->length);
+      }
+
+      token = next;
+  }
+
+  xfree (outbuf);
+
+  return decoded;
+}
+
+
+/**
+ * g_mime_utils_header_decode_phrase:
+ * @phrase: header to decode
+ *
+ * Decodes an rfc2047 encoded 'phrase' header.
+ *
+ * Note: See g_mime_set_user_charsets() for details on how charset
+ * conversion is handled for unencoded 8bit text and/or wrongly
+ * specified rfc2047 encoded-word tokens.
+ *
+ * Returns: a newly allocated UTF-8 string representing the the decoded
+ * header.
+ **/
+static char *
+g_mime_utils_header_decode_phrase (const char *phrase)
+{
+  rfc2047_token *tokens;
+  char *decoded;
+  size_t len;
+
+  tokens = tokenize_rfc2047_phrase (phrase, &len);
+  decoded = rfc2047_decode_tokens (tokens, len);
+  rfc2047_token_list_free (tokens);
+
+  if (decoded && strlen(decoded))
+    {
+      return decoded;
+    }
+  else
+    {
+      xfree (decoded);
+      return strdup (phrase);
+    }
+
+  return decoded;
+}
+
+/* Try to parse an rfc 2047 filename for attachment handling.
+   returns the parsed string. On errors the input string is just
+   copied with strdup */
+char *
+rfc2047_parse (const char *input)
+{
+  char *decoded;
+  if (!input)
+    return strdup ("");
+
+  log_debug ("%s:%s: Input: \"%s\"",
+             SRCNAME, __func__, input);
+
+  if (!strncmp (input, "=?", 2))
+    {
+      return strdup (input);
+    }
+  decoded = g_mime_utils_header_decode_phrase (input);
+
+  log_debug ("%s:%s: Decoded: \"%s\"",
+             SRCNAME, __func__, decoded);
+
+  if (!decoded || !strlen (decoded))
+    {
+      xfree (decoded);
+      return strdup (input);
+    }
+  return decoded;
+}
diff --git a/src/rfc2047parse.h b/src/rfc2047parse.h
new file mode 100644
index 0000000..593b6fd
--- /dev/null
+++ b/src/rfc2047parse.h
@@ -0,0 +1,31 @@
+/* @file rfc2047parse.h
+ * @brief Parser for filenames encoded according to rfc2047
+ *
+ *    Copyright (C) 2015 Intevation GmbH
+ *
+ * This file is part of GpgOL.
+ *
+ * GpgOL is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * GpgOL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @brief Try to parse a string according to rfc2047.
+  *
+  * On error the error is logged and a copy of the original
+  * input string returned.
+  *
+  * @returns a malloced string in UTF-8 encoding or a copy
+  *          of the input string.
+  */
+char *
+rfc2047_parse (const char *input);

-----------------------------------------------------------------------

Summary of changes:
 src/Makefile.am                      |   4 +-
 src/common.c                         |  43 +++
 src/common.h                         |   1 +
 src/mimemaker.c                      |  36 +-
 src/mimeparser.c                     |  25 +-
 src/mlang-charset.cpp                |  98 ++++++
 src/{xmalloc.h => mlang-charset.h}   |  35 +-
 src/rfc2047parse.c                   | 657 +++++++++++++++++++++++++++++++++++
 src/{eventsinks.h => rfc2047parse.h} |  23 +-
 9 files changed, 881 insertions(+), 41 deletions(-)
 create mode 100644 src/mlang-charset.cpp
 copy src/{xmalloc.h => mlang-charset.h} (60%)
 create mode 100644 src/rfc2047parse.c
 copy src/{eventsinks.h => rfc2047parse.h} (66%)


hooks/post-receive
-- 
GnuPG extension for MS Outlook
http://git.gnupg.org




More information about the Gnupg-commits mailing list