[svn] GnuPG - r5112 - in branches/STABLE-BRANCH-1-4: g10 include util

Wed Aug 12 07:01:12 CEST 2009

Author: dshaw
Date: 2009-08-12 07:01:08 +0200 (Wed, 12 Aug 2009)
New Revision: 5112

Modified:
   branches/STABLE-BRANCH-1-4/g10/ChangeLog
   branches/STABLE-BRANCH-1-4/g10/keyserver.c
   branches/STABLE-BRANCH-1-4/include/ChangeLog
   branches/STABLE-BRANCH-1-4/include/util.h
   branches/STABLE-BRANCH-1-4/util/ChangeLog
   branches/STABLE-BRANCH-1-4/util/strgutil.c
Log:
Try and detect mis-coded Latin1 and convert it to UTF8.  Whether the
heuristics succeed or not, the resulting string must be valid UTF8 as
LDAP requires that.  This is bug 1055.


Modified: branches/STABLE-BRANCH-1-4/g10/ChangeLog
===================================================================

--- branches/STABLE-BRANCH-1-4/g10/ChangeLog	2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/g10/ChangeLog	2009-08-12 05:01:08 UTC (rev 5112)
@@ -1,3 +1,10 @@
+2009-08-11  David Shaw  <dshaw at jabberwocky.com>
+
+	* keyserver.c (keyserver_spawn): Try and detect mis-coded Latin1
+	and convert it to UTF8.  Whether the heuristics succeed or not,
+	the resulting string must be valid UTF8 as LDAP requires that.
+	This is bug 1055.
+
 2009-08-03  Werner Koch  <wk at g10code.com>
 
 	* card-util.c (generate_card_keys): Ask for off-card keys only if

Modified: branches/STABLE-BRANCH-1-4/include/ChangeLog
===================================================================
--- branches/STABLE-BRANCH-1-4/include/ChangeLog	2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/include/ChangeLog	2009-08-12 05:01:08 UTC (rev 5112)
@@ -1,3 +1,7 @@
+2009-08-11  David Shaw  <dshaw at jabberwocky.com>
+
+	* util.h: Add string_to_utf8() from GPA.
+
 2009-07-21  Werner Koch  <wk at g10code.com>
 
 	* estream-printf.h: New.  Taken from libestream.x

Modified: branches/STABLE-BRANCH-1-4/util/ChangeLog
===================================================================
--- branches/STABLE-BRANCH-1-4/util/ChangeLog	2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/util/ChangeLog	2009-08-12 05:01:08 UTC (rev 5112)
@@ -1,3 +1,8 @@
+2009-08-11  David Shaw  <dshaw at jabberwocky.com>
+
+	* strgutil.c (string_to_utf8): New function to convert a Latin-1
+	string to UTF8.  From GPA.
+
 2009-07-23  David Shaw  <dshaw at jabberwocky.com>
 
 	* srv.c (getsrv): Fix type-punning warning.

Modified: branches/STABLE-BRANCH-1-4/g10/keyserver.c
===================================================================
--- branches/STABLE-BRANCH-1-4/g10/keyserver.c	2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/g10/keyserver.c	2009-08-12 05:01:08 UTC (rev 5112)
@@ -1270,24 +1270,49 @@
 			{
 			  PKT_user_id *uid=node->pkt->pkt.user_id;
 			  int r;
+			  char *uidstr1,*uidstr2,*uidstr3;
+			  size_t uidstrlen;
 
 			  if(uid->attrib_data)
 			    continue;
 
 			  fprintf(spawn->tochild,"uid:");
 
-			  /* Quote ':', '%', and any 8-bit
-			     characters */
-			  for(r=0;r<uid->len;r++)
+			  /* Make sure it's real UTF8.  What happens
+			     here is that first we heuristically try
+			     and convert the string (which may be
+			     mis-coded) into UTF8.  We then bring it
+			     to native and then back to UTF8.  For
+			     true UTF8, this whole process should be
+			     lossless.  For the common Latin-1
+			     mis-encoding, it will become UTF8.  For
+			     other encodings, it will become UTF8 but
+			     with unknown characters quoted.  This
+			     preserves the notion that anything in the
+			     stream to the keyserver handler program
+			     is UTF8. */
+			  uidstr1=string_to_utf8(uid->name);
+			  uidstr2=utf8_to_native(uidstr1,strlen(uidstr1),-1);
+			  uidstr3=native_to_utf8(uidstr2);
+
+			  uidstrlen=strlen(uidstr3);
+
+			  /* Quote ':', '%', and anything not
+			     printable ASCII */
+			  for(r=0;r<uidstrlen;r++)
 			    {
-			      if(uid->name[r]==':' || uid->name[r]=='%'
-				 || uid->name[r]&0x80)
+			      if(uidstr3[r]==':' || uidstr3[r]=='%'
+				 || uidstr3[r]<' ' || uidstr3[r]>'~')
 				fprintf(spawn->tochild,"%%%02X",
-					(byte)uid->name[r]);
+					(byte)uidstr3[r]);
 			      else
-				fprintf(spawn->tochild,"%c",uid->name[r]);
+				fprintf(spawn->tochild,"%c",uidstr3[r]);
 			    }
 
+			  xfree(uidstr1);
+			  xfree(uidstr2);
+			  xfree(uidstr3);
+
 			  fprintf(spawn->tochild,":%u:%u:",
 				  uid->created,uid->expiredate);
 

Modified: branches/STABLE-BRANCH-1-4/include/util.h
===================================================================
--- branches/STABLE-BRANCH-1-4/include/util.h	2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/include/util.h	2009-08-12 05:01:08 UTC (rev 5112)
@@ -190,7 +190,7 @@
 const char* get_native_charset(void);
 char *native_to_utf8( const char *string );
 char *utf8_to_native( const char *string, size_t length, int delim);
-int  check_utf8_string( const char *string );
+char *string_to_utf8 (const char *string);
 
 int ascii_isupper (int c);
 int ascii_islower (int c);

Modified: branches/STABLE-BRANCH-1-4/util/strgutil.c
===================================================================
--- branches/STABLE-BRANCH-1-4/util/strgutil.c	2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/util/strgutil.c	2009-08-12 05:01:08 UTC (rev 5112)
@@ -1048,7 +1048,78 @@
     }
 }
 
+/* This is similar to native_to_utf8, except it can take any input
+   (which may or may not be UTF8 encoded) and return something that is
+   (almost) definitely UTF8.  This code is mostly borrowed from
+   GPA. */
 
+char *
+string_to_utf8 (const char *string)
+{
+  const char *s;
+  
+  if (!string)
+    return NULL;
+  
+  /* Due to a bug in old and not so old PGP versions user IDs have
+     been copied verbatim into the key.  Thus many users with Umlauts
+     et al. in their name will see their names garbled.  Although this
+     is not an issue for me (;-)), I have a couple of friends with
+     Umlauts in their name, so let's try to make their life easier by
+     detecting invalid encodings and convert that to Latin-1.  We use
+     this even for X.509 because it may make things even better given
+     all the invalid encodings often found in X.509 certificates.  */
+  for (s = string; *s && !(*s & 0x80); s++)
+    ;
+  if (*s && ((s[1] & 0xc0) == 0x80) && ( ((*s & 0xe0) == 0xc0)
+                                         || ((*s & 0xf0) == 0xe0)
+                                         || ((*s & 0xf8) == 0xf0)
+                                         || ((*s & 0xfc) == 0xf8)
+                                         || ((*s & 0xfe) == 0xfc)) )
+    {  
+      /* Possible utf-8 character followed by continuation byte.
+         Although this might still be Latin-1 we better assume that it
+         is valid utf-8. */
+      return xstrdup (string);
+     }
+  else if (*s && !strchr (string, 0xc3))
+    {
+      size_t length=0;
+      char *buffer,*p;
+
+      /* No 0xC3 character in the string; assume that it is Latin-1.  */
+
+      for(s=string; *s; s++ ) 
+        {
+          length++;
+          if( *s & 0x80 )
+            length++;
+	}
+      buffer = xmalloc( length + 1 );
+      for(p=buffer, s=string; *s; s++ )
+        {
+          if( *s & 0x80 )
+            {
+              *p++ = 0xc0 | ((*s >> 6) & 3);
+              *p++ = 0x80 | ( *s & 0x3f );
+            }
+          else
+            *p++ = *s;
+        }
+      *p = 0;
+
+      return buffer;
+    }
+  else
+    {
+      /* Everything else is assumed to be UTF-8.  We do this even that
+         we know the encoding is not valid.  However as we only test
+         the first non-ascii character, valid encodings might
+         follow.  */
+      return xstrdup (string);
+    }
+}
+
 /* Same as asprintf but return an allocated buffer suitable to be
    freed using xfree.  This function simply dies on memory failure,
    thus no extra check is required. */