[svn] GnuPG - r5112 - in branches/STABLE-BRANCH-1-4: g10 include util
svn author dshaw
cvs at cvs.gnupg.org
Wed Aug 12 07:01:12 CEST 2009
Author: dshaw
Date: 2009-08-12 07:01:08 +0200 (Wed, 12 Aug 2009)
New Revision: 5112
Modified:
branches/STABLE-BRANCH-1-4/g10/ChangeLog
branches/STABLE-BRANCH-1-4/g10/keyserver.c
branches/STABLE-BRANCH-1-4/include/ChangeLog
branches/STABLE-BRANCH-1-4/include/util.h
branches/STABLE-BRANCH-1-4/util/ChangeLog
branches/STABLE-BRANCH-1-4/util/strgutil.c
Log:
Try and detect mis-coded Latin1 and convert it to UTF8. Whether the
heuristics succeed or not, the resulting string must be valid UTF8 as
LDAP requires that. This is bug 1055.
Modified: branches/STABLE-BRANCH-1-4/g10/ChangeLog
===================================================================
--- branches/STABLE-BRANCH-1-4/g10/ChangeLog 2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/g10/ChangeLog 2009-08-12 05:01:08 UTC (rev 5112)
@@ -1,3 +1,10 @@
+2009-08-11 David Shaw <dshaw at jabberwocky.com>
+
+ * keyserver.c (keyserver_spawn): Try and detect mis-coded Latin1
+ and convert it to UTF8. Whether the heuristics succeed or not,
+ the resulting string must be valid UTF8 as LDAP requires that.
+ This is bug 1055.
+
2009-08-03 Werner Koch <wk at g10code.com>
* card-util.c (generate_card_keys): Ask for off-card keys only if
Modified: branches/STABLE-BRANCH-1-4/include/ChangeLog
===================================================================
--- branches/STABLE-BRANCH-1-4/include/ChangeLog 2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/include/ChangeLog 2009-08-12 05:01:08 UTC (rev 5112)
@@ -1,3 +1,7 @@
+2009-08-11 David Shaw <dshaw at jabberwocky.com>
+
+ * util.h: Add string_to_utf8() from GPA.
+
2009-07-21 Werner Koch <wk at g10code.com>
* estream-printf.h: New. Taken from libestream.x
Modified: branches/STABLE-BRANCH-1-4/util/ChangeLog
===================================================================
--- branches/STABLE-BRANCH-1-4/util/ChangeLog 2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/util/ChangeLog 2009-08-12 05:01:08 UTC (rev 5112)
@@ -1,3 +1,8 @@
+2009-08-11 David Shaw <dshaw at jabberwocky.com>
+
+ * strgutil.c (string_to_utf8): New function to convert a Latin-1
+ string to UTF8. From GPA.
+
2009-07-23 David Shaw <dshaw at jabberwocky.com>
* srv.c (getsrv): Fix type-punning warning.
Modified: branches/STABLE-BRANCH-1-4/g10/keyserver.c
===================================================================
--- branches/STABLE-BRANCH-1-4/g10/keyserver.c 2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/g10/keyserver.c 2009-08-12 05:01:08 UTC (rev 5112)
@@ -1270,24 +1270,49 @@
{
PKT_user_id *uid=node->pkt->pkt.user_id;
int r;
+ char *uidstr1,*uidstr2,*uidstr3;
+ size_t uidstrlen;
if(uid->attrib_data)
continue;
fprintf(spawn->tochild,"uid:");
- /* Quote ':', '%', and any 8-bit
- characters */
- for(r=0;r<uid->len;r++)
+ /* Make sure it's real UTF8. What happens
+ here is that first we heuristically try
+ and convert the string (which may be
+ mis-coded) into UTF8. We then bring it
+ to native and then back to UTF8. For
+ true UTF8, this whole process should be
+ lossless. For the common Latin-1
+ mis-encoding, it will become UTF8. For
+ other encodings, it will become UTF8 but
+ with unknown characters quoted. This
+ preserves the notion that anything in the
+ stream to the keyserver handler program
+ is UTF8. */
+ uidstr1=string_to_utf8(uid->name);
+ uidstr2=utf8_to_native(uidstr1,strlen(uidstr1),-1);
+ uidstr3=native_to_utf8(uidstr2);
+
+ uidstrlen=strlen(uidstr3);
+
+ /* Quote ':', '%', and anything not
+ printable ASCII */
+ for(r=0;r<uidstrlen;r++)
{
- if(uid->name[r]==':' || uid->name[r]=='%'
- || uid->name[r]&0x80)
+ if(uidstr3[r]==':' || uidstr3[r]=='%'
+ || uidstr3[r]<' ' || uidstr3[r]>'~')
fprintf(spawn->tochild,"%%%02X",
- (byte)uid->name[r]);
+ (byte)uidstr3[r]);
else
- fprintf(spawn->tochild,"%c",uid->name[r]);
+ fprintf(spawn->tochild,"%c",uidstr3[r]);
}
+ xfree(uidstr1);
+ xfree(uidstr2);
+ xfree(uidstr3);
+
fprintf(spawn->tochild,":%u:%u:",
uid->created,uid->expiredate);
Modified: branches/STABLE-BRANCH-1-4/include/util.h
===================================================================
--- branches/STABLE-BRANCH-1-4/include/util.h 2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/include/util.h 2009-08-12 05:01:08 UTC (rev 5112)
@@ -190,7 +190,7 @@
const char* get_native_charset(void);
char *native_to_utf8( const char *string );
char *utf8_to_native( const char *string, size_t length, int delim);
-int check_utf8_string( const char *string );
+char *string_to_utf8 (const char *string);
int ascii_isupper (int c);
int ascii_islower (int c);
Modified: branches/STABLE-BRANCH-1-4/util/strgutil.c
===================================================================
--- branches/STABLE-BRANCH-1-4/util/strgutil.c 2009-08-11 18:34:16 UTC (rev 5111)
+++ branches/STABLE-BRANCH-1-4/util/strgutil.c 2009-08-12 05:01:08 UTC (rev 5112)
@@ -1048,7 +1048,78 @@
}
}
+/* This is similar to native_to_utf8, except it can take any input
+ (which may or may not be UTF8 encoded) and return something that is
+ (almost) definitely UTF8. This code is mostly borrowed from
+ GPA. */
+char *
+string_to_utf8 (const char *string)
+{
+ const char *s;
+
+ if (!string)
+ return NULL;
+
+ /* Due to a bug in old and not so old PGP versions user IDs have
+ been copied verbatim into the key. Thus many users with Umlauts
+ et al. in their name will see their names garbled. Although this
+ is not an issue for me (;-)), I have a couple of friends with
+ Umlauts in their name, so let's try to make their life easier by
+ detecting invalid encodings and convert that to Latin-1. We use
+ this even for X.509 because it may make things even better given
+ all the invalid encodings often found in X.509 certificates. */
+ for (s = string; *s && !(*s & 0x80); s++)
+ ;
+ if (*s && ((s[1] & 0xc0) == 0x80) && ( ((*s & 0xe0) == 0xc0)
+ || ((*s & 0xf0) == 0xe0)
+ || ((*s & 0xf8) == 0xf0)
+ || ((*s & 0xfc) == 0xf8)
+ || ((*s & 0xfe) == 0xfc)) )
+ {
+ /* Possible utf-8 character followed by continuation byte.
+ Although this might still be Latin-1 we better assume that it
+ is valid utf-8. */
+ return xstrdup (string);
+ }
+ else if (*s && !strchr (string, 0xc3))
+ {
+ size_t length=0;
+ char *buffer,*p;
+
+ /* No 0xC3 character in the string; assume that it is Latin-1. */
+
+ for(s=string; *s; s++ )
+ {
+ length++;
+ if( *s & 0x80 )
+ length++;
+ }
+ buffer = xmalloc( length + 1 );
+ for(p=buffer, s=string; *s; s++ )
+ {
+ if( *s & 0x80 )
+ {
+ *p++ = 0xc0 | ((*s >> 6) & 3);
+ *p++ = 0x80 | ( *s & 0x3f );
+ }
+ else
+ *p++ = *s;
+ }
+ *p = 0;
+
+ return buffer;
+ }
+ else
+ {
+ /* Everything else is assumed to be UTF-8. We do this even that
+ we know the encoding is not valid. However as we only test
+ the first non-ascii character, valid encodings might
+ follow. */
+ return xstrdup (string);
+ }
+}
+
/* Same as asprintf but return an allocated buffer suitable to be
freed using xfree. This function simply dies on memory failure,
thus no extra check is required. */
More information about the Gnupg-commits
mailing list