[evolution-data-server/gnome-3-32] I#105 - Invalid UTF-8 letters in a mail body get lost

From: Milan Crha <mcrha src gnome org>
To: commits-list gnome org
Cc:
Subject: [evolution-data-server/gnome-3-32] I#105 - Invalid UTF-8 letters in a mail body get lost
Date: Tue, 23 Apr 2019 13:14:10 +0000 (UTC)
commit 974fda135e834dd2d35a7864fb236e16ea14ac2a
Author: Milan Crha <mcrha redhat com>
Date:   Tue Apr 23 15:15:46 2019 +0200

    I#105 - Invalid UTF-8 letters in a mail body get lost
    
    Closes https://gitlab.gnome.org/GNOME/evolution-data-server/issues/105

 src/camel/camel-mime-filter-tohtml.c    | 59 ++++++++++++++++++------------
 src/camel/camel-utf8.c                  | 65 +++++++++++++++++++++++++++++----
 src/camel/camel-utf8.h                  |  2 +
 src/libedataserver/e-data-server-util.c |  3 +-
 4 files changed, 97 insertions(+), 32 deletions(-)
---
diff --git a/src/camel/camel-mime-filter-tohtml.c b/src/camel/camel-mime-filter-tohtml.c
index b9fdfd12a..cbdfbb6ad 100644
--- a/src/camel/camel-mime-filter-tohtml.c
+++ b/src/camel/camel-mime-filter-tohtml.c
@@ -164,16 +164,31 @@ exit:
 
 static gchar *
 writeln (CamelMimeFilter *mime_filter,
-         const guchar *in,
-         const guchar *inend,
+         const gchar *in_anycharset,
+         const gchar *inend_char,
          gchar *outptr,
          gchar **outend)
 {
        CamelMimeFilterToHTMLPrivate *priv;
-       const guchar *inptr = in;
+       const guchar *inptr, *inend, *inbegin;
+       gchar *in_utf8 = NULL;
 
        priv = CAMEL_MIME_FILTER_TOHTML_GET_PRIVATE (mime_filter);
 
+       if (!g_utf8_validate (in_anycharset, inend_char - in_anycharset, NULL)) {
+               in_utf8 = camel_utf8_make_valid_len (in_anycharset, inend_char - in_anycharset);
+
+               if (!in_utf8)
+                       return outptr;
+
+               inptr = (const guchar *) in_utf8;
+               inend = inptr + strlen (in_utf8);
+       } else {
+               inptr = (const guchar *) in_anycharset;
+               inend = (const guchar *) inend_char;
+       }
+
+       inbegin = inptr;
        while (inptr < inend) {
                guint32 u;
 
@@ -216,7 +231,7 @@ writeln (CamelMimeFilter *mime_filter,
                        /* falls through */
                case ' ':
                        if (priv->flags & CAMEL_MIME_FILTER_TOHTML_CONVERT_SPACES
-                           && ((inptr == (in + 1) || (inptr < inend && (*inptr == ' ' || *inptr == '\t'))))) 
{
+                           && ((inptr == (inbegin + 1) || (inptr < inend && (*inptr == ' ' || *inptr == 
'\t'))))) {
                                outptr = g_stpcpy (outptr, "&nbsp;");
                                priv->column++;
                                break;
@@ -228,7 +243,7 @@ writeln (CamelMimeFilter *mime_filter,
                                   only if not converting the new-line breaks */
                                if (!(priv->flags & CAMEL_MIME_FILTER_TOHTML_CONVERT_NL))
                                        *outptr++ = u;
-                       } else if (u >= 20 && u <0x80) {
+                       } else if (u >= 0x20 && u < 0x80) {
                                *outptr++ = u;
                        } else {
                                if (priv->flags & CAMEL_MIME_FILTER_TOHTML_ESCAPE_8BIT)
@@ -241,6 +256,8 @@ writeln (CamelMimeFilter *mime_filter,
                }
        }
 
+       g_free (in_utf8);
+
        return outptr;
 }
 
@@ -375,9 +392,8 @@ html_convert (CamelMimeFilter *mime_filter,
                                        /* write out anything before the first regex match */
                                        outptr = writeln (
                                                mime_filter,
-                                               (const guchar *) start,
-                                               (const guchar *) start +
-                                               match.um_so,
+                                               start,
+                                               start + match.um_so,
                                                outptr, &outend);
 
                                        start += match.um_so;
@@ -390,15 +406,13 @@ html_convert (CamelMimeFilter *mime_filter,
                                        /* prefix shouldn't need escaping, but let's be safe */
                                        outptr = writeln (
                                                mime_filter,
-                                               (const guchar *) match.prefix,
-                                               (const guchar *) match.prefix +
-                                               strlen (match.prefix),
+                                               match.prefix,
+                                               match.prefix + strlen (match.prefix),
                                                outptr, &outend);
                                        outptr = writeln (
                                                mime_filter,
-                                               (const guchar *) start,
-                                               (const guchar *) start +
-                                               matchlen,
+                                               start,
+                                               start + matchlen,
                                                outptr, &outend);
                                        outptr = append_string_verbatim (
                                                mime_filter, "\">",
@@ -407,9 +421,8 @@ html_convert (CamelMimeFilter *mime_filter,
                                        /* now write the matched string */
                                        outptr = writeln (
                                                mime_filter,
-                                               (const guchar *) start,
-                                               (const guchar *) start +
-                                               matchlen,
+                                               start,
+                                               start + matchlen,
                                                outptr, &outend);
                                        priv->column += matchlen;
                                        start += matchlen;
@@ -423,8 +436,8 @@ html_convert (CamelMimeFilter *mime_filter,
                                        /* nothing matched so write out the remainder of this line buffer */
                                        outptr = writeln (
                                                mime_filter,
-                                               (const guchar *) start,
-                                               (const guchar *) start + len,
+                                               start,
+                                               start + len,
                                                outptr, &outend);
                                        break;
                                }
@@ -432,8 +445,8 @@ html_convert (CamelMimeFilter *mime_filter,
                } else {
                        outptr = writeln (
                                mime_filter,
-                               (const guchar *) start,
-                               (const guchar *) inptr,
+                               start,
+                               inptr,
                                outptr, &outend);
                }
 
@@ -459,8 +472,8 @@ html_convert (CamelMimeFilter *mime_filter,
                if (start < inend)
                        outptr = writeln (
                                mime_filter,
-                               (const guchar *) start,
-                               (const guchar *) inend,
+                               start,
+                               inend,
                                outptr, &outend);
 
                while (priv->blockquote_depth > 0) {
diff --git a/src/camel/camel-utf8.c b/src/camel/camel-utf8.c
index fd1ba91f2..1d17fe96e 100644
--- a/src/camel/camel-utf8.c
+++ b/src/camel/camel-utf8.c
@@ -403,16 +403,65 @@ camel_ucs2_utf8 (const gchar *ptr)
 gchar *
 camel_utf8_make_valid (const gchar *text)
 {
-       gchar *res = g_strdup (text), *p;
+       return camel_utf8_make_valid_len (text, -1);
+}
+
+/**
+ * camel_utf8_make_valid_len:
+ * @text: a text to make valid
+ * @text_len: length of the @text, or -1 if NUL-terminated
+ *
+ * Ensures the returned text will be valid UTF-8 string, with incorrect letters
+ * changed to question marks.
+ *
+ * Returns: (transfer full): Valid UTF-8 string, with replaced incorrect letters.
+ *    Free it with g_free(), when no longer needed.
+ *
+ * Since: 3.32.2
+ **/
+gchar *
+camel_utf8_make_valid_len (const gchar *text,
+                          gssize text_len)
+{
+       /* almost identical copy of glib's _g_utf8_make_valid() */
+       GString *string;
+       const gchar *remainder, *invalid;
+       gint remaining_bytes, valid_bytes;
+
+       if (text && text_len < 0)
+               text_len = strlen (text);
+
+       if (!text || text_len <= 0 || !*text)
+               return g_strdup (text);
+
+       string = NULL;
+       remainder = (gchar *) text,
+       remaining_bytes = text_len;
+
+       while (remaining_bytes != 0) {
+               if (g_utf8_validate (remainder, remaining_bytes, &invalid))
+                       break;
+
+               valid_bytes = invalid - remainder;
+
+               if (!string)
+                       string = g_string_sized_new (remaining_bytes);
 
-       if (!res)
-               return res;
+               g_string_append_len (string, remainder, valid_bytes);
+               /* append U+FFFD REPLACEMENT CHARACTER */
+               g_string_append (string, "\357\277\275");
 
-       p = res;
-       while (!g_utf8_validate (p, -1, (const gchar **) &p)) {
-               /* make all invalid characters appear as question marks */
-               *p = '?';
+               remaining_bytes -= valid_bytes + 1;
+               remainder = invalid + 1;
        }
 
-       return res;
+       if (!string)
+               return g_strndup (text, text_len);
+
+       if (remaining_bytes > 0)
+               g_string_append_len (string, remainder, remaining_bytes);
+
+       g_warn_if_fail (g_utf8_validate (string->str, -1, NULL));
+
+       return g_string_free (string, FALSE);
 }
diff --git a/src/camel/camel-utf8.h b/src/camel/camel-utf8.h
index 48c5e86f5..e600f7292 100644
--- a/src/camel/camel-utf8.h
+++ b/src/camel/camel-utf8.h
@@ -42,6 +42,8 @@ gchar *camel_ucs2_utf8 (const gchar *ptr);
 
 /* make valid utf8 string */
 gchar *camel_utf8_make_valid (const gchar *text);
+gchar *camel_utf8_make_valid_len (const gchar *text,
+                                 gssize text_len);
 
 G_END_DECLS
 
diff --git a/src/libedataserver/e-data-server-util.c b/src/libedataserver/e-data-server-util.c
index 4d85234c6..88aa696d4 100644
--- a/src/libedataserver/e-data-server-util.c
+++ b/src/libedataserver/e-data-server-util.c
@@ -650,7 +650,8 @@ e_util_utf8_data_make_valid (const gchar *data,
        if (string == NULL)
                return g_strndup ((gchar *) data, data_bytes);
 
-       g_string_append (string, remainder);
+       if (remaining_bytes > 0)
+               g_string_append_len (string, remainder, remaining_bytes);
 
        g_warn_if_fail (g_utf8_validate (string->str, -1, NULL));
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]