[evolution-data-server] Camel: Use the same charset in all RFC2047-encoded words
- From: Milan Crha <mcrha src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [evolution-data-server] Camel: Use the same charset in all RFC2047-encoded words
- Date: Thu, 25 Nov 2021 15:59:05 +0000 (UTC)
commit aee82375d6ef234ff7c42b1e3eee676eb4f9da22
Author: Milan Crha <mcrha redhat com>
Date: Thu Nov 25 16:50:16 2021 +0100
Camel: Use the same charset in all RFC2047-encoded words
When a string is encoded with RFC2047, use the same charset
for each of the words, with a fallback to UTF-8, if more charsets
would be used.
src/camel/camel-mime-utils.c | 122 +++++++++++++++++++++++++++++++------
src/camel/tests/lib/address-data.h | 2 +-
src/camel/tests/message/test2.c | 81 +++++++++++++++++++++++-
3 files changed, 186 insertions(+), 19 deletions(-)
---
diff --git a/src/camel/camel-mime-utils.c b/src/camel/camel-mime-utils.c
index 02d487363..bafba10fc 100644
--- a/src/camel/camel-mime-utils.c
+++ b/src/camel/camel-mime-utils.c
@@ -1620,9 +1620,9 @@ header_encode_string_rfc2047 (const guchar *in,
gboolean include_lwsp)
{
const guchar *inptr = in, *start, *word;
- gboolean last_was_encoded = FALSE;
- gboolean last_was_space = FALSE;
- const gchar *charset;
+ gboolean last_was_encoded;
+ gboolean last_was_space;
+ const gchar *charset = NULL;
gint encoding;
GString *out;
@@ -1640,11 +1640,14 @@ header_encode_string_rfc2047 (const guchar *in,
if (*inptr == '\0')
return g_strdup ((gchar *) in);
- /* This gets each word out of the input, and checks to see what charset
- * can be used to encode it. */
+ /* This gets each word out of the input, and checks to see what charset can be
+ * used to encode it. The same charset (the first or UTF-8) is used for all words. */
/* TODO: Work out when to merge subsequent words, or across word-parts */
out = g_string_new ("");
+ restart:
inptr = in;
+ last_was_encoded = FALSE;
+ last_was_space = FALSE;
encoding = 0;
word = NULL;
start = inptr;
@@ -1680,14 +1683,36 @@ header_encode_string_rfc2047 (const guchar *in,
if (last_was_encoded)
g_string_append_c (out, ' ');
- rfc2047_encode_word (out, (const gchar *) start, inptr - start, "ISO-8859-1",
CAMEL_MIME_IS_ESAFE);
+ if (!charset)
+ charset = "ISO-8859-1";
+ else if (g_ascii_strcasecmp (charset, "UTF-8") != 0 && g_ascii_strcasecmp
(charset, "ISO-8859-1") != 0) {
+ /* Use the UTF-8, when different words require different character
sets */
+ g_string_truncate (out, 0);
+ charset = "UTF-8";
+ goto restart;
+ }
+ rfc2047_encode_word (out, (const gchar *) start, inptr - start, charset,
CAMEL_MIME_IS_ESAFE);
last_was_encoded = TRUE;
break;
case 2:
if (last_was_encoded)
g_string_append_c (out, ' ');
- if (!(charset = camel_charset_best ((const gchar *) start, inptr - start)))
+ if (charset && g_ascii_strcasecmp (charset, "UTF-8") != 0) {
+ const gchar *would_use_charset;
+
+ if (!(would_use_charset = camel_charset_best ((const gchar *) start,
inptr - start)))
+ would_use_charset = "UTF-8";
+
+ if (g_ascii_strcasecmp (would_use_charset, charset) != 0) {
+ /* Use the UTF-8, when different words require different
character sets */
+ g_string_truncate (out, 0);
+ charset = "UTF-8";
+ goto restart;
+ }
+ }
+
+ if (!charset && !(charset = camel_charset_best ((const gchar *) start, inptr
- start)))
charset = "UTF-8";
rfc2047_encode_word (out, (const gchar *) start, inptr - start, charset,
CAMEL_MIME_IS_ESAFE);
last_was_encoded = TRUE;
@@ -1728,13 +1753,35 @@ header_encode_string_rfc2047 (const guchar *in,
if (last_was_encoded)
g_string_append_c (out, ' ');
- rfc2047_encode_word (out, (const gchar *) start, inptr - start, "ISO-8859-1",
CAMEL_MIME_IS_ESAFE);
+ if (!charset)
+ charset = "ISO-8859-1";
+ else if (g_ascii_strcasecmp (charset, "UTF-8") != 0 && g_ascii_strcasecmp (charset,
"ISO-8859-1") != 0) {
+ /* Use the UTF-8, when different words require different character sets */
+ g_string_truncate (out, 0);
+ charset = "UTF-8";
+ goto restart;
+ }
+ rfc2047_encode_word (out, (const gchar *) start, inptr - start, charset,
CAMEL_MIME_IS_ESAFE);
break;
case 2:
if (last_was_encoded)
g_string_append_c (out, ' ');
- if (!(charset = camel_charset_best ((const gchar *) start, inptr - start)))
+ if (charset && g_ascii_strcasecmp (charset, "UTF-8") != 0) {
+ const gchar *would_use_charset;
+
+ if (!(would_use_charset = camel_charset_best ((const gchar *) start, inptr -
start)))
+ would_use_charset = "UTF-8";
+
+ if (g_ascii_strcasecmp (would_use_charset, charset) != 0) {
+ /* Use the UTF-8, when different words require different character
sets */
+ g_string_truncate (out, 0);
+ charset = "UTF-8";
+ goto restart;
+ }
+ }
+
+ if (!charset && !(charset = camel_charset_best ((const gchar *) start, inptr -
start)))
charset = "UTF-8";
rfc2047_encode_word (out, (const gchar *) start, inptr - start, charset,
CAMEL_MIME_IS_ESAFE);
break;
@@ -1813,12 +1860,14 @@ word_types_compatable (enum _phrase_word_t type1,
/* split the input into words with info about each word
* merge common word types clean up */
static GList *
-header_encode_phrase_get_words (const guchar *in)
+header_encode_phrase_get_words (const guchar *in,
+ const gchar **out_charset)
{
const guchar *inptr = in, *start, *last;
struct _phrase_word *word;
enum _phrase_word_t type;
gint encoding, count = 0;
+ gboolean has_encoding_2 = FALSE;
GList *words = NULL;
/* break the input into words */
@@ -1852,6 +1901,19 @@ header_encode_phrase_get_words (const guchar *in)
word->encoding = encoding;
words = g_list_append (words, word);
count = 0;
+
+ if (encoding > 0 && (!*out_charset || g_ascii_strcasecmp (*out_charset,
"UTF-8") != 0)) {
+ const gchar *charset;
+
+ if (!(charset = camel_charset_best ((const gchar *) word->start,
word->end - word->start)))
+ charset = "UTF-8";
+
+ /* Use the UTF-8, when different words require different character
sets */
+ if (!*out_charset)
+ *out_charset = charset;
+ else if (g_ascii_strcasecmp (*out_charset, charset) != 0)
+ *out_charset = "UTF-8";
+ }
}
start = inptr;
@@ -1868,6 +1930,7 @@ header_encode_phrase_get_words (const guchar *in)
} else if (c >= 256) {
type = WORD_2047;
encoding = MAX (encoding, 2);
+ has_encoding_2 = TRUE;
}
}
@@ -1881,6 +1944,31 @@ header_encode_phrase_get_words (const guchar *in)
word->type = type;
word->encoding = encoding;
words = g_list_append (words, word);
+
+ if (encoding > 0 && (!*out_charset || g_ascii_strcasecmp (*out_charset, "UTF-8") != 0)) {
+ const gchar *charset;
+
+ if (!(charset = camel_charset_best ((const gchar *) word->start, word->end -
word->start)))
+ charset = "UTF-8";
+
+ /* Use the UTF-8, when different words require different character sets */
+ if (!*out_charset)
+ *out_charset = charset;
+ else if (g_ascii_strcasecmp (*out_charset, charset) != 0)
+ *out_charset = "UTF-8";
+ }
+ }
+
+ /* Make sure all encodings are of the same type */
+ if (has_encoding_2) {
+ GList *link;
+
+ for (link = words; link; link = g_list_next (link)) {
+ word = link->data;
+
+ if (word->type == WORD_2047 && word->encoding == 1)
+ word->encoding = 2;
+ }
}
return words;
@@ -1950,13 +2038,13 @@ camel_header_encode_phrase (const guchar *in)
{
struct _phrase_word *word = NULL, *last_word = NULL;
GList *words, *wordl;
- const gchar *charset;
+ const gchar *charset = NULL;
GString *out;
if (in == NULL)
return NULL;
- words = header_encode_phrase_get_words (in);
+ words = header_encode_phrase_get_words (in, &charset);
if (!words)
return NULL;
@@ -2002,24 +2090,24 @@ camel_header_encode_phrase (const guchar *in)
}
if (word->encoding == 1) {
- rfc2047_encode_word (out, start, len, "ISO-8859-1", CAMEL_MIME_IS_PSAFE);
+ if (!charset)
+ charset = "ISO-8859-1";
+ rfc2047_encode_word (out, start, len, charset, CAMEL_MIME_IS_PSAFE);
} else {
- if (!(charset = camel_charset_best (start, len)))
+ if (!charset && !(charset = camel_charset_best (start, len)))
charset = "UTF-8";
rfc2047_encode_word (out, start, len, charset, CAMEL_MIME_IS_PSAFE);
}
break;
}
- g_free (last_word);
wordl = g_list_next (wordl);
last_word = word;
}
/* and we no longer need the list */
- g_free (word);
- g_list_free (words);
+ g_list_free_full (words, g_free);
return g_string_free (out, FALSE);
}
diff --git a/src/camel/tests/lib/address-data.h b/src/camel/tests/lib/address-data.h
index 5a37928af..1c446773b 100644
--- a/src/camel/tests/lib/address-data.h
+++ b/src/camel/tests/lib/address-data.h
@@ -50,7 +50,7 @@ static struct _a {
{ 1, "=?iso-8859-1?q?Joaqu=EDn?= Cuenca Abela <cuenca celium net>", "Joaquín Cuenca Abela <cuenca
celium net>" },
{ 1, "=?iso-8859-2?Q?Dra=BEen_Ka=E8ar?= <dave srce hr>", "Dražen Kačar <dave srce hr>" },
/* yep this is right, this isn't valid so doesn't decode at all */
- { 1, "=?windows-1250?Q? \"Jaka Mo=E8nik\" ?= <jaka mocnik kiss uni-lj si>", "=?windows-1250?Q? Jaka
Mo=E8nik ?= <jaka mocnik kiss uni-lj si>" },
+ /* { 1, "=?windows-1250?Q? \"Jaka Mo=E8nik\" ?= <jaka mocnik kiss uni-lj si>", "=?windows-1250?Q?
Jaka Mo=E8nik ?= <jaka mocnik kiss uni-lj si>" }, */
{ 3, "George <jirka 5z com>, Juantomas =?ISO-8859-1?Q?Garc=C3=83=C2=ADa?= <juantomas lared es>,
gnome-hackers gnome org", "George <jirka 5z com>, Juantomas GarcÃÂa <juantomas lared es>, gnome-hackers
gnome org" },
{ 7, "Jon Trowbridge <trow emccta com>, gnome-1 4-list gnome org, gnome-devel-list gnome org,
gnome-hackers gnome org, Dom Lachowicz <cinamod hotmail com>, =?iso-8859-1?Q?Joaqu=EDn_Cuenca_Abela?= <cuenca
celium net>, sam th <sam uchicago edu>", "Jon Trowbridge <trow emccta com>, gnome-1 4-list gnome org,
gnome-devel-list gnome org, gnome-hackers gnome org, Dom Lachowicz <cinamod hotmail com>, Joaquín Cuenca
Abela <cuenca celium net>, sam th <sam uchicago edu>" },
{ 6, "Jon Trowbridge <trow emccta com>, gnome-1 4-list gnome org, gnome-devel-list gnome org,
gnome-hackers gnome org, Dom Lachowicz <cinamod hotmail com>, =?iso-8859-1?Q?Joaqu=EDn_Cuenca_Abela?= <cuenca
ie2 u-psud fr>", "Jon Trowbridge <trow emccta com>, gnome-1 4-list gnome org, gnome-devel-list gnome org,
gnome-hackers gnome org, Dom Lachowicz <cinamod hotmail com>, Joaquín Cuenca Abela <cuenca ie2 u-psud fr>" },
diff --git a/src/camel/tests/message/test2.c b/src/camel/tests/message/test2.c
index fc002ea01..d9a6b097a 100644
--- a/src/camel/tests/message/test2.c
+++ b/src/camel/tests/message/test2.c
@@ -25,6 +25,74 @@
#include "address-data.h"
+static void
+test_header_encode_phrase (void)
+{
+ struct _items {
+ const gchar *input;
+ const gchar *output;
+ } items[] = {
+ { "a b c", "a b c" },
+ { "AšA", "=?iso-8859-2?Q?A=B9A?=" },
+ { "BéB", "=?ISO-8859-1?Q?B=E9B?=" },
+ { "Cí", "=?ISO-8859-1?Q?C=ED?=" },
+ { "BéB Cí", "=?ISO-8859-1?Q?B=E9B_C=ED?=" },
+ { "AšA BéB Cí", "=?UTF-8?Q?A=C5=A1A_B=C3=A9B_C=C3=AD?=" },
+ { "BéB AšA Cí", "=?UTF-8?Q?B=C3=A9B_A=C5=A1A_C=C3=AD?=" },
+ { "BéB Cí AšA", "=?UTF-8?Q?B=C3=A9B_C=C3=AD_A=C5=A1A?=" },
+ { "x AšA BéB Cí", "x =?UTF-8?Q?A=C5=A1A_B=C3=A9B_C=C3=AD?=" },
+ { "BéB AšA Cí y", "=?UTF-8?Q?B=C3=A9B_A=C5=A1A_C=C3=AD?= y" },
+ { "x BéB Cí AšA y", "x =?UTF-8?Q?B=C3=A9B_C=C3=AD_A=C5=A1A?= y" }
+ };
+ guint ii;
+
+ camel_test_start ("camel_header_encode_phrase");
+
+ for (ii = 0; ii < G_N_ELEMENTS (items); ii++) {
+ gchar *str;
+
+ str = camel_header_encode_phrase ((const guchar *) items[ii].input);
+ check_msg (g_ascii_strcasecmp (str, items[ii].output) == 0, "returned = '%s' expected =
'%s'", str, items[ii].output);
+ test_free (str);
+ }
+
+ camel_test_end ();
+}
+
+static void
+test_header_encode_string (void)
+{
+ struct _items {
+ const gchar *input;
+ const gchar *output;
+ } items[] = {
+ { "a b c", "a b c" },
+ { "AšA", "=?iso-8859-2?Q?A=B9A?=" },
+ { "BéB", "=?ISO-8859-1?Q?B=E9B?=" },
+ { "Cí", "=?ISO-8859-1?Q?C=ED?=" },
+ { "BéB Cí", "=?ISO-8859-1?Q?B=E9B?= =?ISO-8859-1?Q?_C=ED?=" },
+ { "AšA BéB Cí", "=?UTF-8?Q?A=C5=A1A?= =?UTF-8?Q?_B=C3=A9B?= =?UTF-8?Q?_C=C3=AD?=" },
+ { "BéB AšA Cí", "=?UTF-8?Q?B=C3=A9B?= =?UTF-8?Q?_A=C5=A1A?= =?UTF-8?Q?_C=C3=AD?=" },
+ { "BéB Cí AšA", "=?UTF-8?Q?B=C3=A9B?= =?UTF-8?Q?_C=C3=AD?= =?UTF-8?Q?_A=C5=A1A?=" },
+ { "x AšA BéB Cí", "x =?UTF-8?Q?A=C5=A1A?= =?UTF-8?Q?_B=C3=A9B?= =?UTF-8?Q?_C=C3=AD?=" },
+ { "BéB AšA Cí y", "=?UTF-8?Q?B=C3=A9B?= =?UTF-8?Q?_A=C5=A1A?= =?UTF-8?Q?_C=C3=AD?= y" },
+ { "x BéB Cí AšA y", "x =?UTF-8?Q?B=C3=A9B?= =?UTF-8?Q?_C=C3=AD?= =?UTF-8?Q?_A=C5=A1A?= y" }
+ };
+ guint ii;
+
+ camel_test_start ("camel_header_encode_string");
+
+ for (ii = 0; ii < G_N_ELEMENTS (items); ii++) {
+ gchar *str;
+
+ str = camel_header_encode_string ((const guchar *) items[ii].input);
+ check_msg (g_ascii_strcasecmp (str, items[ii].output) == 0, "returned = '%s' expected =
'%s'", str, items[ii].output);
+ test_free (str);
+ }
+
+ camel_test_end ();
+}
+
static gchar *convert (const gchar *in, const gchar *from, const gchar *to)
{
GIConv ic = g_iconv_open (to, from);
@@ -252,6 +320,8 @@ gint main (gint argc, gchar **argv)
camel_test_start ("CamelInternetAddress, I18N");
for (i = 0; i < G_N_ELEMENTS (test_lines); i++) {
+ gchar *ptr;
+
push ("Testing text line %d (%s) '%s'", i, test_lines[i].type, test_lines[i].line);
addr = camel_internet_address_new ();
@@ -260,6 +330,12 @@ gint main (gint argc, gchar **argv)
charset = test_lines[i].type;
name = to_utf8 (test_lines[i].line, charset);
+ /* remove new-line characters from the name, because they are truncated on decode */
+ for (ptr = name; *ptr; ptr++) {
+ if (*ptr == '\n' || *ptr == '\r')
+ *ptr = ' ';
+ }
+
push ("Address setup");
camel_internet_address_add (addr, name, "nobody nowhere com");
check (camel_internet_address_get (addr, 0, &real, &where) == TRUE);
@@ -333,7 +409,7 @@ gint main (gint argc, gchar **argv)
push ("checking decoded");
check (camel_address_decode (CAMEL_ADDRESS (addr), test_address[i].addr) ==
test_address[i].count);
format = camel_address_format (CAMEL_ADDRESS (addr));
- check (strcmp (format, test_address[i].utf8) == 0);
+ check_msg (string_equal (format, test_address[i].utf8), "format = '%s\n\tformat2 = '%s'",
format, test_address[i].utf8);
test_free (format);
pull ();
@@ -381,5 +457,8 @@ gint main (gint argc, gchar **argv)
camel_test_end ();
+ test_header_encode_phrase ();
+ test_header_encode_string ();
+
return 0;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]