[glib] gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions
- From: Philip Withnall <pwithnall src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [glib] gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions
- Date: Fri, 19 Jan 2018 12:10:56 +0000 (UTC)
commit 81cd8154061338dfee7a9d3e23752efe190310bb
Author: Mikhail Zabaluev <mikhail zabaluev gmail com>
Date: Sun Jan 14 16:55:03 2018 +0200
gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions
The character encoding conversion utility functions g_locale_to_utf8()
and g_filename_to_utf8() had inconsistent behavior on producing strings
with inner NUL bytes: in the all-UTF-8 strdup path, the input string
validation prohibits embedded NULs, while g_convert(), using iconv(),
can produce UTF-8 output with NUL bytes inside the output buffer.
This, while valid UTF-8 per the Unicode standard, is not valid for
the nul-terminated (type utf8) return value format that the *_to_utf8()
functions are annotated with (as per discussion in bug 756128).
Check the output of g_convert() for embedded NUL bytes, and if any
are found, set the newly introduced error
G_CONVERT_ERROR_EMBEDDED_NUL.
Also document the error set by g_{locale,filename}_{from,to}_utf8()
when the input string contains nul bytes.
https://bugzilla.gnome.org/show_bug.cgi?id=792516
glib/gconvert.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++--------
glib/gconvert.h | 6 +++-
2 files changed, 76 insertions(+), 14 deletions(-)
---
diff --git a/glib/gconvert.c b/glib/gconvert.c
index 083ea17..586b53a 100644
--- a/glib/gconvert.c
+++ b/glib/gconvert.c
@@ -866,6 +866,40 @@ strdup_len (const gchar *string,
return g_strndup (string, real_len);
}
+static gchar *
+convert_to_utf8 (const gchar *opsysstring,
+ gssize len,
+ const gchar *charset,
+ gsize *bytes_read,
+ gsize *bytes_written,
+ GError **error)
+{
+ gchar *utf8;
+ gsize outbytes;
+
+ utf8 = g_convert (opsysstring, len, "UTF-8", charset,
+ bytes_read, &outbytes, error);
+ if (utf8 == NULL)
+ {
+ if (bytes_written)
+ *bytes_written = 0;
+ return NULL;
+ }
+ if (memchr (utf8, '\0', outbytes) != NULL)
+ {
+ g_free (utf8);
+ if (bytes_written)
+ *bytes_written = 0;
+ g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
+ _("Embedded NUL byte in conversion output"));
+ return NULL;
+ }
+
+ if (bytes_written)
+ *bytes_written = outbytes;
+ return utf8;
+}
+
/**
* g_locale_to_utf8:
* @opsysstring: a string in the encoding of the current locale. On Windows
@@ -879,7 +913,7 @@ strdup_len (const gchar *string,
* Even if the conversion was successful, this may be
* less than @len if there were partial characters
* at the end of the input. If the error
- * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
* stored will the byte offset after the last valid
* input sequence.
* @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -890,6 +924,14 @@ strdup_len (const gchar *string,
* Converts a string which is in the encoding used for strings by
* the C runtime (usually the same as that used by the operating
* system) in the [current locale][setlocale] into a UTF-8 string.
+ *
+ * If the source encoding is not UTF-8 and the conversion output contains a
+ * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
+ * function returns %NULL.
+ * If the source encoding is UTF-8, an embedded nul character is treated with
+ * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
+ * earlier versions of this library. Use g_convert() to produce output that
+ * may contain embedded nul characters.
*
* Returns: A newly-allocated buffer containing the converted string,
* or %NULL on an error, and error will be set.
@@ -906,23 +948,21 @@ g_locale_to_utf8 (const gchar *opsysstring,
if (g_get_charset (&charset))
return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
else
- return g_convert (opsysstring, len,
- "UTF-8", charset, bytes_read, bytes_written, error);
+ return convert_to_utf8 (opsysstring, len, charset,
+ bytes_read, bytes_written, error);
}
/**
* g_locale_from_utf8:
* @utf8string: a UTF-8 encoded string
* @len: the length of the string, or -1 if the string is
- * nul-terminated (Note that some encodings may allow nul
- * bytes to occur inside strings. In that case, using -1
- * for the @len parameter is unsafe)
+ * nul-terminated.
* @bytes_read: (out) (optional): location to store the number of bytes in the
* input string that were successfully converted, or %NULL.
* Even if the conversion was successful, this may be
* less than @len if there were partial characters
* at the end of the input. If the error
- * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
* stored will the byte offset after the last valid
* input sequence.
* @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -934,7 +974,12 @@ g_locale_to_utf8 (const gchar *opsysstring,
* the C runtime (usually the same as that used by the operating
* system) in the [current locale][setlocale]. On Windows this means
* the system codepage.
- *
+ *
+ * The input string should not contain nul characters even if the @len
+ * argument is positive. A nul character found inside the string may result
+ * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
+ * input that may contain embedded nul characters.
+ *
* Returns: A newly-allocated buffer containing the converted string,
* or %NULL on an error, and error will be set.
**/
@@ -1126,7 +1171,7 @@ get_filename_charset (const gchar **filename_charset)
* Even if the conversion was successful, this may be
* less than @len if there were partial characters
* at the end of the input. If the error
- * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
* stored will the byte offset after the last valid
* input sequence.
* @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -1138,6 +1183,14 @@ get_filename_charset (const gchar **filename_charset)
* filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
* for filenames; on other platforms, this function indirectly depends on
* the [current locale][setlocale].
+ *
+ * If the source encoding is not UTF-8 and the conversion output contains a
+ * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
+ * function returns %NULL.
+ * If the source encoding is UTF-8, an embedded nul character is treated with
+ * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
+ * earlier versions of this library. Use g_convert() to produce output that
+ * may contain embedded nul characters.
*
* Returns: The converted string, or %NULL on an error.
**/
@@ -1155,8 +1208,8 @@ g_filename_to_utf8 (const gchar *opsysstring,
if (get_filename_charset (&charset))
return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
else
- return g_convert (opsysstring, len,
- "UTF-8", charset, bytes_read, bytes_written, error);
+ return convert_to_utf8 (opsysstring, len, charset,
+ bytes_read, bytes_written, error);
}
/**
@@ -1169,7 +1222,7 @@ g_filename_to_utf8 (const gchar *opsysstring,
* Even if the conversion was successful, this may be
* less than @len if there were partial characters
* at the end of the input. If the error
- * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
* stored will the byte offset after the last valid
* input sequence.
* @bytes_written: (out): the number of bytes stored in the output buffer (not
@@ -1181,7 +1234,12 @@ g_filename_to_utf8 (const gchar *opsysstring,
* filenames. Note that on Windows GLib uses UTF-8 for filenames;
* on other platforms, this function indirectly depends on the
* [current locale][setlocale].
- *
+ *
+ * The input string should not contain nul characters even if the @len
+ * argument is positive. A nul character found inside the string may result
+ * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Note that nul bytes are
+ * prohibited in all filename encodings that GLib is known to work with.
+ *
* Returns: (array length=bytes_written) (element-type guint8) (transfer full):
* The converted string, or %NULL on an error.
**/
diff --git a/glib/gconvert.h b/glib/gconvert.h
index f064e41..ea93006 100644
--- a/glib/gconvert.h
+++ b/glib/gconvert.h
@@ -43,6 +43,9 @@ G_BEGIN_DECLS
* @G_CONVERT_ERROR_BAD_URI: URI is invalid.
* @G_CONVERT_ERROR_NOT_ABSOLUTE_PATH: Pathname is not an absolute path.
* @G_CONVERT_ERROR_NO_MEMORY: No memory available. Since: 2.40
+ * @G_CONVERT_ERROR_EMBEDDED_NUL: An embedded NUL character is present in
+ * conversion output where a NUL-terminated string is expected.
+ * Since: 2.56
*
* Error codes returned by character set conversion routines.
*/
@@ -54,7 +57,8 @@ typedef enum
G_CONVERT_ERROR_PARTIAL_INPUT,
G_CONVERT_ERROR_BAD_URI,
G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
- G_CONVERT_ERROR_NO_MEMORY
+ G_CONVERT_ERROR_NO_MEMORY,
+ G_CONVERT_ERROR_EMBEDDED_NUL
} GConvertError;
/**
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]