[tepl/wip/icu: 4/5] icu: write tepl_utils_markup_escape_text()
- From: Sébastien Wilmet <swilmet src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tepl/wip/icu: 4/5] icu: write tepl_utils_markup_escape_text()
- Date: Sat, 30 May 2020 11:18:16 +0000 (UTC)
commit 71f4742f6027b2c09fa8731f54880badac8135f0
Author: Sébastien Wilmet <swilmet gnome org>
Date: Fri May 29 22:08:34 2020 +0200
icu: write tepl_utils_markup_escape_text()
docs/reference/tepl-sections.txt | 1 +
tepl/tepl-utils.c | 68 ++++++++++++++++++++++++++++++++++++++++
tepl/tepl-utils.h | 3 ++
testsuite/test-utils.c | 35 +++++++++++++++++++++
4 files changed, 107 insertions(+)
---
diff --git a/docs/reference/tepl-sections.txt b/docs/reference/tepl-sections.txt
index 13fc4e9..96f5441 100644
--- a/docs/reference/tepl-sections.txt
+++ b/docs/reference/tepl-sections.txt
@@ -445,6 +445,7 @@ tepl_notebook_get_type
tepl_utils_str_middle_truncate
tepl_utils_str_end_truncate
tepl_utils_str_replace
+tepl_utils_markup_escape_text
tepl_utils_get_file_extension
tepl_utils_get_file_shortname
tepl_utils_replace_home_dir_with_tilde
diff --git a/tepl/tepl-utils.c b/tepl/tepl-utils.c
index ceb5baa..d49c42f 100644
--- a/tepl/tepl-utils.c
+++ b/tepl/tepl-utils.c
@@ -10,6 +10,7 @@
#include "tepl-utils.h"
#include <string.h>
#include "tepl-application-window.h"
+#include "tepl-icu.h"
/**
* SECTION:utils
@@ -159,6 +160,73 @@ tepl_utils_str_replace (const gchar *string,
return ret;
}
+/**
+ * tepl_utils_markup_escape_text:
+ * @src: a nul-terminated UTF-8 string.
+ *
+ * The same as g_markup_escape_text(), but with an implementation that fully
+ * supports round-trip integrity. I.e. when #GMarkupParser or any other XML
+ * parser will decode/unescape the string, the exact same string as @src will be
+ * brought back. As long as @src is a valid UTF-8 string.
+ *
+ * The other difference with g_markup_escape_text() is that the @length
+ * parameter is not present for tepl_utils_markup_escape_text().
+ *
+ * # g_markup_escape_text() doesn't fully support round-trip integrity
+ *
+ * In fact, g_markup_escape_text() doesn't escape the tabstop, newline and
+ * carriage return characters. And the #GMarkupParser correctly processes
+ * whitespace and line endings according to the [XML rules for normalization of
+ * line endings and attribute values](https://www.w3.org/TR/xml/#AVNormalize).
+ *
+ * For example `"\t"` (a tab) after a round-trip through g_markup_escape_text()
+ * and #GMarkupParser becomes a simple space.
+ *
+ * Returns: (transfer full) (nullable): a newly allocated string with the
+ * escaped text, or %NULL if @src is not a valid UTF-8 string. Free with
+ * g_free() when no longer needed.
+ * Since: 5.0
+ */
+gchar *
+tepl_utils_markup_escape_text (const gchar *src)
+{
+ UChar *src_uchars;
+ UTransliterator *trans;
+ UChar *dest_uchars = NULL;
+ gchar *dest = NULL;
+
+ src_uchars = _tepl_icu_strFromUTF8Simple (src);
+ if (src_uchars == NULL)
+ {
+ return NULL;
+ }
+
+ trans = _tepl_icu_trans_open_xml_escape ();
+ if (trans == NULL)
+ {
+ goto out;
+ }
+
+ dest_uchars = _tepl_icu_trans_transUCharsSimple (trans, src_uchars);
+ if (dest_uchars == NULL)
+ {
+ goto out;
+ }
+
+ dest = _tepl_icu_strToUTF8Simple (dest_uchars);
+
+out:
+ g_free (src_uchars);
+ g_free (dest_uchars);
+
+ if (trans != NULL)
+ {
+ utrans_close (trans);
+ }
+
+ return dest;
+}
+
static gint
get_extension_position (const gchar *filename)
{
diff --git a/tepl/tepl-utils.h b/tepl/tepl-utils.h
index e0759ee..88369f8 100644
--- a/tepl/tepl-utils.h
+++ b/tepl/tepl-utils.h
@@ -29,6 +29,9 @@ gchar * tepl_utils_str_replace (const gchar *string,
const gchar *search,
const gchar *replacement);
+_TEPL_EXTERN
+gchar * tepl_utils_markup_escape_text (const gchar *src);
+
/* File utilities */
_TEPL_EXTERN
diff --git a/testsuite/test-utils.c b/testsuite/test-utils.c
index 5f3c77f..9f22855 100644
--- a/testsuite/test-utils.c
+++ b/testsuite/test-utils.c
@@ -51,6 +51,40 @@ test_str_replace (void)
g_free (result);
}
+static void
+check_markup_escape_text (const gchar *src,
+ const gchar *expected_dest)
+{
+ gchar *received_dest;
+
+ received_dest = tepl_utils_markup_escape_text (src);
+ g_assert_cmpstr (received_dest, ==, expected_dest);
+ g_free (received_dest);
+}
+
+static void
+test_markup_escape_text (void)
+{
+ check_markup_escape_text ("", "");
+ check_markup_escape_text ("123ASCIIabc.,;/_-:", "123ASCIIabc.,;/_-:");
+ check_markup_escape_text ("é", "é");
+ check_markup_escape_text ("\t", "	");
+ check_markup_escape_text ("ẞ", "ẞ"); // multi-byte UTF-8 char.
+
+ {
+ gchar *dest;
+
+ /* If this changes in the future, maybe g_markup_escape_text()
+ * has been modified to fully support round-trip integrity, in
+ * which case tepl_utils_markup_escape_text() is no longer
+ * useful.
+ */
+ dest = g_markup_escape_text ("\t", -1);
+ g_assert_cmpstr (dest, ==, "\t");
+ g_free (dest);
+ }
+}
+
static void
test_get_file_extension (void)
{
@@ -191,6 +225,7 @@ main (int argc,
g_test_add_func ("/utils/str-middle-truncate", test_str_middle_truncate);
g_test_add_func ("/utils/str-end-truncate", test_str_end_truncate);
g_test_add_func ("/utils/str-replace", test_str_replace);
+ g_test_add_func ("/utils/markup-escape-text", test_markup_escape_text);
g_test_add_func ("/utils/get-file-extension", test_get_file_extension);
g_test_add_func ("/utils/get-file-shortname", test_get_file_shortname);
g_test_add_func ("/utils/replace-home-dir-with-tilde", test_replace_home_dir_with_tilde);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]