[glibmm/glibmm-2-62] ustring: Add make_valid()
- From: Kjell Ahlstedt <kjellahl src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [glibmm/glibmm-2-62] ustring: Add make_valid()
- Date: Sun, 15 Sep 2019 19:31:03 +0000 (UTC)
commit e7a86070a9bf3402138f7874d050d5e72f9c76b0
Author: Krzysztof Piecuch <piecuch protonmail com>
Date: Sun Sep 15 21:28:55 2019 +0200
ustring: Add make_valid()
make_valid() replaces all non-UTF8 characters with replacement
character (U+FFFD). Allows manipulating with ustring after you find
out by ustring::validate() that it's not an UTF-8 string and you
need to rescue it somehow. This wraps g_utf8_make_valid().
https://bugzilla.gnome.org/show_bug.cgi?id=780075
See also issue #40
This commit includes part of MR !11 by Martin Ejdestig <marejde gmail com>,
fixing a memory leak in the original make_valid().
glib/glibmm/ustring.cc | 6 ++++
glib/glibmm/ustring.h | 8 ++++-
tests/Makefile.am | 2 ++
tests/glibmm_ustring_make_valid/main.cc | 58 +++++++++++++++++++++++++++++++++
4 files changed, 73 insertions(+), 1 deletion(-)
---
diff --git a/glib/glibmm/ustring.cc b/glib/glibmm/ustring.cc
index a0f9ab80..639041ea 100644
--- a/glib/glibmm/ustring.cc
+++ b/glib/glibmm/ustring.cc
@@ -1218,6 +1218,12 @@ ustring::validate(ustring::const_iterator& first_invalid) const
return (is_valid != 0);
}
+ustring
+ustring::make_valid() const
+{
+ return convert_return_gchar_ptr_to_ustring(g_utf8_make_valid(string_.data(), string_.size()));
+}
+
bool
ustring::is_ascii() const
{
diff --git a/glib/glibmm/ustring.h b/glib/glibmm/ustring.h
index f0c4d869..daa3b141 100644
--- a/glib/glibmm/ustring.h
+++ b/glib/glibmm/ustring.h
@@ -175,7 +175,7 @@ gunichar get_unichar_from_std_iterator(std::string::const_iterator pos) G_GNUC_P
* Many member functions and operators of %Glib::ustring and Glib::ustring_Iterator
* assume that the string contains only valid UTF-8 data. If it does not, memory
* outside the bounds of the string can be accessed. If you're uncertain, use
- * validate().
+ * validate() and/or make_valid().
* @par
* In a perfect world the C++ Standard Library would contain a UTF-8 string
* class. Unfortunately, the C++98 standard doesn't mention UTF-8 at all.
@@ -603,6 +603,12 @@ public:
/*! Check whether the string is valid UTF-8. */
bool validate(const_iterator& first_invalid) const;
+ /*! Return a copy that is a valid UTF-8 string replacing invalid bytes
+ * in the original with %Unicode replacement character (U+FFFD).
+ * If the string is valid, return a copy of it.
+ */
+ ustring make_valid() const;
+
/*! Check whether the string is plain 7-bit ASCII. @par
* Unlike any other ustring method, is_ascii() is safe to use on invalid
* UTF-8 strings. If the string isn't valid UTF-8, it cannot be valid
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b7336354..895d205c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -37,6 +37,7 @@ check_PROGRAMS = \
glibmm_objectbase_move/test \
glibmm_ustring_compose/test \
glibmm_ustring_format/test \
+ glibmm_ustring_make_valid/test \
glibmm_ustring_sprintf/test \
glibmm_value/test \
glibmm_valuearray/test \
@@ -103,6 +104,7 @@ glibmm_object_move_test_SOURCES = glibmm_object_move/main.cc
glibmm_objectbase_move_test_SOURCES = glibmm_objectbase_move/main.cc
glibmm_ustring_compose_test_SOURCES = glibmm_ustring_compose/main.cc
glibmm_ustring_format_test_SOURCES = glibmm_ustring_format/main.cc
+glibmm_ustring_make_valid_test_SOURCES = glibmm_ustring_make_valid/main.cc
glibmm_ustring_sprintf_test_SOURCES = glibmm_ustring_sprintf/main.cc
glibmm_value_test_SOURCES = glibmm_value/glibmm_value.cc glibmm_value/main.cc
glibmm_valuearray_test_SOURCES = glibmm_valuearray/main.cc
diff --git a/tests/glibmm_ustring_make_valid/main.cc b/tests/glibmm_ustring_make_valid/main.cc
new file mode 100644
index 00000000..3f941225
--- /dev/null
+++ b/tests/glibmm_ustring_make_valid/main.cc
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <glibmm.h>
+
+int
+main()
+{
+ Glib::init();
+
+ // 0-1: bad character
+ const char not_utf8[] = { '\x80',
+ // 1-4: good three bytes (one character)
+ '\xef', '\x80', '\x80',
+ // 4-5: bad character
+ '\xef',
+ // 5-6: bad character
+ '\x80',
+ // 6-7: good character
+ 'a',
+ // 7-8: bad character
+ '\0',
+ // 8-9: good character
+ 'd',
+ // 9-10: bad character
+ '\x80',
+ // 10-13: good three bytes (one character)
+ '\xef', '\x80', '\x80',
+ // 13-15: two bad characters
+ '\xef', '\x80'
+ };
+
+ const char fixed_utf8[] = { '\xef', '\xbf', '\xbd',
+ '\xef', '\x80', '\x80',
+ '\xef', '\xbf', '\xbd',
+ '\xef', '\xbf', '\xbd',
+ 'a',
+ '\xef', '\xbf', '\xbd',
+ 'd',
+ '\xef', '\xbf', '\xbd',
+ '\xef', '\x80', '\x80',
+ '\xef', '\xbf', '\xbd',
+ '\xef', '\xbf', '\xbd'
+ };
+
+ // const char repl_character[] = {'\xef', '\xbf', '\xbd'};
+ const Glib::ustring s(not_utf8, not_utf8 + sizeof not_utf8);
+ g_assert(s.validate() == false);
+
+ const Glib::ustring good_one = s.make_valid();
+ g_assert(s.validate() == false); // we make a copy
+ g_assert(good_one.validate()); // this one is good!
+
+ const Glib::ustring correct_output(fixed_utf8,
+ fixed_utf8 + sizeof fixed_utf8);
+ g_assert(correct_output.validate());
+ g_assert(correct_output == good_one);
+
+ return EXIT_SUCCESS;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]