[glibmm] Added ustring::make_valid() which fixes non-UTF8 strings.
- From: Murray Cumming <murrayc src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [glibmm] Added ustring::make_valid() which fixes non-UTF8 strings.
- Date: Wed, 15 Mar 2017 09:00:39 +0000 (UTC)
commit 0797bf2954177f58b7ac6ebecce7264310481c55
Author: Krzysztof Piecuch <piecuch protonmail com>
Date: Wed Mar 15 02:13:15 2017 +0100
Added ustring::make_valid() which fixes non-UTF8 strings.
make_valid replaces all non-UTF8 characters with replacement
character (U+FFFD). Allows manipulating with ustring after you find
out by ustring::validate() that it's not an UTF-8 string and you
need to rescue it somehow.
This wraps g_utf8_make_valid().
Bug #780075
glib/glibmm/ustring.cc | 6 +++
glib/glibmm/ustring.h | 6 +++
tests/Makefile.am | 4 ++-
tests/glibmm_ustring_make_valid/main.cc | 58 +++++++++++++++++++++++++++++++
4 files changed, 73 insertions(+), 1 deletions(-)
---
diff --git a/glib/glibmm/ustring.cc b/glib/glibmm/ustring.cc
index eae9802..d05e986 100644
--- a/glib/glibmm/ustring.cc
+++ b/glib/glibmm/ustring.cc
@@ -1219,6 +1219,12 @@ ustring::validate(ustring::const_iterator& first_invalid) const
return (is_valid != 0);
}
+ustring
+ustring::make_valid() const
+{
+ return ustring(g_utf8_make_valid(string_.data(), string_.size()));
+}
+
bool
ustring::is_ascii() const
{
diff --git a/glib/glibmm/ustring.h b/glib/glibmm/ustring.h
index ba6289d..7cd19f0 100644
--- a/glib/glibmm/ustring.h
+++ b/glib/glibmm/ustring.h
@@ -596,6 +596,12 @@ public:
/*! Check whether the string is valid UTF-8. */
bool validate(const_iterator& first_invalid) const;
+ /*! Return a copy that is a valid UTF-8 string replacing invalid bytes in the
+ * original with Unicode replacement character (U+FFFD).
+ * If the string is valid - return it's copy.
+ */
+ ustring make_valid() const;
+
/*! Check whether the string is plain 7-bit ASCII. @par
* Unlike any other ustring method, is_ascii() is safe to use on invalid
* UTF-8 strings. If the string isn't valid UTF-8, it cannot be valid
diff --git a/tests/Makefile.am b/tests/Makefile.am
index bbe85ae..4a0d2fe 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -48,7 +48,8 @@ check_PROGRAMS = \
glibmm_refptr/test \
glibmm_refptr_sigc_bind/test \
glibmm_weakref/test \
- glibmm_bytearray/test
+ glibmm_bytearray/test \
+ glibmm_ustring_make_valid/test
TESTS = $(check_PROGRAMS)
@@ -123,3 +124,4 @@ glibmm_refptr_sigc_bind_test_SOURCES = glibmm_refptr_sigc_bind/main.cc
glibmm_weakref_test_SOURCES = glibmm_weakref/main.cc
glibmm_weakref_test_LDADD = $(giomm_ldadd)
glibmm_bytearray_test_SOURCES = glibmm_bytearray/main.cc
+glibmm_ustring_make_valid_test_SOURCES = glibmm_ustring_make_valid/main.cc
diff --git a/tests/glibmm_ustring_make_valid/main.cc b/tests/glibmm_ustring_make_valid/main.cc
new file mode 100644
index 0000000..3f94122
--- /dev/null
+++ b/tests/glibmm_ustring_make_valid/main.cc
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <glibmm.h>
+
+int
+main()
+{
+ Glib::init();
+
+ // 0-1: bad character
+ const char not_utf8[] = { '\x80',
+ // 1-4: good three bytes (one character)
+ '\xef', '\x80', '\x80',
+ // 4-5: bad character
+ '\xef',
+ // 5-6: bad character
+ '\x80',
+ // 6-7: good character
+ 'a',
+ // 7-8: bad character
+ '\0',
+ // 8-9: good character
+ 'd',
+ // 9-10: bad character
+ '\x80',
+ // 10-13: good three bytes (one character)
+ '\xef', '\x80', '\x80',
+ // 13-15: two bad characters
+ '\xef', '\x80'
+ };
+
+ const char fixed_utf8[] = { '\xef', '\xbf', '\xbd',
+ '\xef', '\x80', '\x80',
+ '\xef', '\xbf', '\xbd',
+ '\xef', '\xbf', '\xbd',
+ 'a',
+ '\xef', '\xbf', '\xbd',
+ 'd',
+ '\xef', '\xbf', '\xbd',
+ '\xef', '\x80', '\x80',
+ '\xef', '\xbf', '\xbd',
+ '\xef', '\xbf', '\xbd'
+ };
+
+ // const char repl_character[] = {'\xef', '\xbf', '\xbd'};
+ const Glib::ustring s(not_utf8, not_utf8 + sizeof not_utf8);
+ g_assert(s.validate() == false);
+
+ const Glib::ustring good_one = s.make_valid();
+ g_assert(s.validate() == false); // we make a copy
+ g_assert(good_one.validate()); // this one is good!
+
+ const Glib::ustring correct_output(fixed_utf8,
+ fixed_utf8 + sizeof fixed_utf8);
+ g_assert(correct_output.validate());
+ g_assert(correct_output == good_one);
+
+ return EXIT_SUCCESS;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]