[glibmm/glibmm-2-62] ustring: Add make_valid()

From: Kjell Ahlstedt <kjellahl src gnome org>
To: commits-list gnome org
Cc:
Subject: [glibmm/glibmm-2-62] ustring: Add make_valid()
Date: Sun, 15 Sep 2019 19:31:03 +0000 (UTC)
commit e7a86070a9bf3402138f7874d050d5e72f9c76b0
Author: Krzysztof Piecuch <piecuch protonmail com>
Date:   Sun Sep 15 21:28:55 2019 +0200

    ustring: Add make_valid()
    
    make_valid() replaces all non-UTF8 characters with replacement
    character (U+FFFD). Allows manipulating with ustring after you find
    out by ustring::validate() that it's not an UTF-8 string and you
    need to rescue it somehow. This wraps g_utf8_make_valid().
    
    https://bugzilla.gnome.org/show_bug.cgi?id=780075
    See also issue #40
    
    This commit includes part of MR !11 by Martin Ejdestig <marejde gmail com>,
    fixing a memory leak in the original make_valid().

 glib/glibmm/ustring.cc                  |  6 ++++
 glib/glibmm/ustring.h                   |  8 ++++-
 tests/Makefile.am                       |  2 ++
 tests/glibmm_ustring_make_valid/main.cc | 58 +++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 1 deletion(-)
---
diff --git a/glib/glibmm/ustring.cc b/glib/glibmm/ustring.cc
index a0f9ab80..639041ea 100644
--- a/glib/glibmm/ustring.cc
+++ b/glib/glibmm/ustring.cc
@@ -1218,6 +1218,12 @@ ustring::validate(ustring::const_iterator& first_invalid) const
   return (is_valid != 0);
 }
 
+ustring
+ustring::make_valid() const
+{
+  return convert_return_gchar_ptr_to_ustring(g_utf8_make_valid(string_.data(), string_.size()));
+}
+
 bool
 ustring::is_ascii() const
 {
diff --git a/glib/glibmm/ustring.h b/glib/glibmm/ustring.h
index f0c4d869..daa3b141 100644
--- a/glib/glibmm/ustring.h
+++ b/glib/glibmm/ustring.h
@@ -175,7 +175,7 @@ gunichar get_unichar_from_std_iterator(std::string::const_iterator pos) G_GNUC_P
  * Many member functions and operators of %Glib::ustring and Glib::ustring_Iterator
  * assume that the string contains only valid UTF-8 data. If it does not, memory
  * outside the bounds of the string can be accessed. If you're uncertain, use
- * validate().
+ * validate() and/or make_valid().
  * @par
  * In a perfect world the C++ Standard Library would contain a UTF-8 string
  * class.  Unfortunately, the C++98 standard doesn't mention UTF-8 at all.
@@ -603,6 +603,12 @@ public:
   /*! Check whether the string is valid UTF-8. */
   bool validate(const_iterator& first_invalid) const;
 
+  /*! Return a copy that is a valid UTF-8 string replacing invalid bytes
+   * in the original with %Unicode replacement character (U+FFFD).
+   * If the string is valid, return a copy of it.
+   */
+  ustring make_valid() const;
+
   /*! Check whether the string is plain 7-bit ASCII. @par
    * Unlike any other ustring method, is_ascii() is safe to use on invalid
    * UTF-8 strings.  If the string isn't valid UTF-8, it cannot be valid
diff --git a/tests/Makefile.am b/tests/Makefile.am
index b7336354..895d205c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -37,6 +37,7 @@ check_PROGRAMS =                              \
        glibmm_objectbase_move/test                     \
        glibmm_ustring_compose/test             \
        glibmm_ustring_format/test              \
+       glibmm_ustring_make_valid/test \
        glibmm_ustring_sprintf/test             \
        glibmm_value/test                       \
        glibmm_valuearray/test                  \
@@ -103,6 +104,7 @@ glibmm_object_move_test_SOURCES          = glibmm_object_move/main.cc
 glibmm_objectbase_move_test_SOURCES      = glibmm_objectbase_move/main.cc
 glibmm_ustring_compose_test_SOURCES      = glibmm_ustring_compose/main.cc
 glibmm_ustring_format_test_SOURCES       = glibmm_ustring_format/main.cc
+glibmm_ustring_make_valid_test_SOURCES   = glibmm_ustring_make_valid/main.cc
 glibmm_ustring_sprintf_test_SOURCES      = glibmm_ustring_sprintf/main.cc
 glibmm_value_test_SOURCES                = glibmm_value/glibmm_value.cc glibmm_value/main.cc
 glibmm_valuearray_test_SOURCES           = glibmm_valuearray/main.cc
diff --git a/tests/glibmm_ustring_make_valid/main.cc b/tests/glibmm_ustring_make_valid/main.cc
new file mode 100644
index 00000000..3f941225
--- /dev/null
+++ b/tests/glibmm_ustring_make_valid/main.cc
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <glibmm.h>
+
+int
+main()
+{
+  Glib::init();
+
+    //                        0-1: bad character
+  const char not_utf8[] = { '\x80',
+    //                        1-4: good three bytes (one character)
+    '\xef', '\x80', '\x80',
+    //                        4-5: bad character
+    '\xef',
+    //                        5-6: bad character
+    '\x80',
+    //                        6-7: good character
+    'a',
+    //                        7-8: bad character
+    '\0',
+    //                        8-9: good character
+    'd',
+    //                        9-10: bad character
+    '\x80',
+    //                        10-13: good three bytes (one character)
+    '\xef', '\x80', '\x80',
+    //                        13-15: two bad characters
+    '\xef', '\x80'
+  };
+
+  const char fixed_utf8[] = { '\xef', '\xbf', '\xbd',
+    '\xef', '\x80', '\x80',
+    '\xef', '\xbf', '\xbd',
+    '\xef', '\xbf', '\xbd',
+    'a',
+    '\xef', '\xbf', '\xbd',
+    'd',
+    '\xef', '\xbf', '\xbd',
+    '\xef', '\x80', '\x80',
+    '\xef', '\xbf', '\xbd',
+    '\xef', '\xbf', '\xbd'
+  };
+
+  // const char repl_character[] = {'\xef', '\xbf', '\xbd'};
+  const Glib::ustring s(not_utf8, not_utf8 + sizeof not_utf8);
+  g_assert(s.validate() == false);
+
+  const Glib::ustring good_one = s.make_valid();
+  g_assert(s.validate() == false); // we make a copy
+  g_assert(good_one.validate());   // this one is good!
+
+  const Glib::ustring correct_output(fixed_utf8,
+      fixed_utf8 + sizeof fixed_utf8);
+  g_assert(correct_output.validate());
+  g_assert(correct_output == good_one);
+
+  return EXIT_SUCCESS;
+}
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]