[gtksourceview/wip/uchardet: 1/2] Use uchardet to improve character encoding auto-detection



commit f3586e75fca4198eaf691af4ea1e4f0ccafcc33c
Author: Jehan <jehan girinstud io>
Date:   Thu Sep 3 21:34:04 2015 +0200

    Use uchardet to improve character encoding auto-detection
    
    uchardet is a C language binding from Mozilla encoding library. It
    performs much better than the current implementation.
    
    The dependency is optional.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=669448

 configure.ac                                |   23 ++++++++++
 gtksourceview/Makefile.am                   |    9 +++-
 gtksourceview/gtksourcebufferoutputstream.c |   63 ++++++++++++++++++++++++---
 tests/Makefile.am                           |    3 +-
 4 files changed, 88 insertions(+), 10 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index ef83b17..6392a08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -48,6 +48,7 @@ glib_req=2.47.0
 gtk_req=3.19.3
 libxml_req=2.6.0
 gladeui_req=3.9
+uchardet_req=0.0.1
 
 AC_CONFIG_SRCDIR([gtksourceview/gtksourcebuffer.h])
 AC_CONFIG_HEADER([config.h])
@@ -140,6 +141,27 @@ AS_IF([ test "$glade_catalog" = "yes" ],
         AC_MSG_RESULT([$GLADE_CATALOG_DIR])
         AC_SUBST(GLADE_CATALOG_DIR)])
 
+# Check for uchardet
+AC_ARG_WITH([uchardet],
+           [AS_HELP_STRING([--without-uchardet],
+                           [Build without uchardet support])],
+           [with_uchardet=${withval}],
+           [with_uchardet=yes])
+
+if test "x$with_uchardet" = xyes; then
+       AX_PKG_CHECK_MODULES([UCHARDET],
+                            [],
+                            [uchardet >= $uchardet_req],
+                            [have_uchardet=yes],
+                            [have_uchardet=no])
+
+       if test "x$have_uchardet" = xno; then
+               AC_MSG_ERROR([uchardet library not found or too old. Use --without-uchardet to build without 
the uchardet dependency.])
+       fi
+
+       AC_DEFINE([WITH_UCHARDET], [1], [Define to 1 if uchardet is available])
+fi
+
 # i18n
 AM_GNU_GETTEXT([external])
 AM_GNU_GETTEXT_VERSION([0.19.4])
@@ -220,4 +242,5 @@ Configuration:
        Glade Catalog:          ${glade_catalog}
        GObject introspection:  ${found_introspection}
        Vala:                   ${enable_vala}
+       uchardet:               ${with_uchardet}
 "
diff --git a/gtksourceview/Makefile.am b/gtksourceview/Makefile.am
index 768e887..b649b47 100644
--- a/gtksourceview/Makefile.am
+++ b/gtksourceview/Makefile.am
@@ -152,8 +152,9 @@ libgtksourceview_core_la_SOURCES =          \
 nodist_libgtksourceview_core_la_SOURCES = \
        $(BUILT_SOURCES)
 
-libgtksourceview_core_la_CFLAGS =      \
-       $(CODE_COVERAGE_CFLAGS)
+libgtksourceview_core_la_CFLAGS =      \
+       $(CODE_COVERAGE_CFLAGS)         \
+       $(UCHARDET_CFLAGS)
 
 libgtksourceview_core_la_LDFLAGS =     \
        -no-undefined                   \
@@ -174,7 +175,9 @@ libgtksourceview_3_0_la_LIBADD =                                    \
        -lm                                                             \
        libgtksourceview-core.la                                        \
        completion-providers/words/libgtksourcecompletionwords.la       \
-       $(DEP_LIBS) $(GTK_MAC_LIBS)
+       $(DEP_LIBS)                                                     \
+       $(UCHARDET_LIBS)                                                \
+       $(GTK_MAC_LIBS)
 
 libgtksourceview_3_0_la_CFLAGS =       \
        $(CODE_COVERAGE_CFLAGS)
diff --git a/gtksourceview/gtksourcebufferoutputstream.c b/gtksourceview/gtksourcebufferoutputstream.c
index 365ae98..200a16c 100644
--- a/gtksourceview/gtksourcebufferoutputstream.c
+++ b/gtksourceview/gtksourcebufferoutputstream.c
@@ -29,6 +29,10 @@
 #include "gtksourcefileloader.h"
 #include "gtksourceview-i18n.h"
 
+#ifdef WITH_UCHARDET
+#include <uchardet.h>
+#endif
+
 /* NOTE: never use async methods on this stream, the stream is just
  * a wrapper around GtkTextBuffer api so that we can use GIO Stream
  * methods, but the underlying code operates on a GtkTextBuffer, so
@@ -352,19 +356,41 @@ try_convert (GCharsetConverter *converter,
        return ret;
 }
 
+#ifdef WITH_UCHARDET
 static GCharsetConverter *
-guess_encoding (GtkSourceBufferOutputStream *stream,
-               const void                  *inbuf,
-               gsize                        inbuf_size)
+guess_encoding_with_uchardet (const void *inbuf,
+                             gsize       inbuf_size)
 {
        GCharsetConverter *conv = NULL;
+       uchardet_t chardet = uchardet_new ();
+       gint success;
 
-       if (inbuf == NULL || inbuf_size == 0)
+       success = uchardet_handle_data (chardet,
+                                       (const char *) inbuf,
+                                       (size_t) inbuf_size);
+       uchardet_data_end (chardet);
+
+       if (success == 0 &&
+           g_strcmp0 (uchardet_get_charset (chardet), "") != 0)
        {
-               stream->priv->is_utf8 = TRUE;
-               return NULL;
+               conv = g_charset_converter_new ("UTF-8",
+                                               uchardet_get_charset (chardet),
+                                               NULL);
        }
 
+       uchardet_delete (chardet);
+
+       return conv;
+}
+#endif /* WITH_UCHARDET */
+
+static GCharsetConverter *
+guess_encoding_fallback (GtkSourceBufferOutputStream *stream,
+                        const void                  *inbuf,
+                        gsize                        inbuf_size)
+{
+       GCharsetConverter *conv = NULL;
+
        if (stream->priv->encodings != NULL &&
            stream->priv->encodings->next == NULL)
        {
@@ -440,6 +466,31 @@ guess_encoding (GtkSourceBufferOutputStream *stream,
        return conv;
 }
 
+static GCharsetConverter *
+guess_encoding (GtkSourceBufferOutputStream *stream,
+               const void                  *inbuf,
+               gsize                        inbuf_size)
+{
+       GCharsetConverter *conv = NULL;
+
+       if (inbuf == NULL || inbuf_size == 0)
+       {
+               stream->priv->is_utf8 = TRUE;
+               return NULL;
+       }
+
+#ifdef WITH_UCHARDET
+       conv = guess_encoding_with_uchardet (inbuf, inbuf_size);
+#endif
+
+       if (conv == NULL)
+       {
+               conv = guess_encoding_fallback (stream, inbuf, inbuf_size);
+       }
+
+       return conv;
+}
+
 static GtkSourceNewlineType
 get_newline_type (GtkTextIter *end)
 {
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 660a8dc..2b08b56 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -16,7 +16,8 @@ LDADD = $(top_builddir)/gtksourceview/completion-providers/words/libgtksourcecom
        $(top_builddir)/gtksourceview/libgtksourceview-core.la \
        -lm \
        $(DEP_LIBS) \
-       $(TESTS_LIBS)
+       $(TESTS_LIBS) \
+       $(UCHARDET_LIBS)
 
 noinst_PROGRAMS = $(TEST_PROGS) $(UNIT_TEST_PROGS)
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]