[gtksourceview/wip/uchardet: 1/2] Use uchardet to improve character encoding auto-detection
- From: Sébastien Wilmet <swilmet src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gtksourceview/wip/uchardet: 1/2] Use uchardet to improve character encoding auto-detection
- Date: Thu, 26 Nov 2015 16:31:27 +0000 (UTC)
commit f3586e75fca4198eaf691af4ea1e4f0ccafcc33c
Author: Jehan <jehan girinstud io>
Date: Thu Sep 3 21:34:04 2015 +0200
Use uchardet to improve character encoding auto-detection
uchardet is a C language binding from Mozilla encoding library. It
performs much better than the current implementation.
The dependency is optional.
https://bugzilla.gnome.org/show_bug.cgi?id=669448
configure.ac | 23 ++++++++++
gtksourceview/Makefile.am | 9 +++-
gtksourceview/gtksourcebufferoutputstream.c | 63 ++++++++++++++++++++++++---
tests/Makefile.am | 3 +-
4 files changed, 88 insertions(+), 10 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index ef83b17..6392a08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -48,6 +48,7 @@ glib_req=2.47.0
gtk_req=3.19.3
libxml_req=2.6.0
gladeui_req=3.9
+uchardet_req=0.0.1
AC_CONFIG_SRCDIR([gtksourceview/gtksourcebuffer.h])
AC_CONFIG_HEADER([config.h])
@@ -140,6 +141,27 @@ AS_IF([ test "$glade_catalog" = "yes" ],
AC_MSG_RESULT([$GLADE_CATALOG_DIR])
AC_SUBST(GLADE_CATALOG_DIR)])
+# Check for uchardet
+AC_ARG_WITH([uchardet],
+ [AS_HELP_STRING([--without-uchardet],
+ [Build without uchardet support])],
+ [with_uchardet=${withval}],
+ [with_uchardet=yes])
+
+if test "x$with_uchardet" = xyes; then
+ AX_PKG_CHECK_MODULES([UCHARDET],
+ [],
+ [uchardet >= $uchardet_req],
+ [have_uchardet=yes],
+ [have_uchardet=no])
+
+ if test "x$have_uchardet" = xno; then
+ AC_MSG_ERROR([uchardet library not found or too old. Use --without-uchardet to build without
the uchardet dependency.])
+ fi
+
+ AC_DEFINE([WITH_UCHARDET], [1], [Define to 1 if uchardet is available])
+fi
+
# i18n
AM_GNU_GETTEXT([external])
AM_GNU_GETTEXT_VERSION([0.19.4])
@@ -220,4 +242,5 @@ Configuration:
Glade Catalog: ${glade_catalog}
GObject introspection: ${found_introspection}
Vala: ${enable_vala}
+ uchardet: ${with_uchardet}
"
diff --git a/gtksourceview/Makefile.am b/gtksourceview/Makefile.am
index 768e887..b649b47 100644
--- a/gtksourceview/Makefile.am
+++ b/gtksourceview/Makefile.am
@@ -152,8 +152,9 @@ libgtksourceview_core_la_SOURCES = \
nodist_libgtksourceview_core_la_SOURCES = \
$(BUILT_SOURCES)
-libgtksourceview_core_la_CFLAGS = \
- $(CODE_COVERAGE_CFLAGS)
+libgtksourceview_core_la_CFLAGS = \
+ $(CODE_COVERAGE_CFLAGS) \
+ $(UCHARDET_CFLAGS)
libgtksourceview_core_la_LDFLAGS = \
-no-undefined \
@@ -174,7 +175,9 @@ libgtksourceview_3_0_la_LIBADD = \
-lm \
libgtksourceview-core.la \
completion-providers/words/libgtksourcecompletionwords.la \
- $(DEP_LIBS) $(GTK_MAC_LIBS)
+ $(DEP_LIBS) \
+ $(UCHARDET_LIBS) \
+ $(GTK_MAC_LIBS)
libgtksourceview_3_0_la_CFLAGS = \
$(CODE_COVERAGE_CFLAGS)
diff --git a/gtksourceview/gtksourcebufferoutputstream.c b/gtksourceview/gtksourcebufferoutputstream.c
index 365ae98..200a16c 100644
--- a/gtksourceview/gtksourcebufferoutputstream.c
+++ b/gtksourceview/gtksourcebufferoutputstream.c
@@ -29,6 +29,10 @@
#include "gtksourcefileloader.h"
#include "gtksourceview-i18n.h"
+#ifdef WITH_UCHARDET
+#include <uchardet.h>
+#endif
+
/* NOTE: never use async methods on this stream, the stream is just
* a wrapper around GtkTextBuffer api so that we can use GIO Stream
* methods, but the underlying code operates on a GtkTextBuffer, so
@@ -352,19 +356,41 @@ try_convert (GCharsetConverter *converter,
return ret;
}
+#ifdef WITH_UCHARDET
static GCharsetConverter *
-guess_encoding (GtkSourceBufferOutputStream *stream,
- const void *inbuf,
- gsize inbuf_size)
+guess_encoding_with_uchardet (const void *inbuf,
+ gsize inbuf_size)
{
GCharsetConverter *conv = NULL;
+ uchardet_t chardet = uchardet_new ();
+ gint success;
- if (inbuf == NULL || inbuf_size == 0)
+ success = uchardet_handle_data (chardet,
+ (const char *) inbuf,
+ (size_t) inbuf_size);
+ uchardet_data_end (chardet);
+
+ if (success == 0 &&
+ g_strcmp0 (uchardet_get_charset (chardet), "") != 0)
{
- stream->priv->is_utf8 = TRUE;
- return NULL;
+ conv = g_charset_converter_new ("UTF-8",
+ uchardet_get_charset (chardet),
+ NULL);
}
+ uchardet_delete (chardet);
+
+ return conv;
+}
+#endif /* WITH_UCHARDET */
+
+static GCharsetConverter *
+guess_encoding_fallback (GtkSourceBufferOutputStream *stream,
+ const void *inbuf,
+ gsize inbuf_size)
+{
+ GCharsetConverter *conv = NULL;
+
if (stream->priv->encodings != NULL &&
stream->priv->encodings->next == NULL)
{
@@ -440,6 +466,31 @@ guess_encoding (GtkSourceBufferOutputStream *stream,
return conv;
}
+static GCharsetConverter *
+guess_encoding (GtkSourceBufferOutputStream *stream,
+ const void *inbuf,
+ gsize inbuf_size)
+{
+ GCharsetConverter *conv = NULL;
+
+ if (inbuf == NULL || inbuf_size == 0)
+ {
+ stream->priv->is_utf8 = TRUE;
+ return NULL;
+ }
+
+#ifdef WITH_UCHARDET
+ conv = guess_encoding_with_uchardet (inbuf, inbuf_size);
+#endif
+
+ if (conv == NULL)
+ {
+ conv = guess_encoding_fallback (stream, inbuf, inbuf_size);
+ }
+
+ return conv;
+}
+
static GtkSourceNewlineType
get_newline_type (GtkTextIter *end)
{
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 660a8dc..2b08b56 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -16,7 +16,8 @@ LDADD = $(top_builddir)/gtksourceview/completion-providers/words/libgtksourcecom
$(top_builddir)/gtksourceview/libgtksourceview-core.la \
-lm \
$(DEP_LIBS) \
- $(TESTS_LIBS)
+ $(TESTS_LIBS) \
+ $(UCHARDET_LIBS)
noinst_PROGRAMS = $(TEST_PROGS) $(UNIT_TEST_PROGS)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]