[tracker] extract-mp3: Bail out on encoding detection if confidence is too low



commit ede17cc22b0c6245c030fbb45d0db60a35316c73
Author: Carlos Garnacho <carlosg gnome org>
Date:   Sun Jul 5 12:21:27 2015 +0200

    extract-mp3: Bail out on encoding detection if confidence is too low
    
    Libicu encoding detection is able to tell the confidence it got on
    the detection, we should be using that in case the confidence is
    too low, as that means the returned encoding is probably bogus, and
    we have an encoding to fallback on.
    
    This fixes detection on the file reported on bug #735515, where
    a couple of 'ï' chars (valid ISO-8859-1) make libicu detect UTF-16BE,
    although with an extremely low confidence.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=735515

 src/libtracker-extract/tracker-encoding-libicu.c |   15 +++++++++++++--
 src/libtracker-extract/tracker-encoding-libicu.h |    3 ++-
 src/libtracker-extract/tracker-encoding.c        |   12 +++++++++---
 src/libtracker-extract/tracker-encoding.h        |    3 ++-
 src/tracker-extract/tracker-extract-mp3.c        |   11 ++++++++++-
 5 files changed, 36 insertions(+), 8 deletions(-)
---
diff --git a/src/libtracker-extract/tracker-encoding-libicu.c 
b/src/libtracker-extract/tracker-encoding-libicu.c
index 8eb0add..3490dac 100644
--- a/src/libtracker-extract/tracker-encoding-libicu.c
+++ b/src/libtracker-extract/tracker-encoding-libicu.c
@@ -29,13 +29,15 @@
 
 gchar *
 tracker_encoding_guess_icu (const gchar *buffer,
-                           gsize        size)
+                            gsize        size,
+                            gdouble     *confidence)
 {
        UCharsetDetector *detector = NULL;
        const UCharsetMatch *match;
        gchar *charset = NULL;
        UErrorCode status = 0;
        const char *p_match = NULL;
+       int32_t conf = 0;
 
        detector = ucsdet_open (&status);
 
@@ -60,12 +62,21 @@ tracker_encoding_guess_icu (const gchar *buffer,
        if (p_match == NULL || U_FAILURE (status))
                goto failure;
 
+       conf = ucsdet_getConfidence (match, &status);
+
+       if (U_FAILURE (status))
+               goto failure;
+
         charset = g_strdup ((const gchar *) p_match);
 
        if (charset)
-               g_debug ("Guessing charset as '%s'", charset);
+               g_debug ("Guessing charset as '%s' (Confidence: %f)",
+                        charset, (gdouble) conf / 100);
 
 failure:
+       if (confidence)
+               *confidence = (gdouble) conf / 100;
+
        if (detector)
                ucsdet_close (detector);
 
diff --git a/src/libtracker-extract/tracker-encoding-libicu.h 
b/src/libtracker-extract/tracker-encoding-libicu.h
index 0b9b9f4..3b3f942 100644
--- a/src/libtracker-extract/tracker-encoding-libicu.h
+++ b/src/libtracker-extract/tracker-encoding-libicu.h
@@ -26,7 +26,8 @@ G_BEGIN_DECLS
 
 G_GNUC_INTERNAL
 gchar *tracker_encoding_guess_icu (const gchar *buffer,
-                                  gsize        size);
+                                   gsize        size,
+                                   gdouble     *confidence);
 
 G_END_DECLS
 
diff --git a/src/libtracker-extract/tracker-encoding.c b/src/libtracker-extract/tracker-encoding.c
index d8da3c4..ac4f976 100644
--- a/src/libtracker-extract/tracker-encoding.c
+++ b/src/libtracker-extract/tracker-encoding.c
@@ -46,9 +46,11 @@ tracker_encoding_can_guess (void)
 
 gchar *
 tracker_encoding_guess (const gchar *buffer,
-                        gsize        size)
+                        gsize        size,
+                        gdouble     *confidence)
 {
        gchar *encoding = NULL;
+       gdouble conf = 1;
 
 #ifdef HAVE_MEEGOTOUCH
        encoding = tracker_encoding_guess_meegotouch (buffer, size);
@@ -56,14 +58,18 @@ tracker_encoding_guess (const gchar *buffer,
 
 #ifdef HAVE_LIBICU_CHARSET_DETECTION
        if (!encoding)
-               encoding = tracker_encoding_guess_icu (buffer, size);
+               encoding = tracker_encoding_guess_icu (buffer, size, &conf);
 #endif /* HAVE_LIBICU_CHARSET_DETECTION */
 
 #ifdef HAVE_ENCA
-       if (!encoding)
+       if (!encoding || conf < 0.5) {
+               conf = 1;
                encoding = tracker_encoding_guess_enca (buffer, size);
+       }
 #endif /* HAVE_ENCA */
 
+       if (confidence)
+               *confidence = conf;
 
        return encoding;
 }
diff --git a/src/libtracker-extract/tracker-encoding.h b/src/libtracker-extract/tracker-encoding.h
index 3964452..ed7e51e 100644
--- a/src/libtracker-extract/tracker-encoding.h
+++ b/src/libtracker-extract/tracker-encoding.h
@@ -33,7 +33,8 @@ gboolean  tracker_encoding_can_guess (void);
 
 /* Returns NULL if it couldn't guess it */
 gchar    *tracker_encoding_guess     (const gchar *buffer,
-                                      gsize        size);
+                                      gsize        size,
+                                      gdouble     *confidence);
 
 G_END_DECLS
 
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index f3d1bcb..04c4c09 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -675,13 +675,22 @@ get_encoding (const gchar *data,
               gsize        size,
               gboolean    *encoding_found)
 {
+       gdouble confidence = 1;
        gchar *encoding;
 
        /* Try to guess encoding */
        encoding = (data && size ?
-                   tracker_encoding_guess (data, size) :
+                   tracker_encoding_guess (data, size, &confidence) :
                    NULL);
 
+       if (confidence < 0.5) {
+               /* Confidence on the results was too low, bail out and
+                * fallback to the default ISO-8859-1/Windows-1252 encoding.
+                */
+               g_free (encoding);
+               encoding = NULL;
+       }
+
        /* Notify if a proper detection was done */
        if (encoding_found) {
                *encoding_found = (encoding ? TRUE : FALSE);;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]