[tracker/tracker-0.6] Fixs according to specs and improve handwritten MP3 genres



commit c47d8a6700fd1d621072b427ec1ff3b21d4184a0
Author: Martyn Russell <martyn imendio com>
Date:   Wed Jun 24 12:51:30 2009 +0100

    Fixs according to specs and improve handwritten MP3 genres

 configure.ac                                       |    1 +
 src/tracker-extract/tracker-extract-mp3.c          |  143 +++++++++++++++---
 utils/Makefile.am                                  |    9 +-
 utils/mp3-genre-leading-uppercase/Makefile.am      |   18 ++
 .../mp3-genre-leading-uppercase.c                  |  166 ++++++++++++++++++++
 5 files changed, 314 insertions(+), 23 deletions(-)
---
diff --git a/configure.ac b/configure.ac
index ebda38e..776d353 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1463,6 +1463,7 @@ AC_CONFIG_FILES([
 	tests/tracker-extract/Makefile
 	utils/Makefile
 	utils/albumart/Makefile
+	utils/mp3-genre-leading-uppercase/Makefile
 	utils/qdbm/Makefile
 	utils/sqlite/Makefile
 	utils/tracker-fts/Makefile
diff --git a/src/tracker-extract/tracker-extract-mp3.c b/src/tracker-extract/tracker-extract-mp3.c
index ca3a095..7dcde3b 100644
--- a/src/tracker-extract/tracker-extract-mp3.c
+++ b/src/tracker-extract/tracker-extract-mp3.c
@@ -114,7 +114,23 @@ enum {
 static void extract_mp3 (const gchar *filename,
 			 GHashTable  *metadata);
 
+/* This list is based on the comprehensive list on the French wiki
+ * page here:
+ * 
+ *   http://fr.wikipedia.org/wiki/ID3
+ * 
+ * The actual list as explained by the standard is available here but
+ * has some ~17 or so genres missing which are on the French list:
+ *
+ *   http://www.id3.org/id3v2.3.0#head-129376727ebe5309c1de1888987d070288d7c7e7
+ * 
+ * Since the index is the most important thing here and this list is
+ * not sorted alphabetically, all new IDs are only ever appended to
+ * the list and that's why we can still use the French ID3 list over
+ * the actual list on the standards website.
+ */
 static const char *const genre_names[] = {
+	/* Standard genres */
 	"Blues",
 	"Classic Rock",
 	"Country",
@@ -155,7 +171,7 @@ static const char *const genre_names[] = {
 	"Sound Clip",
 	"Gospel",
 	"Noise",
-	"Alt. Rock",
+	"AlternRock",
 	"Bass",
 	"Soul",
 	"Punk",
@@ -174,7 +190,7 @@ static const char *const genre_names[] = {
 	"Southern Rock",
 	"Comedy",
 	"Cult",
-	"Gangsta Rap",
+	"Gangsta",
 	"Top 40",
 	"Christian Rap",
 	"Pop/Funk",
@@ -182,7 +198,7 @@ static const char *const genre_names[] = {
 	"Native American",
 	"Cabaret",
 	"New Wave",
-	"Psychedelic",
+	"Psychadelic",
 	"Rave",
 	"Showtunes",
 	"Trailer",
@@ -195,11 +211,13 @@ static const char *const genre_names[] = {
 	"Musical",
 	"Rock & Roll",
 	"Hard Rock",
+
+	/* Added on December 12, 1997 in cooperation with Winamp: */
 	"Folk",
-	"Folk/Rock",
+	"Folk-Rock",
 	"National Folk",
 	"Swing",
-	"Fast-Fusion",
+	"Fast Fusion",
 	"Bebob",
 	"Latin",
 	"Revival",
@@ -226,11 +244,15 @@ static const char *const genre_names[] = {
 	"Primus",
 	"Porn Groove",
 	"Satire",
+
+	/* Added on January 26, 1998 to ensure compatibility with Winamp 1.7: */
 	"Slow Jam",
 	"Club",
 	"Tango",
 	"Samba",
 	"Folklore",
+
+	/* Added on April 13, 1998 to ensure compatibility with Winamp 1.90: */
 	"Ballad",
 	"Power Ballad",
 	"Rhythmic Soul",
@@ -238,7 +260,7 @@ static const char *const genre_names[] = {
 	"Duet",
 	"Punk Rock",
 	"Drum Solo",
-	"A Cappella",
+	"A capella",
 	"Euro-House",
 	"Dance Hall",
 	"Goa",
@@ -257,6 +279,8 @@ static const char *const genre_names[] = {
 	"Crossover",
 	"Contemporary Christian",
 	"Christian Rock",
+
+	/* Added on Jun 1, 1998 to ensure compatibility with Winamp 1.91: */
 	"Merengue",
 	"Salsa",
 	"Thrash Metal",
@@ -308,6 +332,80 @@ static TrackerExtractData extract_data[] = {
 	{ NULL, NULL }
 };
 
+static void
+improve_handwritten_genre (gchar *genre)
+{
+	/* This function tries to make each first letter of each word
+	 * upper case so we conform a bit more to the standards, for
+	 * example, if it is "Fusion jazz", we want "Fussion Jazz" to
+	 * make things more consistent.
+	 */
+        gchar *p;
+	gunichar c;
+	gboolean set_next;
+
+	if (!genre) {
+		return;
+	}
+
+	c = g_utf8_get_char (genre);
+	*genre = g_unichar_toupper (c);
+
+        for (p = genre, set_next = FALSE; *p; p = g_utf8_next_char (p)) {
+		GUnicodeBreakType t;
+
+                c = g_utf8_get_char (p);
+		t = g_unichar_break_type (c);
+
+		if (set_next) {
+			*p = g_unichar_toupper (c);
+			set_next = FALSE;
+		}
+
+		switch (t) {
+		case G_UNICODE_BREAK_MANDATORY:
+		case G_UNICODE_BREAK_CARRIAGE_RETURN:
+		case G_UNICODE_BREAK_LINE_FEED:
+		case G_UNICODE_BREAK_COMBINING_MARK:
+		case G_UNICODE_BREAK_SURROGATE:
+		case G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
+		case G_UNICODE_BREAK_INSEPARABLE:
+		case G_UNICODE_BREAK_NON_BREAKING_GLUE:
+		case G_UNICODE_BREAK_CONTINGENT:
+		case G_UNICODE_BREAK_SPACE:
+		case G_UNICODE_BREAK_HYPHEN:
+		case G_UNICODE_BREAK_EXCLAMATION:
+		case G_UNICODE_BREAK_WORD_JOINER:
+		case G_UNICODE_BREAK_NEXT_LINE:
+		case G_UNICODE_BREAK_SYMBOL:
+			set_next = TRUE;
+
+		case G_UNICODE_BREAK_AFTER:
+		case G_UNICODE_BREAK_BEFORE:
+		case G_UNICODE_BREAK_BEFORE_AND_AFTER:
+		case G_UNICODE_BREAK_NON_STARTER:
+		case G_UNICODE_BREAK_OPEN_PUNCTUATION:
+		case G_UNICODE_BREAK_CLOSE_PUNCTUATION:
+		case G_UNICODE_BREAK_QUOTATION:
+		case G_UNICODE_BREAK_IDEOGRAPHIC:
+		case G_UNICODE_BREAK_NUMERIC:
+		case G_UNICODE_BREAK_INFIX_SEPARATOR:
+		case G_UNICODE_BREAK_ALPHABETIC:
+		case G_UNICODE_BREAK_PREFIX:
+		case G_UNICODE_BREAK_POSTFIX:
+		case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+		case G_UNICODE_BREAK_AMBIGUOUS:
+		case G_UNICODE_BREAK_UNKNOWN:
+		case G_UNICODE_BREAK_HANGUL_L_JAMO:
+		case G_UNICODE_BREAK_HANGUL_V_JAMO:
+		case G_UNICODE_BREAK_HANGUL_T_JAMO:
+		case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+		case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+			break;
+		}
+        }
+}
+
 static char *
 read_id3v1_buffer (int fd, goffset size)
 {
@@ -1005,10 +1103,13 @@ get_id3v24_tags (const gchar *data,
 						if (get_genre_number (word, &genre)) {
 							g_free (word);
 							word = g_strdup (get_genre_name (genre));
-						}
+						} else {
+							if (g_ascii_strcasecmp (word, "unknown") == 0) {
+								g_free (word);
+								break;
+							} 
 
-						if (!word || strcasecmp (word, "unknown") == 0) {
-							break;
+							improve_handwritten_genre (word);
 						}
 					} else if (strcmp (tmap[i].text, "TLEN") == 0) {
 						guint32 duration;
@@ -1255,6 +1356,7 @@ get_id3v23_tags (const gchar *data,
 
 						parts = g_strsplit (word, "/", 2);
 						g_free (word);
+
 						word = g_strdup (parts[0]);
 						g_strfreev (parts);
 					} else if (strcmp (tmap[i].text, "TCON") == 0) {
@@ -1263,10 +1365,13 @@ get_id3v23_tags (const gchar *data,
 						if (get_genre_number (word, &genre)) {
 							g_free (word);
 							word = g_strdup (get_genre_name (genre));
-						}
+						} else {
+							if (g_ascii_strcasecmp (word, "unknown") == 0) {
+								g_free (word);
+								break;
+							} 
 
-						if (!word || strcasecmp (word, "unknown") == 0) {
-							break;
+							improve_handwritten_genre (word);
 						}
 					} else if (strcmp (tmap[i].text, "TLEN") == 0) {
 						guint32 duration;
@@ -1492,19 +1597,19 @@ get_id3v20_tags (const gchar *data,
 						s = g_strdup (word + strlen (word) + 1);
 						g_free (word);
 						word = s;
-					}
-
-					if (strcmp (tmap[i].text, "TCO") == 0) {
+					} else if (strcmp (tmap[i].text, "TCO") == 0) {
 						gint genre;
 
 						if (get_genre_number (word, &genre)) {
 							g_free (word);
 							word = g_strdup (get_genre_name (genre));
-						}
+						} else {
+							if (g_ascii_strcasecmp (word, "unknown") == 0) {
+								g_free (word);
+								break;
+							} 
 
-						if (!word || strcasecmp (word, "unknown") == 0) {
-							g_free (word);
-							break;
+							improve_handwritten_genre (word);
 						}
 					} else if (strcmp (tmap[i].text, "TLE") == 0) {
 						guint32 duration;
diff --git a/utils/Makefile.am b/utils/Makefile.am
index a7c271e..d2f1baa 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -4,8 +4,9 @@ if ENABLE_SQLITE_FTS
 build_sqlite_fts = tracker-fts
 endif
 
-SUBDIRS = 			\
-	$(build_sqlite_fts)	\
-	albumart		\
-	qdbm			\
+SUBDIRS = 				\
+	$(build_sqlite_fts)		\
+	albumart			\
+	mp3-genre-leading-uppercase	\
+	qdbm				\
 	sqlite
diff --git a/utils/mp3-genre-leading-uppercase/Makefile.am b/utils/mp3-genre-leading-uppercase/Makefile.am
new file mode 100644
index 0000000..0a0c71d
--- /dev/null
+++ b/utils/mp3-genre-leading-uppercase/Makefile.am
@@ -0,0 +1,18 @@
+include $(top_srcdir)/Makefile.decl
+
+noinst_PROGRAMS = mp3-genre-leading-uppercase
+
+INCLUDES = 								\
+	-DG_LOG_DOMAIN=\"Tracker\"					\
+	-DTRACKER_COMPILATION						\
+	-I$(top_srcdir)/src						\
+	$(WARN_CFLAGS)							\
+	$(GLIB2_CFLAGS)
+
+mp3_genre_leading_uppercase_SOURCES =					\
+	mp3-genre-leading-uppercase.c
+
+mp3_genre_leading_uppercase_LDADD =					\
+	$(top_builddir)/src/libtracker-common/libtracker-common.la 	\
+	$(GLIB2_LIBS)
+
diff --git a/utils/mp3-genre-leading-uppercase/mp3-genre-leading-uppercase.c b/utils/mp3-genre-leading-uppercase/mp3-genre-leading-uppercase.c
new file mode 100644
index 0000000..bb2fe6a
--- /dev/null
+++ b/utils/mp3-genre-leading-uppercase/mp3-genre-leading-uppercase.c
@@ -0,0 +1,166 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2008, Nokia
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+
+#include <glib/gi18n.h>
+
+#include <libtracker-common/tracker-albumart.h>
+
+static gchar	    **text;
+
+static GOptionEntry   entries[] = {
+	{ G_OPTION_REMAINING, 0, 0,
+	  G_OPTION_ARG_STRING_ARRAY, &text,
+	  N_("genre"),
+	  N_("EXPRESSION")
+	},
+	{ NULL }
+};
+
+static void
+improve_handwritten_genre (gchar *genre)
+{
+	/* This function tries to make each first letter of each word
+	 * upper case so we conform a bit more to the standards, for
+	 * example, if it is "Fusion jazz", we want "Fussion Jazz" to
+	 * make things more consistent.
+	 */
+        gchar *p;
+	gunichar c;
+	gboolean set_next;
+
+	if (!genre) {
+		return;
+	}
+
+	c = g_utf8_get_char (genre);
+	*genre = g_unichar_toupper (c);
+
+        for (p = genre, set_next = FALSE; *p; p = g_utf8_next_char (p)) {
+		GUnicodeBreakType t;
+
+                c = g_utf8_get_char (p);
+		t = g_unichar_break_type (c);
+
+		if (set_next) {
+			*p = g_unichar_toupper (c);
+			set_next = FALSE;
+		}
+
+		switch (t) {
+		case G_UNICODE_BREAK_MANDATORY:
+		case G_UNICODE_BREAK_CARRIAGE_RETURN:
+		case G_UNICODE_BREAK_LINE_FEED:
+		case G_UNICODE_BREAK_COMBINING_MARK:
+		case G_UNICODE_BREAK_SURROGATE:
+		case G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
+		case G_UNICODE_BREAK_INSEPARABLE:
+		case G_UNICODE_BREAK_NON_BREAKING_GLUE:
+		case G_UNICODE_BREAK_CONTINGENT:
+		case G_UNICODE_BREAK_SPACE:
+		case G_UNICODE_BREAK_HYPHEN:
+		case G_UNICODE_BREAK_EXCLAMATION:
+		case G_UNICODE_BREAK_WORD_JOINER:
+		case G_UNICODE_BREAK_NEXT_LINE:
+		case G_UNICODE_BREAK_SYMBOL:
+			set_next = TRUE;
+
+		case G_UNICODE_BREAK_AFTER:
+		case G_UNICODE_BREAK_BEFORE:
+		case G_UNICODE_BREAK_BEFORE_AND_AFTER:
+		case G_UNICODE_BREAK_NON_STARTER:
+		case G_UNICODE_BREAK_OPEN_PUNCTUATION:
+		case G_UNICODE_BREAK_CLOSE_PUNCTUATION:
+		case G_UNICODE_BREAK_QUOTATION:
+		case G_UNICODE_BREAK_IDEOGRAPHIC:
+		case G_UNICODE_BREAK_NUMERIC:
+		case G_UNICODE_BREAK_INFIX_SEPARATOR:
+		case G_UNICODE_BREAK_ALPHABETIC:
+		case G_UNICODE_BREAK_PREFIX:
+		case G_UNICODE_BREAK_POSTFIX:
+		case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+		case G_UNICODE_BREAK_AMBIGUOUS:
+		case G_UNICODE_BREAK_UNKNOWN:
+		case G_UNICODE_BREAK_HANGUL_L_JAMO:
+		case G_UNICODE_BREAK_HANGUL_V_JAMO:
+		case G_UNICODE_BREAK_HANGUL_T_JAMO:
+		case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+		case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+			break;
+		}
+        }
+}
+
+int
+main (int argc, char *argv[])
+{
+	GOptionContext  *context;
+        gchar           *summary;
+	gchar          **p;
+
+	setlocale (LC_ALL, "");
+
+	context = g_option_context_new (_("- Test MP3 handwritten genre conversion to leading uppercase"));
+	summary = g_strconcat (_("You can use this to check genre strings get converted correctly, for example:"),
+			       "\n",
+			       "\n",
+			       "  \"fOo-bar bAz ping/pong sliff's & sloffs\"",
+			       "\n",
+			       "\n",
+			       _("Should be converted to"),
+			       "\n",
+			       "\n",
+			       "  \"FOo-Bar BAz Ping/Pong Sliff's & Sloffs\"",
+			       NULL);
+	g_option_context_set_summary (context, summary);
+	g_option_context_add_main_entries (context, entries, NULL);
+	g_option_context_parse (context, &argc, &argv, NULL);
+	g_free (summary);
+
+	if (!text || !*text) {
+		gchar *help;
+
+		g_printerr ("%s\n\n",
+			    _("No genre text was provided"));
+
+		help = g_option_context_get_help (context, TRUE, NULL);
+		g_option_context_free (context);
+		g_printerr ("%s", help);
+		g_free (help);
+
+		return EXIT_FAILURE;
+	}
+
+	g_option_context_free (context);
+
+	for (p = text; *p; p++) {
+		g_print ("\n");
+
+		g_print ("%s:\n", _("Converted to"));
+
+		improve_handwritten_genre (*p);
+		g_print ("  %s\n", *p);
+	}
+       
+        return EXIT_SUCCESS;
+}



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]