Re: glib iconv caching



On Mon, 2005-08-08 at 17:05 -0400, mortenw gnome org wrote:
> I would guess that the reason caching was needed to begin with
> was because of crazy code like (from g_utf8_collate)...
> 
>       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
>       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
> 
> where "charset" is from g_get_charset, i.e., in practice it never
> changes.
> 
> Just keeping a single iconv around here for this would do wonders
> for efficiency.  (Similar code is found in g_utf8_collate_key.)

Here is a patch which tries to address this issue by caching iconv
descriptors for utf8<->locale and utf8<->filename conversions, and using
them in appropriate places.

Matthias
? glib/actual-abi
? glib/expected-abi
Index: glib/gconvert.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.c,v
retrieving revision 1.78
diff -u -p -r1.78 gconvert.c
--- glib/gconvert.c	8 Aug 2005 19:50:38 -0000	1.78
+++ glib/gconvert.c	9 Aug 2005 00:14:39 -0000
@@ -30,6 +30,7 @@
 
 #include "glib.h"
 #include "gprintfint.h"
+#include "gunicodeprivate.h"
 #include "gthreadinit.h"
 
 #ifdef G_PLATFORM_WIN32
@@ -1046,13 +1047,13 @@ g_locale_to_utf8 (const gchar  *opsysstr
 		  gsize        *bytes_written,
 		  GError      **error)
 {
-  const char *charset;
+  GIConv cd = (GIConv)-1;
 
-  if (g_get_charset (&charset))
+  if (_g_get_charset_iconv (NULL, &cd, NULL))
     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
-    return g_convert (opsysstring, len, 
-		      "UTF-8", charset, bytes_read, bytes_written, error);
+    return g_convert_with_iconv (opsysstring, len, 
+				 cd, bytes_read, bytes_written, error);
 }
 
 /**
@@ -1086,13 +1087,13 @@ g_locale_from_utf8 (const gchar *utf8str
 		    gsize       *bytes_written,
 		    GError     **error)
 {
-  const gchar *charset;
+  GIConv cd = (GIConv)-1;
 
-  if (g_get_charset (&charset))
+  if (_g_get_charset_iconv (NULL, NULL, &cd))
     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
   else
-    return g_convert (utf8string, len,
-		      charset, "UTF-8", bytes_read, bytes_written, error);
+    return g_convert_with_iconv (utf8string, len,
+				 cd, bytes_read, bytes_written, error);
 }
 
 #ifndef G_PLATFORM_WIN32
@@ -1102,6 +1103,8 @@ typedef struct _GFilenameCharsetCache GF
 struct _GFilenameCharsetCache {
   gboolean is_utf8;
   gchar *charset;
+  GIConv charset_to_utf8;
+  GIConv utf8_to_charset;
   gchar **filename_charsets;
 };
 
@@ -1111,44 +1114,17 @@ filename_charset_cache_free (gpointer da
   GFilenameCharsetCache *cache = data;
   g_free (cache->charset);
   g_strfreev (cache->filename_charsets);
+  if (cache->charset_to_utf8 != (GIConv)0)
+    g_iconv_close (cache->charset_to_utf8);
+  if (cache->utf8_to_charset != (GIConv)0)
+    g_iconv_close (cache->utf8_to_charset);
   g_free (cache);
 }
 
-/**
- * g_get_filename_charsets:
- * @charsets: return location for the %NULL-terminated list of encoding names
- *
- * Determines the preferred character sets used for filenames.
- * The first character set from the @charsets is the filename encoding, the
- * subsequent character sets are used when trying to generate a displayable
- * representation of a filename, see g_filename_display_name().
- *
- * On Unix, the character sets are determined by consulting the
- * environment variables <envar>G_FILENAME_ENCODING</envar> and
- * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
- * used in the GLib API is always UTF-8 and said environment variables
- * have no effect.
- *
- * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list 
- * of character set names. The special token "@locale" is taken to mean the 
- * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar> 
- * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of 
- * the current locale is taken as the filename encoding. If neither environment
- * variable is set, UTF-8 is taken as the filename encoding, but the character
- * set of the current locale is also put in the list of encodings.
- *
- * The returned @charsets belong to GLib and must not be freed.
- *
- * Note that on Unix, regardless of the locale character set or
- * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
- * system might be in any random encoding or just gibberish.
- *
- * Return value: %TRUE if the filename encoding is UTF-8.
- * 
- * Since: 2.6
- */
-gboolean
-g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
+static gboolean
+get_filename_charsets_iconv (G_CONST_RETURN gchar ***filename_charsets,
+			     GIConv                 *charset_to_utf8,
+			     GIConv                 *utf8_to_charset)
 {
   static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
   GFilenameCharsetCache *cache = g_static_private_get (&cache_private);
@@ -1207,9 +1183,64 @@ g_get_filename_charsets (G_CONST_RETURN 
   if (filename_charsets)
     *filename_charsets = (const gchar **)cache->filename_charsets;
 
+  if (charset_to_utf8)
+    {
+      if (cache->charset_to_utf8 == (GIConv)0)
+	cache->charset_to_utf8 = g_iconv_open ("UTF-8", 
+					       cache->filename_charsets[0]);
+      *charset_to_utf8 = cache->charset_to_utf8;
+    }
+
+  if (utf8_to_charset)
+    {
+      if (cache->utf8_to_charset == (GIConv)0)
+	cache->utf8_to_charset = g_iconv_open (cache->filename_charsets[0],
+					       "UTF-8");
+      *utf8_to_charset = cache->utf8_to_charset;
+    }
+
   return cache->is_utf8;
 }
 
+/**
+ * g_get_filename_charsets:
+ * @charsets: return location for the %NULL-terminated list of encoding names
+ *
+ * Determines the preferred character sets used for filenames.
+ * The first character set from the @charsets is the filename encoding, the
+ * subsequent character sets are used when trying to generate a displayable
+ * representation of a filename, see g_filename_display_name().
+ *
+ * On Unix, the character sets are determined by consulting the
+ * environment variables <envar>G_FILENAME_ENCODING</envar> and
+ * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
+ * used in the GLib API is always UTF-8 and said environment variables
+ * have no effect.
+ *
+ * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list 
+ * of character set names. The special token "@locale" is taken to mean the 
+ * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar> 
+ * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of 
+ * the current locale is taken as the filename encoding. If neither environment
+ * variable is set, UTF-8 is taken as the filename encoding, but the character
+ * set of the current locale is also put in the list of encodings.
+ *
+ * The returned @charsets belong to GLib and must not be freed.
+ *
+ * Note that on Unix, regardless of the locale character set or
+ * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
+ * system might be in any random encoding or just gibberish.
+ *
+ * Return value: %TRUE if the filename encoding is UTF-8.
+ * 
+ * Since: 2.6
+ */
+gboolean
+g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
+{
+  return get_filename_charsets_iconv (filename_charsets, NULL, NULL);
+}
+
 #else /* G_PLATFORM_WIN32 */
 
 gboolean
@@ -1241,20 +1272,6 @@ g_get_filename_charsets (G_CONST_RETURN 
 
 #endif /* G_PLATFORM_WIN32 */
 
-static gboolean
-get_filename_charset (const gchar **filename_charset)
-{
-  const gchar **charsets;
-  gboolean is_utf8;
-  
-  is_utf8 = g_get_filename_charsets (&charsets);
-
-  if (filename_charset)
-    *filename_charset = charsets[0];
-  
-  return is_utf8;
-}
-
 /* This is called from g_thread_init(). It's used to
  * initialize some static data in a threadsafe way.
  */
@@ -1296,13 +1313,13 @@ g_filename_to_utf8 (const gchar *opsysst
 		    gsize       *bytes_written,
 		    GError     **error)
 {
-  const gchar *charset;
+  GIConv cd;
 
-  if (get_filename_charset (&charset))
+  if (get_filename_charsets_iconv (NULL, &cd, NULL))
     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
-    return g_convert (opsysstring, len, 
-		      "UTF-8", charset, bytes_read, bytes_written, error);
+    return g_convert_with_iconv (opsysstring, len, 
+				 cd, bytes_read, bytes_written, error);
 }
 
 #ifdef G_OS_WIN32
@@ -1359,13 +1376,13 @@ g_filename_from_utf8 (const gchar *utf8s
 		      gsize       *bytes_written,
 		      GError     **error)
 {
-  const gchar *charset;
+  GIConv cd;
 
-  if (get_filename_charset (&charset))
+  if (get_filename_charsets_iconv (NULL, NULL, &cd))
     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
   else
-    return g_convert (utf8string, len,
-		      charset, "UTF-8", bytes_read, bytes_written, error);
+    return g_convert_with_iconv (utf8string, len,
+				 cd, bytes_read, bytes_written, error);
 }
 
 #ifdef G_OS_WIN32
Index: glib/gunicode.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gunicode.h,v
retrieving revision 1.30
diff -u -p -r1.30 gunicode.h
--- glib/gunicode.h	8 Jun 2005 05:22:05 -0000	1.30
+++ glib/gunicode.h	9 Aug 2005 00:14:39 -0000
@@ -241,6 +241,7 @@ gchar*     g_ucs4_to_utf8      (const gu
 gint      g_unichar_to_utf8 (gunichar    c,
 			     gchar      *outbuf);
 
+
 /* Validate a UTF8 string, return TRUE if valid, put pointer to
  * first invalid char in **end
  */
Index: glib/gunicodeprivate.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gunicodeprivate.h,v
retrieving revision 1.2
diff -u -p -r1.2 gunicodeprivate.h
--- glib/gunicodeprivate.h	2 Aug 2004 15:34:30 -0000	1.2
+++ glib/gunicodeprivate.h	9 Aug 2005 00:14:39 -0000
@@ -30,6 +30,11 @@ gunichar *_g_utf8_normalize_wc       (co
                                       GNormalizeMode  mode) G_GNUC_INTERNAL;
 gint      _g_unichar_combining_class (gunichar uc) G_GNUC_INTERNAL;
 
+gboolean  _g_get_charset_iconv       (G_CONST_RETURN char **charset,
+				      GIConv               *charset_to_utf8,
+				      GIConv               *utf8_to_charset);
+
+
 G_END_DECLS
 
 #endif /* __G_UNICODE_PRIVATE_H__ */
Index: glib/gunicollate.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gunicollate.c,v
retrieving revision 1.17
diff -u -p -r1.17 gunicollate.c
--- glib/gunicollate.c	8 Jun 2005 05:22:05 -0000	1.17
+++ glib/gunicollate.c	9 Aug 2005 00:14:39 -0000
@@ -69,9 +69,9 @@ g_utf8_collate (const gchar *str1,
 
 #else /* !__STDC_ISO_10646__ */
 
-  const gchar *charset;
   gchar *str1_norm;
   gchar *str2_norm;
+  GIConv cd;
 
   g_return_val_if_fail (str1 != NULL, 0);
   g_return_val_if_fail (str2 != NULL, 0);
@@ -79,17 +79,19 @@ g_utf8_collate (const gchar *str1,
   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 
-  if (g_get_charset (&charset))
+  if (_g_get_charset_iconv (NULL, NULL, &cd))
     {
       result = strcoll (str1_norm, str2_norm);
     }
   else
     {
-      gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
-      gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
+      gchar *str1_locale, *str2_locale;
+
+      str1_locale = g_convert_with_iconv (str1_norm, -1, cd, NULL, NULL, NULL);
+      str2_locale = g_convert_with_iconv (str2_norm, -1, cd, NULL, NULL, NULL);
 
       if (str1_locale && str2_locale)
-	result =  strcoll (str1_locale, str2_locale);
+	result = strcoll (str1_locale, str2_locale);
       else if (str1_locale)
 	result = -1;
       else if (str2_locale)
@@ -212,15 +214,14 @@ g_utf8_collate_key (const gchar *str,
 
   return result;
 #else /* !__STDC_ISO_10646__ */
-
-  const gchar *charset;
+  GIConv cd;
   gchar *str_norm;
 
   g_return_val_if_fail (str != NULL, NULL);
 
   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 
-  if (g_get_charset (&charset))
+  if (_g_get_charset_iconv (NULL, NULL, &cd))
     {
       xfrm_len = strxfrm (NULL, str_norm, 0);
       result = g_malloc (xfrm_len + 1);
@@ -228,7 +229,7 @@ g_utf8_collate_key (const gchar *str,
     }
   else
     {
-      gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
+      gchar *str_locale = g_convert_with_iconv (str_norm, -1, cd, NULL, NULL, NULL);
 
       if (str_locale)
 	{
Index: glib/gutf8.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gutf8.c,v
retrieving revision 1.42
diff -u -p -r1.42 gutf8.c
--- glib/gutf8.c	14 Mar 2005 04:26:57 -0000	1.42
+++ glib/gutf8.c	9 Aug 2005 00:14:39 -0000
@@ -28,6 +28,7 @@
 #include <string.h>
 
 #include "glib.h"
+#include "gunicodeprivate.h"
 #include "galias.h"
 
 #ifdef G_PLATFORM_WIN32
@@ -458,6 +459,8 @@ struct _GCharsetCache {
   gboolean is_utf8;
   gchar *raw;
   gchar *charset;
+  GIConv charset_to_utf8;
+  GIConv utf8_to_charset;
 };
 
 static void
@@ -466,6 +469,10 @@ charset_cache_free (gpointer data)
   GCharsetCache *cache = data;
   g_free (cache->raw);
   g_free (cache->charset);
+  if (cache->charset_to_utf8 != (GIConv)0)
+    g_iconv_close (cache->charset_to_utf8);
+  if (cache->utf8_to_charset != (GIConv)0)
+    g_iconv_close (cache->utf8_to_charset);
   g_free (cache);
 }
 
@@ -490,6 +497,14 @@ charset_cache_free (gpointer data)
 gboolean
 g_get_charset (G_CONST_RETURN char **charset) 
 {
+  return _g_get_charset_iconv (charset, NULL, NULL);
+}
+
+gboolean
+_g_get_charset_iconv (G_CONST_RETURN char **charset,
+		      GIConv               *charset_to_utf8,
+		      GIConv               *utf8_to_charset) 
+{
   static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
   GCharsetCache *cache = g_static_private_get (&cache_private);
   const gchar *raw;
@@ -515,7 +530,21 @@ g_get_charset (G_CONST_RETURN char **cha
 
   if (charset)
     *charset = cache->charset;
-  
+
+  if (charset_to_utf8)
+    {
+      if (cache->charset_to_utf8 == (GIConv)0)
+	cache->charset_to_utf8 = g_iconv_open ("UTF-8", cache->charset);
+      *charset_to_utf8 = cache->charset_to_utf8;
+    }
+
+  if (utf8_to_charset)
+    {
+      if (cache->utf8_to_charset == (GIConv)0)
+	cache->utf8_to_charset = g_iconv_open (cache->charset, "UTF-8");
+      *utf8_to_charset = cache->utf8_to_charset;
+    }
+
   return cache->is_utf8;
 }
 
@@ -1165,7 +1194,7 @@ g_utf16_to_utf8 (const gunichar2  *str,
  * @str: a UTF-16 encoded string
  * @len: the maximum length of @str to use. If @len < 0, then
  *       the string is terminated with a 0 character.
- * @items_read: location to store number of words read, or %NULL.
+ * @items_read: location to store number of words read, or %NULL. 
  *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
  *              returned in case @str contains a trailing partial
  *              character. If an error occurs then the index of the


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]