Re: glib iconv caching
- From: Matthias Clasen <mclasen redhat com>
- To: mortenw gnome org
- Cc: gtk-devel-list gnome org
- Subject: Re: glib iconv caching
- Date: Mon, 08 Aug 2005 20:20:38 -0400
On Mon, 2005-08-08 at 17:05 -0400, mortenw gnome org wrote:
> I would guess that the reason caching was needed to begin with
> was because of crazy code like (from g_utf8_collate)...
>
> gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
> gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
>
> where "charset" is from g_get_charset, i.e., in practice it never
> changes.
>
> Just keeping a single iconv around here for this would do wonders
> for efficiency. (Similar code is found in g_utf8_collate_key.)
Here is a patch which tries to address this issue by caching iconv
descriptors for utf8<->locale and utf8<->filename conversions, and using
them in appropriate places.
Matthias
? glib/actual-abi
? glib/expected-abi
Index: glib/gconvert.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.c,v
retrieving revision 1.78
diff -u -p -r1.78 gconvert.c
--- glib/gconvert.c 8 Aug 2005 19:50:38 -0000 1.78
+++ glib/gconvert.c 9 Aug 2005 00:14:39 -0000
@@ -30,6 +30,7 @@
#include "glib.h"
#include "gprintfint.h"
+#include "gunicodeprivate.h"
#include "gthreadinit.h"
#ifdef G_PLATFORM_WIN32
@@ -1046,13 +1047,13 @@ g_locale_to_utf8 (const gchar *opsysstr
gsize *bytes_written,
GError **error)
{
- const char *charset;
+ GIConv cd = (GIConv)-1;
- if (g_get_charset (&charset))
+ if (_g_get_charset_iconv (NULL, &cd, NULL))
return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
else
- return g_convert (opsysstring, len,
- "UTF-8", charset, bytes_read, bytes_written, error);
+ return g_convert_with_iconv (opsysstring, len,
+ cd, bytes_read, bytes_written, error);
}
/**
@@ -1086,13 +1087,13 @@ g_locale_from_utf8 (const gchar *utf8str
gsize *bytes_written,
GError **error)
{
- const gchar *charset;
+ GIConv cd = (GIConv)-1;
- if (g_get_charset (&charset))
+ if (_g_get_charset_iconv (NULL, NULL, &cd))
return strdup_len (utf8string, len, bytes_read, bytes_written, error);
else
- return g_convert (utf8string, len,
- charset, "UTF-8", bytes_read, bytes_written, error);
+ return g_convert_with_iconv (utf8string, len,
+ cd, bytes_read, bytes_written, error);
}
#ifndef G_PLATFORM_WIN32
@@ -1102,6 +1103,8 @@ typedef struct _GFilenameCharsetCache GF
struct _GFilenameCharsetCache {
gboolean is_utf8;
gchar *charset;
+ GIConv charset_to_utf8;
+ GIConv utf8_to_charset;
gchar **filename_charsets;
};
@@ -1111,44 +1114,17 @@ filename_charset_cache_free (gpointer da
GFilenameCharsetCache *cache = data;
g_free (cache->charset);
g_strfreev (cache->filename_charsets);
+ if (cache->charset_to_utf8 != (GIConv)0)
+ g_iconv_close (cache->charset_to_utf8);
+ if (cache->utf8_to_charset != (GIConv)0)
+ g_iconv_close (cache->utf8_to_charset);
g_free (cache);
}
-/**
- * g_get_filename_charsets:
- * @charsets: return location for the %NULL-terminated list of encoding names
- *
- * Determines the preferred character sets used for filenames.
- * The first character set from the @charsets is the filename encoding, the
- * subsequent character sets are used when trying to generate a displayable
- * representation of a filename, see g_filename_display_name().
- *
- * On Unix, the character sets are determined by consulting the
- * environment variables <envar>G_FILENAME_ENCODING</envar> and
- * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
- * used in the GLib API is always UTF-8 and said environment variables
- * have no effect.
- *
- * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list
- * of character set names. The special token "@locale" is taken to mean the
- * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar>
- * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of
- * the current locale is taken as the filename encoding. If neither environment
- * variable is set, UTF-8 is taken as the filename encoding, but the character
- * set of the current locale is also put in the list of encodings.
- *
- * The returned @charsets belong to GLib and must not be freed.
- *
- * Note that on Unix, regardless of the locale character set or
- * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
- * system might be in any random encoding or just gibberish.
- *
- * Return value: %TRUE if the filename encoding is UTF-8.
- *
- * Since: 2.6
- */
-gboolean
-g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
+static gboolean
+get_filename_charsets_iconv (G_CONST_RETURN gchar ***filename_charsets,
+ GIConv *charset_to_utf8,
+ GIConv *utf8_to_charset)
{
static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
GFilenameCharsetCache *cache = g_static_private_get (&cache_private);
@@ -1207,9 +1183,64 @@ g_get_filename_charsets (G_CONST_RETURN
if (filename_charsets)
*filename_charsets = (const gchar **)cache->filename_charsets;
+ if (charset_to_utf8)
+ {
+ if (cache->charset_to_utf8 == (GIConv)0)
+ cache->charset_to_utf8 = g_iconv_open ("UTF-8",
+ cache->filename_charsets[0]);
+ *charset_to_utf8 = cache->charset_to_utf8;
+ }
+
+ if (utf8_to_charset)
+ {
+ if (cache->utf8_to_charset == (GIConv)0)
+ cache->utf8_to_charset = g_iconv_open (cache->filename_charsets[0],
+ "UTF-8");
+ *utf8_to_charset = cache->utf8_to_charset;
+ }
+
return cache->is_utf8;
}
+/**
+ * g_get_filename_charsets:
+ * @charsets: return location for the %NULL-terminated list of encoding names
+ *
+ * Determines the preferred character sets used for filenames.
+ * The first character set from the @charsets is the filename encoding, the
+ * subsequent character sets are used when trying to generate a displayable
+ * representation of a filename, see g_filename_display_name().
+ *
+ * On Unix, the character sets are determined by consulting the
+ * environment variables <envar>G_FILENAME_ENCODING</envar> and
+ * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
+ * used in the GLib API is always UTF-8 and said environment variables
+ * have no effect.
+ *
+ * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list
+ * of character set names. The special token "@locale" is taken to mean the
+ * character set for the current locale. If <envar>G_FILENAME_ENCODING</envar>
+ * is not set, but <envar>G_BROKEN_FILENAMES</envar> is, the character set of
+ * the current locale is taken as the filename encoding. If neither environment
+ * variable is set, UTF-8 is taken as the filename encoding, but the character
+ * set of the current locale is also put in the list of encodings.
+ *
+ * The returned @charsets belong to GLib and must not be freed.
+ *
+ * Note that on Unix, regardless of the locale character set or
+ * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present on a
+ * system might be in any random encoding or just gibberish.
+ *
+ * Return value: %TRUE if the filename encoding is UTF-8.
+ *
+ * Since: 2.6
+ */
+gboolean
+g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
+{
+ return get_filename_charsets_iconv (filename_charsets, NULL, NULL);
+}
+
#else /* G_PLATFORM_WIN32 */
gboolean
@@ -1241,20 +1272,6 @@ g_get_filename_charsets (G_CONST_RETURN
#endif /* G_PLATFORM_WIN32 */
-static gboolean
-get_filename_charset (const gchar **filename_charset)
-{
- const gchar **charsets;
- gboolean is_utf8;
-
- is_utf8 = g_get_filename_charsets (&charsets);
-
- if (filename_charset)
- *filename_charset = charsets[0];
-
- return is_utf8;
-}
-
/* This is called from g_thread_init(). It's used to
* initialize some static data in a threadsafe way.
*/
@@ -1296,13 +1313,13 @@ g_filename_to_utf8 (const gchar *opsysst
gsize *bytes_written,
GError **error)
{
- const gchar *charset;
+ GIConv cd;
- if (get_filename_charset (&charset))
+ if (get_filename_charsets_iconv (NULL, &cd, NULL))
return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
else
- return g_convert (opsysstring, len,
- "UTF-8", charset, bytes_read, bytes_written, error);
+ return g_convert_with_iconv (opsysstring, len,
+ cd, bytes_read, bytes_written, error);
}
#ifdef G_OS_WIN32
@@ -1359,13 +1376,13 @@ g_filename_from_utf8 (const gchar *utf8s
gsize *bytes_written,
GError **error)
{
- const gchar *charset;
+ GIConv cd;
- if (get_filename_charset (&charset))
+ if (get_filename_charsets_iconv (NULL, NULL, &cd))
return strdup_len (utf8string, len, bytes_read, bytes_written, error);
else
- return g_convert (utf8string, len,
- charset, "UTF-8", bytes_read, bytes_written, error);
+ return g_convert_with_iconv (utf8string, len,
+ cd, bytes_read, bytes_written, error);
}
#ifdef G_OS_WIN32
Index: glib/gunicode.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gunicode.h,v
retrieving revision 1.30
diff -u -p -r1.30 gunicode.h
--- glib/gunicode.h 8 Jun 2005 05:22:05 -0000 1.30
+++ glib/gunicode.h 9 Aug 2005 00:14:39 -0000
@@ -241,6 +241,7 @@ gchar* g_ucs4_to_utf8 (const gu
gint g_unichar_to_utf8 (gunichar c,
gchar *outbuf);
+
/* Validate a UTF8 string, return TRUE if valid, put pointer to
* first invalid char in **end
*/
Index: glib/gunicodeprivate.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gunicodeprivate.h,v
retrieving revision 1.2
diff -u -p -r1.2 gunicodeprivate.h
--- glib/gunicodeprivate.h 2 Aug 2004 15:34:30 -0000 1.2
+++ glib/gunicodeprivate.h 9 Aug 2005 00:14:39 -0000
@@ -30,6 +30,11 @@ gunichar *_g_utf8_normalize_wc (co
GNormalizeMode mode) G_GNUC_INTERNAL;
gint _g_unichar_combining_class (gunichar uc) G_GNUC_INTERNAL;
+gboolean _g_get_charset_iconv (G_CONST_RETURN char **charset,
+ GIConv *charset_to_utf8,
+ GIConv *utf8_to_charset);
+
+
G_END_DECLS
#endif /* __G_UNICODE_PRIVATE_H__ */
Index: glib/gunicollate.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gunicollate.c,v
retrieving revision 1.17
diff -u -p -r1.17 gunicollate.c
--- glib/gunicollate.c 8 Jun 2005 05:22:05 -0000 1.17
+++ glib/gunicollate.c 9 Aug 2005 00:14:39 -0000
@@ -69,9 +69,9 @@ g_utf8_collate (const gchar *str1,
#else /* !__STDC_ISO_10646__ */
- const gchar *charset;
gchar *str1_norm;
gchar *str2_norm;
+ GIConv cd;
g_return_val_if_fail (str1 != NULL, 0);
g_return_val_if_fail (str2 != NULL, 0);
@@ -79,17 +79,19 @@ g_utf8_collate (const gchar *str1,
str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
- if (g_get_charset (&charset))
+ if (_g_get_charset_iconv (NULL, NULL, &cd))
{
result = strcoll (str1_norm, str2_norm);
}
else
{
- gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
- gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
+ gchar *str1_locale, *str2_locale;
+
+ str1_locale = g_convert_with_iconv (str1_norm, -1, cd, NULL, NULL, NULL);
+ str2_locale = g_convert_with_iconv (str2_norm, -1, cd, NULL, NULL, NULL);
if (str1_locale && str2_locale)
- result = strcoll (str1_locale, str2_locale);
+ result = strcoll (str1_locale, str2_locale);
else if (str1_locale)
result = -1;
else if (str2_locale)
@@ -212,15 +214,14 @@ g_utf8_collate_key (const gchar *str,
return result;
#else /* !__STDC_ISO_10646__ */
-
- const gchar *charset;
+ GIConv cd;
gchar *str_norm;
g_return_val_if_fail (str != NULL, NULL);
str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
- if (g_get_charset (&charset))
+ if (_g_get_charset_iconv (NULL, NULL, &cd))
{
xfrm_len = strxfrm (NULL, str_norm, 0);
result = g_malloc (xfrm_len + 1);
@@ -228,7 +229,7 @@ g_utf8_collate_key (const gchar *str,
}
else
{
- gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
+ gchar *str_locale = g_convert_with_iconv (str_norm, -1, cd, NULL, NULL, NULL);
if (str_locale)
{
Index: glib/gutf8.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gutf8.c,v
retrieving revision 1.42
diff -u -p -r1.42 gutf8.c
--- glib/gutf8.c 14 Mar 2005 04:26:57 -0000 1.42
+++ glib/gutf8.c 9 Aug 2005 00:14:39 -0000
@@ -28,6 +28,7 @@
#include <string.h>
#include "glib.h"
+#include "gunicodeprivate.h"
#include "galias.h"
#ifdef G_PLATFORM_WIN32
@@ -458,6 +459,8 @@ struct _GCharsetCache {
gboolean is_utf8;
gchar *raw;
gchar *charset;
+ GIConv charset_to_utf8;
+ GIConv utf8_to_charset;
};
static void
@@ -466,6 +469,10 @@ charset_cache_free (gpointer data)
GCharsetCache *cache = data;
g_free (cache->raw);
g_free (cache->charset);
+ if (cache->charset_to_utf8 != (GIConv)0)
+ g_iconv_close (cache->charset_to_utf8);
+ if (cache->utf8_to_charset != (GIConv)0)
+ g_iconv_close (cache->utf8_to_charset);
g_free (cache);
}
@@ -490,6 +497,14 @@ charset_cache_free (gpointer data)
gboolean
g_get_charset (G_CONST_RETURN char **charset)
{
+ return _g_get_charset_iconv (charset, NULL, NULL);
+}
+
+gboolean
+_g_get_charset_iconv (G_CONST_RETURN char **charset,
+ GIConv *charset_to_utf8,
+ GIConv *utf8_to_charset)
+{
static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
GCharsetCache *cache = g_static_private_get (&cache_private);
const gchar *raw;
@@ -515,7 +530,21 @@ g_get_charset (G_CONST_RETURN char **cha
if (charset)
*charset = cache->charset;
-
+
+ if (charset_to_utf8)
+ {
+ if (cache->charset_to_utf8 == (GIConv)0)
+ cache->charset_to_utf8 = g_iconv_open ("UTF-8", cache->charset);
+ *charset_to_utf8 = cache->charset_to_utf8;
+ }
+
+ if (utf8_to_charset)
+ {
+ if (cache->utf8_to_charset == (GIConv)0)
+ cache->utf8_to_charset = g_iconv_open (cache->charset, "UTF-8");
+ *utf8_to_charset = cache->utf8_to_charset;
+ }
+
return cache->is_utf8;
}
@@ -1165,7 +1194,7 @@ g_utf16_to_utf8 (const gunichar2 *str,
* @str: a UTF-16 encoded string
* @len: the maximum length of @str to use. If @len < 0, then
* the string is terminated with a 0 character.
- * @items_read: location to store number of words read, or %NULL.
+ * @items_read: location to store number of words read, or %NULL.
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
* returned in case @str contains a trailing partial
* character. If an error occurs then the index of the
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]