g_utf8_salvage
- From: Robert Brady <robert susu org uk>
- To: gtk-devel-list gnome org
- Subject: g_utf8_salvage
- Date: Wed, 8 Nov 2000 04:16:00 +0000 (GMT)
Here is a patch to add a new function, g_utf8_salvage. It does as follows
-
/* Salvage a UTF-8 string, return a g_malloced string which is
the same, but with invalid UTF-8 sequences replaced with
U+FFFD */
gchar *g_utf8_salvage (const gchar *str);
This complies with the handling set forward in Markus Kuhn's text file
UTF-8-test.txt, which is available from
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>,
which seems as good as any. (and is what xterm is doing)
(It includes my previous patch)
(It also adds _uc variants of g_string_append_c and g_string_insert_c, if
this is thought a good idea, I'll add _uc variants for all the others?)
--
Robert
Index: gutf8.c
===================================================================
RCS file: /cvs/gnome/glib/gutf8.c,v
retrieving revision 1.7
diff -u -r1.7 gutf8.c
--- gutf8.c 2000/09/18 14:55:24 1.7
+++ gutf8.c 2000/11/08 04:03:22
@@ -548,6 +548,12 @@
gint max_len,
const gchar **end)
{
+ static int min_ucs_for_len[] =
+ { 0, 0, 0x0080,
+ 0x0800,
+ 0x10000,
+ 0x00200000,
+ 0x04000000 };
const gchar *p;
gboolean retval = TRUE;
@@ -581,12 +587,22 @@
UTF8_GET (result, p, i, mask, len);
- if (result == (gunichar)-1)
+ if (result == (gunichar)-1 ||
+ (result >= 0xd800 &&
+ result <= 0xdfff) ||
+ result == 0xfffe ||
+ result == 0xffff)
{
retval = FALSE;
break;
}
+ if (result < min_ucs_for_len[len])
+ {
+ retval = FALSE;
+ break;
+ }
+
p += len;
}
@@ -596,4 +612,44 @@
return retval;
}
+gchar *
+g_utf8_salvage (const gchar *str)
+{
+ GString *new_str = g_string_new (NULL);
+ gchar *retval = NULL, *error = NULL;
+ while (*str)
+ {
+ gint skip;
+ error = NULL;
+ if (g_utf8_validate (str, strlen(str), (const gchar **)&error))
+ {
+ g_string_append (new_str, str);
+ retval = new_str->str;
+ g_string_free (new_str, FALSE);
+ return retval;
+ }
+ if (error > str)
+ new_str = g_string_append_len (new_str, str, error - str);
+
+ g_string_append_uc (new_str, 0xfffd);
+ skip = g_utf8_skip[(guchar)*error];
+ str = error;
+ if (skip)
+ {
+ skip--;
+ str++;
+ }
+ while (skip && ((((guchar)(*str)) & 0xc0)==0x80))
+ {
+ str++;
+ skip--;
+ }
+ if (str == error)
+ str++;
+ }
+ new_str = g_string_append (new_str, str);
+ retval = new_str->str;
+ g_string_free (new_str, FALSE);
+ return retval;
+}
Index: gunicode.h
===================================================================
RCS file: /cvs/gnome/glib/gunicode.h,v
retrieving revision 1.10
diff -u -r1.10 gunicode.h
--- gunicode.h 2000/10/19 15:21:03 1.10
+++ gunicode.h 2000/11/08 04:03:23
@@ -191,6 +191,12 @@
gint max_len,
const gchar **end);
+/* Salvage a UTF-8 string, return a g_malloced string which is
+ the same, but with invalid UTF-8 sequences replaced with
+ U+FFFD */
+
+gchar *g_utf8_salvage (const gchar *str);
+
G_END_DECLS
#endif /* __G_UNICODE_H__ */
Index: gstring.c
===================================================================
RCS file: /cvs/gnome/glib/gstring.c,v
retrieving revision 1.21
diff -u -r1.21 gstring.c
--- gstring.c 2000/10/27 02:46:03 1.21
+++ gstring.c 2000/11/08 04:03:27
@@ -430,6 +430,15 @@
}
GString*
+g_string_append_uc (GString *fstring,
+ gunichar c)
+{
+ g_return_val_if_fail (fstring != NULL, NULL);
+
+ return g_string_insert_uc (fstring, -1, c);
+}
+
+GString*
g_string_prepend (GString *fstring,
const gchar *val)
{
@@ -469,6 +478,20 @@
g_return_val_if_fail (pos <= fstring->len, fstring);
return g_string_insert_len (fstring, pos, val, -1);
+}
+
+GString*
+g_string_insert_uc (GString *fstring,
+ gint pos,
+ gunichar val)
+{
+ gchar str[10];
+ g_return_val_if_fail (fstring != NULL, NULL);
+ g_return_val_if_fail (pos <= fstring->len, fstring);
+
+ str[g_unichar_to_utf8 (val, str)] = 0;
+
+ return g_string_insert (fstring, pos, str);
}
GString*
Index: gstring.h
===================================================================
RCS file: /cvs/gnome/glib/gstring.h,v
retrieving revision 1.2
diff -u -r1.2 gstring.h
--- gstring.h 2000/10/27 02:46:03 1.2
+++ gstring.h 2000/11/08 04:03:27
@@ -28,6 +28,7 @@
#define __G_STRING_H__
#include <gtypes.h>
+#include <gunicode.h>
G_BEGIN_DECLS
@@ -76,6 +77,8 @@
gint len);
GString* g_string_append_c (GString *string,
gchar c);
+GString* g_string_append_uc (GString *string,
+ gunichar c);
GString* g_string_prepend (GString *string,
const gchar *val);
GString* g_string_prepend_c (GString *string,
@@ -89,6 +92,9 @@
GString* g_string_insert_c (GString *string,
gint pos,
gchar c);
+GString* g_string_insert_uc (GString *string,
+ gint pos,
+ gunichar wc);
GString* g_string_erase (GString *string,
gint pos,
gint len);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]