Re: URIs vs. half-baked URIs [glib PATCH]
- From: Alex Larsson <alexl redhat com>
- To: Owen Taylor <otaylor redhat com>
- Cc: Darin Adler <darin bentspoon com>, <gtk-devel-list gnome org>, <gnome-hackers gnome org>
- Subject: Re: URIs vs. half-baked URIs [glib PATCH]
- Date: Wed, 8 Aug 2001 16:00:08 -0400 (EDT)
Ok. New version. This one includes g_ascii_isspace() that i needed for the
gtk fileselector dnd thing.
/ alex
Index: glib/gconvert.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.c,v
retrieving revision 1.16
diff -u -p -r1.16 gconvert.c
--- glib/gconvert.c 2001/06/23 13:55:07 1.16
+++ glib/gconvert.c 2001/08/08 20:02:48
@@ -519,11 +519,24 @@ static gchar *
strdup_len (const gchar *string,
gssize len,
gsize *bytes_written,
- gsize *bytes_read)
+ gsize *bytes_read,
+ GError **error)
{
gsize real_len;
+ if (!g_utf8_validate (string, -1, NULL))
+ {
+ if (bytes_read)
+ *bytes_read = 0;
+ if (bytes_written)
+ *bytes_written = 0;
+
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+ _("Invalid byte sequence in conversion input"));
+ return NULL;
+ }
+
if (len < 0)
real_len = strlen (string);
else
@@ -674,7 +687,7 @@ g_locale_to_utf8 (const gchar *opsysstr
const char *charset;
if (g_get_charset (&charset))
- return strdup_len (opsysstring, len, bytes_read, bytes_written);
+ return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
else
return g_convert (opsysstring, len,
"UTF-8", charset, bytes_read, bytes_written, error);
@@ -820,7 +833,7 @@ g_locale_from_utf8 (const gchar *utf8str
const gchar *charset;
if (g_get_charset (&charset))
- return strdup_len (utf8string, len, bytes_read, bytes_written);
+ return strdup_len (utf8string, len, bytes_read, bytes_written, error);
else
return g_convert (utf8string, len,
charset, "UTF-8", bytes_read, bytes_written, error);
@@ -863,12 +876,13 @@ g_filename_to_utf8 (const gchar *opsysst
bytes_read, bytes_written,
error);
#else /* !G_PLATFORM_WIN32 */
+
if (getenv ("G_BROKEN_FILENAMES"))
return g_locale_to_utf8 (opsysstring, len,
bytes_read, bytes_written,
error);
else
- return strdup_len (opsysstring, len, bytes_read, bytes_written);
+ return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
#endif /* !G_PLATFORM_WIN32 */
}
@@ -911,6 +925,363 @@ g_filename_from_utf8 (const gchar *utf8s
bytes_read, bytes_written,
error);
else
- return strdup_len (utf8string, len, bytes_read, bytes_written);
+ return strdup_len (utf8string, len, bytes_read, bytes_written, error);
#endif /* !G_PLATFORM_WIN32 */
}
+
+/* Test of haystack has the needle prefix, comparing case
+ * insensitive. haystack may be UTF-8, but needle must
+ * contain only ascii. */
+static gboolean
+has_case_prefix (const gchar *haystack, const gchar *needle)
+{
+ const gchar *h, *n;
+ gchar hc, nc;
+
+ /* Eat one character at a time. */
+ h = haystack == NULL ? "" : haystack;
+ n = needle == NULL ? "" : needle;
+ do
+ {
+ if (*n == '\0')
+ return TRUE;
+ if (*h == '\0')
+ return FALSE;
+
+ hc = *h++;
+ nc = *n++;
+
+ hc = g_ascii_tolower (hc);
+ nc = g_ascii_tolower (nc);
+ }
+ while (hc == nc);
+
+ return FALSE;
+}
+
+typedef enum {
+ UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
+ UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
+ UNSAFE_PATH = 0x4, /* Allows '/' and '?' and '&' and '=' */
+ UNSAFE_DOS_PATH = 0x8, /* Allows '/' and '?' and '&' and '=' and ':' */
+ UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
+ UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
+} UnsafeCharacterSet;
+
+static const guchar acceptable[96] = {
+ /* X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+ 0x00,0x3F,0x20,0x20,0x20,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x22,0x20,0x3F,0x3F,0x1C, /* 2X !"#$%&'()*+,-./ */
+ 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x2C, /* 3X 0123456789:;<=>? */
+ 0x30,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 4X @ABCDEFGHIJKLMNO */
+ 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, /* 5X PQRSTUVWXYZ[\]^_ */
+ 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 6X `abcdefghijklmno */
+ 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 /* 7X pqrstuvwxyz{|}~DEL */
+};
+
+#define HEX_ESCAPE '%'
+
+static const gchar hex[16] = "0123456789ABCDEF";
+
+static gchar *
+g_escape_uri_string (const gchar *string,
+ UnsafeCharacterSet mask)
+{
+#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
+
+ const gchar *p;
+ gchar *q;
+ gchar *result;
+ int c;
+ gint unacceptable;
+ UnsafeCharacterSet use_mask;
+
+ g_return_val_if_fail (mask == UNSAFE_ALL
+ || mask == UNSAFE_ALLOW_PLUS
+ || mask == UNSAFE_PATH
+ || mask == UNSAFE_DOS_PATH
+ || mask == UNSAFE_HOST
+ || mask == UNSAFE_SLASHES, NULL);
+
+ if (string == NULL)
+ return NULL;
+
+ unacceptable = 0;
+ use_mask = mask;
+ for (p = string; *p != '\0'; p++)
+ {
+ c = *p;
+ if (!ACCEPTABLE (c))
+ unacceptable++;
+ }
+
+ result = g_malloc (p - string + unacceptable * 2 + 1);
+
+ use_mask = mask;
+ for (q = result, p = string; *p != '\0'; p++)
+ {
+ c = *p;
+
+ if (!ACCEPTABLE (c))
+ {
+ *q++ = HEX_ESCAPE; /* means hex coming */
+ *q++ = hex[c >> 4];
+ *q++ = hex[c & 15];
+ }
+ else
+ *q++ = *p;
+ }
+
+ *q = '\0';
+
+ return result;
+}
+
+
+static gchar *
+g_escape_file_uri (const gchar *hostname,
+ const gchar *pathname)
+{
+ char *escaped_hostname = NULL;
+ char *escaped_path;
+
+ if (hostname)
+ {
+ escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
+ }
+
+ escaped_path = g_escape_uri_string (pathname, UNSAFE_DOS_PATH);
+
+ return g_strconcat ("file://",
+ (escaped_hostname)?escaped_hostname:"",
+ (*escaped_path!='/')?"/":"",
+ escaped_path,
+ NULL);
+}
+
+static int
+hex_to_int (gchar c)
+{
+ return c >= '0' && c <= '9' ? c - '0'
+ : c >= 'A' && c <= 'F' ? c - 'A' + 10
+ : c >= 'a' && c <= 'f' ? c - 'a' + 10
+ : -1;
+}
+
+static int
+unescape_character (const char *scanner)
+{
+ int first_digit;
+ int second_digit;
+
+ first_digit = hex_to_int (*scanner++);
+
+ if (first_digit < 0)
+ return -1;
+
+ second_digit = hex_to_int (*scanner++);
+ if (second_digit < 0)
+ return -1;
+
+ return (first_digit << 4) | second_digit;
+}
+
+static gchar *
+g_unescape_uri_string (const gchar *escaped,
+ const gchar *illegal_characters)
+{
+ const gchar *in;
+ gchar *out, *result;
+ int character;
+
+ if (escaped == NULL)
+ return NULL;
+
+ result = g_malloc (strlen (escaped) + 1);
+
+ out = result;
+ for (in = escaped; *in != '\0'; in++)
+ {
+ character = *in;
+ if (character == HEX_ESCAPE)
+ {
+ character = unescape_character (in + 1);
+
+ /* Check for an illegal character. We consider '\0' illegal here. */
+ if (character == 0
+ || (illegal_characters != NULL
+ && strchr (illegal_characters, (char)character) != NULL))
+ {
+ g_free (result);
+ return NULL;
+ }
+ in += 2;
+ }
+ *out++ = character;
+ }
+
+ *out = '\0';
+
+ g_assert (out - result <= strlen (escaped));
+
+ if (!g_utf8_validate (result, -1, NULL))
+ {
+ g_free (result);
+ return NULL;
+ }
+
+ return result;
+}
+
+/**
+ * g_filename_from_uri:
+ * @uri: a uri describing a filename (escaped, UTF8-encoded)
+ * @hostname: If the URI specifies a hostname it will be placed here,
+ or %NULL to ignore the hostname.
+ * @error: location to store the error occuring, or %NULL to ignore
+ * errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts an escaped UTF-8 encoded URI to a local filename in the
+ * encoding used for filenames. Or NULL if the URI doesn't specify a
+ * local filename.
+ *
+ * Return value: a newly allocated string holding the resulting
+ * filename, or %NULL on an error.
+ **/
+gchar *
+g_filename_from_uri (const char *uri,
+ char **hostname,
+ GError **error)
+{
+ const char *path_part;
+ const char *host_part;
+ char *result;
+ char *filename;
+ int offs;
+
+ if (hostname)
+ *hostname = NULL;
+
+ if (!has_case_prefix (uri, "file:/"))
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_LOCAL_FILE,
+ _("The URI `%s' does not specify a local file"),
+ uri);
+ return NULL;
+ }
+
+ path_part = uri + strlen ("file:");
+
+ if (strchr (path_part, '#') != NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The local file URI `%s' may not include a `#'"),
+ uri);
+ return NULL;
+ }
+
+ if (has_case_prefix (path_part, "///"))
+ path_part += 2;
+ else if (has_case_prefix (path_part, "//"))
+ {
+ path_part += 2;
+ host_part = path_part;
+
+ path_part = strchr (path_part, '/');
+
+ if (path_part == NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The URI `%s' is invalid"),
+ uri);
+ return NULL;
+ }
+
+ if (hostname)
+ {
+ char *t;
+ t = g_strndup (host_part, path_part - host_part);
+ *hostname = g_unescape_uri_string (t, "");
+ g_free (t);
+
+ if (*hostname == NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The hostname of the URI `%s' is contains invalidly escaped characters"),
+ uri);
+ return NULL;
+ }
+ }
+ }
+
+ filename = g_unescape_uri_string (path_part, "/");
+
+ if (filename == NULL)
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+ _("The URI `%s' is contains invalidly escaped characters"),
+ uri);
+ return NULL;
+ }
+
+ /* DOS uri's are like "file://host/c:\foo", so we need to check if we need to
+ * drop the initial slash */
+ offs = 0;
+ if (g_path_is_absolute (filename+1))
+ offs = 1;
+
+ result = g_filename_from_utf8 (filename + offs, -1, NULL, NULL, error);
+ g_free (filename);
+
+ return result;
+}
+
+/**
+ * g_filename_to_uri:
+ * @filename: an absolute filename specified in the encoding
+ * used for filenames.
+ * @hostname: A utf-8 encoded hostname, or %NULL for none.
+ * @error: location to store the error occuring, or %NULL to ignore
+ * errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts an absolute filename to an escaped UTF8 encoded URI.
+ *
+ * Return value: a newly allocated string holding the resulting
+ * URI, or %NULL on an error.
+ **/
+gchar *
+g_filename_to_uri (const char *filename,
+ char *hostname,
+ GError **error)
+{
+ char *escaped_uri;
+ char *utf8_filename;
+
+ g_return_val_if_fail (filename != NULL, NULL);
+
+ if (!g_path_is_absolute (filename))
+ {
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
+ _("The pathname '%s' is not an absolute path"),
+ filename);
+ return NULL;
+ }
+
+ utf8_filename = g_filename_to_utf8 (filename, -1, NULL, NULL, error);
+ if (utf8_filename == NULL)
+ return NULL;
+
+ if (hostname &&
+ !g_utf8_validate (hostname, -1, NULL))
+ {
+ g_free (utf8_filename);
+ g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+ _("Invalid byte sequence in hostname"));
+ return NULL;
+ }
+
+ escaped_uri = g_escape_file_uri (hostname,
+ utf8_filename);
+
+ return escaped_uri;
+}
+
Index: glib/gconvert.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.h,v
retrieving revision 1.7
diff -u -p -r1.7 gconvert.h
--- glib/gconvert.h 2001/06/26 16:01:14 1.7
+++ glib/gconvert.h 2001/08/08 20:02:48
@@ -37,7 +37,10 @@ typedef enum
G_CONVERT_ERROR_NO_CONVERSION,
G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
G_CONVERT_ERROR_FAILED,
- G_CONVERT_ERROR_PARTIAL_INPUT
+ G_CONVERT_ERROR_PARTIAL_INPUT,
+ G_CONVERT_ERROR_NOT_LOCAL_FILE,
+ G_CONVERT_ERROR_INVALID_URI,
+ G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
} GConvertError;
#define G_CONVERT_ERROR g_convert_error_quark()
@@ -100,6 +103,15 @@ gchar* g_filename_from_utf8 (const gchar
gsize *bytes_read,
gsize *bytes_written,
GError **error);
+
+gchar *g_filename_from_uri (const char *uri,
+ char **hostname,
+ GError **error);
+
+gchar *g_filename_to_uri (const char *filename,
+ char *hostname,
+ GError **error);
+
G_END_DECLS
Index: glib/gstrfuncs.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gstrfuncs.c,v
retrieving revision 1.62
diff -u -p -r1.62 gstrfuncs.c
--- glib/gstrfuncs.c 2001/07/19 20:07:40 1.62
+++ glib/gstrfuncs.c 2001/08/08 20:02:48
@@ -1081,6 +1081,27 @@ g_strreverse (gchar *string)
}
/**
+ * g_ascii_isspace:
+ * @c: any character
+ *
+ * Determines whether a character is white-space.
+ *
+ * Unlike the standard C library isalpha function, this only
+ * recognizes standard ASCII letters and ignores the locale, returning
+ * %FALSE for all non-ASCII characters. Also unlike the standard
+ * library function, this takes a char, not an int, so don't call it
+ * on EOF but no need to cast to guchar before passing a possibly
+ * non-ASCII character in.
+ *
+ * Return value: %TRUE if @c is an ASCII white-space character
+ **/
+gboolean
+g_ascii_isspace (gchar c)
+{
+ return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v';
+}
+
+/**
* g_ascii_isalpha:
* @c: any character
*
Index: glib/gstrfuncs.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gstrfuncs.h,v
retrieving revision 1.11
diff -u -p -r1.11 gstrfuncs.h
--- glib/gstrfuncs.h 2001/07/19 20:07:40 1.11
+++ glib/gstrfuncs.h 2001/08/08 20:02:48
@@ -37,6 +37,7 @@ gboolean g_ascii_isalpha (
gboolean g_ascii_isalnum (gchar c) G_GNUC_CONST;
gboolean g_ascii_islower (gchar c) G_GNUC_CONST;
gboolean g_ascii_isupper (gchar c) G_GNUC_CONST;
+gboolean g_ascii_isspace (gchar c) G_GNUC_CONST;
gchar g_ascii_tolower (gchar c) G_GNUC_CONST;
gchar g_ascii_toupper (gchar c) G_GNUC_CONST;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]