Re: URIs vs. half-baked URIs [glib PATCH]

From: Alex Larsson <alexl redhat com>
To: Owen Taylor <otaylor redhat com>
Cc: Darin Adler <darin bentspoon com>, <gtk-devel-list gnome org>, <gnome-hackers gnome org>
Subject: Re: URIs vs. half-baked URIs [glib PATCH]
Date: Wed, 8 Aug 2001 16:00:08 -0400 (EDT)
Ok. New version. This one includes g_ascii_isspace() that i needed for the
gtk fileselector dnd thing.

/ alex

Index: glib/gconvert.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.c,v
retrieving revision 1.16
diff -u -p -r1.16 gconvert.c
--- glib/gconvert.c	2001/06/23 13:55:07	1.16
+++ glib/gconvert.c	2001/08/08 20:02:48
@@ -519,11 +519,24 @@ static gchar *
 strdup_len (const gchar *string,
 	    gssize       len,
 	    gsize       *bytes_written,
-	    gsize       *bytes_read)
+	    gsize       *bytes_read,
+	    GError      **error)

 {
   gsize real_len;

+  if (!g_utf8_validate (string, -1, NULL))
+    {
+      if (bytes_read)
+	*bytes_read = 0;
+      if (bytes_written)
+	*bytes_written = 0;
+
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+		   _("Invalid byte sequence in conversion input"));
+      return NULL;
+    }
+
   if (len < 0)
     real_len = strlen (string);
   else
@@ -674,7 +687,7 @@ g_locale_to_utf8 (const gchar  *opsysstr
   const char *charset;

   if (g_get_charset (&charset))
-    return strdup_len (opsysstring, len, bytes_read, bytes_written);
+    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
   else
     return g_convert (opsysstring, len,
 		      "UTF-8", charset, bytes_read, bytes_written, error);
@@ -820,7 +833,7 @@ g_locale_from_utf8 (const gchar *utf8str
   const gchar *charset;

   if (g_get_charset (&charset))
-    return strdup_len (utf8string, len, bytes_read, bytes_written);
+    return strdup_len (utf8string, len, bytes_read, bytes_written, error);
   else
     return g_convert (utf8string, len,
 		      charset, "UTF-8", bytes_read, bytes_written, error);
@@ -863,12 +876,13 @@ g_filename_to_utf8 (const gchar *opsysst
 			   bytes_read, bytes_written,
 			   error);
 #else  /* !G_PLATFORM_WIN32 */
+
   if (getenv ("G_BROKEN_FILENAMES"))
     return g_locale_to_utf8 (opsysstring, len,
 			     bytes_read, bytes_written,
 			     error);
   else
-    return strdup_len (opsysstring, len, bytes_read, bytes_written);
+    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
 #endif /* !G_PLATFORM_WIN32 */
 }

@@ -911,6 +925,363 @@ g_filename_from_utf8 (const gchar *utf8s
 			       bytes_read, bytes_written,
 			       error);
   else
-    return strdup_len (utf8string, len, bytes_read, bytes_written);
+    return strdup_len (utf8string, len, bytes_read, bytes_written, error);
 #endif /* !G_PLATFORM_WIN32 */
 }
+
+/* Test of haystack has the needle prefix, comparing case
+ * insensitive. haystack may be UTF-8, but needle must
+ * contain only ascii. */
+static gboolean
+has_case_prefix (const gchar *haystack, const gchar *needle)
+{
+  const gchar *h, *n;
+  gchar hc, nc;
+
+  /* Eat one character at a time. */
+  h = haystack == NULL ? "" : haystack;
+  n = needle == NULL ? "" : needle;
+  do
+    {
+      if (*n == '\0')
+	return TRUE;
+      if (*h == '\0')
+	return FALSE;
+
+      hc = *h++;
+      nc = *n++;
+
+      hc = g_ascii_tolower (hc);
+      nc = g_ascii_tolower (nc);
+    }
+  while (hc == nc);
+
+  return FALSE;
+}
+
+typedef enum {
+  UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
+  UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
+  UNSAFE_PATH       = 0x4,  /* Allows '/' and '?' and '&' and '='  */
+  UNSAFE_DOS_PATH   = 0x8,  /* Allows '/' and '?' and '&' and '=' and ':' */
+  UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
+  UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
+} UnsafeCharacterSet;
+
+static const guchar acceptable[96] = {
+ /* X0   X1   X2   X3   X4   X5   X6   X7   X8   X9   XA   XB   XC   XD   XE   XF */
+  0x00,0x3F,0x20,0x20,0x20,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x22,0x20,0x3F,0x3F,0x1C, /* 2X  !"#$%&'()*+,-./   */
+  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x2C, /* 3X 0123456789:;<=>?   */
+  0x30,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 4X @ABCDEFGHIJKLMNO   */
+  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, /* 5X PQRSTUVWXYZ[\]^_   */
+  0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, /* 6X `abcdefghijklmno   */
+  0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20  /* 7X pqrstuvwxyz{|}~DEL */
+};
+
+#define HEX_ESCAPE '%'
+
+static const gchar hex[16] = "0123456789ABCDEF";
+
+static gchar *
+g_escape_uri_string (const gchar *string,
+		     UnsafeCharacterSet mask)
+{
+#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
+
+  const gchar *p;
+  gchar *q;
+  gchar *result;
+  int c;
+  gint unacceptable;
+  UnsafeCharacterSet use_mask;
+
+  g_return_val_if_fail (mask == UNSAFE_ALL
+			|| mask == UNSAFE_ALLOW_PLUS
+			|| mask == UNSAFE_PATH
+			|| mask == UNSAFE_DOS_PATH
+			|| mask == UNSAFE_HOST
+			|| mask == UNSAFE_SLASHES, NULL);
+
+  if (string == NULL)
+    return NULL;
+
+  unacceptable = 0;
+  use_mask = mask;
+  for (p = string; *p != '\0'; p++)
+    {
+      c = *p;
+      if (!ACCEPTABLE (c))
+	unacceptable++;
+    }
+
+  result = g_malloc (p - string + unacceptable * 2 + 1);
+
+  use_mask = mask;
+  for (q = result, p = string; *p != '\0'; p++)
+    {
+      c = *p;
+
+      if (!ACCEPTABLE (c))
+	{
+	  *q++ = HEX_ESCAPE; /* means hex coming */
+	  *q++ = hex[c >> 4];
+	  *q++ = hex[c & 15];
+	}
+      else
+	*q++ = *p;
+    }
+
+  *q = '\0';
+
+  return result;
+}
+
+
+static gchar *
+g_escape_file_uri (const gchar *hostname,
+		   const gchar *pathname)
+{
+  char *escaped_hostname = NULL;
+  char *escaped_path;
+
+  if (hostname)
+    {
+      escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
+    }
+
+  escaped_path = g_escape_uri_string (pathname, UNSAFE_DOS_PATH);
+
+  return g_strconcat ("file://",
+		      (escaped_hostname)?escaped_hostname:"",
+		      (*escaped_path!='/')?"/":"",
+		      escaped_path,
+		      NULL);
+}
+
+static int
+hex_to_int (gchar c)
+{
+  return  c >= '0' && c <= '9' ? c - '0'
+    : c >= 'A' && c <= 'F' ? c - 'A' + 10
+    : c >= 'a' && c <= 'f' ? c - 'a' + 10
+    : -1;
+}
+
+static int
+unescape_character (const char *scanner)
+{
+  int first_digit;
+  int second_digit;
+
+  first_digit = hex_to_int (*scanner++);
+
+  if (first_digit < 0)
+    return -1;
+
+  second_digit = hex_to_int (*scanner++);
+  if (second_digit < 0)
+    return -1;
+
+  return (first_digit << 4) | second_digit;
+}
+
+static gchar *
+g_unescape_uri_string (const gchar *escaped,
+		       const gchar *illegal_characters)
+{
+  const gchar *in;
+  gchar *out, *result;
+  int character;
+
+  if (escaped == NULL)
+    return NULL;
+
+  result = g_malloc (strlen (escaped) + 1);
+
+  out = result;
+  for (in = escaped; *in != '\0'; in++)
+    {
+      character = *in;
+      if (character == HEX_ESCAPE)
+	{
+	  character = unescape_character (in + 1);
+
+	  /* Check for an illegal character. We consider '\0' illegal here. */
+	  if (character == 0
+	      || (illegal_characters != NULL
+		  && strchr (illegal_characters, (char)character) != NULL))
+	    {
+	      g_free (result);
+	      return NULL;
+	    }
+	  in += 2;
+	}
+      *out++ = character;
+    }
+
+  *out = '\0';
+
+  g_assert (out - result <= strlen (escaped));
+
+  if (!g_utf8_validate (result, -1, NULL))
+    {
+      g_free (result);
+      return NULL;
+    }
+
+  return result;
+}
+
+/**
+ * g_filename_from_uri:
+ * @uri: a uri describing a filename (escaped, UTF8-encoded)
+ * @hostname: If the URI specifies a hostname it will be placed here,
+              or %NULL to ignore the hostname.
+ * @error: location to store the error occuring, or %NULL to ignore
+ *         errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts an escaped UTF-8 encoded URI to a local filename in the
+ * encoding used for filenames. Or NULL if the URI doesn't specify a
+ * local filename.
+ *
+ * Return value: a newly allocated string holding the resulting
+ *               filename, or %NULL on an error.
+ **/
+gchar *
+g_filename_from_uri (const char *uri,
+		     char      **hostname,
+		     GError    **error)
+{
+  const char *path_part;
+  const char *host_part;
+  char *result;
+  char *filename;
+  int offs;
+
+  if (hostname)
+    *hostname = NULL;
+
+  if (!has_case_prefix (uri, "file:/"))
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_LOCAL_FILE,
+		   _("The URI `%s' does not specify a local file"),
+		   uri);
+      return NULL;
+    }
+
+  path_part = uri + strlen ("file:");
+
+  if (strchr (path_part, '#') != NULL)
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+		   _("The local file URI `%s' may not include a `#'"),
+		   uri);
+      return NULL;
+    }
+
+  if (has_case_prefix (path_part, "///"))
+    path_part += 2;
+  else if (has_case_prefix (path_part, "//"))
+    {
+      path_part += 2;
+      host_part = path_part;
+
+      path_part = strchr (path_part, '/');
+
+      if (path_part == NULL)
+	{
+	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+		       _("The URI `%s' is invalid"),
+		       uri);
+	  return NULL;
+	}
+
+      if (hostname)
+	{
+	  char *t;
+	  t = g_strndup (host_part, path_part - host_part);
+	  *hostname = g_unescape_uri_string (t, "");
+	  g_free (t);
+
+	  if (*hostname == NULL)
+	    {
+	      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+			   _("The hostname of the URI `%s' is contains invalidly escaped characters"),
+			   uri);
+	      return NULL;
+	    }
+	}
+    }
+
+  filename = g_unescape_uri_string (path_part, "/");
+
+  if (filename == NULL)
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_INVALID_URI,
+		   _("The URI `%s' is contains invalidly escaped characters"),
+		   uri);
+      return NULL;
+    }
+
+  /* DOS uri's are like "file://host/c:\foo", so we need to check if we need to
+   * drop the initial slash */
+  offs = 0;
+  if (g_path_is_absolute (filename+1))
+    offs = 1;
+
+  result = g_filename_from_utf8 (filename + offs, -1, NULL, NULL, error);
+  g_free (filename);
+
+  return result;
+}
+
+/**
+ * g_filename_to_uri:
+ * @filename: an absolute filename specified in the encoding
+ *            used for filenames.
+ * @hostname: A utf-8 encoded hostname, or %NULL for none.
+ * @error: location to store the error occuring, or %NULL to ignore
+ *         errors. Any of the errors in #GConvertError may occur.
+ *
+ * Converts an absolute filename to an escaped UTF8 encoded URI.
+ *
+ * Return value: a newly allocated string holding the resulting
+ *               URI, or %NULL on an error.
+ **/
+gchar *
+g_filename_to_uri   (const char *filename,
+		     char       *hostname,
+		     GError    **error)
+{
+  char *escaped_uri;
+  char *utf8_filename;
+
+  g_return_val_if_fail (filename != NULL, NULL);
+
+  if (!g_path_is_absolute (filename))
+    {
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
+		   _("The pathname '%s' is not an absolute path"),
+		   filename);
+      return NULL;
+    }
+
+  utf8_filename = g_filename_to_utf8 (filename, -1, NULL, NULL, error);
+  if (utf8_filename == NULL)
+    return NULL;
+
+  if (hostname &&
+      !g_utf8_validate (hostname, -1, NULL))
+    {
+      g_free (utf8_filename);
+      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+		   _("Invalid byte sequence in hostname"));
+      return NULL;
+    }
+
+  escaped_uri = g_escape_file_uri (hostname,
+				   utf8_filename);
+
+  return escaped_uri;
+}
+
Index: glib/gconvert.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gconvert.h,v
retrieving revision 1.7
diff -u -p -r1.7 gconvert.h
--- glib/gconvert.h	2001/06/26 16:01:14	1.7
+++ glib/gconvert.h	2001/08/08 20:02:48
@@ -37,7 +37,10 @@ typedef enum
   G_CONVERT_ERROR_NO_CONVERSION,
   G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
   G_CONVERT_ERROR_FAILED,
-  G_CONVERT_ERROR_PARTIAL_INPUT
+  G_CONVERT_ERROR_PARTIAL_INPUT,
+  G_CONVERT_ERROR_NOT_LOCAL_FILE,
+  G_CONVERT_ERROR_INVALID_URI,
+  G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
 } GConvertError;

 #define G_CONVERT_ERROR g_convert_error_quark()
@@ -100,6 +103,15 @@ gchar* g_filename_from_utf8 (const gchar
 			     gsize        *bytes_read,
 			     gsize        *bytes_written,
 			     GError      **error);
+
+gchar *g_filename_from_uri (const char *uri,
+			    char      **hostname,
+			    GError    **error);
+
+gchar *g_filename_to_uri   (const char *filename,
+			    char       *hostname,
+			    GError    **error);
+

 G_END_DECLS

Index: glib/gstrfuncs.c
===================================================================
RCS file: /cvs/gnome/glib/glib/gstrfuncs.c,v
retrieving revision 1.62
diff -u -p -r1.62 gstrfuncs.c
--- glib/gstrfuncs.c	2001/07/19 20:07:40	1.62
+++ glib/gstrfuncs.c	2001/08/08 20:02:48
@@ -1081,6 +1081,27 @@ g_strreverse (gchar *string)
 }

 /**
+ * g_ascii_isspace:
+ * @c: any character
+ *
+ * Determines whether a character is white-space.
+ *
+ * Unlike the standard C library isalpha function, this only
+ * recognizes standard ASCII letters and ignores the locale, returning
+ * %FALSE for all non-ASCII characters. Also unlike the standard
+ * library function, this takes a char, not an int, so don't call it
+ * on EOF but no need to cast to guchar before passing a possibly
+ * non-ASCII character in.
+ *
+ * Return value: %TRUE if @c is an ASCII white-space character
+ **/
+gboolean
+g_ascii_isspace (gchar c)
+{
+  return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v';
+}
+
+/**
  * g_ascii_isalpha:
  * @c: any character
  *
Index: glib/gstrfuncs.h
===================================================================
RCS file: /cvs/gnome/glib/glib/gstrfuncs.h,v
retrieving revision 1.11
diff -u -p -r1.11 gstrfuncs.h
--- glib/gstrfuncs.h	2001/07/19 20:07:40	1.11
+++ glib/gstrfuncs.h	2001/08/08 20:02:48
@@ -37,6 +37,7 @@ gboolean              g_ascii_isalpha  (
 gboolean              g_ascii_isalnum  (gchar        c) G_GNUC_CONST;
 gboolean              g_ascii_islower  (gchar        c) G_GNUC_CONST;
 gboolean              g_ascii_isupper  (gchar        c) G_GNUC_CONST;
+gboolean              g_ascii_isspace  (gchar        c) G_GNUC_CONST;
 gchar                 g_ascii_tolower  (gchar        c) G_GNUC_CONST;
 gchar                 g_ascii_toupper  (gchar        c) G_GNUC_CONST;
Follow-Ups:
- Re: URIs vs. half-baked URIs [glib PATCH]
  - From: Darin Adler
- Re: URIs vs. half-baked URIs [glib PATCH]
  - From: Owen Taylor
References:
- Re: URIs vs. half-baked URIs [glib PATCH]
  - From: Owen Taylor
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]