[glib/wip/tingping/guri-normalize] guri: Normalize uri segments if they are encoded

From: Patrick Griffis <pgriffis src gnome org>
To: commits-list gnome org
Cc:
Subject: [glib/wip/tingping/guri-normalize] guri: Normalize uri segments if they are encoded
Date: Wed, 14 Oct 2020 19:49:20 +0000 (UTC)

commit fe3603604f069ad90f0fa8dc1597e9452670f41c
Author: Patrick Griffis <pgriffis igalia com>
Date:   Wed Oct 14 14:22:58 2020 -0500

    guri: Normalize uri segments if they are encoded
    
    This changes it so when a segment is encoded it will be
    normalized at parse time which ensures its valid and
    it can more easily be compared with other uris.

 glib/guri.c      | 42 +++++++++++++++++++++++++-----------------
 glib/tests/uri.c | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 17 deletions(-)
---
diff --git a/glib/guri.c b/glib/guri.c
index f04139b80..02d506432 100644
--- a/glib/guri.c
+++ b/glib/guri.c
@@ -289,15 +289,16 @@ uri_decoder (gchar       **out,
              GUriError     parse_error,
              GError      **error)
 {
-  gchar *decoded, *d, c;
+  gchar c;
+  GString *decoded;
   const gchar *invalid, *s, *end;
   gssize len;
 
   if (!(flags & G_URI_FLAGS_ENCODED))
     just_normalize = FALSE;
 
-  decoded = g_malloc (length + 1);
-  for (s = start, end = s + length, d = decoded; s < end; s++)
+  decoded = g_string_sized_new (length + 1);
+  for (s = start, end = s + length; s < end; s++)
     {
       if (*s == '%')
         {
@@ -311,7 +312,7 @@ uri_decoder (gchar       **out,
                   g_set_error_literal (error, G_URI_ERROR, parse_error,
                                        /* xgettext: no-c-format */
                                        _("Invalid %-encoding in URI"));
-                  g_free (decoded);
+                  g_string_free (decoded, TRUE);
                   return -1;
                 }
 
@@ -319,7 +320,7 @@ uri_decoder (gchar       **out,
                * fix it to "%25", since that might change the way that
                * the URI's owner would interpret it.
                */
-              *d++ = *s;
+              g_string_append_c (decoded, *s);
               continue;
             }
 
@@ -328,43 +329,50 @@ uri_decoder (gchar       **out,
             {
               g_set_error_literal (error, G_URI_ERROR, parse_error,
                                    _("Illegal character in URI"));
-              g_free (decoded);
+              g_string_free (decoded, TRUE);
               return -1;
             }
           if (just_normalize && !g_uri_char_is_unreserved (c))
             {
-              /* Leave the % sequence there. */
-              *d++ = *s;
+              /* Leave the % sequence there but normalize it. */
+              g_string_append_c (decoded, *s);
+              g_string_append_c (decoded, g_ascii_toupper (s[1]));
+              g_string_append_c (decoded, g_ascii_toupper (s[2]));
+              s += 2;
             }
           else
             {
-              *d++ = c;
+              g_string_append_c (decoded, c);
               s += 2;
             }
         }
       else if (www_form && *s == '+')
-        *d++ = ' ';
+        g_string_append_c (decoded, ' ');
+      /* Normalize any illegal characters */
+      else if (just_normalize && (!g_ascii_isgraph (*s) ||
+                                  (illegal_chars && strchr (illegal_chars, *s))))
+        g_string_append_printf (decoded, "%%%02X", (int)*s);
       else
-        *d++ = *s;
+        g_string_append_c (decoded, *s);
     }
-  *d = '\0';
 
-  len = d - decoded;
+  len = decoded->len;
   g_assert (len >= 0);
 
   if (!(flags & G_URI_FLAGS_ENCODED) &&
-      !g_utf8_validate (decoded, len, &invalid))
+      !g_utf8_validate (decoded->str, len, &invalid))
     {
       g_set_error_literal (error, G_URI_ERROR, parse_error,
                            _("Non-UTF-8 characters in URI"));
-      g_free (decoded);
+      g_string_free (decoded, TRUE);
       return -1;
     }
 
   if (out)
-    *out = g_steal_pointer (&decoded);
+    *out = g_string_free (decoded, FALSE);
+  else
+    g_string_free (decoded, TRUE);
 
-  g_free (decoded);
   return len;
 }
 
diff --git a/glib/tests/uri.c b/glib/tests/uri.c
index b8a0c6a47..e769a9ea5 100644
--- a/glib/tests/uri.c
+++ b/glib/tests/uri.c
@@ -1708,6 +1708,38 @@ test_uri_join_split_round_trip (void)
     }
 }
 
+static const struct
+{
+  /* Inputs */
+  const gchar *uri;
+  GUriFlags flags;
+  /* Outputs */
+  const gchar *path;
+} normalize_tests[] =
+  {
+    { "http://foo/path with spaces", G_URI_FLAGS_ENCODED,
+      "/path%20with%20spaces" },
+    { "http://foo/path with spaces 2", G_URI_FLAGS_ENCODED_PATH,
+      "/path%20with%20spaces%202" },
+    { "http://foo/%aa";, G_URI_FLAGS_ENCODED,
+      "/%AA" },
+    { "http://foo/%☺";, G_URI_FLAGS_ENCODED | G_URI_FLAGS_PARSE_RELAXED,
+      "/%%FFFFFFE2%FFFFFF98%FFFFFFBA" },
+  };
+
+static void
+test_uri_normalize (void)
+{
+  for (gsize i = 0; i < G_N_ELEMENTS (normalize_tests); ++i)
+    {
+      GUri *uri = g_uri_parse (normalize_tests[i].uri,
+                               normalize_tests[i].flags,
+                               NULL);
+      g_assert_nonnull (uri);
+      g_assert_cmpstr (g_uri_get_path (uri), ==, normalize_tests[i].path);
+    }
+}
+
 int
 main (int   argc,
       char *argv[])
@@ -1733,6 +1765,7 @@ main (int   argc,
   g_test_add_func ("/uri/to-string", test_uri_to_string);
   g_test_add_func ("/uri/join", test_uri_join);
   g_test_add_func ("/uri/join-split-round-trip", test_uri_join_split_round_trip);
+  g_test_add_func ("/uri/normalize", test_uri_normalize);
   g_test_add_data_func ("/uri/iter-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_iter_params);
   g_test_add_data_func ("/uri/iter-params/length", GINT_TO_POINTER (FALSE), test_uri_iter_params);
   g_test_add_data_func ("/uri/parse-params/nul-terminated", GINT_TO_POINTER (TRUE), test_uri_parse_params);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]