[libsoup] Implement content sniffing
- From: Gustavo Noronha Silva <gns src gnome org>
- To: svn-commits-list gnome org
- Subject: [libsoup] Implement content sniffing
- Date: Thu, 2 Jul 2009 14:05:14 +0000 (UTC)
commit 3c9f3cdffc32126700f25d8a0c55f68b6f587bde
Author: Gustavo Noronha Silva <gns gnome org>
Date: Wed Jun 17 20:53:17 2009 -0300
Implement content sniffing
The implementation is based on the draft spec on Content-Type
Processing Model (draft-abarth-mime-sniff-01). It is a spinoff from
the HTML5 spec.
Soup now provides a SoupContentSniffer session feature, which hooks
into the message I/O, and delays emissions of the got-chunk signal to
be able to figure out the Content-Type of messages from the actual
content received, in some cases.
GIO is also used to sniff content, whenever the spec allows further
sniffing.
http://bugzilla.gnome.org/show_bug.cgi?id=572589
.gitignore | 1 +
libsoup/Makefile.am | 2 +
libsoup/soup-content-sniffer.c | 570 ++++++++++++++++++++++++++++++++++++++++
libsoup/soup-content-sniffer.h | 57 ++++
libsoup/soup-marshal.list | 1 +
libsoup/soup-message-headers.c | 19 ++-
libsoup/soup-message-io.c | 128 +++++++++-
libsoup/soup-message-private.h | 5 +
libsoup/soup-message.c | 57 ++++
libsoup/soup-message.h | 1 +
libsoup/soup.h | 1 +
tests/Makefile.am | 3 +
tests/resources/atom.xml | 35 +++
tests/resources/home.gif | Bin 0 -> 995 bytes
tests/resources/mbox | 16 ++
tests/resources/rss20.xml | 26 ++
tests/resources/test.html | 10 +
tests/sniffing-test.c | 429 ++++++++++++++++++++++++++++++
18 files changed, 1356 insertions(+), 5 deletions(-)
---
diff --git a/.gitignore b/.gitignore
index b0cd3a4..1bb227d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,6 +69,7 @@ tests/redirect-test
tests/server-auth-test
tests/simple-httpd
tests/simple-proxy
+tests/sniffing-test
tests/ssl-test
tests/streaming-test
tests/timeout-test
diff --git a/libsoup/Makefile.am b/libsoup/Makefile.am
index 949f243..2d3a6ea 100644
--- a/libsoup/Makefile.am
+++ b/libsoup/Makefile.am
@@ -55,6 +55,7 @@ soup_headers = \
soup-auth-domain.h \
soup-auth-domain-basic.h \
soup-auth-domain-digest.h \
+ soup-content-sniffer.h \
soup-cookie.h \
soup-cookie-jar.h \
soup-cookie-jar-text.h \
@@ -119,6 +120,7 @@ libsoup_2_4_la_SOURCES = \
soup-auth-manager-ntlm.c \
soup-connection.h \
soup-connection.c \
+ soup-content-sniffer.c \
soup-cookie.c \
soup-cookie-jar.c \
soup-cookie-jar-text.c \
diff --git a/libsoup/soup-content-sniffer.c b/libsoup/soup-content-sniffer.c
new file mode 100644
index 0000000..5fdee5c
--- /dev/null
+++ b/libsoup/soup-content-sniffer.c
@@ -0,0 +1,570 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * soup-content-sniffer.c
+ *
+ * Copyright (C) 2009 Gustavo Noronha Silva.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include <gio/gio.h>
+
+#include "soup-content-sniffer.h"
+#include "soup-enum-types.h"
+#include "soup-message.h"
+#include "soup-message-private.h"
+#include "soup-session-feature.h"
+#include "soup-uri.h"
+
+/**
+ * SECTION:soup-content-sniffer
+ * @short_description: Content sniffing for #SoupSession
+ *
+ * A #SoupContentSniffer tries to detect the actual content type of
+ * the files that are being downloaded by looking at some of the data
+ * before the #SoupMessage emits its #SoupMessage::got-headers signal.
+ * #SoupContentSniffer implements #SoupSessionFeature, so you can add
+ * content sniffing to a session with soup_session_add_feature() or
+ * soup_session_add_feature_by_type().
+ *
+ * Since: 2.27.3
+ **/
+
+static char *sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params);
+static gsize get_buffer_size (SoupContentSniffer *sniffer);
+
+static void soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface, gpointer interface_data);
+
+static void request_queued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
+static void request_unqueued (SoupSessionFeature *feature, SoupSession *session, SoupMessage *msg);
+
+G_DEFINE_TYPE_WITH_CODE (SoupContentSniffer, soup_content_sniffer, G_TYPE_OBJECT,
+ G_IMPLEMENT_INTERFACE (SOUP_TYPE_SESSION_FEATURE,
+ soup_content_sniffer_session_feature_init))
+
+static void
+soup_content_sniffer_init (SoupContentSniffer *content_sniffer)
+{
+}
+
+static void
+soup_content_sniffer_class_init (SoupContentSnifferClass *content_sniffer_class)
+{
+ content_sniffer_class->sniff = sniff;
+ content_sniffer_class->get_buffer_size = get_buffer_size;
+}
+
+static void
+soup_content_sniffer_session_feature_init (SoupSessionFeatureInterface *feature_interface,
+ gpointer interface_data)
+{
+ feature_interface->request_queued = request_queued;
+ feature_interface->request_unqueued = request_unqueued;
+}
+
+/**
+ * soup_content_sniffer_new:
+ *
+ * Creates a new #SoupContentSniffer.
+ *
+ * Returns: a new #SoupContentSniffer
+ *
+ * Since: 2.27.3
+ **/
+SoupContentSniffer *
+soup_content_sniffer_new ()
+{
+ return g_object_new (SOUP_TYPE_CONTENT_SNIFFER, NULL);
+}
+
+char *
+soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
+ SoupMessage *msg, SoupBuffer *buffer,
+ GHashTable **params)
+{
+ g_return_val_if_fail (SOUP_IS_CONTENT_SNIFFER (sniffer), NULL);
+ g_return_val_if_fail (SOUP_IS_MESSAGE (msg), NULL);
+ g_return_val_if_fail (buffer != NULL, NULL);
+
+ return SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer)->sniff (sniffer, msg, buffer, params);
+}
+
+/* This table is based on the HTML5 spec;
+ * See 2.7.4 Content-Type sniffing: unknown type
+ */
+typedef struct {
+ /* @has_ws is TRUE if @pattern contains "generic" whitespace */
+ gboolean has_ws;
+ const char *mask;
+ const char *pattern;
+ guint pattern_length;
+ const char *sniffed_type;
+ gboolean scriptable;
+} SoupContentSnifferPattern;
+
+static SoupContentSnifferPattern types_table[] = {
+ { FALSE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF",
+ "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C",
+ 14,
+ "text/html",
+ TRUE },
+
+ { TRUE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF",
+ " \x3C\x48\x54\x4D\x4C",
+ 5,
+ "text/html",
+ TRUE },
+
+ { TRUE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF",
+ " \x3C\x48\x45\x41\x44",
+ 5,
+ "text/html",
+ TRUE },
+
+ { TRUE,
+ "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF",
+ " \x3C\x53\x43\x52\x49\x50\x54",
+ 7,
+ "text/html",
+ TRUE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF",
+ "\x25\x50\x44\x46\x2D",
+ 5,
+ "application/pdf",
+ TRUE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D",
+ 11,
+ "application/postscript",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\x00\x00",
+ "\xFE\xFF\x00\x00",
+ 4,
+ "text/plain",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\x00\x00",
+ "\xFF\xFF\x00\x00",
+ 4,
+ "text/plain",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\x00",
+ "\xEF\xBB\xBF\x00",
+ 4,
+ "text/plain",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x47\x49\x46\x38\x37\x61",
+ 6,
+ "image/gif",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x47\x49\x46\x38\x39\x61",
+ 6,
+ "image/gif",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF",
+ "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A",
+ 8,
+ "image/png",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF",
+ "\xFF\xD8\xFF",
+ 3,
+ "image/jpeg",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF",
+ "\x42\x4D",
+ 2,
+ "image/bmp",
+ FALSE },
+
+ { FALSE,
+ "\xFF\xFF\xFF\xFF",
+ "\x00\x00\x01\x00",
+ 4,
+ "image/vnd.microsoft.icon",
+ FALSE }
+};
+
+/* Whether a given byte looks like it might be part of binary content.
+ * Source: HTML5 spec; borrowed from the Chromium mime sniffer code,
+ * which is BSD-licensed
+ */
+static char byte_looks_binary[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, /* 0x00 - 0x0F */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, /* 0x10 - 0x1F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x4F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x50 - 0x5F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x70 - 0x7F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x9F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xAF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xBF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xCF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xF0 - 0xFF */
+};
+
+static char *
+sniff_gio (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+ SoupURI *uri;
+ char *uri_path;
+ char *content_type;
+ char *mime_type;
+ gboolean uncertain;
+
+ uri = soup_message_get_uri (msg);
+ uri_path = soup_uri_to_string (uri, TRUE);
+
+ content_type= g_content_type_guess (uri_path, (const guchar*)buffer->data, buffer->length, &uncertain);
+ mime_type = g_content_type_get_mime_type (content_type);
+
+ g_free (uri_path);
+ g_free (content_type);
+
+ return mime_type;
+}
+
+/* HTML5: 2.7.4 Content-Type sniffing: unknown type */
+static char*
+sniff_unknown (SoupContentSniffer *sniffer, SoupMessage *msg,
+ SoupBuffer *buffer, gboolean for_text_or_binary)
+{
+ const char *resource = buffer->data;
+ int resource_length = MIN (512, buffer->length);
+ char *gio_guess;
+ int i;
+
+ for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
+ SoupContentSnifferPattern *type_row = &(types_table[i]);
+
+ /* The scriptable types should be skiped for the text
+ * or binary path, but considered for other paths */
+ if (for_text_or_binary && type_row->scriptable)
+ continue;
+
+ if (type_row->has_ws) {
+ int index_stream = 0;
+ int index_pattern = 0;
+ gboolean skip_row = FALSE;
+
+ while (index_stream < resource_length) {
+ /* Skip insignificant white space ("WS" in the spec) */
+ if (type_row->pattern[index_pattern] == ' ') {
+ if (resource[index_stream] == '\x09' ||
+ resource[index_stream] == '\x0a' ||
+ resource[index_stream] == '\x0c' ||
+ resource[index_stream] == '\x0d' ||
+ resource[index_stream] == '\x20')
+ index_stream++;
+ else
+ index_pattern++;
+ } else {
+ if ((type_row->mask[index_pattern] & resource[index_stream]) != type_row->pattern[index_pattern]) {
+ skip_row = TRUE;
+ break;
+ }
+ index_pattern++;
+ index_stream++;
+ }
+ }
+
+ if (skip_row)
+ continue;
+
+ if (index_pattern > type_row->pattern_length)
+ return g_strdup (type_row->sniffed_type);
+ } else {
+ int j;
+
+ if (resource_length < type_row->pattern_length)
+ continue;
+
+ for (j = 0; j < type_row->pattern_length; j++) {
+ if ((type_row->mask[j] & resource[j]) != type_row->pattern[j])
+ break;
+ }
+
+ /* This means our comparison above matched completely */
+ if (j == type_row->pattern_length)
+ return g_strdup (type_row->sniffed_type);
+ }
+ }
+
+ /* The spec allows us to use platform sniffing to find out
+ * about other types that are not covered, but we need to be
+ * careful to not escalate privileges, if on text or binary.
+ */
+ gio_guess = sniff_gio (sniffer, msg, buffer);
+
+ if (for_text_or_binary) {
+ for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
+ SoupContentSnifferPattern *type_row = &(types_table[i]);
+
+ if (!g_ascii_strcasecmp (type_row->sniffed_type, gio_guess) &&
+ type_row->scriptable) {
+ g_free (gio_guess);
+ gio_guess = NULL;
+ break;
+ }
+ }
+ }
+
+ if (gio_guess)
+ return gio_guess;
+
+ return g_strdup ("application/octet-stream");
+}
+
+/* HTML5: 2.7.3 Content-Type sniffing: text or binary */
+static char*
+sniff_text_or_binary (SoupContentSniffer *sniffer, SoupMessage *msg,
+ SoupBuffer *buffer)
+{
+ const char *resource = buffer->data;
+ int resource_length = MIN (512, buffer->length);
+ gboolean looks_binary = FALSE;
+ int i;
+
+ /* Detecting UTF-16BE, UTF-16LE, or UTF-8 BOMs means it's text/plain */
+ if (resource_length >= 4) {
+ if ((resource[0] == 0xFE && resource[1] == 0xFF) ||
+ (resource[0] == 0xFF && resource[1] == 0xFE) ||
+ (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF))
+ return g_strdup ("text/plain");
+ }
+
+ /* Look to see if any of the first n bytes looks binary */
+ for (i = 0; i < resource_length; i++) {
+ if (byte_looks_binary[(unsigned char)resource[i]]) {
+ looks_binary = TRUE;
+ break;
+ }
+ }
+
+ if (!looks_binary)
+ return g_strdup ("text/plain");
+
+ return sniff_unknown (sniffer, msg, buffer, TRUE);
+}
+
+static char*
+sniff_images (SoupContentSniffer *sniffer, SoupMessage *msg,
+ SoupBuffer *buffer, const char *content_type)
+{
+ const char *resource = buffer->data;
+ int resource_length = MIN (512, buffer->length);
+ int i;
+
+ for (i = 0; i < G_N_ELEMENTS (types_table); i++) {
+ SoupContentSnifferPattern *type_row = &(types_table[i]);
+
+ if (resource_length < type_row->pattern_length)
+ continue;
+
+ if (!g_str_has_prefix (type_row->sniffed_type, "image/"))
+ continue;
+
+ /* All of the image types use all-\xFF for the mask,
+ * so we can just memcmp.
+ */
+ if (memcmp (type_row->pattern, resource, type_row->pattern_length) == 0)
+ return g_strdup (type_row->sniffed_type);
+ }
+
+ return g_strdup (content_type);
+}
+
+static char*
+sniff_feed_or_html (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer)
+{
+ const char *resource = buffer->data;
+ int resource_length = MIN (512, buffer->length);
+ int pos = 0;
+
+ /* Skip a leading UTF-8 BOM */
+ if (resource[0] == 0xEF && resource[1] == 0xBB && resource[2] == 0xBF)
+ pos = 3;
+
+ look_for_tag:
+ /* Skip insignificant white space */
+ while ((resource[pos] == '\x09') ||
+ (resource[pos] == '\x20') ||
+ (resource[pos] == '\x0A') ||
+ (resource[pos] == '\x0D'))
+ pos++;
+
+ /* != < */
+ if (resource[pos] != '\x3C')
+ return g_strdup ("text/html");
+
+ pos++;
+
+ /* Skipping comments */
+ if ((resource[pos] == '\x2D') ||
+ (resource[pos+1] == '\x2D') ||
+ (resource[pos+2] == '\x3E')) {
+ pos = pos + 3;
+
+ while ((resource[pos] != '\x2D') &&
+ (resource[pos+1] != '\x2D') &&
+ (resource[pos+2] != '\x3E'))
+ pos++;
+
+ goto look_for_tag;
+ }
+
+ /* == ! */
+ if (resource[pos] == '\x21') {
+ do {
+ pos++;
+ } while (resource[pos] != '\x3E');
+
+ pos++;
+
+ goto look_for_tag;
+ } else if (resource[pos] == '\x3F') { /* ? */
+ do {
+ pos++;
+ } while ((resource[pos] != '\x3F') &&
+ (resource[pos+1] != '\x3E'));
+
+ pos = pos + 2;
+
+ goto look_for_tag;
+ }
+
+ if ((resource[pos] == '\x72') &&
+ (resource[pos+1] == '\x73') &&
+ (resource[pos+2] == '\x73'))
+ return g_strdup ("application/rss+xml");
+
+ if ((resource[pos] == '\x66') &&
+ (resource[pos+1] == '\x65') &&
+ (resource[pos+2] == '\x65') &&
+ (resource[pos+3] == '\x64'))
+ return g_strdup ("application/atom+xml");
+
+ return g_strdup ("text/html");
+}
+
+static char*
+sniff (SoupContentSniffer *sniffer, SoupMessage *msg, SoupBuffer *buffer, GHashTable **params)
+{
+ const char *content_type_with_params;
+ const char *content_type;
+
+ content_type = soup_message_headers_get_content_type (msg->response_headers, params);
+ content_type_with_params = soup_message_headers_get_one (msg->response_headers, "Content-Type");
+
+
+ /* These comparisons are done in an ASCII-case-insensitive
+ * manner because the spec requires it */
+ if ((content_type == NULL) ||
+ !g_ascii_strcasecmp (content_type, "unknown/unknown") ||
+ !g_ascii_strcasecmp (content_type, "application/unknown") ||
+ !g_ascii_strcasecmp (content_type, "*/*"))
+ return sniff_unknown (sniffer, msg, buffer, FALSE);
+
+ if (g_str_has_suffix (content_type, "+xml") ||
+ !g_ascii_strcasecmp (content_type, "text/xml") ||
+ !g_ascii_strcasecmp (content_type, "application/xml"))
+ return g_strdup (content_type);
+
+ /* 2.7.5 Content-Type sniffing: image
+ * The spec says:
+ *
+ * If the resource's official type is "image/svg+xml", then
+ * the sniffed type of the resource is its official type (an
+ * XML type)
+ *
+ * The XML case is handled by the if above; if you refactor
+ * this code, keep this in mind.
+ */
+ if (!g_ascii_strncasecmp (content_type, "image/", 6))
+ return sniff_images (sniffer, msg, buffer, content_type);
+
+ /* If we got text/plain, use text_or_binary */
+ if (g_str_equal (content_type_with_params, "text/plain") ||
+ g_str_equal (content_type_with_params, "text/plain; charset=ISO-8859-1") ||
+ g_str_equal (content_type_with_params, "text/plain; charset=iso-8859-1") ||
+ g_str_equal (content_type_with_params, "text/plain; charset=UTF-8")) {
+ return sniff_text_or_binary (sniffer, msg, buffer);
+ }
+
+ if (!g_ascii_strcasecmp (content_type, "text/html"))
+ return sniff_feed_or_html (sniffer, msg, buffer);
+
+ return g_strdup (content_type);
+}
+
+static gsize
+get_buffer_size (SoupContentSniffer *sniffer)
+{
+ return 512;
+}
+
+static void
+soup_content_sniffer_got_headers_cb (SoupMessage *msg, SoupContentSniffer *sniffer)
+{
+ SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+ SoupContentSnifferClass *content_sniffer_class = SOUP_CONTENT_SNIFFER_GET_CLASS (sniffer);
+
+ priv->should_sniff_content = TRUE;
+ priv->bytes_for_sniffing = content_sniffer_class->get_buffer_size (sniffer);
+}
+
+static void
+request_queued (SoupSessionFeature *feature, SoupSession *session,
+ SoupMessage *msg)
+{
+ SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+
+ priv->sniffer = g_object_ref (feature);
+ g_signal_connect (msg, "got-headers",
+ G_CALLBACK (soup_content_sniffer_got_headers_cb),
+ feature);
+}
+
+static void
+request_unqueued (SoupSessionFeature *feature, SoupSession *session,
+ SoupMessage *msg)
+{
+ SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+
+ g_object_unref (priv->sniffer);
+ priv->sniffer = NULL;
+
+ g_signal_handlers_disconnect_by_func (msg, soup_content_sniffer_got_headers_cb, feature);
+}
diff --git a/libsoup/soup-content-sniffer.h b/libsoup/soup-content-sniffer.h
new file mode 100644
index 0000000..a8aa915
--- /dev/null
+++ b/libsoup/soup-content-sniffer.h
@@ -0,0 +1,57 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009 Gustavo Noronha Silva.
+ */
+
+#ifndef SOUP_CONTENT_SNIFFER_H
+#define SOUP_CONTENT_SNIFFER_H 1
+
+#include <libsoup/soup-types.h>
+#include <libsoup/soup-message-body.h>
+
+G_BEGIN_DECLS
+
+#define SOUP_TYPE_CONTENT_SNIFFER (soup_content_sniffer_get_type ())
+#define SOUP_CONTENT_SNIFFER(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSniffer))
+#define SOUP_CONTENT_SNIFFER_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSnifferClass))
+#define SOUP_IS_CONTENT_SNIFFER(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), SOUP_TYPE_CONTENT_SNIFFER))
+#define SOUP_IS_CONTENT_SNIFFER_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((obj), SOUP_TYPE_CONTENT_SNIFFER))
+#define SOUP_CONTENT_SNIFFER_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), SOUP_TYPE_CONTENT_SNIFFER, SoupContentSnifferClass))
+
+typedef struct _SoupContentSnifferPrivate SoupContentSnifferPrivate;
+
+typedef struct {
+ GObject parent;
+
+ SoupContentSnifferPrivate *priv;
+} SoupContentSniffer;
+
+typedef struct {
+ GObjectClass parent_class;
+
+ char* (*sniff) (SoupContentSniffer *sniffer,
+ SoupMessage *msg,
+ SoupBuffer *buffer,
+ GHashTable **params);
+ gsize (*get_buffer_size) (SoupContentSniffer *sniffer);
+
+ /* Padding for future expansion */
+ void (*_libsoup_reserved1) (void);
+ void (*_libsoup_reserved2) (void);
+ void (*_libsoup_reserved3) (void);
+ void (*_libsoup_reserved4) (void);
+ void (*_libsoup_reserved5) (void);
+} SoupContentSnifferClass;
+
+GType soup_content_sniffer_get_type (void);
+
+SoupContentSniffer *soup_content_sniffer_new (void);
+
+char *soup_content_sniffer_sniff (SoupContentSniffer *sniffer,
+ SoupMessage *msg,
+ SoupBuffer *buffer,
+ GHashTable **params);
+
+G_END_DECLS
+
+#endif /* SOUP_CONTENT_SNIFFER_H */
diff --git a/libsoup/soup-marshal.list b/libsoup/soup-marshal.list
index 1a43570..d0c53ef 100644
--- a/libsoup/soup-marshal.list
+++ b/libsoup/soup-marshal.list
@@ -6,3 +6,4 @@ NONE:OBJECT,OBJECT
NONE:OBJECT,POINTER
NONE:BOXED,BOXED
NONE:OBJECT,OBJECT,BOOLEAN
+NONE:STRING,BOXED
diff --git a/libsoup/soup-message-headers.c b/libsoup/soup-message-headers.c
index f0abb78..185346e 100644
--- a/libsoup/soup-message-headers.c
+++ b/libsoup/soup-message-headers.c
@@ -226,6 +226,20 @@ find_header (SoupHeader *hdr_array, const char *interned_name, int nth)
return -1;
}
+static int
+find_last_header (SoupHeader *hdr_array, guint length, const char *interned_name, int nth)
+{
+ int i;
+
+ for (i = length; i >= 0; i--) {
+ if (hdr_array[i].name == interned_name) {
+ if (nth-- == 0)
+ return i;
+ }
+ }
+ return -1;
+}
+
/**
* soup_message_headers_remove:
* @hdrs: a #SoupMessageHeaders
@@ -277,12 +291,15 @@ const char *
soup_message_headers_get_one (SoupMessageHeaders *hdrs, const char *name)
{
SoupHeader *hdr_array = (SoupHeader *)(hdrs->array->data);
+ guint hdr_length = hdrs->array->len;
int index;
g_return_val_if_fail (name != NULL, NULL);
name = intern_header_name (name, NULL);
- index = find_header (hdr_array, name, 0);
+
+ index = find_last_header (hdr_array, hdr_length, name, 0);
+
return (index == -1) ? NULL : hdr_array[index].value;
}
diff --git a/libsoup/soup-message-io.c b/libsoup/soup-message-io.c
index 8e04b66..10657b7 100644
--- a/libsoup/soup-message-io.c
+++ b/libsoup/soup-message-io.c
@@ -18,6 +18,7 @@
#include "soup-misc.h"
#include "soup-socket.h"
#include "soup-ssl.h"
+#include "soup-uri.h"
typedef enum {
SOUP_MESSAGE_IO_CLIENT,
@@ -53,6 +54,11 @@ typedef struct {
SoupMessageBody *read_body;
goffset read_length;
+ gboolean acked_content_sniff_decision;
+ gboolean delay_got_chunks;
+ SoupMessageBody *delayed_chunk_data;
+ gsize delayed_chunk_length;
+
SoupMessageIOState write_state;
SoupEncoding write_encoding;
GString *write_buf;
@@ -105,6 +111,9 @@ soup_message_io_cleanup (SoupMessage *msg)
if (io->write_chunk)
soup_buffer_free (io->write_chunk);
+ if (io->delayed_chunk_data)
+ soup_message_body_free (io->delayed_chunk_data);
+
g_slice_free (SoupMessageIOData, io);
}
@@ -207,6 +216,35 @@ io_disconnected (SoupSocket *sock, SoupMessage *msg)
io_error (sock, msg, NULL);
}
+static gboolean
+io_sniff_content (SoupMessage *msg)
+{
+ SoupMessagePrivate *priv = SOUP_MESSAGE_GET_PRIVATE (msg);
+ SoupMessageIOData *io = priv->io_data;
+ SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
+ char *sniffed_mime_type;
+ GHashTable *params = NULL;
+
+ io->delay_got_chunks = FALSE;
+
+ sniffed_mime_type = soup_content_sniffer_sniff (priv->sniffer, msg, sniffed_buffer, ¶ms);
+ SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+ soup_message_content_sniffed (msg, sniffed_mime_type, params);
+ g_free (sniffed_mime_type);
+ if (params)
+ g_hash_table_destroy (params);
+ SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+
+ SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+ soup_message_got_chunk (msg, sniffed_buffer);
+ soup_buffer_free (sniffed_buffer);
+ soup_message_body_free (io->delayed_chunk_data);
+ io->delayed_chunk_data = NULL;
+ SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+
+ return TRUE;
+}
+
/* Reads data from io->sock into io->read_meta_buf. If @to_blank is
* %TRUE, it reads up until a blank line ("CRLF CRLF" or "LF LF").
* Otherwise, it reads up until a single CRLF or LF.
@@ -294,6 +332,21 @@ read_body_chunk (SoupMessage *msg)
GError *error = NULL;
SoupBuffer *buffer;
+ if (!io->acked_content_sniff_decision) {
+ /* The content sniffer feature decides whether a
+ * message needs to be sniffed while handling
+ * got-headers, but the message may be paused in a
+ * user handler, so we need to make sure the signal is
+ * emitted, or delay_got_chunks is correctly setup
+ * here.
+ */
+ if (priv->should_sniff_content)
+ io->delay_got_chunks = TRUE;
+ else if (priv->sniffer)
+ soup_message_content_sniffed (msg, NULL, NULL);
+ io->acked_content_sniff_decision = TRUE;
+ }
+
while (read_to_eof || io->read_length > 0) {
if (priv->chunk_allocator) {
buffer = priv->chunk_allocator (msg, io->read_length, priv->chunk_allocator_data);
@@ -324,10 +377,24 @@ read_body_chunk (SoupMessage *msg)
io->read_length -= nread;
- SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
- soup_message_got_chunk (msg, buffer);
- soup_buffer_free (buffer);
- SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+ if (io->delay_got_chunks) {
+ if (!io->delayed_chunk_data)
+ io->delayed_chunk_data = soup_message_body_new ();
+
+ soup_message_body_append_buffer (io->delayed_chunk_data, buffer);
+ io->delayed_chunk_length += buffer->length;
+
+ /* We already have enough data to perform sniffing, so do it */
+ if (io->delayed_chunk_length > priv->bytes_for_sniffing) {
+ if (!io_sniff_content (msg))
+ return FALSE;
+ }
+ } else {
+ SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+ soup_message_got_chunk (msg, buffer);
+ soup_buffer_free (buffer);
+ SOUP_MESSAGE_IO_RETURN_VAL_IF_CANCELLED_OR_PAUSED (FALSE);
+ }
continue;
}
@@ -675,6 +742,23 @@ io_read (SoupSocket *sock, SoupMessage *msg)
guint status;
read_more:
+ /* We have delayed chunks, but are no longer delaying, so this
+ * means we already sniffed but the message got paused while
+ * content-sniffed was being handled, in which case we did not
+ * emit the necessary got-chunk; See also the handling for
+ * state SOUP_MESSAGE_IO_STATE_BODY in the switch bellow.
+ */
+ if (io->delayed_chunk_data && !io->delay_got_chunks) {
+ SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
+
+ SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+ soup_message_got_chunk (msg, sniffed_buffer);
+ soup_buffer_free (sniffed_buffer);
+ soup_message_body_free (io->delayed_chunk_data);
+ io->delayed_chunk_data = NULL;
+ SOUP_MESSAGE_IO_RETURN_IF_CANCELLED_OR_PAUSED;
+ }
+
switch (io->read_state) {
case SOUP_MESSAGE_IO_STATE_NOT_STARTED:
return;
@@ -782,6 +866,39 @@ io_read (SoupSocket *sock, SoupMessage *msg)
return;
got_body:
+ /* A chunk of data may have been read and the emission
+ * of got_chunk delayed because we wanted to wait for
+ * more chunks to arrive, for doing content sniffing,
+ * but the body was too small, so we need to check if
+ * an emission is in order here, along with the
+ * sniffing, if we haven't done it yet, of course.
+ */
+ if (io->delayed_chunk_data) {
+ if (io->delay_got_chunks) {
+ if (!io_sniff_content (msg))
+ return;
+ } else {
+ SoupBuffer *sniffed_buffer = soup_message_body_flatten (io->delayed_chunk_data);
+
+ SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
+ soup_message_got_chunk (msg, sniffed_buffer);
+ soup_buffer_free (sniffed_buffer);
+ soup_message_body_free (io->delayed_chunk_data);
+ io->delayed_chunk_data = NULL;
+
+ /* If we end up returning, read_state
+ * needs to be set to IO_STATE_BODY,
+ * and read_length must be 0; since we
+ * may be coming from STATE_TRAILERS,
+ * or may be doing a read-to-eof, we
+ * sanitize these here.
+ */
+ io->read_state = SOUP_MESSAGE_IO_STATE_BODY;
+ io->read_length = 0;
+ SOUP_MESSAGE_IO_RETURN_IF_CANCELLED_OR_PAUSED;
+ }
+ }
+
io->read_state = SOUP_MESSAGE_IO_STATE_FINISHING;
SOUP_MESSAGE_IO_PREPARE_FOR_CALLBACK;
@@ -885,6 +1002,9 @@ new_iostate (SoupMessage *msg, SoupSocket *sock, SoupMessageIOMode mode,
io->read_state = SOUP_MESSAGE_IO_STATE_NOT_STARTED;
io->write_state = SOUP_MESSAGE_IO_STATE_NOT_STARTED;
+ if (priv->should_sniff_content)
+ io->delay_got_chunks = TRUE;
+
if (priv->io_data)
soup_message_io_cleanup (msg);
priv->io_data = io;
diff --git a/libsoup/soup-message-private.h b/libsoup/soup-message-private.h
index f47251a..999c335 100644
--- a/libsoup/soup-message-private.h
+++ b/libsoup/soup-message-private.h
@@ -9,6 +9,7 @@
#include "soup-message.h"
#include "soup-auth.h"
#include "soup-connection.h"
+#include "soup-content-sniffer.h"
typedef enum {
SOUP_MESSAGE_IO_STATUS_IDLE,
@@ -29,6 +30,10 @@ typedef struct {
guint msg_flags;
gboolean server_side;
+ SoupContentSniffer *sniffer;
+ gboolean should_sniff_content;
+ gsize bytes_for_sniffing;
+
SoupHTTPVersion http_version, orig_http_version;
SoupURI *uri;
diff --git a/libsoup/soup-message.c b/libsoup/soup-message.c
index 5475bb7..f614946 100644
--- a/libsoup/soup-message.c
+++ b/libsoup/soup-message.c
@@ -99,6 +99,7 @@ enum {
GOT_HEADERS,
GOT_CHUNK,
GOT_BODY,
+ CONTENT_SNIFFED,
RESTARTED,
FINISHED,
@@ -402,6 +403,44 @@ soup_message_class_init (SoupMessageClass *message_class)
G_TYPE_NONE, 0);
/**
+ * SoupMessage::content-sniffed:
+ * @msg: the message
+ * @type: the content type that we got from sniffing
+ * @params: a #GHashTable with the parameters
+ *
+ * This signal is emitted after %got-headers, and before the
+ * first %got-chunk. If content sniffing is disabled, or no
+ * content sniffing will be performed, due to the sniffer
+ * deciding to trust the Content-Type sent by the server, this
+ * signal is emitted immediately after %got_headers, and @type
+ * is %NULL.
+ *
+ * If the #SoupContentSniffer feature is enabled, and the
+ * sniffer decided to perform sniffing, the first %got_chunk
+ * emission may be delayed, so that the sniffer has enough
+ * data to correctly sniff the content. It notified the
+ * library user that the content has been sniffed, and allows
+ * it to change the header contents in the message, if
+ * desired.
+ *
+ * After this signal is emitted, the data that was spooled so
+ * that sniffing could be done is delivered on the first
+ * emission of %got_chunk.
+ *
+ * Since: 2.27.3
+ **/
+ signals[CONTENT_SNIFFED] =
+ g_signal_new ("content_sniffed",
+ G_OBJECT_CLASS_TYPE (object_class),
+ G_SIGNAL_RUN_FIRST,
+ 0,
+ NULL, NULL,
+ soup_marshal_NONE__STRING_BOXED,
+ G_TYPE_NONE, 2,
+ G_TYPE_STRING,
+ G_TYPE_HASH_TABLE);
+
+ /**
* SoupMessage::restarted:
* @msg: the message
*
@@ -858,6 +897,24 @@ soup_message_got_body (SoupMessage *msg)
g_signal_emit (msg, signals[GOT_BODY], 0);
}
+/**
+ * soup_message_content_sniffed:
+ * @msg: a #SoupMessage
+ * @type: a string with the sniffed content type
+ * @params: a #GHashTable with the parameters
+ *
+ * Emits the %content_sniffed signal, indicating that the IO layer
+ * finished sniffing the content type for @msg. If content sniffing
+ * will not be performed, due to the sniffer deciding to trust the
+ * Content-Type sent by the server, this signal is emitted immediately
+ * after %got_headers, with %NULL as @content_type.
+ **/
+void
+soup_message_content_sniffed (SoupMessage *msg, const char *content_type, GHashTable *params)
+{
+ g_signal_emit (msg, signals[CONTENT_SNIFFED], 0, content_type, params);
+}
+
static void
restarted (SoupMessage *req)
{
diff --git a/libsoup/soup-message.h b/libsoup/soup-message.h
index 1b850be..b940ac6 100644
--- a/libsoup/soup-message.h
+++ b/libsoup/soup-message.h
@@ -155,6 +155,7 @@ void soup_message_got_informational (SoupMessage *msg);
void soup_message_got_headers (SoupMessage *msg);
void soup_message_got_chunk (SoupMessage *msg, SoupBuffer *chunk);
void soup_message_got_body (SoupMessage *msg);
+void soup_message_content_sniffed (SoupMessage *msg, const char *content_type, GHashTable *params);
void soup_message_restarted (SoupMessage *msg);
void soup_message_finished (SoupMessage *msg);
diff --git a/libsoup/soup.h b/libsoup/soup.h
index 496a4c1..ddb73f7 100644
--- a/libsoup/soup.h
+++ b/libsoup/soup.h
@@ -15,6 +15,7 @@ extern "C" {
#include <libsoup/soup-auth-domain.h>
#include <libsoup/soup-auth-domain-basic.h>
#include <libsoup/soup-auth-domain-digest.h>
+#include <libsoup/soup-content-sniffer.h>
#include <libsoup/soup-cookie.h>
#include <libsoup/soup-cookie-jar.h>
#include <libsoup/soup-cookie-jar-text.h>
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0d46df5..ca8158d 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -26,6 +26,7 @@ noinst_PROGRAMS = \
redirect-test \
simple-httpd \
simple-proxy \
+ sniffing-test \
streaming-test \
timeout-test \
uri-parsing \
@@ -58,6 +59,7 @@ redirect_test_SOURCES = redirect-test.c $(TEST_SRCS)
server_auth_test_SOURCES = server-auth-test.c $(TEST_SRCS)
simple_httpd_SOURCES = simple-httpd.c
simple_proxy_SOURCES = simple-proxy.c
+sniffing_test_SOURCES = sniffing-test.c $(TEST_SRCS)
ssl_test_SOURCES = ssl-test.c $(TEST_SRCS)
streaming_test_SOURCES = streaming-test.c $(TEST_SRCS)
timeout_test_SOURCES = timeout-test.c $(TEST_SRCS)
@@ -87,6 +89,7 @@ TESTS = \
misc-test \
ntlm-test \
redirect-test \
+ sniffing-test \
streaming-test \
timeout-test \
uri-parsing \
diff --git a/tests/resources/atom.xml b/tests/resources/atom.xml
new file mode 100644
index 0000000..962ecf4
--- /dev/null
+++ b/tests/resources/atom.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom" xmlns:planet="http://planet.libsouprocks.net/" xmlns:indexing="urn:atom-extension:indexing" indexing:index="no"><access:restriction xmlns:access="http://www.bloglines.com/about/specs/fac-1.0" relationship="deny"/>
+ <title>A small ATOM feed</title>
+ <updated>2009-07-02T10:27:44Z</updated>
+ <generator>kov</generator>
+ <author>
+ <name>Anonymous Coward</name>
+ </author>
+ <id>http://libsoup.rocks/atom.xml</id>
+ <link href="http://libsoup.rocks/atom.xml" rel="self" type="application/atom+xml"/>
+ <link href="http://libsoup.rocks/" rel="alternate"/>
+
+ <entry xml:lang="en">
+ <id>http://libsoup.rocks/so/much/</id>
+ <link href="http://libsoup.rocks/so/much/" rel="alternate" type="text/html"/>
+ <title>One post too many</title>
+ <summary>woo [...]</summary>
+ <content type="xhtml"><div xmlns="http://www.w3.org/1999/xhtml"><p>woohoo</p></div>
+ </content>
+ <updated>2009-07-02T10:38:28Z</updated>
+ <category term="Category1"/>
+ <category term="Personal"/>
+ <author>
+ <name>kov</name>
+ </author>
+ <source>
+ <id>http://libsoup.rocks/blog</id>
+ <link href="http://libsoup.rocks/blog/feed" rel="self" type="application/atom+xml"/>
+ <link href="http://libsoup.rocks/blog" rel="alternate" type="text/html"/>
+ <subtitle>Just stuff to test libsoup</subtitle>
+ <title>Random stuff to test libsoup</title>
+ <updated>2009-07-02T00:38:29Z</updated>
+ </source>
+ </entry>
+</feed>
diff --git a/tests/resources/home.gif b/tests/resources/home.gif
new file mode 100644
index 0000000..55e1d59
Binary files /dev/null and b/tests/resources/home.gif differ
diff --git a/tests/resources/mbox b/tests/resources/mbox
new file mode 100644
index 0000000..929ad2b
--- /dev/null
+++ b/tests/resources/mbox
@@ -0,0 +1,16 @@
+From email here Wed Jun 17 21:20:48 2009
+Return-path: <email here>
+Envelope-to: email here
+Delivery-date: Wed, 17 Jun 2009 21:20:48 -0300
+Received: from email by here.domain with local (Exim 4.69)
+ (envelope-from <email here>)
+ id 1MH5N2-0008Lq-7c
+ for email here; Wed, 17 Jun 2009 21:20:48 -0300
+To: email here
+Subject: This is just so that I have a mailbox
+Message-Id: <E1MH5N2-0008Lq-7c here domain>
+From: A Nice User <email here>
+Date: Wed, 17 Jun 2009 21:20:48 -0300
+
+This is a dumb email.
+
diff --git a/tests/resources/rss20.xml b/tests/resources/rss20.xml
new file mode 100644
index 0000000..d64bdda
--- /dev/null
+++ b/tests/resources/rss20.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<rss version="2.0">
+
+<channel>
+ <title>A small RSS</title>
+ <link>http://libsoup.rocks/</link>
+ <language>en</language>
+ <description>A small RSS to test libsoup</description>
+
+<item>
+ <title>One post too many</title>
+ <guid isPermaLink="true">http://libsoup.rocks/so/much/</guid>
+ <link>http://libsoup.rocks/so/much/</link>
+ <description><p>woohoo</p></description>
+ <pubDate>Wed, 02 Jul 2009 10:26:28 +0000</pubDate>
+</item>
+<item>
+ <title>GCDS will rock</title>
+ <guid isPermaLink="true">http://libsoup.rocks/so/much/again/</guid>
+ <link>http://libsoup.rocks/so/much/again/</link>
+ <description><p>I mean, really.</p></description>
+ <pubDate>Wed, 02 Jul 2009 10:26:28 +0000</pubDate>
+</item>
+
+</channel>
+</rss>
diff --git a/tests/resources/test.html b/tests/resources/test.html
new file mode 100644
index 0000000..5a6cc0c
--- /dev/null
+++ b/tests/resources/test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title></title>
+</head>
+<body>
+<h1>GNOME!</h1>
+</body>
+</html>
diff --git a/tests/sniffing-test.c b/tests/sniffing-test.c
new file mode 100644
index 0000000..ad2690f
--- /dev/null
+++ b/tests/sniffing-test.c
@@ -0,0 +1,429 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2009 Gustavo Noronha Silva <gns gnome org>.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <libsoup/soup.h>
+
+#include "test-utils.h"
+
+SoupSession *session;
+SoupURI *base_uri;
+SoupMessageBody *chunk_data;
+
+static void
+server_callback (SoupServer *server, SoupMessage *msg,
+ const char *path, GHashTable *query,
+ SoupClientContext *context, gpointer data)
+{
+ GError *error = NULL;
+ char *chunked;
+ char *contents;
+ gsize length;
+
+ if (msg->method != SOUP_METHOD_GET) {
+ soup_message_set_status (msg, SOUP_STATUS_NOT_IMPLEMENTED);
+ return;
+ }
+
+ soup_message_set_status (msg, SOUP_STATUS_OK);
+
+ if (query) {
+ chunked = g_hash_table_lookup (query, "chunked");
+ if (chunked && g_str_equal (chunked, "yes"))
+ soup_message_headers_set_encoding (msg->response_headers,
+ SOUP_ENCODING_CHUNKED);
+ }
+
+ if (!strcmp (path, "/mbox")) {
+ g_file_get_contents ("resources/mbox",
+ &contents, &length,
+ &error);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ soup_message_set_response (msg, "text/plain",
+ SOUP_MEMORY_TAKE,
+ contents,
+ length);
+ }
+
+ if (g_str_has_prefix (path, "/text_or_binary/")) {
+ char *base_name = g_path_get_basename (path);
+ char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+ g_file_get_contents (file_name,
+ &contents, &length,
+ &error);
+
+ g_free (base_name);
+ g_free (file_name);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ soup_message_set_response (msg, "text/plain",
+ SOUP_MEMORY_TAKE,
+ contents,
+ length);
+ }
+
+ if (g_str_has_prefix (path, "/unknown/")) {
+ char *base_name = g_path_get_basename (path);
+ char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+ g_file_get_contents (file_name,
+ &contents, &length,
+ &error);
+
+ g_free (base_name);
+ g_free (file_name);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ soup_message_set_response (msg, "UNKNOWN/unknown",
+ SOUP_MEMORY_TAKE,
+ contents,
+ length);
+ }
+
+ if (g_str_has_prefix (path, "/type/")) {
+ char **components = g_strsplit (path, "/", 4);
+ char *ptr;
+
+ char *base_name = g_path_get_basename (path);
+ char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+ g_file_get_contents (file_name,
+ &contents, &length,
+ &error);
+
+ g_free (base_name);
+ g_free (file_name);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ /* Hack to allow passing type in the URI */
+ ptr = g_strrstr (components[2], "_");
+ *ptr = '/';
+
+ soup_message_set_response (msg, components[2],
+ SOUP_MEMORY_TAKE,
+ contents,
+ length);
+
+ g_strfreev (components);
+ }
+
+ if (g_str_has_prefix (path, "/multiple_headers/")) {
+ char *base_name = g_path_get_basename (path);
+ char *file_name = g_strdup_printf ("resources/%s", base_name);
+
+ g_file_get_contents (file_name,
+ &contents, &length,
+ &error);
+
+ g_free (base_name);
+ g_free (file_name);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ soup_message_set_response (msg, "text/xml",
+ SOUP_MEMORY_TAKE,
+ contents,
+ length);
+
+ soup_message_headers_append (msg->response_headers,
+ "Content-Type", "text/plain");
+ }
+
+}
+
+static gboolean
+unpause_msg (gpointer data)
+{
+ SoupMessage *msg = (SoupMessage*)data;
+ soup_session_unpause_message (session, msg);
+ return FALSE;
+}
+
+
+static void
+content_sniffed (SoupMessage *msg, char *content_type, GHashTable *params, gpointer data)
+{
+ gboolean should_pause = GPOINTER_TO_INT (data);
+
+ if (g_object_get_data (G_OBJECT (msg), "got-chunk")) {
+ debug_printf (1, " got-chunk got emitted before content-sniffed\n");
+ errors++;
+ }
+
+ g_object_set_data (G_OBJECT (msg), "content-sniffed", GINT_TO_POINTER (TRUE));
+
+ if (should_pause) {
+ soup_session_pause_message (session, msg);
+ g_idle_add (unpause_msg, msg);
+ }
+}
+
+static void
+got_headers (SoupMessage *msg, gpointer data)
+{
+ gboolean should_pause = GPOINTER_TO_INT (data);
+
+ if (g_object_get_data (G_OBJECT (msg), "content-sniffed")) {
+ debug_printf (1, " content-sniffed got emitted before got-headers\n");
+ errors++;
+ }
+
+ g_object_set_data (G_OBJECT (msg), "got-headers", GINT_TO_POINTER (TRUE));
+
+ if (should_pause) {
+ soup_session_pause_message (session, msg);
+ g_idle_add (unpause_msg, msg);
+ }
+}
+
+static void
+got_chunk (SoupMessage *msg, SoupBuffer *chunk, gpointer data)
+{
+ gboolean should_accumulate = GPOINTER_TO_INT (data);
+
+ g_object_set_data (G_OBJECT (msg), "got-chunk", GINT_TO_POINTER (TRUE));
+
+ if (!should_accumulate) {
+ if (!chunk_data)
+ chunk_data = soup_message_body_new ();
+ soup_message_body_append_buffer (chunk_data, chunk);
+ }
+}
+
+static void
+finished (SoupSession *session, SoupMessage *msg, gpointer data)
+{
+ GMainLoop *loop = (GMainLoop*)data;
+ g_main_loop_quit (loop);
+}
+
+static void
+do_signals_test (gboolean should_content_sniff,
+ gboolean should_pause,
+ gboolean should_accumulate,
+ gboolean chunked_encoding)
+{
+ SoupURI *uri = soup_uri_new_with_base (base_uri, "/mbox");
+ SoupMessage *msg = soup_message_new_from_uri ("GET", uri);
+ GMainLoop *loop = g_main_loop_new (NULL, TRUE);
+ char *contents;
+ gsize length;
+ GError *error = NULL;
+ SoupBuffer *body;
+
+ if (chunked_encoding)
+ soup_uri_set_query (uri, "chunked=yes");
+
+ soup_message_body_set_accumulate (msg->response_body, should_accumulate);
+
+ g_object_connect (msg,
+ "signal::got-headers", got_headers, GINT_TO_POINTER (should_pause),
+ "signal::got-chunk", got_chunk, GINT_TO_POINTER (should_accumulate),
+ "signal::content_sniffed", content_sniffed, GINT_TO_POINTER (should_pause),
+ NULL);
+
+ g_object_ref (msg);
+ soup_session_queue_message (session, msg, finished, loop);
+
+ g_main_loop_run (loop);
+
+ if (!should_content_sniff &&
+ g_object_get_data (G_OBJECT (msg), "content-sniffed")) {
+ debug_printf (1, " content-sniffed got emitted without a sniffer\n");
+ errors++;
+ } else if (should_content_sniff &&
+ !g_object_get_data (G_OBJECT (msg), "content-sniffed")) {
+ debug_printf (1, " content-sniffed did not get emitted\n");
+ errors++;
+ }
+
+ g_file_get_contents ("resources/mbox",
+ &contents, &length,
+ &error);
+
+ if (error) {
+ g_error ("%s", error->message);
+ g_error_free (error);
+ exit (1);
+ }
+
+ if (!should_accumulate) {
+ body = soup_message_body_flatten (chunk_data);
+ soup_message_body_free (chunk_data);
+ chunk_data = NULL;
+ } else
+ body = soup_message_body_flatten (msg->response_body);
+
+ if (body->length != length) {
+ debug_printf (1, " lengths do not match\n");
+ errors++;
+ }
+
+ if (memcmp (body->data, contents, length)) {
+ debug_printf (1, " downloaded data does not match\n");
+ errors++;
+ }
+
+ g_free (contents);
+ soup_buffer_free (body);
+
+ soup_uri_free (uri);
+ g_object_unref (msg);
+ g_main_loop_unref (loop);
+}
+
+static void
+sniffing_content_sniffed (SoupMessage *msg, char *content_type, GHashTable *params, gpointer data)
+{
+ char *expected_type = (char*)data;
+
+ if (strcmp (content_type, expected_type)) {
+ debug_printf (1, " sniffing failed! expected %s, got %s\n",
+ expected_type, content_type);
+ errors++;
+ }
+}
+
+static void
+test_sniffing (const char *path, const char *expected_type)
+{
+ SoupURI *uri = soup_uri_new_with_base (base_uri, path);
+ SoupMessage *msg = soup_message_new_from_uri ("GET", uri);
+ GMainLoop *loop = g_main_loop_new (NULL, TRUE);
+
+ g_object_connect (msg,
+ "signal::content_sniffed", sniffing_content_sniffed, expected_type,
+ NULL);
+
+ g_object_ref (msg);
+
+ soup_session_queue_message (session, msg, finished, loop);
+
+ g_main_loop_run (loop);
+
+ soup_uri_free (uri);
+ g_object_unref (msg);
+ g_main_loop_unref (loop);
+}
+
+int
+main (int argc, char **argv)
+{
+ SoupServer *server;
+ SoupContentSniffer *sniffer;
+
+ test_init (argc, argv, NULL);
+
+ server = soup_test_server_new (TRUE);
+ soup_server_add_handler (server, NULL, server_callback, NULL, NULL);
+ base_uri = soup_uri_new ("http://127.0.0.1/");
+ soup_uri_set_port (base_uri, soup_server_get_port (server));
+
+ session = soup_session_async_new ();
+
+ /* No sniffer, no content_sniffed should be emitted */
+ do_signals_test (FALSE, FALSE, FALSE, FALSE);
+ do_signals_test (FALSE, FALSE, FALSE, TRUE);
+ do_signals_test (FALSE, FALSE, TRUE, FALSE);
+ do_signals_test (FALSE, FALSE, TRUE, TRUE);
+
+ do_signals_test (FALSE, TRUE, TRUE, FALSE);
+ do_signals_test (FALSE, TRUE, TRUE, TRUE);
+ do_signals_test (FALSE, TRUE, FALSE, FALSE);
+ do_signals_test (FALSE, TRUE, FALSE, TRUE);
+
+ sniffer = soup_content_sniffer_new ();
+ soup_session_add_feature (session, (SoupSessionFeature*)sniffer);
+
+ /* Now, with a sniffer, content_sniffed must be emitted after
+ * got-headers, and before got-chunk.
+ */
+ do_signals_test (TRUE, FALSE, FALSE, FALSE);
+ do_signals_test (TRUE, FALSE, FALSE, TRUE);
+ do_signals_test (TRUE, FALSE, TRUE, FALSE);
+ do_signals_test (TRUE, FALSE, TRUE, TRUE);
+
+ do_signals_test (TRUE, TRUE, TRUE, FALSE);
+ do_signals_test (TRUE, TRUE, TRUE, TRUE);
+ do_signals_test (TRUE, TRUE, FALSE, FALSE);
+ do_signals_test (TRUE, TRUE, FALSE, TRUE);
+
+ /* Test the text_or_binary sniffing path */
+
+ /* GIF is a 'safe' type */
+ test_sniffing ("/text_or_binary/home.gif", "image/gif");
+
+ /* With our current code, no sniffing is done using GIO, so
+ * the mbox will be identified as text/plain; should we change
+ * this?
+ */
+ test_sniffing ("/text_or_binary/mbox", "text/plain");
+
+ /* HTML is considered unsafe for this algorithm, since it is
+ * scriptable, so going from text/plain to text/html is
+ * considered 'privilege escalation'
+ */
+ test_sniffing ("/text_or_binary/test.html", "text/plain");
+
+ /* Test the unknown sniffing path */
+
+ test_sniffing ("/unknown/test.html", "text/html");
+ test_sniffing ("/unknown/home.gif", "image/gif");
+ test_sniffing ("/unknown/mbox", "application/mbox");
+
+ /* Test the XML sniffing path */
+
+ test_sniffing ("/type/text_xml/home.gif", "text/xml");
+ test_sniffing ("/type/anice_type+xml/home.gif", "anice/type+xml");
+ test_sniffing ("/type/application_xml/home.gif", "application/xml");
+
+ /* Test the image sniffing path */
+
+ test_sniffing ("/type/image_png/home.gif", "image/gif");
+
+ /* Test the feed or html path */
+
+ test_sniffing ("/type/text_html/test.html", "text/html");
+ test_sniffing ("/type/text_html/rss20.xml", "application/rss+xml");
+ test_sniffing ("/type/text_html/atom.xml", "application/atom+xml");
+
+ /* The spec tells us to only use the last Content-Type header */
+
+ test_sniffing ("/multiple_headers/home.gif", "image/gif");
+
+ soup_uri_free (base_uri);
+
+ test_cleanup ();
+ return errors != 0;
+}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]