alternative gmarkup parser



Hi,

I reimplemented GMarkup a different way, using a state machine type of
thing. The new implementation allows incremental parsing; I also
swapped in an expat-style vtable API instead of the thing with the
nodes. This API is more general-purpose/flexible and has fewer entry
points. For many applications I think it's also simpler to use, though
it may not seem that way in theory; the test program was easier to
write using it, and I expect it to be easier to write the rich text
parser thing this way also. The nodes result in a lot of tedious list
walking and switching on node type that can be avoided with the
vtable; also the vtable allows you to signal errors as the markup is
parsed, instead of having to walk the nodes after the fact and
validate them.

Owen this is where I'm wanting the is_first_byte_in_char() macro, grep
for 0x80, see what you think about alternative ways to do the code.

Yes, this code contains a 750-line function. ;-) I'll split it up...

If we keep the old API with the nodes, I still want to swap in this
implementation instead of the old implementation.

Havoc

/* gmarkup.h - Simple XML-like string parser/writer
 *
 *  Copyright 2000 Red Hat, Inc.
 *
 * GLib is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * GLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GLib; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *   Boston, MA 02111-1307, USA.
 */

#ifndef __G_MARKUP_H__
#define __G_MARKUP_H__

#include <gerror.h>

#ifdef __cplusplus
extern "C"
{
#endif


typedef enum
{
  G_MARKUP_ERROR_BAD_UTF8,
  G_MARKUP_ERROR_EMPTY,
  G_MARKUP_ERROR_PARSE,
  /* These three are primarily intended for specific GMarkupParser
   * implementations to set.
   */
  G_MARKUP_ERROR_UNKNOWN_ELEMENT,
  G_MARKUP_ERROR_UNKNOWN_ATTRIBUTE,
  G_MARKUP_ERROR_INVALID_CONTENT
} GMarkupErrorType;

#define G_MARKUP_ERROR g_markup_error_quark ()

GQuark g_markup_error_quark ();

typedef enum
{
  /* Hmm, can't think of any at the moment */
  G_MARKUP_FOO = 1 << 0
  
} GMarkupParseFlags;

typedef struct _GMarkupParseContext GMarkupParseContext;
typedef struct _GMarkupParser GMarkupParser;

struct _GMarkupParser
{
  /* Called for open tags <foo bar="baz"> */
  void (*start_element)  (GMarkupParseContext *context,
                          const gchar         *element_name,
                          const gchar        **attribute_names,
                          const gchar        **attribute_values,
                          gpointer             user_data,
                          GError             **error);

  /* Called for close tags </foo> */
  void (*end_element)    (GMarkupParseContext *context,
                          const gchar         *element_name,
                          gpointer             user_data,
                          GError             **error);

  /* Called for character data */
  /* text is not nul-terminated */
  void (*text)           (GMarkupParseContext *context,
                          const gchar         *text,
                          gint                 text_len,
                          gpointer             user_data,
                          GError             **error);

  /* Called for strings that should be re-saved verbatim in this same
   * position, but are not otherwise interpretable.  At the moment
   * this includes comments and processing instructions.
   */
  /* text is not nul-terminated. */
  void (*passthrough)    (GMarkupParseContext *context,
                          const gchar         *passthrough_text,
                          gint                 text_len,
                          gpointer             user_data,
                          GError             **error);

  /* Called on error, including one set by other
   * methods in the vtable. The GError should not be freed.
   */
  void (*error)          (GMarkupParseContext *context,
                          GError              *error,
                          gpointer             user_data);
};

GMarkupParseContext *g_markup_parse_context_new   (const GMarkupParser *parser,
                                                   GMarkupParseFlags    flags,
                                                   gpointer             user_data,
                                                   GDestroyNotify       user_data_dnotify);
void                 g_markup_parse_context_free  (GMarkupParseContext *context);
gboolean             g_markup_parse_context_parse (GMarkupParseContext *context,
                                                   const gchar         *text,
                                                   gint                 text_len,
                                                   GError             **error);
                                                   
gboolean             g_markup_parse_context_end_parse (GMarkupParseContext *context,
                                                       GError             **error);

/* useful when saving */
gchar* g_markup_escape_text (const gchar *text,
                             gint         length);


#ifdef __cplusplus
}
#endif

#endif /* __G_MARKUP_H__ */

/* gmarkup.c - Simple XML-like parser
 *
 *  Copyright 2000 Red Hat, Inc.
 *
 * GLib is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * GLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GLib; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *   Boston, MA 02111-1307, USA.
 */

#include "glib.h"

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>

/* FIXME */
#define _(x) x

GQuark
g_markup_error_quark ()
{
  static GQuark error_quark = 0;

  if (error_quark == 0)
    error_quark = g_quark_from_static_string ("g-markup-error-quark");

  return error_quark;
}

typedef struct _GMarkupAttribute GMarkupAttribute;

struct _GMarkupAttribute
{
  gchar *name;
  gchar *value;
};

static GMarkupAttribute*
attribute_new (const gchar *name, const gchar *value)
{
  GMarkupAttribute *attr;

  attr = g_new (GMarkupAttribute, 1);

  /* name/value are allowed to be NULL */
  attr->name = g_strdup (name);
  attr->value = g_strdup (value);

  return attr;
}

static void
attribute_free (GMarkupAttribute *attr)
{
  g_free (attr->name);
  g_free (attr->value);
  g_free (attr);
}

typedef enum
{
  STATE_START,
  STATE_AFTER_OPEN_ANGLE,
  STATE_AFTER_CLOSE_ANGLE,
  STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
  STATE_INSIDE_OPEN_TAG_NAME,
  STATE_INSIDE_ATTRIBUTE_NAME,
  STATE_BETWEEN_ATTRIBUTES,
  STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
  STATE_INSIDE_ATTRIBUTE_VALUE,
  STATE_INSIDE_TEXT,
  STATE_AFTER_CLOSE_TAG_SLASH,
  STATE_INSIDE_CLOSE_TAG_NAME,
  STATE_INSIDE_PASSTHROUGH,
  STATE_ERROR
} GMarkupParseState;

struct _GMarkupParseContext
{
  const GMarkupParser *parser;

  GMarkupParseFlags flags;

  gint line_number;
  gint char_number;

  gpointer user_data;
  GDestroyNotify dnotify;

  /* A piece of character data or an element that
   * hasn't "ended" yet so we haven't yet called
   * the callback for it.
   */
  GString *partial_chunk;

  GMarkupParseState state;
  GSList *tag_stack;
  GSList *attributes;

  const gchar *current_text;
  gint         current_text_len;
  const gchar *current_text_end;

  GString *leftover_char_portion;

  /* used to save the start of the last interesting thingy */
  const gchar *start;

  const gchar *iter;

  guint document_empty : 1;
  guint parsing : 1;
};

GMarkupParseContext *
g_markup_parse_context_new (const GMarkupParser *parser,
                            GMarkupParseFlags    flags,
                            gpointer             user_data,
                            GDestroyNotify       user_data_dnotify)
{
  GMarkupParseContext *context;

  g_return_val_if_fail (parser != NULL, NULL);

  context = g_new (GMarkupParseContext, 1);

  context->parser = parser;
  context->flags = flags;
  context->user_data = user_data;
  context->dnotify = user_data_dnotify;

  context->line_number = 1;
  context->char_number = 1;

  context->partial_chunk = NULL;

  context->state = STATE_START;
  context->tag_stack = NULL;
  context->attributes = NULL;

  context->current_text = NULL;
  context->current_text_len = -1;
  context->current_text_end = NULL;
  context->leftover_char_portion = NULL;

  context->start = NULL;
  context->iter = NULL;

  context->document_empty = TRUE;
  context->parsing = FALSE;

  return context;
}

void
g_markup_parse_context_free (GMarkupParseContext *context)
{
  g_return_if_fail (context != NULL);
  g_return_if_fail (!context->parsing);

  if (context->dnotify)
    (* context->dnotify) (context->user_data);

  g_slist_foreach (context->attributes, (GFunc)attribute_free, NULL);
  g_slist_free (context->attributes);

  g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL);
  g_slist_free (context->tag_stack);

  if (context->partial_chunk)
    g_string_free (context->partial_chunk, TRUE);

  if (context->leftover_char_portion)
    g_string_free (context->leftover_char_portion, TRUE);

  g_free (context);
}

static void
attribute_list_to_arrays (GSList  *attributes,
                          gchar ***namesp,
                          gchar ***valuesp,
                          gint    *n_attributes)
{
  GSList *tmp_list;
  gint len;
  gchar **names;
  gchar **values;
  gint i;

  len = g_slist_length (attributes);

  if (namesp)
    {
      names = g_new (gchar*, len + 1);
      names[len] = NULL;
    }
  else
    names = NULL;

  if (valuesp)
    {
      values = g_new (gchar*, len + 1);
      values[len] = NULL;
    }
  else
    values = NULL;

  i = 0;
  tmp_list = attributes;
  while (tmp_list)
    {
      GMarkupAttribute *attr = tmp_list->data;

      g_assert (i < len + 1);

      if (namesp)
        names[i] = g_strdup (attr->name);

      if (valuesp)
        values[i] = g_strdup (attr->value);

      tmp_list = g_slist_next (tmp_list);
      ++i;
    }

  if (n_attributes)
    *n_attributes = len;

  if (namesp)
    *namesp = names;

  if (valuesp)
    *valuesp = values;
}

static void
mark_error (GMarkupParseContext *context,
            GError              *error)
{
  context->state = STATE_ERROR;

  if (context->parser->error)
    (*context->parser->error) (context, error, context->user_data);
}

static void
set_error (GMarkupParseContext *context,
           GError             **error,
           GMarkupErrorType     code,
           const gchar         *format,
           ...)
{
  GError *tmp_error;
  gchar *s;
  va_list args;

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

  tmp_error = g_error_new (G_MARKUP_ERROR,
                           code,
                           _("Error on line %d char %d: %s"),
                           context->line_number,
                           context->char_number,
                           s);

  g_free (s);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

static gboolean
is_name_start_char (gunichar c)
{
  if (g_unichar_isalpha (c) ||
      c == '_' ||
      c == ':')
    return TRUE;
  else
    return FALSE;
}

static gboolean
is_name_char (gunichar c)
{
  if (g_unichar_isalnum (c) ||
      c == '.' ||
      c == '-' ||
      c == '_' ||
      c == ':')
    return TRUE;
  else
    return FALSE;
}


static gchar*
char_str (gunichar c,
          gchar   *buf)
{
  memset (buf, 0, 7);
  g_unichar_to_utf8 (c, buf);
  return buf;
}

static gchar*
utf8_str (const gchar *utf8,
          gchar       *buf)
{
  char_str (g_utf8_get_char (utf8), buf);
  return buf;
}

static void
set_unescape_error (GMarkupParseContext *context,
                    GError             **error,
                    const gchar         *remaining_text,
                    const gchar         *remaining_text_end,
                    GMarkupErrorType     code,
                    const gchar         *format,
                    ...)
{
  GError *tmp_error;
  gchar *s;
  va_list args;
  gint remaining_newlines;
  const gchar *p;

  remaining_newlines = 0;
  p = remaining_text;
  while (p != remaining_text_end)
    {
      if (*p == '\n')
        ++remaining_newlines;
      ++p;
    }

  va_start (args, format);
  s = g_strdup_vprintf (format, args);
  va_end (args);

  tmp_error = g_error_new (G_MARKUP_ERROR,
                           code,
                           _("Error on line %d: %s"),
                           context->line_number - remaining_newlines,
                           s);

  g_free (s);

  mark_error (context, tmp_error);

  g_propagate_error (error, tmp_error);
}

typedef enum
{
  USTATE_INSIDE_TEXT,
  USTATE_AFTER_AMPERSAND,
  USTATE_INSIDE_ENTITY_NAME,
  USTATE_AFTER_CHARREF_HASH
} UnescapeState;

static gboolean
unescape_text (GMarkupParseContext *context,
               const gchar         *text,
               const gchar         *text_end,
               gchar              **unescaped,
               GError             **error)
{
#define MAX_ENT_LEN 5
  GString *str;
  const gchar *p;
  UnescapeState state;
  const gchar *start;

  str = g_string_new ("");

  state = USTATE_INSIDE_TEXT;
  p = text;
  start = p;
  while (p != text_end && context->state != STATE_ERROR)
    {
      g_assert (p < text_end);
      
      switch (state)
        {
        case USTATE_INSIDE_TEXT:
          {
            while (p != text_end && *p != '&')
              p = g_utf8_next_char (p);

            if (p != start)
              {
                g_string_append_len (str, start, p - start);

                start = NULL;
              }
            
            if (p != text_end && *p == '&')
              {
                p = g_utf8_next_char (p);
                state = USTATE_AFTER_AMPERSAND;
              }
          }
          break;

        case USTATE_AFTER_AMPERSAND:
          {
            if (*p == '#')
              {
                p = g_utf8_next_char (p);

                start = p;
                state = USTATE_AFTER_CHARREF_HASH;
              }
            else if (!is_name_start_char (g_utf8_get_char (p)))
              {
                if (*p == ';')
                  {
                    set_unescape_error (context, error,
                                        p, text_end,
                                        G_MARKUP_ERROR_PARSE,
                                        _("Empty entity '&;' seen; valid "
                                          "entities are: &amp; &quot; &lt; &gt; &apos;"));
                  }
                else
                  {
                    gchar buf[7];

                    set_unescape_error (context, error,
                                        p, text_end,
                                        G_MARKUP_ERROR_PARSE,
                                        _("Character '%s' is not valid at "
                                          "the start of an entity name; "
                                          "the & character begins an entity; "
                                          "if this ampersand isn't supposed "
                                          "to be an entity, escape it as "
                                          "&amp;"),
                                        utf8_str (p, buf));
                  }
              }
            else
              {
                start = p;
                state = USTATE_INSIDE_ENTITY_NAME;
              }
          }
          break;


        case USTATE_INSIDE_ENTITY_NAME:
          {
            gchar buf[MAX_ENT_LEN+1] = {
              '\0', '\0', '\0', '\0', '\0', '\0'
            };
            gchar *dest;

            while (p != text_end)
              {
                if (*p == ';')
                  break;
                else if (!is_name_char (*p))
                  {
                    gchar ubuf[7];

                    set_unescape_error (context, error,
                                        p, text_end,
                                        G_MARKUP_ERROR_PARSE,
                                        _("Character '%s' is not valid "
                                          "inside an entity name"),
                                        utf8_str (p, ubuf));
                    break;
                  }

                p = g_utf8_next_char (p);
              }

            if (context->state != STATE_ERROR &&
                p != text_end)
              {
                const gchar *src;
                
                src = start;
                dest = buf;
                while (src != p)
                  {
                    *dest = *src;
                    ++dest;
                    ++src;
                  }

                /* move to after semicolon */
                p = g_utf8_next_char (p);
                start = p;
                state = USTATE_INSIDE_TEXT;

                if (strcmp (buf, "lt") == 0)
                  g_string_append_c (str, '<');
                else if (strcmp (buf, "gt") == 0)
                  g_string_append_c (str, '>');
                else if (strcmp (buf, "amp") == 0)
                  g_string_append_c (str, '&');
                else if (strcmp (buf, "quot") == 0)
                  g_string_append_c (str, '"');
                else if (strcmp (buf, "apos") == 0)
                  g_string_append_c (str, '\'');
                else
                  {
                    set_unescape_error (context, error,
                                        p, text_end,
                                        G_MARKUP_ERROR_PARSE,
                                        _("Entity name '%s' is not known"),
                                        buf);
                  }
              }
            else
              {
                set_unescape_error (context, error,
                                    /* give line number of the & */
                                    start, text_end,
                                    G_MARKUP_ERROR_PARSE,
                                    _("Entity did not end with a semicolon; "
                                      "most likely you used an ampersand "
                                      "character without intending to start "
                                      "an entity - escape ampersand as &amp;"));
              }
          }
          break;

        case USTATE_AFTER_CHARREF_HASH:
          {
            gboolean is_hex = FALSE;
            if (*p == 'x')
              {
                is_hex = TRUE;
                p = g_utf8_next_char (p);
                start = p;
              }

            while (p != text_end && *p != ';')
              p = g_utf8_next_char (p);

            if (p != text_end)
              {
                g_assert (*p == ';');

                /* digit is between start and p */

                if (start != p)
                  {
                    gchar *digit = g_strndup (start, p - start);
                    gulong l;
                    gchar *end = NULL;

                    errno = 0;
                    if (is_hex)
                      l = strtoul (digit, &end, 16);
                    else
                      l = strtoul (digit, &end, 10);

                    if (errno != 0)
                      {
                        set_unescape_error (context, error,
                                            start, text_end,
                                            G_MARKUP_ERROR_PARSE,
                                            _("Failed to parse '%s', which "
                                              "should have been a digit "
                                              "inside a character reference "
                                              "(&#234; for example) - perhaps "
                                              "the digit is too large"),
                                            digit);
                      }
                    else
                      {
                        /* characters XML permits */
                        if (l == 0x9 ||
                            l == 0xA ||
                            l == 0xD ||
                            (l >= 0x20 && l <= 0xD7FF) ||
                            (l >= 0xE000 && l <= 0xFFFD) ||
                            (l >= 0x10000 && l <= 0x10FFFF))
                          {
                            gchar buf[7];
                            g_string_append (str,
                                             char_str (l, buf));
                          }
                        else
                          {
                            set_unescape_error (context, error,
                                                start, text_end,
                                                G_MARKUP_ERROR_PARSE,
                                                _("Character reference %s does not encode a permitted character"),
                                                digit);
                          }
                      }

                    g_free (digit);

                    /* Move to next state */
                    p = g_utf8_next_char (p); /* past semicolon */
                    start = p;
                    state = USTATE_INSIDE_TEXT;
                  }
                else
                  {
                    set_unescape_error (context, error,
                                        start, text_end,
                                        G_MARKUP_ERROR_PARSE,
                                        _("Empty character reference; "
                                          "should include a digit such as "
                                          "&#454;"));
                  }
              }
            else
              {
                set_unescape_error (context, error,
                                    start, text_end,
                                    G_MARKUP_ERROR_PARSE,
                                    _("Character reference did not end with a "
                                      "semicolon; "
                                      "most likely you used an ampersand "
                                      "character without intending to start "
                                      "an entity - escape ampersand as &amp;"));
              }
          }
          break;

        default:
          g_assert_not_reached ();
          break;
        }
    }

  /* If no errors, we should have returned to USTATE_INSIDE_TEXT */
  g_assert (context->state == STATE_ERROR ||
            state == USTATE_INSIDE_TEXT);

  if (context->state == STATE_ERROR)
    {
      g_string_free (str, TRUE);
      *unescaped = NULL;
      return FALSE;
    }
  else
    {
      *unescaped = g_string_free (str, FALSE);
      return TRUE;
    }

#undef MAX_ENT_LEN
}

static gboolean
advance_char (GMarkupParseContext *context)
{

  context->iter = g_utf8_next_char (context->iter);
  context->char_number += 1;
  if (*context->iter == '\n')
    {
      context->line_number += 1;
      context->char_number = 1;
    }

  return context->iter != context->current_text_end;
}

static void
skip_spaces (GMarkupParseContext *context)
{
  do
    {
      if (!g_unichar_isspace (g_utf8_get_char (context->iter)))
        return;
    }
  while (advance_char (context));
}

static void
advance_to_name_end (GMarkupParseContext *context)
{
  do
    {
      if (!is_name_char (g_utf8_get_char (context->iter)))
        return;
    }
  while (advance_char (context));
}

static void
add_to_partial (GMarkupParseContext *context,
                const gchar         *text_start,
                const gchar         *text_end)
{
  if (text_start == text_end)
    return;

  if (context->partial_chunk == NULL)
    context->partial_chunk = g_string_new ("");

  g_string_append_len (context->partial_chunk, text_start,
                       text_end - text_start);
}

static void
free_partial (GMarkupParseContext *context)
{
  if (context->partial_chunk != NULL)
    {
      g_string_free (context->partial_chunk, TRUE);
      context->partial_chunk = NULL;
    }
}

static const gchar*
current_element (GMarkupParseContext *context)
{
  return context->tag_stack->data;
}

static const gchar*
current_attribute (GMarkupParseContext *context)
{
  return ((GMarkupAttribute*)context->attributes->data)->name;
}

static void
find_current_text_end (GMarkupParseContext *context)
{
  /* This function must be safe (non-segfaulting) on invalid UTF8 */
  const gchar *end = context->current_text + context->current_text_len;
  const gchar *p;
  const gchar *next;

  g_assert (p != end);

  p = context->current_text;
  next = g_utf8_find_next_char (p, end);

  while (next)
    {
      p = next;
      next = g_utf8_find_next_char (p, end);
    }

  /* p is now the start of the last character or character portion. */
  g_assert (p != end);
  next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */

  if (next == end)
    {
      /* whole character */
      context->current_text_end = end;
    }
  else
    {
      /* portion */
      context->leftover_char_portion = g_string_new_len (p, end - p);
      context->current_text_len -= (end - p);
      context->current_text_end = p;
    }
}

gboolean
g_markup_parse_context_parse (GMarkupParseContext *context,
                              const gchar         *text,
                              gint                 text_len,
                              GError             **error)
{
  const gchar *first_invalid;

  g_return_val_if_fail (context != NULL, FALSE);
  g_return_val_if_fail (text != NULL, FALSE);
  g_return_val_if_fail (context->state != STATE_ERROR, FALSE);
  g_return_val_if_fail (!context->parsing, FALSE);

  if (text_len < 0)
    text_len = strlen (text);

  if (text_len == 0)
    return TRUE;

  context->parsing = TRUE;
  
  if (context->leftover_char_portion)
    {
      const gchar *first_char;

      if ((*text & 0xc0) != 0x80)
        first_char = text;
      else
        first_char = g_utf8_find_next_char (text, text + text_len);

      if (first_char)
        {
          /* leftover_char_portion was completed. Parse it. */
          GString *portion = context->leftover_char_portion;
          
          g_string_append_len (context->leftover_char_portion,
                               text, first_char - text);

          /* hacks to allow recursion */
          context->parsing = FALSE;
          context->leftover_char_portion = NULL;
          
          if (!g_markup_parse_context_parse (context,
                                             portion->str, portion->len,
                                             error))
            {
              g_assert (context->state == STATE_ERROR);
            }
          
          g_string_free (portion, TRUE);
          context->parsing = TRUE;

          /* Skip the fraction of char that was in this text */
          text_len -= (first_char - text);
          text = first_char;
        }
      else
        {
          /* another little chunk of the leftover char; geez
           * someone is inefficient.
           */
          g_string_append_len (context->leftover_char_portion,
                               text, text_len);

          goto finished;
        }
    }

  context->current_text = text;
  context->current_text_len = text_len;
  context->iter = context->current_text;
  context->start = context->iter;

  /* Nothing left after finishing the leftover char, or nothing
   * passed in to begin with.
   */
  if (context->current_text_len == 0)
    goto finished;

  /* find_current_text_end () assumes the string starts at
   * a character start, so we need to validate at least
   * that much. It doesn't assume any following bytes
   * are valid.
   */
  if ((*context->current_text & 0xc0) == 0x80) /* not a char start */
    {
      set_error (context,
                 error,
                 G_MARKUP_ERROR_BAD_UTF8,
                 _("Invalid UTF-8 encoded text"));
      goto finished;
    }

  /* Initialize context->current_text_end, possibly adjusting
   * current_text_len, and add any leftover char portion
   */
  find_current_text_end (context);

  /* Validate UTF8 (must be done after we find the end, since
   * we could have a trailing incomplete char)
   */
  if (!g_utf8_validate (context->current_text,
                        context->current_text_len,
                        &first_invalid))
    {
      gint newlines = 0;
      const gchar *p;
      p = context->current_text;
      while (p != context->current_text_end)
        {
          if (*p == '\n')
            ++newlines;
          ++p;
        }

      context->line_number += newlines;

      set_error (context,
                 error,
                 G_MARKUP_ERROR_BAD_UTF8,
                 _("Invalid UTF-8 encoded text"));
      goto finished;
    }

  while (context->iter != context->current_text_end)
    {
      switch (context->state)
        {
        case STATE_START:
          /* Possible next state: AFTER_OPEN_ANGLE */

          g_assert (context->tag_stack == NULL);

          /* whitespace is ignored outside of any elements */
          skip_spaces (context);

          if (context->iter != context->current_text_end)
            {
              if (*context->iter == '<')
                {
                  /* Move after the open angle */
                  advance_char (context);

                  context->state = STATE_AFTER_OPEN_ANGLE;

                  /* this could start a passthrough */
                  context->start = context->iter;

                  /* document is now non-empty */
                  context->document_empty = FALSE;
                }
              else
                {
                  set_error (context,
                             error,
                             G_MARKUP_ERROR_PARSE,
                             _("Document must begin with an element (e.g. <book>)"));
                }
            }
          break;

        case STATE_AFTER_OPEN_ANGLE:
          /* Possible next states: INSIDE_OPEN_TAG_NAME,
           *  AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
           */
          if (*context->iter == '?' ||
              *context->iter == '!')
            {
              /* include < in the passthrough */
              const gchar *openangle = "<";
              add_to_partial (context, openangle, openangle + 1);
              context->start = context->iter;
              context->state = STATE_INSIDE_PASSTHROUGH;
            }
          else if (*context->iter == '/')
            {
              /* move after it */
              advance_char (context);

              context->state = STATE_AFTER_CLOSE_TAG_SLASH;
            }
          else if (is_name_start_char (g_utf8_get_char (context->iter)))
            {
              context->state = STATE_INSIDE_OPEN_TAG_NAME;

              /* start of tag name */
              context->start = context->iter;
            }
          else
            {
              gchar buf[7];
              set_error (context,
                         error,
                         G_MARKUP_ERROR_PARSE,
                         _("'%s' is not a valid character following "
                           "a '<' character; it may not begin an "
                           "element name"),
                         utf8_str (context->iter, buf));
            }
          break;

          /* The AFTER_CLOSE_ANGLE state is actually sort of
           * broken, because it doesn't correspond to a range
           * of characters in the input stream as the others do,
           * and thus makes things harder to conceptualize
           */
        case STATE_AFTER_CLOSE_ANGLE:
          /* Possible next states: INSIDE_TEXT, STATE_START */
          if (context->tag_stack == NULL)
            {
              context->start = NULL;
              context->state = STATE_START;
            }
          else
            {
              context->start = context->iter;
              context->state = STATE_INSIDE_TEXT;
            }
          break;

        case STATE_AFTER_ELISION_SLASH:
          /* Possible next state: AFTER_CLOSE_ANGLE */

          {
            /* We need to pop the tag stack and call the end_element
             * function, since this is the close tag
             */
            GError *tmp_error = NULL;
          
            g_assert (context->tag_stack != NULL);

            tmp_error = NULL;
            if (context->parser->end_element)
              (* context->parser->end_element) (context,
                                                context->tag_stack->data,
                                                context->user_data,
                                                &tmp_error);
          
            g_free (context->tag_stack->data);
            context->tag_stack = g_slist_delete_link (context->tag_stack,
                                                      context->tag_stack);

            if (tmp_error)
              {
                mark_error (context, tmp_error);
                g_propagate_error (error, tmp_error);
              }          
            else
              {
                if (*context->iter == '>')
                  {
                    /* move after the close angle */
                    advance_char (context);
                    context->state = STATE_AFTER_CLOSE_ANGLE;
                  }
                else
                  {
                    gchar buf[7];
                    set_error (context,
                               error,
                               G_MARKUP_ERROR_PARSE,
                               _("Odd character '%s', expected a '>' character "
                                 "to end the start tag of element '%s'"),
                               utf8_str (context->iter, buf),
                               current_element (context));
                  }
              }
          }
          break;

        case STATE_INSIDE_OPEN_TAG_NAME:
          /* Possible next states: BETWEEN_ATTRIBUTES */

          /* if there's a partial chunk then it's the first part of the
           * tag name. If there's a context->start then it's the start
           * of the tag name in current_text, the partial chunk goes
           * before that start though.
           */
          advance_to_name_end (context);

          if (context->iter == context->current_text_end)
            {
              /* The name hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The name has ended. Combine it with the partial chunk
               * if any; push it on the stack; enter next state.
               */
              add_to_partial (context, context->start, context->iter);
              context->tag_stack =
                g_slist_prepend (context->tag_stack,
                                 g_string_free (context->partial_chunk,
                                                FALSE));

              context->partial_chunk = NULL;

              context->state = STATE_BETWEEN_ATTRIBUTES;
              context->start = NULL;
            }
          break;

        case STATE_INSIDE_ATTRIBUTE_NAME:
          /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */

          /* read the full name, if we enter the equals sign state
           * then add the attribute to the list (without the value),
           * otherwise store a partial chunk to be prepended later.
           */
          advance_to_name_end (context);

          if (context->iter == context->current_text_end)
            {
              /* The name hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The name has ended. Combine it with the partial chunk
               * if any; push it on the stack; enter next state.
               */
              GMarkupAttribute *attr;
              add_to_partial (context, context->start, context->iter);

              attr = attribute_new (NULL, NULL);

              attr->name = g_string_free (context->partial_chunk,
                                          FALSE);

              context->partial_chunk = NULL;
              context->start = NULL;

              context->attributes =
                g_slist_prepend (context->attributes, attr);

              if (*context->iter == '=')
                {
                  advance_char (context);
                  context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
                }
              else
                {
                  gchar buf[7];
                  set_error (context,
                             error,
                             G_MARKUP_ERROR_PARSE,
                             _("Odd character '%s', expected a '=' after "
                               "attribute name '%s' of element '%s'"),
                             utf8_str (context->iter, buf),
                             attr->name,
                             current_element (context));

                }
            }
          break;

        case STATE_BETWEEN_ATTRIBUTES:
          /* Possible next states: AFTER_CLOSE_ANGLE,
           * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
           */
          skip_spaces (context);

          if (context->iter != context->current_text_end)
            {
              if (*context->iter == '/')
                {
                  advance_char (context);
                  context->state = STATE_AFTER_ELISION_SLASH;
                }
              else if (*context->iter == '>')
                {

                  advance_char (context);
                  context->state = STATE_AFTER_CLOSE_ANGLE;
                }
              else if (is_name_start_char (g_utf8_get_char (context->iter)))
                {
                  context->state = STATE_INSIDE_ATTRIBUTE_NAME;
                  /* start of attribute name */
                  context->start = context->iter;
                }
              else
                {
                  gchar buf[7];
                  set_error (context,
                             error,
                             G_MARKUP_ERROR_PARSE,
                             _("Odd character '%s', expected a '>' or '/' "
                               "character to end the start tag of "
                               "element '%s', or optionally an attribute; "
                               "perhaps you used an invalid character in "
                               "an attribute name"),
                             utf8_str (context->iter, buf),
                             current_element (context));
                }

              /* If we're done with attributes, invoke
               * the start_element callback
               */
              if (context->state == STATE_AFTER_ELISION_SLASH ||
                  context->state == STATE_AFTER_CLOSE_ANGLE)
                {
                  const gchar *start_name;
                  gchar **attr_names = NULL;
                  gchar **attr_values = NULL;
                  GError *tmp_error;

                  /* Call user callback for element start */
                  start_name = current_element (context);

                  /* this gratuituously copies the attr names/values
                   * I guess
                   */
                  attribute_list_to_arrays (context->attributes,
                                            &attr_names,
                                            &attr_values,
                                            NULL);

                  tmp_error = NULL;
                  if (context->parser->start_element)
                    (* context->parser->start_element) (context,
                                                        start_name,
                                                        attr_names,
                                                        attr_values,
                                                        context->user_data,
                                                        &tmp_error);

                  /* Go ahead and free this. */
                  g_slist_foreach (context->attributes, (GFunc)attribute_free,
                                   NULL);
                  g_slist_free (context->attributes);
                  context->attributes = NULL;

                  if (tmp_error != NULL)
                    {
                      mark_error (context, tmp_error);
                      g_propagate_error (error, tmp_error);
                    }
                }
            }
          break;

        case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
          /* Possible next state: INSIDE_ATTRIBUTE_VALUE */
          if (*context->iter == '"')
            {
              advance_char (context);
              context->state = STATE_INSIDE_ATTRIBUTE_VALUE;
              context->start = context->iter;
            }
          else
            {
              gchar buf[7];
              set_error (context,
                         error,
                         G_MARKUP_ERROR_PARSE,
                         _("Odd character '%s', expected an open quote mark "
                           "after the equals sign when giving value for "
                           "attribute '%s' of element '%s'"),
                         utf8_str (context->iter, buf),
                         current_attribute (context),
                         current_element (context));
            }
          break;

        case STATE_INSIDE_ATTRIBUTE_VALUE:
          /* Possible next states: BETWEEN_ATTRIBUTES */
          do
            {
              if (*context->iter == '"')
                break;
            }
          while (advance_char (context));

          if (context->iter == context->current_text_end)
            {
              /* The value hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The value has ended at the quote mark. Combine it
               * with the partial chunk if any; set it for the current
               * attribute.
               */
              GMarkupAttribute *attr;

              add_to_partial (context, context->start, context->iter);

              attr = context->attributes->data;

              if (unescape_text (context,
                                 context->partial_chunk->str,
                                 context->partial_chunk->str +
                                 context->partial_chunk->len,
                                 &attr->value,
                                 error))
                {
                  /* success, advance past quote and set state. */
                  advance_char (context);
                  context->state = STATE_BETWEEN_ATTRIBUTES;
                  context->start = NULL;
                }

              g_string_free (context->partial_chunk, TRUE);

              context->partial_chunk = NULL;
            }
          break;

        case STATE_INSIDE_TEXT:
          /* Possible next states: AFTER_OPEN_ANGLE */
          do
            {
              if (*context->iter == '<')
                break;
            }
          while (advance_char (context));

          /* The text hasn't necessarily ended. Merge with
           * partial chunk, leave state unchanged.
           */

          add_to_partial (context, context->start, context->iter);

          if (context->iter != context->current_text_end)
            {
              gchar *unescaped = NULL;

              /* The text has ended at the open angle. Call the text
               * callback.
               */
              
              if (unescape_text (context,
                                 context->partial_chunk->str,
                                 context->partial_chunk->str +
                                 context->partial_chunk->len,
                                 &unescaped,
                                 error))
                {
                  GError *tmp_error = NULL;

                  if (context->parser->text)
                    (*context->parser->text) (context,
                                              unescaped,
                                              strlen (unescaped),
                                              context->user_data,
                                              &tmp_error);
                  
                  g_free (unescaped);

                  if (tmp_error == NULL)
                    {
                      /* advance past open angle and set state. */
                      advance_char (context);
                      context->state = STATE_AFTER_OPEN_ANGLE;
                      /* could begin a passthrough */
                      context->start = context->iter;
                    }
                  else
                    {
                      mark_error (context, tmp_error);
                      g_propagate_error (error, tmp_error);
                    }
                }

              free_partial (context);
            }
          break;

        case STATE_AFTER_CLOSE_TAG_SLASH:
          /* Possible next state: INSIDE_CLOSE_TAG_NAME */
          if (is_name_start_char (g_utf8_get_char (context->iter)))
            {
              context->state = STATE_INSIDE_CLOSE_TAG_NAME;

              /* start of tag name */
              context->start = context->iter;
            }
          else
            {
              gchar buf[7];
              set_error (context,
                         error,
                         G_MARKUP_ERROR_PARSE,
                         _("'%s' is not a valid character following "
                           "the characters '</'; '%s' may not begin an "
                           "element name"),
                         utf8_str (context->iter, buf),
                         utf8_str (context->iter, buf));
            }
          break;

        case STATE_INSIDE_CLOSE_TAG_NAME:
          /* Possible next state: AFTER_CLOSE_ANGLE */
          advance_to_name_end (context);

          if (context->iter == context->current_text_end)
            {
              /* The name hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The name has ended. Combine it with the partial chunk
               * if any; check that it matches stack top and pop
               * stack; invoke proper callback; enter next state.
               */
              gchar *close_name;

              add_to_partial (context, context->start, context->iter);

              close_name = g_string_free (context->partial_chunk, FALSE);
              context->partial_chunk = NULL;
              
              if (context->tag_stack == NULL)
                {
                  set_error (context,
                             error,
                             G_MARKUP_ERROR_PARSE,
                             _("Element '%s' was closed, but no element "
                               "is currently open"),
                             close_name);
                }
              else if (strcmp (close_name, current_element (context)) != 0)
                {
                  set_error (context,
                             error,
                             G_MARKUP_ERROR_PARSE,
                             _("Element '%s' was closed, but the currently "
                               "open element is '%s'"),
                             close_name,
                             current_element (context));
                }
              else if (*context->iter != '>')
                {
                  gchar buf[7];
                  set_error (context,
                             error,
                             G_MARKUP_ERROR_PARSE,
                             _("'%s' is not a valid character following "
                               "the close element name '%s'; the allowed "
                               "character is '>'"),
                             utf8_str (context->iter, buf),
                             close_name);
                }
              else
                {
                  GError *tmp_error;
                  advance_char (context);
                  context->state = STATE_AFTER_CLOSE_ANGLE;
                  context->start = NULL;

                  /* call the end_element callback */
                  tmp_error = NULL;
                  if (context->parser->end_element)
                    (* context->parser->end_element) (context,
                                                      close_name,
                                                      context->user_data,
                                                      &tmp_error);

                  
                  /* Pop the tag stack */
                  g_free (context->tag_stack->data);
                  context->tag_stack = g_slist_delete_link (context->tag_stack,
                                                            context->tag_stack);
                  
                  if (tmp_error)
                    {
                      mark_error (context, tmp_error);
                      g_propagate_error (error, tmp_error);
                    }
                }

              g_free (close_name);
            }
          break;

        case STATE_INSIDE_PASSTHROUGH:
          /* Possible next state: AFTER_CLOSE_ANGLE */
          do
            {
              if (*context->iter == '>')
                break;
            }
          while (advance_char (context));

          if (context->iter == context->current_text_end)
            {
              /* The passthrough hasn't necessarily ended. Merge with
               * partial chunk, leave state unchanged.
               */
              add_to_partial (context, context->start, context->iter);
            }
          else
            {
              /* The passthrough has ended at the close angle. Combine
               * it with the partial chunk if any. Call the passthrough
               * callback. Note that the open/close angles are
               * included in the text of the passthrough.
               */
              GError *tmp_error = NULL;

              advance_char (context); /* advance past close angle */
              add_to_partial (context, context->start, context->iter);

              if (context->parser->passthrough)
                (*context->parser->passthrough) (context,
                                                 context->partial_chunk->str,
                                                 context->partial_chunk->len,
                                                 context->user_data,
                                                 &tmp_error);

              free_partial (context);

              if (tmp_error == NULL)
                {
                  context->state = STATE_AFTER_CLOSE_ANGLE;
                  context->start = context->iter; /* could begin text */
                }
              else
                {
                  mark_error (context, tmp_error);
                  g_propagate_error (error, tmp_error);
                }
            }
          break;

        case STATE_ERROR:
          goto finished;
          break;

        default:
          g_assert_not_reached ();
          break;
        }
    }

 finished:
  context->parsing = FALSE;

  return context->state != STATE_ERROR;
}

gboolean
g_markup_parse_context_end_parse (GMarkupParseContext *context,
                                  GError             **error)
{
  g_return_val_if_fail (context != NULL, FALSE);
  g_return_val_if_fail (!context->parsing, FALSE);
  g_return_val_if_fail (context->state != STATE_ERROR, FALSE);

  if (context->document_empty)
    {
      set_error (context, error, G_MARKUP_ERROR_EMPTY,
                 _("Document was empty or contained only whitespace"));
      return FALSE;
    }
  
  context->parsing = TRUE;
  
  switch (context->state)
    {
    case STATE_START:
      /* Nothing to do */
      break;

    case STATE_AFTER_OPEN_ANGLE:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly just after an open angle bracket '<'"));
      break;

    case STATE_AFTER_CLOSE_ANGLE:
      if (context->tag_stack != NULL)
        {
          /* Error message the same as for INSIDE_TEXT */
          set_error (context, error, G_MARKUP_ERROR_PARSE,
                     _("Document ended unexpectedly with elements still open - "
                       "'%s' was the last element opened"),
                     current_element (context));
        }
      break;
      
    case STATE_AFTER_ELISION_SLASH:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly, expected to see a close angle "
                   "bracket closing the tag <%s/>"), current_element (context));
      break;

    case STATE_INSIDE_OPEN_TAG_NAME:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly inside an element name"));
      break;

    case STATE_INSIDE_ATTRIBUTE_NAME:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly inside an attribute name"));
      break;

    case STATE_BETWEEN_ATTRIBUTES:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly inside an element-opening "
                   "tag."));
      break;

    case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly after the equals sign "
                   "following an attribute name; no attribute value"));
      break;

    case STATE_INSIDE_ATTRIBUTE_VALUE:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly while inside an attribute "
                   "value"));
      break;

    case STATE_INSIDE_TEXT:
      g_assert (context->tag_stack != NULL);
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly with elements still open - "
                   "'%s' was the last element opened"),
                 current_element (context));
      break;

    case STATE_AFTER_CLOSE_TAG_SLASH:
    case STATE_INSIDE_CLOSE_TAG_NAME:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly inside the close tag for"
                   "element '%s'"), current_element);
      break;

    case STATE_INSIDE_PASSTHROUGH:
      set_error (context, error, G_MARKUP_ERROR_PARSE,
                 _("Document ended unexpectedly inside a comment or "
                   "processing instruction"));
      break;

    case STATE_ERROR:
    default:
      g_assert_not_reached ();
      break;
    }

  context->parsing = FALSE;

  return context->state != STATE_ERROR;
}

static void
append_escaped_text (GString     *str,
                     const gchar *text,
                     gint         length)
{
  const gchar *p;
  const gchar *end;

  p = text;
  end = text + length;

  while (p != end)
    {
      const gchar *next;
      next = g_utf8_next_char (p);

      switch (*p)
        {
        case '&':
          g_string_append (str, "&amp;");
          break;

        case '<':
          g_string_append (str, "&lt;");
          break;

        case '>':
          g_string_append (str, "&gt;");
          break;

        case '\'':
          g_string_append (str, "&apos;");
          break;

        case '"':
          g_string_append (str, "&quot;");
          break;

        default:
          g_string_append_len (str, p, next - p);
          break;
        }

      p = next;
    }
}

gchar*
g_markup_escape_text (const gchar *text,
                      gint         length)
{
  GString *str;

  g_return_val_if_fail (text != NULL, NULL);

  if (length < 0)
    length = strlen (text);

  str = g_string_new ("");
  append_escaped_text (str, text, length);

  return g_string_free (str, FALSE);
}





[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]