simple markup format

From: Havoc Pennington <hp redhat com>
To: gtk-devel-list gnome org
Subject: simple markup format
Date: 23 Aug 2000 15:42:37 -0400
Hi,

A nice feature we've discussed adding to GtkLabel in the Labs is to
set it from a "rich text string":

  gtk_label_new_rich ("<b>Bold text <i>Italic bold</i></b>");

This cheesy markup format is also useful in other cases; I have some
ideas for the text widget, and it can be used for simple config files.

So rather than stick a parser in gtklabel.c I wrote a small generic
facility. This parses simple XML-like markup into a parse tree. 
The markup is intended to be something an XML parser could read, but
the parser in GLib/GTK+ won't be able to handle full XML. Applications
can move to an XML parser while still keeping the ability to read
their old config files, since an XML subset is used. Also, generic XML
tools can manipulate these markup files.

The subset supported is:
 - UTF8 only
 - no DTDs
 - comments and processing instructions are passed through on 
   load/save but otherwise ignored
 - no <[[CDATA]]> stuff
 - no custom entities, only the built-in 5
 - probably some other minor details

Basically the following is a supported document using all the
features:
 <foobar>
 <!-- comment -->
 <e1>Hi &amp; this is some text inside an element</e1>
 <e2:foo> Text with some <nested>nested elements</nested> and entities &quot;&amp;  &lt; &gt;&gt; &apos; and whitespace    </e2:foo>
 <tag ab="fo&lt;o" bar="foo" baz="blah">This element has attributes</tag>
 </foobar>

My current implementation doesn't actually handle the case where 
there isn't a root node, so that has to change, so that it returns 
a list of nodes instead of a single node from the parser, I guess.

The implementation is dead simple, 1500 lines of code counting
comments and blank lines, about 10 API entry points.

Elements of the API that may need explaining:
 G_MARKUP_NODE_PASSTHROUGH is an opaque node used to pass comments
  and processing instructions through from load to save.
 G_MARKUP_PRESERVE_ALL_WHITESPACE keeps the parser from stripping
  text nodes containing only whitespace; these nodes are normally 
  stripped from below a parent node if only whitespace exists in 
  any of the text nodes below that parent
 G_MARKUP_NO_FORMATTING keeps the saver from adding whitespace and 
  indentation to parent nodes that have only non-text nodes below
  them

So the whitespace/formatting rule is basically that you can add
or strip whitespace here:

 <parent_node>
  <child> </child>
  <child> </child>
 </parent_node>

But not here, since non-whitespace text is immediately below
parent_node:

 <parent_node>
  This is some text
  <child> </child>
  <child> </child>
 </parent_node>

Typically this is the behavior you probably want. But you can turn it
off if you want, with the provided flags.

I guess that's it.

Havoc

/* gmarkup.h - Simple XML-like string parser/writer
 *
 *  Copyright 2000 Red Hat, Inc.
 *
 * GLib is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * GLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GLib; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *   Boston, MA 02111-1307, USA.
 */

#ifndef __GMARKUP_H__
#define __GMARKUP_H__

#ifdef __cplusplus
extern "C"
{
#endif

typedef enum
{
  G_MARKUP_NODE_ELEMENT,
  G_MARKUP_NODE_TEXT,
  G_MARKUP_NODE_PASSTHROUGH
} GMarkupNodeType;

typedef enum
{
  G_MARKUP_PRESERVE_ALL_WHITESPACE = 1 << 0
  
} GMarkupParseFlags;

typedef enum
{
  G_MARKUP_NO_FORMATTING = 1 << 0

} GMarkupToStringFlags;

typedef union _GMarkupNode GMarkupNode;
typedef struct _GMarkupNodeText GMarkupNodeText;
typedef struct _GMarkupNodeElement GMarkupNodeElement;

struct _GMarkupNodeText
{
  GMarkupNodeType type;
  
  gchar *text;
};

struct _GMarkupNodeElement
{
  GMarkupNodeType type;

  gchar *name;
  
  GList *children;

  /* List members are an opaque datatype, so ignore this. */
  GList *attributes;
};

union _GMarkupNode
{
  GMarkupNodeType type;

  GMarkupNodeText text;
  GMarkupNodeElement element;  
};

typedef enum
{
  G_MARKUP_ERROR_BAD_UTF8,
  G_MARKUP_ERROR_EMPTY,
  G_MARKUP_ERROR_PARSE
} GMarkupErrorType;

#define G_MARKUP_ERROR g_markup_error_quark ()

GQuark g_markup_error_quark ();

GMarkupNodeText* g_markup_node_new_text (const gchar *text);
GMarkupNodeElement* g_markup_node_new_element (const gchar *name);

void         g_markup_node_free (GMarkupNode *node);

void g_markup_node_set_attribute (GMarkupNodeElement *node,
                                  const gchar *attribute_name,
                                  const gchar *attribute_value);

gchar* g_markup_node_get_attribute (GMarkupNodeElement *node,
                                    const gchar *attribute_name);

void g_markup_node_unset_attribute (GMarkupNodeElement *node,
                                    const gchar *attribute_name);

/* Get array of attribute names/values, otherwise you couldn't get
 * a list of them.
 */
void g_markup_node_get_attributes (GMarkupNodeElement *node,
                                   gchar ***names,
                                   gchar ***values,
                                   gint    *n_attributes);

GMarkupNode *g_markup_node_from_string (const gchar *text,
                                        gint length,
                                        GMarkupParseFlags flags,
                                        GError **error);

gchar *g_markup_node_to_string (GMarkupNode *node,
                                GMarkupToStringFlags flags);


#ifdef __cplusplus
}
#endif

#endif /* __GMARKUP_H__ */

/* gmarkup.c - Simple XML-like string parser/writer
 *
 *  Copyright 2000 Red Hat, Inc.
 *
 * GLib is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * GLib is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GLib; see the file COPYING.LIB.  If not,
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 *   Boston, MA 02111-1307, USA.
 */

#include "glib.h"

#include <string.h>
#include <stdio.h>

/* FIXME */
#define _(x) x

typedef struct _GMarkupAttribute GMarkupAttribute;
typedef struct _GMarkupNodePassthrough GMarkupNodePassthrough;

struct _GMarkupAttribute
{
  gchar *name;
  gchar *value;
};


struct _GMarkupNodePassthrough
{
  GMarkupNodeType type;
  
  gchar *passthrough_text;
};

static GMarkupAttribute *attribute_new (const gchar *name, const gchar *value);
static void attribute_free (GMarkupAttribute *attr);
static void append_node (GString *str,
                         GMarkupNode *node,
                         int depth,
                         GMarkupToStringFlags flags);

static GMarkupNode* parse_element (const gchar *text,
                                   gint i,
                                   gint length,
                                   GMarkupParseFlags flags,
                                   gint *new_i,
                                   GError **error);

GQuark
g_markup_error_quark ()
{
  static GQuark error_quark = 0;

  if (error_quark == 0)
    error_quark = g_quark_from_static_string ("g-markup-error-quark");

  return error_quark;
}

static GMarkupNodePassthrough*
g_markup_node_new_passthrough (const gchar *text)
{
  GMarkupNodePassthrough *node;

  g_return_val_if_fail (text != NULL, NULL);
  
  node = g_new (GMarkupNodePassthrough, 1);

  node->type = G_MARKUP_NODE_PASSTHROUGH;
  node->passthrough_text = g_strdup (text);
  
  return node;
}

GMarkupNodeText*
g_markup_node_new_text (const gchar *text)
{
  GMarkupNodeText *node;

  g_return_val_if_fail (text != NULL, NULL);
  
  node = g_new (GMarkupNodeText, 1);

  node->type = G_MARKUP_NODE_TEXT;
  node->text = g_strdup (text);
  
  return node;
}

GMarkupNodeElement*
g_markup_node_new_element (const gchar *name)
{
  GMarkupNodeElement *node;

  g_return_val_if_fail (name != NULL, NULL);
  
  node = g_new (GMarkupNodeElement, 1);

  node->type = G_MARKUP_NODE_ELEMENT;
  node->name = g_strdup (name);

  node->children = NULL;
  node->attributes = NULL;

  return node;
}

static void
free_attribute_list (GList *list)
{
  GList *tmp_list;

  tmp_list = list;
  while (tmp_list)
    {
      GMarkupAttribute *attr = tmp_list->data;

      attribute_free (attr);
      
      tmp_list = g_list_next (tmp_list);
    }

  g_list_free (list);
}

static void
free_node_list (GList *list)
{
  GList *tmp_list;

  tmp_list = list;
  while (tmp_list)
    {
      GMarkupNode *node = tmp_list->data;

      g_markup_node_free (node);
      
      tmp_list = g_list_next (tmp_list);
    }

  g_list_free (list);
}

void
g_markup_node_free (GMarkupNode *node)
{
  g_return_if_fail (node != NULL);
  
  switch (node->type)
    {
    case G_MARKUP_NODE_TEXT:
      g_free (node->text.text);
      break;
      
    case G_MARKUP_NODE_ELEMENT:
      g_free (node->element.name);
      free_attribute_list (node->element.attributes);
      free_node_list (node->element.children);
      break;

    case G_MARKUP_NODE_PASSTHROUGH:
      g_free (((GMarkupNodePassthrough*)node)->passthrough_text);
      break;
      
    default:
      g_assert_not_reached ();
      break;
    }
  
  g_free (node);
}

void
g_markup_node_set_attribute (GMarkupNodeElement *node,
                             const gchar *attribute_name,
                             const gchar *attribute_value)
{
  GList *tmp_list;

  g_return_if_fail (node != NULL);
  g_return_if_fail (node->type == G_MARKUP_NODE_ELEMENT);
  g_return_if_fail (attribute_name != NULL);
  /* value is NULL to unset */
  
  tmp_list = node->children;
  while (tmp_list)
    {
      GMarkupAttribute *attr = tmp_list->data;

      if (strcmp (attr->name, attribute_name) == 0)
        {
          if (attribute_value)
            {
              g_free (attr->value);
              attr->value = g_strdup (attribute_value);
            }
          else
            {
              node->attributes = g_list_delete_link (node->attributes,
                                                     tmp_list);

              attribute_free (attr);
            }

          return;
        }
      
      tmp_list = g_list_next (tmp_list);
    }

  /* Not found, add it if we have a value */
  if (attribute_value)
    {
      GMarkupAttribute *attr;

      attr = attribute_new (attribute_name, attribute_value);
      
      node->attributes = g_list_prepend (node->attributes, attr);
    }
}

gchar*
g_markup_node_get_attribute (GMarkupNodeElement *node,
                             const gchar *attribute_name)
{
  GList *tmp_list;

  g_return_val_if_fail (node != NULL, NULL);
  g_return_val_if_fail (node->type == G_MARKUP_NODE_ELEMENT, NULL);
  g_return_val_if_fail (attribute_name != NULL, NULL);
  
  tmp_list = node->children;
  while (tmp_list)
    {
      GMarkupAttribute *attr = tmp_list->data;

      if (strcmp (attr->name, attribute_name) == 0)
        return g_strdup (attr->value);
      
      tmp_list = g_list_next (tmp_list);
    }

  return NULL;
}

void
g_markup_node_unset_attribute (GMarkupNodeElement *node,
                               const gchar *attribute_name)
{
  g_markup_node_set_attribute (node, attribute_name, NULL);
}

void
g_markup_node_get_attributes (GMarkupNodeElement *node,
                              gchar ***namesp,
                              gchar ***valuesp,
                              gint    *n_attributes)
{
  GList *tmp_list;
  gint len;
  gchar **names;
  gchar **values;
  gint i;
  
  g_return_if_fail (node != NULL);
  g_return_if_fail (node->type == G_MARKUP_NODE_ELEMENT);
  
  len = g_list_length (node->attributes);

  if (namesp)
    {
      names = g_new (gchar*, len + 1);
      names[len] = NULL;
    }
  else
    names = NULL;
  
  if (valuesp)
    {
      values = g_new (gchar*, len + 1);
      values[len] = NULL;
    }
  else
    values = NULL;
  
  i = 0;
  tmp_list = node->attributes;
  while (tmp_list)
    {
      GMarkupAttribute *attr = tmp_list->data;

      g_assert (i < len + 1);
      
      if (namesp)
        names[i] = g_strdup (attr->name);

      if (valuesp)
        values[i] = g_strdup (attr->value);
      
      tmp_list = g_list_next (tmp_list);
    }

  if (n_attributes)
    *n_attributes = len;

  if (namesp)
    *namesp = names;

  if (valuesp)
    *valuesp = values;
}


/* Parsing a string */

#if 1
#include <stdio.h>
#define T(desc, byte) printf("%8d %35s   (%s)\n", byte, desc, __FUNCTION__)
#else
#define T(desc, byte)
#endif

static inline gint
next_char (const gchar *text, gint i)
{
  const gchar *p = &text[i];
  const gchar *n = g_utf8_next_char (p);
  return i + (n - p);
}

static gint
skip_spaces (const gchar *text,
             gint i,
             gint length)
{
  gunichar c;
  
  c = g_utf8_get_char (&text[i]);
  while (g_unichar_isspace (c))
    {
      i = next_char (text, i);
      if (i >= length)
        break;
      c = g_utf8_get_char (&text[i]);
    }

  return i;
}

static gchar*
text_before (const gchar *text,
             gint i)
{
  gint before = i - 30;

  if (before < 0)
    before = 0;

  return g_strndup (&text[before], 30);
}

static void
set_error (const gchar *text,
           gint i,
           gint length,
           GError **error,
           GMarkupErrorType code,
           const gchar   *format,
           ...)
{
  T("error", i);
  
  if (error)
    {
      gchar *s;
      gchar *surrounding;
      gchar *sub;
      gint lines;
      gint char_on_line;
      gint last_newline;
      gint j;
      gint point;
      gint start, end;
      
      va_list args;
      
      va_start (args, format);
      s = g_strdup_vprintf (format, args);
      va_end (args);

      /* count lines up to i */
      lines = 1;
      j = 0;
      last_newline = 0;
      while (j < i)
        {
          gunichar c = g_utf8_get_char (&text[j]);

          if (c == '\n' || c == '\r')
            {
              ++lines;
              last_newline = j;
            }
          
          j = next_char (text, j);
        }

      char_on_line = i - last_newline;
      
      start = i - 40;
      if (start < 0)
        start = 0;
      end = i + 40;
      if (end > length)
        end = length;

      surrounding = g_strndup (&text[start], end - start);
      /* only display stuff on the same line */
      point = i - start;
      sub = surrounding;
      j = 0;
      while (surrounding[j] != '\0')
        {
          if (surrounding[j] == '\n')
            {
              if (j < point)
                sub = &surrounding[j+1];

              surrounding[j] = '\0';
            }
          
          ++j;
        }
      
      *error = g_error_new (G_MARKUP_ERROR,
                            code,
                            _("Error on line %d char %d: %s\n(Some surrounding text was '%s')\n"),
                            lines, char_on_line, s, sub);

      g_free (surrounding);
      g_free (s);
    }
}           

static gboolean
is_name_start_char (gunichar c)
{
  if (g_unichar_isalpha (c) ||
      c == '_' ||
      c == ':')
    return TRUE;
  else
    return FALSE;
}

static gboolean
is_name_char (gunichar c)
{
  if (g_unichar_isalnum (c) ||
      c == '.' ||
      c == '-' ||
      c == '_' ||
      c == ':')
    return TRUE;
  else
    return FALSE;
}

static const gchar*
unthreadsafe_char_str (gunichar c)
{
  static gchar buf[7];

  memset (buf, '\0', 7);
  g_unichar_to_utf8 (c, buf);
  return buf;
}

static gint
find_name_end (const gchar *text,
               gint name_start,               
               gint length,
               GMarkupParseFlags flags,
               GError **error)
{
  gint i = name_start;

  T("name start", name_start);
  
  /* start of name assumed to be already validated */
  i = next_char (text, i);

  while (i < length)
    {
      gunichar c = g_utf8_get_char (&text[i]);

      if (!is_name_char (c))
        break;
      else
        i = next_char (text, i);
    }

  T("name end", i);
  
  return i;
}

static gchar*
unescape_text (const gchar *text,
               gint i,
               gint length,
               gint stop,
               gboolean *has_nonwhitespace,
               GError **error)
{
  GString *str;
  gchar *ret;

  T("unescaping text start", i);
  
  *has_nonwhitespace = FALSE;
  
  str = g_string_new ("");
  
  while (i < stop)
    {
      gunichar c = g_utf8_get_char (&text[i]);

      if (!*has_nonwhitespace &&
          !g_unichar_isspace (c))
        *has_nonwhitespace = TRUE;
      
      switch (c)
        {
        case '&':
          {
            /* parse entity: &amp; &quot; &lt; &gt; &apos;
             * note all names shorter than 5 chars
             */
            gint ent_start = i + 1;
            gint semicolon = -1;
            gint ent_char = 0;
            gunichar ent_name[5];
            gboolean bad_entity;

            T("entity name start", ent_start);
            
            i = ent_start;
          
            while (i < stop && ent_char < 5)
              {
                c = g_utf8_get_char (&text[i]);
                ent_name[ent_char] = c;
                
                if (c == ';')
                  {
                    T("semicolon at end of entity", i);
                    semicolon = i;
                    break;
                  }                
                else
                  {
                    ++ent_char;
                    i = next_char (text, i);
                  }
              }

            if (semicolon < 0)
              {
                set_error (text, i, length, 
                           error,
                           G_MARKUP_ERROR_PARSE,
                           _("Text ended in the middle of an entity, or entity name too long to be valid ('&' should begin an entity such as '&quot;')"));

                g_string_free (str, TRUE);
              
                return NULL;          
              }

            bad_entity = FALSE;
          
          /* switch on length of entity name */
            switch (ent_char)
              {
              case 2:
                if (ent_name[0] == 'l' && ent_name[1] == 't')
                  g_string_append_c (str, '<');
                else if (ent_name[0] == 'g' && ent_name[1] == 't')
                  g_string_append_c (str, '>');
                else
                  bad_entity = TRUE;
                break;

              case 3:
                if (ent_name[0] == 'a' && ent_name[1] == 'm' &&
                    ent_name[2] == 'p')
                  g_string_append_c (str, '&');
                else
                  bad_entity = TRUE;
                break;

              case 4:
                if (ent_name[0] == 'q' && ent_name[1] == 'u' &&
                    ent_name[2] == 'o' && ent_name[3] == 't')
                  g_string_append_c (str, '"');
                else if (ent_name[0] == 'a' && ent_name[1] == 'p' &&
                         ent_name[2] == 'o' && ent_name[3] == 's')
                  g_string_append_c (str, '\'');
                else
                  bad_entity = TRUE;
                break;

              default:
                bad_entity = TRUE;
                break;
              }
          
            if (bad_entity)
              {
                gchar *ent_str = g_strndup (&text[ent_start], i - ent_start);
            
                set_error (text, ent_start, length, 
                           error,
                           G_MARKUP_ERROR_PARSE,
                           _("Unknown entity '%s' ('&' must begin entities &amp; &quot; &lt; &gt; &apos;)"),
                           ent_str);
            
                g_free (ent_str);
            
                g_string_free (str, TRUE);
            
                return NULL;
              }

            T("semicolon after entity", i);
            
            /* i now points at the semicolon, and we'll skip past it */
          }
          break;

        case '<':
        case '>':
          set_error (text, i, length, 
                     error,
                     G_MARKUP_ERROR_PARSE,
                     _("'<' or '>' character not allowed here; only allowed around tags, for example <bold> or <paragraph>. Elsewhere, encode these characters as the entities &lt; and &gt;"));
          
          g_string_free (str, TRUE);
              
          return NULL;                    
          break;
          
        default:
          g_string_append (str, unthreadsafe_char_str (c));
          break;
        }
      
      i = next_char (text, i);
    }

  ret = str->str;
  g_string_free (str, FALSE);

  T("unescaping text stop", stop);
  
  return ret;
}

static GMarkupAttribute*
parse_attribute (const gchar *text,
                 gint i,
                 gint length,
                 GMarkupParseFlags flags,
                 gint *new_i,
                 GError **error)
{
  GMarkupAttribute *attr;
  gunichar c;
  gint name_start;
  gint name_end;
  gint value_start;
  gint value_end;
  gchar *value;
  GError *err;
  gboolean has_nonwhitespace;

  T("attribute name start", i);
  
  *new_i = i;
  
  name_start = i;

  c = g_utf8_get_char (&text[i]);

  if (!is_name_start_char (c))
    {
      set_error (text,
                 i, length,
                 error, 
                 G_MARKUP_ERROR_PARSE,
                 _("Character '%s' is not valid at the start of an attribute name"),
                 unthreadsafe_char_str (c));
      return NULL;
    }

  err = NULL;
  name_end = find_name_end (text, name_start, length, flags, &err);

  if (err)
    {
      if (error)
        *error = err;
      else
        g_error_free (err);
      
      return NULL;
    }

  T("attribute name end", name_end);
  
  i = name_end;
  
  if (name_end >= length)
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after attribute name"));
      return NULL;
    }
  
  c = g_utf8_get_char (&text[i]);

  if (c != '=')
    {
      set_error (text, i, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Attribute name must be immediately followed by an '=' character"));
      return NULL;
    }

  T("equals sign", i);
  
  i = next_char (text, i);
  
  c = g_utf8_get_char (&text[i]);

  if (c != '"')
    {
      set_error (text, i, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("'=' character after attribute must be immediately followed by an '\"' character"));
      return NULL;
    }

  T("open quote", i);
  
  i = next_char (text, i);
  value_start = i;
  value_end = -1;
  while (i < length)
    {
      c = g_utf8_get_char (&text[i]);

      switch (c)
        {
        case '"':
          value_end = i;
          goto out;
          break;

        case '<':
        case '>':
          {
            set_error (text, i, length,
                       error,
                       G_MARKUP_ERROR_PARSE,
                       _("Character '%c' found inside an attribute value; perhaps your attribute value is missing the closing quotation mark '\"'"),
                       (char)c);
            return NULL;
          }
          break;
          
        default:
          break;
        }

      i = next_char (text, i);
    }

 out:
  
  if (value_end < 0)
    {
      set_error (text, value_start, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Ran out of text before a quote mark ('\"') was seen at the end of an attribute value"));

      return NULL;
    }

  g_assert (value_end >= value_start);
  g_assert (i == value_end);

  if (value_end >= length)
    {
      set_error (text, i, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Text ends immediately after an attribute value, before the element was closed"));

      return NULL;
    }

  T("close quote", value_end);
  
  err = NULL;
  value = unescape_text (text, value_start, length, value_end,
                         &has_nonwhitespace, &err);

  if (err)
    {
      if (error)
        *error = err;
      else
        g_error_free (err);
      
      return NULL;
    }

  attr = attribute_new (NULL, NULL);

  attr->name = g_strndup (&text[name_start], name_end - name_start);
  attr->value = value;
  
  g_assert (i < length);
  i = next_char (text, i);
  
  *new_i = i;

  T("char after quote", i);

#if 0
  printf ("attribute name: %s\n", attr->name);
  printf ("attribute valu: %s\n", attr->value);
#endif
  
  return attr;
}

static GList*
parse_child_list (const gchar *text,
                  gint i,
                  gint length,
                  GMarkupParseFlags flags,
                  gint *new_i,
                  GError **error)
{
  GList *list = NULL;
  GError *err;
  gint text_start;
  gboolean has_nonwhitespace = FALSE;
  gboolean tmp;
  gint j;

  T("start of child list", i);
  
  *new_i = i;

  text_start = i;
  
  while (i < length)
    {
      gunichar c = g_utf8_get_char (&text[i]);

      if (c == '<')
        {
          GMarkupNode *node;
          
          if (text_start != i)
            {
              gchar *str;

              T("start of text node", text_start);
              T("end of text node", i);
              
              err = NULL;
              str = unescape_text (text, text_start,
                                   length, i,
                                   &tmp,
                                   &err);

              if (err)
                {
                  if (error)
                    *error = err;
                  else
                    g_error_free (err);
                  
                  free_node_list (list);
              
                  return NULL;
                }

              if (tmp)
                has_nonwhitespace = tmp;
              
              /* FIXME gratuituous string copy */
              list = g_list_prepend (list,
                                     g_markup_node_new_text (str));
              g_free (str);              
            }

          if ((i+1) < length &&
              text[i+1] == '/')
            {
              /* This is a close tag,
               * so we're finished.
               * the parse_element that called
               * us will check that the close
               * tag matches
               */
              goto finished;
            }
          else
            {
              /* An open tag, so recurse */
              
              T("start of element", i);
          
              err = NULL;
              node = parse_element (text, i, length,
                                    flags, &j, &err);
              i = j;
          
              if (err)
                {
                  if (error)
                    *error = err;
                  else
                    g_error_free (err);
              
                  free_node_list (list);
              
                  return NULL;
                }

              list = g_list_prepend (list, node);

              text_start = i;
            }
        }
      else
        i = next_char (text, i);
    }

  if (text_start != i)
    {
      gchar *str;

      T("start of text node", text_start);
      T("end of text node", i);
      
      err = NULL;
      str = unescape_text (text, text_start,
                           length, i,
                           &tmp,
                           &err);

      if (err)
        {
          if (error)
            *error = err;
          else
            g_error_free (err);
                  
          free_node_list (list);
              
          return NULL;
        }

      if (tmp)
        has_nonwhitespace = tmp;
      
      /* FIXME gratuituous string copy */
      list = g_list_prepend (list,
                             g_markup_node_new_text (str));
      g_free (str);
    }

 finished:
  
  *new_i = i;

  /* If we have text nodes that contain non-whitespace, we don't filter
   * out the text nodes. If all text nodes are just whitespace, then
   * we nuke them all. If we filter, we reverse the list at the
   * same time. The PRESERVE_ALL_WHITESPACE flag turns off the filter
   * behavior.
   */
  if (!has_nonwhitespace &&
      (flags & G_MARKUP_PRESERVE_ALL_WHITESPACE) == 0)
    {
      GList *new_list = NULL;
      GList *tmp_list;

      tmp_list = list;
      while (tmp_list != NULL)
        {
          GMarkupNode *node = tmp_list->data;

          if (node->type == G_MARKUP_NODE_TEXT)
            g_markup_node_free (node);
          else
            new_list = g_list_prepend (new_list, node);

          tmp_list = g_list_next (tmp_list);
        }

      g_list_free (list);
      list = new_list;
    }
  else
    list = g_list_reverse (list); /* no filter, just reverse */
  
  return list;
}

static GList*
parse_attribute_list (const gchar *text,
                      gint i,
                      gint length,
                      GMarkupParseFlags flags,
                      gint *new_i,
                      GError **error)
{
  GList *list = NULL;
  GError *err;
  gint j;

  T("start of attr list", i);
  
  *new_i = i;

  while (i < length)
    {
      GMarkupAttribute *attr;
      gunichar c;
      
      i = skip_spaces (text, i, length);
      
      T("after attr list leading ws", i);
      
      c = g_utf8_get_char (&text[i]);
      if (c == '>')
        break;
      
      err = NULL;
      attr = parse_attribute (text, i, length,
                              flags, &j, &err);
      i = j;

      if (err)
        {
          if (error)
            *error = err;
          else
            g_error_free (err);
          
          free_attribute_list (list);

          return NULL;
        }

      list = g_list_prepend (list, attr);
      
      i = skip_spaces (text, i, length);

      T("after attr list trailing ws", i);
      
      c = g_utf8_get_char (&text[i]);
      if (c == '>')
        break;
    }
  
  *new_i = i;

  T("after attr list", i);
  
  return list;
}

static GMarkupNode*
parse_element (const gchar *text,
               gint i,
               gint length,
               GMarkupParseFlags flags,
               gint *new_i,
               GError **error)
{
  gunichar c;
  gint name_start;
  gint name_end;
  GError *err;
  GList *attr_list;
  GList *child_list;
  gint close_name_start;
  gint close_name_end;
  GMarkupNodeElement *node;
  gint j;

  T("start of element", i);
  
  *new_i = i;
  
  c = g_utf8_get_char (&text[i]);

  if (c != '<')
    {
      set_error (text,
                 i, length,
                 error, 
                 G_MARKUP_ERROR_PARSE,
                 _("Missing '<' at start of element"));
      return NULL;
    }

  i = next_char (text, i);

  if (i >= length)
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after '<' character"));
      return NULL;
    }

  /* FIXME parse PI and comments as passthroughs here. */
  
  name_start = i;
  
  c = g_utf8_get_char (&text[i]);

  if (!is_name_start_char (c))
    {
      set_error (text,
                 i, length,
                 error, 
                 G_MARKUP_ERROR_PARSE,
                 _("Character '%s' is not valid at the start of an element name"),
                 unthreadsafe_char_str (c));
      return NULL;
    }

  err = NULL;
  name_end = find_name_end (text, name_start, length, flags, &err);
  if (err)
    {
      if (error)
        *error = err;
      else
        g_error_free (err);
      
      return NULL;
    }

  i = name_end;
  
  if (name_end >= length)
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after element name, no '>' seen"));
      return NULL;
    }

  T("end of elem name", name_end);
  
  i = skip_spaces (text, i, length);

  if (i >= length)
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after element name, no '>' seen"));
      return NULL;
    }

  err = NULL;
  attr_list = parse_attribute_list (text, i, length,
                                    flags, &j, &err);
  i = j;
  
  if (err)
    {
      if (error)
        *error = err;
      else
        g_error_free (err);
      
      return NULL;
    }
  
  c = g_utf8_get_char (&text[i]);
  if (c != '>')
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after attribute list, no '>' seen"));

      free_attribute_list (attr_list);
      
      return NULL;
    }

  i = next_char (text, i);
  
  T("start of child list", i);
  child_list = parse_child_list (text, i, length,
                                 flags, &j, &err);
  i = j;
  
  if (err)
    {
      if (error)
        *error = err;
      else
        g_error_free (err);

      free_attribute_list (attr_list);
      
      return NULL;
    }

  T("end of child list", i);
  
  /* Should now be at our close tag, absorb it. */
  c = g_utf8_get_char (&text[i]);
  if (c != '<')
    {
      set_error (text, name_start, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Close tag not found at end of element"));

      free_attribute_list (attr_list);
      free_node_list (child_list);
      
      return NULL;
    }
  
  i = next_char (text, i);
  if (i >= length)
    {

      set_error (text, i, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Close tag ends just after '<' character"));

      free_attribute_list (attr_list);
      free_node_list (child_list);
      
      return NULL;
    }

  c = g_utf8_get_char (&text[i]);
  if (c != '/')
    {
      set_error (text, i, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Close tag should begin with '</', '/' character is missing"));

      free_attribute_list (attr_list);
      free_node_list (child_list);
      
      return NULL;
    }

  i = next_char (text, i);
  if (i >= length)
    {
      set_error (text, i, length,
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Close tag ends just after '/' character"));

      free_attribute_list (attr_list);
      free_node_list (child_list);
      
      return NULL;
    }
  
  /* Do a bytewise strcmp against the name of the opening tag */
  close_name_start = i;

  T("start of close name", close_name_start);
  
  err = NULL;
  close_name_end = find_name_end (text, close_name_start, length, flags, &err);
  if (err)
    {
      if (error)
        *error = err;
      else
        g_error_free (err);

      free_attribute_list (attr_list);
      free_node_list (child_list);
      
      return NULL;
    }
  
  i = close_name_end;
  
  if (close_name_end >= length)
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after element name in close tag, no '>' seen"));

      free_attribute_list (attr_list);
      free_node_list (child_list);

      return NULL;
    }

  T("end of close name", close_name_end);
  
  c = g_utf8_get_char (&text[i]);
  if (c != '>')
    {
      set_error (text, i, length, 
                 error,
                 G_MARKUP_ERROR_PARSE,
                 _("Document ended just after close tag name, no '>' seen"));

      free_attribute_list (attr_list);
      free_node_list (child_list);
      
      return NULL;
    }

  {
    gchar *open_name = g_strndup (&text[name_start],
                                  name_end - name_start);
    gchar *close_name = g_strndup (&text[close_name_start],
                                   close_name_end - close_name_start);

    if (strcmp (open_name, close_name) != 0)
      {
        set_error (text, i, length,
                   error,
                   G_MARKUP_ERROR_PARSE,
                   _("Close tag '%s' does not match opening tag '%s'"),
                   close_name, open_name);

        free_attribute_list (attr_list);
        free_node_list (child_list);

        g_free (open_name);
        g_free (close_name);

        return NULL;
      }

    /* create node with name while we have the name around,
     * yuck
     */
    node = g_markup_node_new_element (open_name);
    
    g_free (open_name);
    g_free (close_name);
  }

  /* We finally have everything; skip past the final > and
   * assemble the node.
   */
  i = next_char (text, i);
  *new_i = i;
  
  node->children = child_list;
  node->attributes = attr_list;

  g_markup_node_get_attributes (node, NULL, NULL, NULL);
  
  return (GMarkupNode*) node;
}

GMarkupNode*
g_markup_node_from_string (const gchar *text,
                           gint length,
                           GMarkupParseFlags flags,
                           GError **error)
{
  gint i;
  const gchar *invalid = NULL;
  
  g_return_val_if_fail (text != NULL, NULL);

  if (length < 0)
    length = strlen (text);

  if (!g_utf8_validate (text, length, &invalid))
    {

      if (error)
        {
          gchar *before;
          
          before = text_before (text, invalid - text);

          *error = g_error_new (G_MARKUP_ERROR,
                                G_MARKUP_ERROR_BAD_UTF8,
                                _("Invalid UTF-8 character at byte %d in marked-up text. Some text before the bad character was '%s'"),
                                invalid - text,
                                before);
          
          g_free (before);
        }
      
      return NULL;
    }
  
  i = 0;
  while (i < length)
    {
      gunichar c = g_utf8_get_char (&text[i]);

      if (g_unichar_isspace (c))
        i = next_char (text, i);
      else
        break;
    }

  if (i < length)
    {
      gint ignored;
      return parse_element (text, i, length, flags, &ignored, error);
    }
  else
    {
      if (error)
        {
          *error = g_error_new (G_MARKUP_ERROR,
                                G_MARKUP_ERROR_EMPTY,
                                _("The marked-up text contained nothing but whitespace."));
        }
      
      return NULL;
    }
}






/* Writing a string */

static void
append_escaped_text (GString *str,
                     const gchar *text)
{
  const gchar *p;

  p = text;

  while (*p)
    {
      const gchar *next;
      next = g_utf8_next_char (p);
      
      switch (*p)
        {
        case '&':
          g_string_append (str, "&amp;");
          break;

        case '<':
          g_string_append (str, "&lt;");
          break;

        case '>':
          g_string_append (str, "&gt;");
          break;

        case '\'':
          g_string_append (str, "&apos;");
          break;

        case '"':
          g_string_append (str, "&quot;");
          break;
          
        default:
          g_string_append_len (str, p, next - p);
          break;
        }

      p = next;
    }
}

static void
append_attributes (GString *str,
                   GList *list)
{
  GList *tmp_list;
  
  tmp_list = list;
  while (tmp_list)
    {
      GMarkupAttribute *attr = tmp_list->data;

      g_string_append (str, attr->name);
      g_string_append (str, "=\"");
      /* FIXME not the same as for outside-attribute text */
      append_escaped_text (str, attr->value);
      g_string_append (str, "\" ");
      
      tmp_list = g_list_next (tmp_list);
    }

  if (list)
    {
      /* if we appended anything, remove the space at the end */
      g_string_truncate (str, str->len - 1);
    }
}

static void
append_node_list (GString *str,
                  GList *children,
                  int depth,
                  GMarkupToStringFlags flags)
{
  GList *tmp_list;

  tmp_list = children;

  while (tmp_list != NULL)
    {
      GMarkupNode *node = tmp_list->data;

      append_node (str, node, depth, flags);
      
      tmp_list = g_list_next (tmp_list);
    }
}

static void
indentation (GString *str,
             int depth,
             GMarkupToStringFlags flags)
{
  if ((flags & G_MARKUP_NO_FORMATTING) == 0)
    {
      /* indent */
      int i = 0;
      while (i < depth)
        {
          g_string_append_c (str, ' ');
          ++i;
        }
    }
}

static gboolean
nonwhitespace_nodes (GList *children)
{
  GList *tmp_list;

  tmp_list = children;

  while (tmp_list != NULL)
    {
      GMarkupNode *node = tmp_list->data;

      if (node->type == G_MARKUP_NODE_TEXT)
        {
          gchar *iter = node->text.text;
          while (*iter)
            {
              if (!g_unichar_isspace (g_utf8_get_char (iter)))
                return TRUE;

              iter = g_utf8_next_char (iter);
            }
        }

      tmp_list = g_list_next (tmp_list);
    }

  return FALSE; /* no non-whitespace found */
}

static void
append_node (GString *str,
             GMarkupNode *node,
             int depth,
             GMarkupToStringFlags flags)
{
  switch (node->type)
    {
    case G_MARKUP_NODE_TEXT:
      append_escaped_text (str, node->text.text);
      break;

    case G_MARKUP_NODE_PASSTHROUGH:
      g_string_append (str, ((GMarkupNodePassthrough*)node)->passthrough_text);
      break;
      
    case G_MARKUP_NODE_ELEMENT:
      {
        indentation (str, depth, flags);
        
        g_string_append_c (str, '<');
        g_string_append (str, node->element.name);
        g_string_append_c (str, ' ');
        append_attributes (str, node->element.attributes);
        g_string_append_c (str, '>');
        
        if ((flags & G_MARKUP_NO_FORMATTING) == 0 &&
            nonwhitespace_nodes (node->element.children))
          {
            /* If we have non-whitespace text immediately under this
             * node, we can't do formatting for the child nodes,
             * we have to dump them literally. So turn on
             * G_MARKUP_NO_FORMATTING if it's off and we find nonwhitespace
             * text nodes.
             */
            
            append_node_list (str, node->element.children, depth + 1,
                              flags & G_MARKUP_NO_FORMATTING);
          }
        else
          {
            /* If we don't find any non-whitespace text, leave
             * G_MARKUP_NO_FORMATTING as-is, and put in a newline
             * after the open-element if the flag is off
             */

            if ((flags & G_MARKUP_NO_FORMATTING) == 0)
              g_string_append_c (str, '\n');
            
            append_node_list (str, node->element.children, depth + 1,
                              flags);
          }
        
        indentation (str, depth, flags);

        g_string_append (str, "</");
        g_string_append (str, node->element.name);
        g_string_append_c (str, '>');

        /* put a newline afterward if formatting is allowed within our
         * parent node
         */
        if ((flags & G_MARKUP_NO_FORMATTING) == 0)
          g_string_append_c (str, '\n');
      }
      break;

    default:
      g_assert_not_reached ();
      break;
    }
}

gchar*
g_markup_node_to_string (GMarkupNode *node, GMarkupToStringFlags flags)
{
  GString *str;
  gchar *retval;
  
  g_return_val_if_fail (node != NULL, NULL);
  
  str = g_string_new ("");

  append_node (str, node, 0, flags);

  retval = str->str;
  g_string_free (str, FALSE);

  return retval;
}

static GMarkupAttribute*
attribute_new (const gchar *name, const gchar *value)
{
  GMarkupAttribute *attr;

  attr = g_new (GMarkupAttribute, 1);

  /* name/value are allowed to be NULL */
  attr->name = g_strdup (name);
  attr->value = g_strdup (value);

  return attr;
}

static void
attribute_free (GMarkupAttribute *attr)
{
  g_free (attr->name);
  g_free (attr->value);
  g_free (attr);
}
Follow-Ups:
- Re: simple markup format
  - From: ERDI Gergo
- Re: simple markup format
  - From: Dominic Ludlam
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]