[libxml2] Switch the HTML parser to be non-recursive

From: Daniel Veillard <veillard src gnome org>
To: commits-list gnome org
Cc:
Subject: [libxml2] Switch the HTML parser to be non-recursive
Date: Mon, 15 Mar 2010 14:19:11 +0000 (UTC)
commit 615904f58210c864dd54aefbf3e55b85e531bc1e
Author: Eugene Pimenov <libc me com>
Date:   Mon Mar 15 15:16:02 2010 +0100

    Switch the HTML parser to be non-recursive
    
    * HTMLparser.c: new htmlParseElementInternal non recursive, with
      htmlParseContentInternal and new function to handle node info
      and element end.
    * include/libxml/parser.h: add new stack for element info in parser
      context
    * parserInternals.c: fee element info stack

 HTMLparser.c            |  371 +++++++++++++++++++++++++++++++++++++++++++++--
 include/libxml/parser.h |    6 +
 parserInternals.c       |    1 +
 3 files changed, 364 insertions(+), 14 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 2b569ca..a3fbb7f 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -209,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
     return (ret);
 }
 
+/**
+ * htmlNodeInfoPush:
+ * @ctxt:  an HTML parser context
+ * @value:  the node info
+ *
+ * Pushes a new element name on top of the node info stack
+ *
+ * Returns 0 in case of error, the index in the stack otherwise
+ */
+static int
+htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
+{
+    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
+        if (ctxt->nodeInfoMax == 0)
+                ctxt->nodeInfoMax = 5;
+        ctxt->nodeInfoMax *= 2;
+        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
+                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
+                                    ctxt->nodeInfoMax *
+                                    sizeof(ctxt->nodeInfoTab[0]));
+        if (ctxt->nodeInfoTab == NULL) {
+            htmlErrMemory(ctxt, NULL);
+            return (0);
+        }
+    }
+    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
+    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
+    return (ctxt->nodeInfoNr++);
+}
+
+/**
+ * htmlNodeInfoPop:
+ * @ctxt:  an HTML parser context
+ *
+ * Pops the top element name from the node info stack
+ *
+ * Returns 0 in case of error, the pointer to NodeInfo otherwise
+ */
+static htmlParserNodeInfo *
+htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
+{
+    if (ctxt->nodeInfoNr <= 0)
+        return (NULL);
+    ctxt->nodeInfoNr--;
+    if (ctxt->nodeInfoNr < 0)
+        return (NULL);
+    if (ctxt->nodeInfoNr > 0)
+        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
+    else
+        ctxt->nodeInfo = NULL;
+    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
+}
+
 /*
  * Macros for accessing the content. Those should be used only by the parser,
  * and not exported.
@@ -3927,6 +3980,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
  * @ctxt:  an HTML parser context
  *
  * Parse a content: comment, sub-element, reference or text.
+ * Kept for compatibility with old code
  */
 
 static void
@@ -4075,23 +4129,11 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
 }
 
 /**
- * htmlParseContent:
- * @ctxt:  an HTML parser context
- *
- * Parse a content: comment, sub-element, reference or text.
- */
-
-void
-__htmlParseContent(void *ctxt) {
-    if (ctxt != NULL)
-	htmlParseContent((htmlParserCtxtPtr) ctxt);
-}
-
-/**
  * htmlParseElement:
  * @ctxt:  an HTML parser context
  *
  * parse an HTML element, this is highly recursive
+ * this is kept for compatibility with previous code versions
  *
  * [39] element ::= EmptyElemTag | STag content ETag
  *
@@ -4219,6 +4261,303 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
 	xmlFree(currentNode);
 }
 
+static void
+htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
+    /*
+     * Capture end position and add node
+     */
+    if ( ctxt->node != NULL && ctxt->record_info ) {
+       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
+                                (CUR_PTR - ctxt->input->base);
+       ctxt->nodeInfo->end_line = ctxt->input->line;
+       ctxt->nodeInfo->node = ctxt->node;
+       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
+       htmlNodeInfoPop(ctxt);
+    }
+    if (!IS_CHAR_CH(CUR)) {
+       htmlAutoCloseOnEnd(ctxt);
+    }
+}
+
+/**
+ * htmlParseElementInternal:
+ * @ctxt:  an HTML parser context
+ *
+ * parse an HTML element, new version, non recursive
+ *
+ * [39] element ::= EmptyElemTag | STag content ETag
+ *
+ * [41] Attribute ::= Name Eq AttValue
+ */
+
+static void
+htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
+    const xmlChar *name;
+    const htmlElemDesc * info;
+    htmlParserNodeInfo node_info;
+    int failed;
+    int depth;
+    const xmlChar *oldptr;
+
+    if ((ctxt == NULL) || (ctxt->input == NULL)) {
+	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
+		     "htmlParseElementInternal: context error\n", NULL, NULL);
+	return;
+    }
+
+    if (ctxt->instate == XML_PARSER_EOF)
+        return;
+
+    /* Capture start position */
+    if (ctxt->record_info) {
+        node_info.begin_pos = ctxt->input->consumed +
+                          (CUR_PTR - ctxt->input->base);
+	node_info.begin_line = ctxt->input->line;
+    }
+
+    failed = htmlParseStartTag(ctxt);
+    name = ctxt->name;
+    if ((failed == -1) || (name == NULL)) {
+	if (CUR == '>')
+	    NEXT;
+        return;
+    }
+
+    /*
+     * Lookup the info for that element.
+     */
+    info = htmlTagLookup(name);
+    if (info == NULL) {
+	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
+	             "Tag %s invalid\n", name, NULL);
+    }
+
+    /*
+     * Check for an Empty Element labeled the XML/SGML way
+     */
+    if ((CUR == '/') && (NXT(1) == '>')) {
+        SKIP(2);
+	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
+	    ctxt->sax->endElement(ctxt->userData, name);
+	htmlnamePop(ctxt);
+	return;
+    }
+
+    if (CUR == '>') {
+        NEXT;
+    } else {
+	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
+	             "Couldn't find end of Start Tag %s\n", name, NULL);
+
+	/*
+	 * end of parsing of this node.
+	 */
+	if (xmlStrEqual(name, ctxt->name)) {
+	    nodePop(ctxt);
+	    htmlnamePop(ctxt);
+	}
+
+        if (ctxt->record_info)
+            htmlNodeInfoPush(ctxt, &node_info);
+        htmlParserFinishElementParsing(ctxt);
+	return;
+    }
+
+    /*
+     * Check for an Empty Element from DTD definition
+     */
+    if ((info != NULL) && (info->empty)) {
+	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
+	    ctxt->sax->endElement(ctxt->userData, name);
+	htmlnamePop(ctxt);
+	return;
+    }
+
+    if (ctxt->record_info)
+        htmlNodeInfoPush(ctxt, &node_info);
+}
+
+/**
+ * htmlParseContentInternal:
+ * @ctxt:  an HTML parser context
+ *
+ * Parse a content: comment, sub-element, reference or text.
+ * New version for non recursive htmlParseElementInternal
+ */
+
+static void
+htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
+    xmlChar *currentNode;
+    int depth;
+    const xmlChar *name;
+
+    currentNode = xmlStrdup(ctxt->name);
+    depth = ctxt->nameNr;
+    while (1) {
+	long cons = ctxt->nbChars;
+
+        GROW;
+
+        if (ctxt->instate == XML_PARSER_EOF)
+            break;
+
+	/*
+	 * Our tag or one of it's parent or children is ending.
+	 */
+        if ((CUR == '<') && (NXT(1) == '/')) {
+	    if (htmlParseEndTag(ctxt) &&
+		((currentNode != NULL) || (ctxt->nameNr == 0))) {
+		if (currentNode != NULL)
+		    xmlFree(currentNode);
+
+	        currentNode = xmlStrdup(ctxt->name);
+	        depth = ctxt->nameNr;
+	    }
+	    continue; /* while */
+        }
+
+	else if ((CUR == '<') &&
+	         ((IS_ASCII_LETTER(NXT(1))) ||
+		  (NXT(1) == '_') || (NXT(1) == ':'))) {
+	    name = htmlParseHTMLName_nonInvasive(ctxt);
+	    if (name == NULL) {
+	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
+			 "htmlParseStartTag: invalid element name\n",
+			 NULL, NULL);
+	        /* Dump the bogus tag like browsers do */
+	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+	            NEXT;
+
+	        htmlParserFinishElementParsing(ctxt);
+	        if (currentNode != NULL)
+	            xmlFree(currentNode);
+
+	        currentNode = xmlStrdup(ctxt->name);
+	        depth = ctxt->nameNr;
+	        continue;
+	    }
+
+	    if (ctxt->name != NULL) {
+	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
+	            htmlAutoClose(ctxt, name);
+	            continue;
+	        }
+	    }
+	}
+
+	/*
+	 * Has this node been popped out during parsing of
+	 * the next element
+	 */
+        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
+	    (!xmlStrEqual(currentNode, ctxt->name)))
+	     {
+	    htmlParserFinishElementParsing(ctxt);
+	    if (currentNode != NULL) xmlFree(currentNode);
+
+	    currentNode = xmlStrdup(ctxt->name);
+	    depth = ctxt->nameNr;
+	    continue;
+	}
+
+	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
+	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
+	    /*
+	     * Handle SCRIPT/STYLE separately
+	     */
+	    htmlParseScript(ctxt);
+	} else {
+	    /*
+	     * Sometimes DOCTYPE arrives in the middle of the document
+	     */
+	    if ((CUR == '<') && (NXT(1) == '!') &&
+		(UPP(2) == 'D') && (UPP(3) == 'O') &&
+		(UPP(4) == 'C') && (UPP(5) == 'T') &&
+		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
+		(UPP(8) == 'E')) {
+		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+		             "Misplaced DOCTYPE declaration\n",
+			     BAD_CAST "DOCTYPE" , NULL);
+		htmlParseDocTypeDecl(ctxt);
+	    }
+
+	    /*
+	     * First case :  a comment
+	     */
+	    if ((CUR == '<') && (NXT(1) == '!') &&
+		(NXT(2) == '-') && (NXT(3) == '-')) {
+		htmlParseComment(ctxt);
+	    }
+
+	    /*
+	     * Second case : a Processing Instruction.
+	     */
+	    else if ((CUR == '<') && (NXT(1) == '?')) {
+		htmlParsePI(ctxt);
+	    }
+
+	    /*
+	     * Third case :  a sub-element.
+	     */
+	    else if (CUR == '<') {
+		htmlParseElementInternal(ctxt);
+		if (currentNode != NULL) xmlFree(currentNode);
+
+		currentNode = xmlStrdup(ctxt->name);
+		depth = ctxt->nameNr;
+	    }
+
+	    /*
+	     * Fourth case : a reference. If if has not been resolved,
+	     *    parsing returns it's Name, create the node
+	     */
+	    else if (CUR == '&') {
+		htmlParseReference(ctxt);
+	    }
+
+	    /*
+	     * Fifth case : end of the resource
+	     */
+	    else if (CUR == 0) {
+		htmlAutoCloseOnEnd(ctxt);
+		break;
+	    }
+
+	    /*
+	     * Last case, text. Note that References are handled directly.
+	     */
+	    else {
+		htmlParseCharData(ctxt);
+	    }
+
+	    if (cons == ctxt->nbChars) {
+		if (ctxt->node != NULL) {
+		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
+		                 "detected an error in element content\n",
+				 NULL, NULL);
+		}
+		break;
+	    }
+	}
+        GROW;
+    }
+    if (currentNode != NULL) xmlFree(currentNode);
+}
+
+/**
+ * htmlParseContent:
+ * @ctxt:  an HTML parser context
+ *
+ * Parse a content: comment, sub-element, reference or text.
+ * This is the entry point when called from parser.c
+ */
+
+void
+__htmlParseContent(void *ctxt) {
+    if (ctxt != NULL)
+	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
+}
+
 /**
  * htmlParseDocument:
  * @ctxt:  an HTML parser context
@@ -4323,7 +4662,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
     /*
      * Time to start parsing the tree itself
      */
-    htmlParseContent(ctxt);
+    htmlParseContentInternal(ctxt);
 
     /*
      * autoclose
@@ -4440,6 +4779,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
     ctxt->nameMax = 10;
     ctxt->name = NULL;
 
+    ctxt->nodeInfoTab = NULL;
+    ctxt->nodeInfoNr  = 0;
+    ctxt->nodeInfoMax = 0;
+
     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
     else {
         ctxt->sax = sax;
diff --git a/include/libxml/parser.h b/include/libxml/parser.h
index 148ee03..dd79c42 100644
--- a/include/libxml/parser.h
+++ b/include/libxml/parser.h
@@ -302,6 +302,12 @@ struct _xmlParserCtxt {
     xmlParserMode     parseMode;    /* the parser mode */
     unsigned long    nbentities;    /* number of entities references */
     unsigned long  sizeentities;    /* size of parsed entities */
+
+    /* for use by HTML non-recursive parser */
+    xmlParserNodeInfo *nodeInfo;      /* Current NodeInfo */
+    int                nodeInfoNr;    /* Depth of the parsing stack */
+    int                nodeInfoMax;   /* Max depth of the parsing stack */
+    xmlParserNodeInfo *nodeInfoTab;   /* array of nodeInfos */
 };
 
 /**
diff --git a/parserInternals.c b/parserInternals.c
index ff20435..2404ddf 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1782,6 +1782,7 @@ xmlFreeParserCtxt(xmlParserCtxtPtr ctxt)
     if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
     if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
     if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
+    if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab);
     if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
     if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
     if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]