[libxml2] Switch the HTML parser to be non-recursive
- From: Daniel Veillard <veillard src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Switch the HTML parser to be non-recursive
- Date: Mon, 15 Mar 2010 14:19:11 +0000 (UTC)
commit 615904f58210c864dd54aefbf3e55b85e531bc1e
Author: Eugene Pimenov <libc me com>
Date: Mon Mar 15 15:16:02 2010 +0100
Switch the HTML parser to be non-recursive
* HTMLparser.c: new htmlParseElementInternal non recursive, with
htmlParseContentInternal and new function to handle node info
and element end.
* include/libxml/parser.h: add new stack for element info in parser
context
* parserInternals.c: fee element info stack
HTMLparser.c | 371 +++++++++++++++++++++++++++++++++++++++++++++--
include/libxml/parser.h | 6 +
parserInternals.c | 1 +
3 files changed, 364 insertions(+), 14 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 2b569ca..a3fbb7f 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -209,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
return (ret);
}
+/**
+ * htmlNodeInfoPush:
+ * @ctxt: an HTML parser context
+ * @value: the node info
+ *
+ * Pushes a new element name on top of the node info stack
+ *
+ * Returns 0 in case of error, the index in the stack otherwise
+ */
+static int
+htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
+{
+ if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
+ if (ctxt->nodeInfoMax == 0)
+ ctxt->nodeInfoMax = 5;
+ ctxt->nodeInfoMax *= 2;
+ ctxt->nodeInfoTab = (htmlParserNodeInfo *)
+ xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
+ ctxt->nodeInfoMax *
+ sizeof(ctxt->nodeInfoTab[0]));
+ if (ctxt->nodeInfoTab == NULL) {
+ htmlErrMemory(ctxt, NULL);
+ return (0);
+ }
+ }
+ ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
+ ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
+ return (ctxt->nodeInfoNr++);
+}
+
+/**
+ * htmlNodeInfoPop:
+ * @ctxt: an HTML parser context
+ *
+ * Pops the top element name from the node info stack
+ *
+ * Returns 0 in case of error, the pointer to NodeInfo otherwise
+ */
+static htmlParserNodeInfo *
+htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
+{
+ if (ctxt->nodeInfoNr <= 0)
+ return (NULL);
+ ctxt->nodeInfoNr--;
+ if (ctxt->nodeInfoNr < 0)
+ return (NULL);
+ if (ctxt->nodeInfoNr > 0)
+ ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
+ else
+ ctxt->nodeInfo = NULL;
+ return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
+}
+
/*
* Macros for accessing the content. Those should be used only by the parser,
* and not exported.
@@ -3927,6 +3980,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
* @ctxt: an HTML parser context
*
* Parse a content: comment, sub-element, reference or text.
+ * Kept for compatibility with old code
*/
static void
@@ -4075,23 +4129,11 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
}
/**
- * htmlParseContent:
- * @ctxt: an HTML parser context
- *
- * Parse a content: comment, sub-element, reference or text.
- */
-
-void
-__htmlParseContent(void *ctxt) {
- if (ctxt != NULL)
- htmlParseContent((htmlParserCtxtPtr) ctxt);
-}
-
-/**
* htmlParseElement:
* @ctxt: an HTML parser context
*
* parse an HTML element, this is highly recursive
+ * this is kept for compatibility with previous code versions
*
* [39] element ::= EmptyElemTag | STag content ETag
*
@@ -4219,6 +4261,303 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
xmlFree(currentNode);
}
+static void
+htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
+ /*
+ * Capture end position and add node
+ */
+ if ( ctxt->node != NULL && ctxt->record_info ) {
+ ctxt->nodeInfo->end_pos = ctxt->input->consumed +
+ (CUR_PTR - ctxt->input->base);
+ ctxt->nodeInfo->end_line = ctxt->input->line;
+ ctxt->nodeInfo->node = ctxt->node;
+ xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
+ htmlNodeInfoPop(ctxt);
+ }
+ if (!IS_CHAR_CH(CUR)) {
+ htmlAutoCloseOnEnd(ctxt);
+ }
+}
+
+/**
+ * htmlParseElementInternal:
+ * @ctxt: an HTML parser context
+ *
+ * parse an HTML element, new version, non recursive
+ *
+ * [39] element ::= EmptyElemTag | STag content ETag
+ *
+ * [41] Attribute ::= Name Eq AttValue
+ */
+
+static void
+htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
+ const xmlChar *name;
+ const htmlElemDesc * info;
+ htmlParserNodeInfo node_info;
+ int failed;
+ int depth;
+ const xmlChar *oldptr;
+
+ if ((ctxt == NULL) || (ctxt->input == NULL)) {
+ htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
+ "htmlParseElementInternal: context error\n", NULL, NULL);
+ return;
+ }
+
+ if (ctxt->instate == XML_PARSER_EOF)
+ return;
+
+ /* Capture start position */
+ if (ctxt->record_info) {
+ node_info.begin_pos = ctxt->input->consumed +
+ (CUR_PTR - ctxt->input->base);
+ node_info.begin_line = ctxt->input->line;
+ }
+
+ failed = htmlParseStartTag(ctxt);
+ name = ctxt->name;
+ if ((failed == -1) || (name == NULL)) {
+ if (CUR == '>')
+ NEXT;
+ return;
+ }
+
+ /*
+ * Lookup the info for that element.
+ */
+ info = htmlTagLookup(name);
+ if (info == NULL) {
+ htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
+ "Tag %s invalid\n", name, NULL);
+ }
+
+ /*
+ * Check for an Empty Element labeled the XML/SGML way
+ */
+ if ((CUR == '/') && (NXT(1) == '>')) {
+ SKIP(2);
+ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
+ ctxt->sax->endElement(ctxt->userData, name);
+ htmlnamePop(ctxt);
+ return;
+ }
+
+ if (CUR == '>') {
+ NEXT;
+ } else {
+ htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
+ "Couldn't find end of Start Tag %s\n", name, NULL);
+
+ /*
+ * end of parsing of this node.
+ */
+ if (xmlStrEqual(name, ctxt->name)) {
+ nodePop(ctxt);
+ htmlnamePop(ctxt);
+ }
+
+ if (ctxt->record_info)
+ htmlNodeInfoPush(ctxt, &node_info);
+ htmlParserFinishElementParsing(ctxt);
+ return;
+ }
+
+ /*
+ * Check for an Empty Element from DTD definition
+ */
+ if ((info != NULL) && (info->empty)) {
+ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
+ ctxt->sax->endElement(ctxt->userData, name);
+ htmlnamePop(ctxt);
+ return;
+ }
+
+ if (ctxt->record_info)
+ htmlNodeInfoPush(ctxt, &node_info);
+}
+
+/**
+ * htmlParseContentInternal:
+ * @ctxt: an HTML parser context
+ *
+ * Parse a content: comment, sub-element, reference or text.
+ * New version for non recursive htmlParseElementInternal
+ */
+
+static void
+htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
+ xmlChar *currentNode;
+ int depth;
+ const xmlChar *name;
+
+ currentNode = xmlStrdup(ctxt->name);
+ depth = ctxt->nameNr;
+ while (1) {
+ long cons = ctxt->nbChars;
+
+ GROW;
+
+ if (ctxt->instate == XML_PARSER_EOF)
+ break;
+
+ /*
+ * Our tag or one of it's parent or children is ending.
+ */
+ if ((CUR == '<') && (NXT(1) == '/')) {
+ if (htmlParseEndTag(ctxt) &&
+ ((currentNode != NULL) || (ctxt->nameNr == 0))) {
+ if (currentNode != NULL)
+ xmlFree(currentNode);
+
+ currentNode = xmlStrdup(ctxt->name);
+ depth = ctxt->nameNr;
+ }
+ continue; /* while */
+ }
+
+ else if ((CUR == '<') &&
+ ((IS_ASCII_LETTER(NXT(1))) ||
+ (NXT(1) == '_') || (NXT(1) == ':'))) {
+ name = htmlParseHTMLName_nonInvasive(ctxt);
+ if (name == NULL) {
+ htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
+ "htmlParseStartTag: invalid element name\n",
+ NULL, NULL);
+ /* Dump the bogus tag like browsers do */
+ while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
+ NEXT;
+
+ htmlParserFinishElementParsing(ctxt);
+ if (currentNode != NULL)
+ xmlFree(currentNode);
+
+ currentNode = xmlStrdup(ctxt->name);
+ depth = ctxt->nameNr;
+ continue;
+ }
+
+ if (ctxt->name != NULL) {
+ if (htmlCheckAutoClose(name, ctxt->name) == 1) {
+ htmlAutoClose(ctxt, name);
+ continue;
+ }
+ }
+ }
+
+ /*
+ * Has this node been popped out during parsing of
+ * the next element
+ */
+ if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
+ (!xmlStrEqual(currentNode, ctxt->name)))
+ {
+ htmlParserFinishElementParsing(ctxt);
+ if (currentNode != NULL) xmlFree(currentNode);
+
+ currentNode = xmlStrdup(ctxt->name);
+ depth = ctxt->nameNr;
+ continue;
+ }
+
+ if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
+ (xmlStrEqual(currentNode, BAD_CAST"style")))) {
+ /*
+ * Handle SCRIPT/STYLE separately
+ */
+ htmlParseScript(ctxt);
+ } else {
+ /*
+ * Sometimes DOCTYPE arrives in the middle of the document
+ */
+ if ((CUR == '<') && (NXT(1) == '!') &&
+ (UPP(2) == 'D') && (UPP(3) == 'O') &&
+ (UPP(4) == 'C') && (UPP(5) == 'T') &&
+ (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+ (UPP(8) == 'E')) {
+ htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+ "Misplaced DOCTYPE declaration\n",
+ BAD_CAST "DOCTYPE" , NULL);
+ htmlParseDocTypeDecl(ctxt);
+ }
+
+ /*
+ * First case : a comment
+ */
+ if ((CUR == '<') && (NXT(1) == '!') &&
+ (NXT(2) == '-') && (NXT(3) == '-')) {
+ htmlParseComment(ctxt);
+ }
+
+ /*
+ * Second case : a Processing Instruction.
+ */
+ else if ((CUR == '<') && (NXT(1) == '?')) {
+ htmlParsePI(ctxt);
+ }
+
+ /*
+ * Third case : a sub-element.
+ */
+ else if (CUR == '<') {
+ htmlParseElementInternal(ctxt);
+ if (currentNode != NULL) xmlFree(currentNode);
+
+ currentNode = xmlStrdup(ctxt->name);
+ depth = ctxt->nameNr;
+ }
+
+ /*
+ * Fourth case : a reference. If if has not been resolved,
+ * parsing returns it's Name, create the node
+ */
+ else if (CUR == '&') {
+ htmlParseReference(ctxt);
+ }
+
+ /*
+ * Fifth case : end of the resource
+ */
+ else if (CUR == 0) {
+ htmlAutoCloseOnEnd(ctxt);
+ break;
+ }
+
+ /*
+ * Last case, text. Note that References are handled directly.
+ */
+ else {
+ htmlParseCharData(ctxt);
+ }
+
+ if (cons == ctxt->nbChars) {
+ if (ctxt->node != NULL) {
+ htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
+ "detected an error in element content\n",
+ NULL, NULL);
+ }
+ break;
+ }
+ }
+ GROW;
+ }
+ if (currentNode != NULL) xmlFree(currentNode);
+}
+
+/**
+ * htmlParseContent:
+ * @ctxt: an HTML parser context
+ *
+ * Parse a content: comment, sub-element, reference or text.
+ * This is the entry point when called from parser.c
+ */
+
+void
+__htmlParseContent(void *ctxt) {
+ if (ctxt != NULL)
+ htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
+}
+
/**
* htmlParseDocument:
* @ctxt: an HTML parser context
@@ -4323,7 +4662,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
/*
* Time to start parsing the tree itself
*/
- htmlParseContent(ctxt);
+ htmlParseContentInternal(ctxt);
/*
* autoclose
@@ -4440,6 +4779,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
ctxt->nameMax = 10;
ctxt->name = NULL;
+ ctxt->nodeInfoTab = NULL;
+ ctxt->nodeInfoNr = 0;
+ ctxt->nodeInfoMax = 0;
+
if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
else {
ctxt->sax = sax;
diff --git a/include/libxml/parser.h b/include/libxml/parser.h
index 148ee03..dd79c42 100644
--- a/include/libxml/parser.h
+++ b/include/libxml/parser.h
@@ -302,6 +302,12 @@ struct _xmlParserCtxt {
xmlParserMode parseMode; /* the parser mode */
unsigned long nbentities; /* number of entities references */
unsigned long sizeentities; /* size of parsed entities */
+
+ /* for use by HTML non-recursive parser */
+ xmlParserNodeInfo *nodeInfo; /* Current NodeInfo */
+ int nodeInfoNr; /* Depth of the parsing stack */
+ int nodeInfoMax; /* Max depth of the parsing stack */
+ xmlParserNodeInfo *nodeInfoTab; /* array of nodeInfos */
};
/**
diff --git a/parserInternals.c b/parserInternals.c
index ff20435..2404ddf 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1782,6 +1782,7 @@ xmlFreeParserCtxt(xmlParserCtxtPtr ctxt)
if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
+ if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab);
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]