[libxml2/2.9] Fix recovery from invalid HTML start tags

From: Nick Wellnhofer <nwellnhof src gnome org>
To: commits-list gnome org
Cc:
Subject: [libxml2/2.9] Fix recovery from invalid HTML start tags
Date: Fri, 29 Apr 2022 16:00:22 +0000 (UTC)
commit 148be64edf88c00dde50cb0ef584a4c7a3d30212
Author: Nick Wellnhofer <wellnhofer aevum de>
Date:   Tue Feb 22 18:15:53 2022 +0100

    Fix recovery from invalid HTML start tags
    
    Only try to parse a start tag if there's a '<' followed by an ASCII
    letter. This is more in line with HTML5 and the old behavior in
    recovery mode. Emit a literal '<' if the following character is
    invalid.
    
    Fixes #101.
    Fixes #339.

 HTMLparser.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index e235f57b..e720bb20 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -3961,26 +3961,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
                     "htmlParseStartTag: invalid element name\n",
                     NULL, NULL);
-        /*
-         * The recovery code is disabled for now as it can result in
-         * quadratic behavior with the push parser. htmlParseStartTag
-         * must consume all content up to the final '>' in order to avoid
-         * rescanning for this terminator.
-         *
-         * For a proper fix in line with HTML5, htmlParseStartTag and
-         * htmlParseElement should only be called when there's an ASCII
-         * alpha character following the initial '<'. Otherwise, the '<'
-         * should be emitted as text (unless followed by '!', '/' or '?').
-         */
-#if 0
-       /* if recover preserve text on classic misconstructs */
-       if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
-           (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
-           htmlParseCharDataInternal(ctxt, '<');
-           return(-1);
-       }
-#endif
-
        /* Dump the bogus tag like browsers do */
        while ((CUR != 0) && (CUR != '>') &&
                (ctxt->instate != XML_PARSER_EOF))
@@ -4433,9 +4413,15 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
            /*
             * Third case :  a sub-element.
             */
-           else if (CUR == '<') {
+           else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
                htmlParseElement(ctxt);
            }
+           else if (CUR == '<') {
+                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                    (ctxt->sax->characters != NULL))
+                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
+                NEXT;
+           }
 
            /*
             * Fourth case : a reference. If if has not been resolved,
@@ -4832,13 +4818,19 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
            /*
             * Third case :  a sub-element.
             */
-           else if (CUR == '<') {
+           else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
                htmlParseElementInternal(ctxt);
                if (currentNode != NULL) xmlFree(currentNode);
 
                currentNode = xmlStrdup(ctxt->name);
                depth = ctxt->nameNr;
            }
+           else if (CUR == '<') {
+                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                    (ctxt->sax->characters != NULL))
+                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
+                NEXT;
+            }
 
            /*
             * Fourth case : a reference. If if has not been resolved,
@@ -6005,7 +5997,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                                "HPP: entering END_TAG\n");
 #endif
                        break;
-                   } else if (cur == '<') {
+                   } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
                         if ((!terminate) && (next == 0))
                             goto done;
                         ctxt->instate = XML_PARSER_START_TAG;
@@ -6015,6 +6007,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                                 "HPP: entering START_TAG\n");
 #endif
                        break;
+                   } else if (cur == '<') {
+                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                            (ctxt->sax->characters != NULL))
+                           ctxt->sax->characters(ctxt->userData,
+                                                 BAD_CAST "<", 1);
+                        NEXT;
                    } else {
                        /*
                         * check that the text sequence is complete
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]