[libxml2] Detect change of encoding when parsing HTML names
- From: Daniel Veillard <veillard src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Detect change of encoding when parsing HTML names
- Date: Mon, 23 May 2016 08:06:25 +0000 (UTC)
commit beca86e8c86984b967a6efa05a9653470253edda
Author: Hugh Davenport <hugh davenport net nz>
Date: Wed May 4 11:23:49 2016 +0800
Detect change of encoding when parsing HTML names
From https://bugzilla.gnome.org/show_bug.cgi?id=758518
Happens when a file has a name getting parsed, but no valid encoding
set, so libxml has to guess what the encoding is. This patch detects
when the buffer location changes, and if it does, restarts the parsing
of the name.
This slightly change a couple of regression tests output
HTMLparser.c | 8 ++++++++
result/HTML/758605.html | 2 +-
result/HTML/758605.html.err | 2 +-
result/HTML/758605.html.sax | 3 ++-
4 files changed, 12 insertions(+), 3 deletions(-)
---
diff --git a/HTMLparser.c b/HTMLparser.c
index 1c112cc..c6fcbc9 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2492,6 +2492,7 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
int len = 0, l;
int c;
int count = 0;
+ const xmlChar *base = ctxt->input->base;
/*
* Handler for more complex cases
@@ -2517,6 +2518,13 @@ htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
len += l;
NEXTL(l);
c = CUR_CHAR(l);
+ if (ctxt->input->base != base) {
+ /*
+ * We changed encoding from an unknown encoding
+ * Input buffer changed location, so we better start again
+ */
+ return(htmlParseNameComplex(ctxt));
+ }
}
if (ctxt->input->base > ctxt->input->cur - len)
diff --git a/result/HTML/758605.html b/result/HTML/758605.html
index a085cce..60b01d3 100644
--- a/result/HTML/758605.html
+++ b/result/HTML/758605.html
@@ -1,3 +1,3 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
-<html><body><p>&
+<html><body><p>&ê
</p></body></html>
diff --git a/result/HTML/758605.html.err b/result/HTML/758605.html.err
index 2b82be6..2086f96 100644
--- a/result/HTML/758605.html.err
+++ b/result/HTML/758605.html.err
@@ -1,3 +1,3 @@
-./test/HTML/758605.html:1: HTML parser error : htmlParseEntityRef: no name
+./test/HTML/758605.html:1: HTML parser error : htmlParseEntityRef: expecting ';'
ê
^
diff --git a/result/HTML/758605.html.sax b/result/HTML/758605.html.sax
index 1f5cd32..c6e0986 100644
--- a/result/HTML/758605.html.sax
+++ b/result/HTML/758605.html.sax
@@ -1,10 +1,11 @@
SAX.setDocumentLocator()
SAX.startDocument()
-SAX.error: htmlParseEntityRef: no name
+SAX.error: htmlParseEntityRef: expecting ';'
SAX.startElement(html)
SAX.startElement(body)
SAX.startElement(p)
SAX.characters(&, 1)
+SAX.characters(ê, 2)
SAX.ignorableWhitespace(
, 1)
SAX.endElement(p)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]