[libxml2] Use buffers when constructing string node lists.



commit 7d553f834e2573c3352dc61a70ce4a7506db3c8c
Author: Conrad Irwin <conrad irwin gmail com>
Date:   Thu May 10 20:17:25 2012 -0700

    Use buffers when constructing string node lists.
    
    Hi Veillard and all,
    
    Firstly, thanks for libxml: it's awesome!
    
    I noticed recently that libxml was taking a surprisingly long time to perform some
    operations (many minutes instead of milliseconds), and so I did some digging. It turns out
    that the problem was caused by the realloc()ing done in xmlNodeAddContentLen() which can
    be called many (many) times when assigning some content into a node.
    
    For background, I'm dealing with XML that contains emails, these can have large
    attachments (~6MB) which are base-64 encoded, line-wrapped at 78 chars, and each line ends
    with &#13;. This means that xmlNodeAddContentLen() is being called about 200,000 times,
    and so there are 200,000 reallocs of a 6MB string, which takes a while... (I put a synthetic
    example of this at https://gist.github.com/2656940)
    
    The attached patch works around that problem by using the existing buffer API to merge the
    strings together before even creating the text node, this keeps the number of realloc()s
    at a managable level.
    
    I'd love feedback on the patch, and am happy to fix problems with it, or explore other
    solutions if you think that this is barking up the wrong tree :).
    
    Thanks,
    
    Conrad
    
    P.S. Should I create a bug for this too?
    
    ------8<------
    
    Before this change xmlStringGetNodeList would perform a realloc() of the
    entire new content for every XML entity in the assigned text in order to
    merge together adjacent text nodes. This had the effect of making
    xmlSetNodeContent O(n^2), which led to unexpectedly bad performance on
    inputs that contained a large number of XML entities.
    
    After this change the memory management is done by the buffer API,
    avoiding the need to continually re-measure and realloc() the string.
    
    For my test data (6MB of 80 character lines, each ending with &#13;)
    this takes the time to xmlSetNodeContent from about 500 seconds to
    around 50ms. I have not profiled smaller cases, though I tried to
    minimize the performance impact of my change by avoiding unnecessary
    string copying.
    
    Signed-off-by: Conrad Irwin <conrad irwin gmail com>

 include/libxml/tree.h |    2 +
 tree.c                |  196 +++++++++++++++++++++++++++++--------------------
 2 files changed, 117 insertions(+), 81 deletions(-)
---
diff --git a/include/libxml/tree.h b/include/libxml/tree.h
index b733589..b522f61 100644
--- a/include/libxml/tree.h
+++ b/include/libxml/tree.h
@@ -694,6 +694,8 @@ XMLPUBFUN void XMLCALL
 		xmlBufferEmpty		(xmlBufferPtr buf);
 XMLPUBFUN const xmlChar* XMLCALL	
 		xmlBufferContent	(const xmlBufferPtr buf);
+XMLPUBFUN xmlChar* XMLCALL
+		xmlBufferDetach         (const xmlBufferPtr buf);
 XMLPUBFUN void XMLCALL		
 		xmlBufferSetAllocationScheme(xmlBufferPtr buf,
 					 xmlBufferAllocationScheme scheme);
diff --git a/tree.c b/tree.c
index 71fbd1d..d8fafc0 100644
--- a/tree.c
+++ b/tree.c
@@ -1261,9 +1261,13 @@ xmlStringLenGetNodeList(xmlDocPtr doc, const xmlChar *value, int len) {
     const xmlChar *cur = value, *end = cur + len;
     const xmlChar *q;
     xmlEntityPtr ent;
+    xmlBufferPtr buffer;
 
     if (value == NULL) return(NULL);
 
+    buffer = xmlBufferCreateSize(0);
+    if (buffer == NULL) return(NULL);
+
     q = cur;
     while ((cur < end) && (*cur != 0)) {
 	if (cur[0] == '&') {
@@ -1274,19 +1278,8 @@ xmlStringLenGetNodeList(xmlDocPtr doc, const xmlChar *value, int len) {
 	     * Save the current text.
 	     */
             if (cur != q) {
-		if ((last != NULL) && (last->type == XML_TEXT_NODE)) {
-		    xmlNodeAddContentLen(last, q, cur - q);
-		} else {
-		    node = xmlNewDocTextLen(doc, q, cur - q);
-		    if (node == NULL) return(ret);
-		    if (last == NULL)
-			last = ret = node;
-		    else {
-			last->next = node;
-			node->prev = last;
-			last = node;
-		    }
-		}
+		if (xmlBufferAdd(buffer, q, cur - q))
+		    goto out;
 	    }
 	    q = cur;
 	    if ((cur + 2 < end) && (cur[1] == '#') && (cur[2] == 'x')) {
@@ -1351,7 +1344,7 @@ xmlStringLenGetNodeList(xmlDocPtr doc, const xmlChar *value, int len) {
 		if ((cur >= end) || (*cur == 0)) {
 		    xmlTreeErr(XML_TREE_UNTERMINATED_ENTITY, (xmlNodePtr) doc,
 		               (const char *) q);
-		    return(ret);
+		    goto out;
 		}
 		if (cur != q) {
 		    /*
@@ -1361,23 +1354,36 @@ xmlStringLenGetNodeList(xmlDocPtr doc, const xmlChar *value, int len) {
 		    ent = xmlGetDocEntity(doc, val);
 		    if ((ent != NULL) &&
 			(ent->etype == XML_INTERNAL_PREDEFINED_ENTITY)) {
-			if (last == NULL) {
-			    node = xmlNewDocText(doc, ent->content);
-			    last = ret = node;
-			} else if (last->type != XML_TEXT_NODE) {
-			    node = xmlNewDocText(doc, ent->content);
-			    last = xmlAddNextSibling(last, node);
-			} else
-			    xmlNodeAddContent(last, ent->content);
+
+			if (xmlBufferCat(buffer, ent->content))
+			    goto out;
 
 		    } else {
 			/*
+			 * Flush buffer so far
+			 */
+			if (buffer->use) {
+			    node = xmlNewDocText(doc, NULL);
+			    if (node == NULL) {
+				if (val != NULL) xmlFree(val);
+				goto out;
+			    }
+			    node->content = xmlBufferDetach(buffer);
+
+			    if (last == NULL) {
+				last = ret = node;
+			    } else {
+				last = xmlAddNextSibling(last, node);
+			    }
+			}
+
+			/*
 			 * Create a new REFERENCE_REF node
 			 */
 			node = xmlNewReference(doc, val);
 			if (node == NULL) {
 			    if (val != NULL) xmlFree(val);
-			    return(ret);
+			    goto out;
 			}
 			else if ((ent != NULL) && (ent->children == NULL)) {
 			    xmlNodePtr temp;
@@ -1409,35 +1415,39 @@ xmlStringLenGetNodeList(xmlDocPtr doc, const xmlChar *value, int len) {
 
 		l = xmlCopyCharMultiByte(buf, charval);
 		buf[l] = 0;
-		node = xmlNewDocText(doc, buf);
-		if (node != NULL) {
-		    if (last == NULL) {
-			last = ret = node;
-		    } else {
-			last = xmlAddNextSibling(last, node);
-		    }
-		}
+
+		if (xmlBufferCat(buffer, buf))
+		    goto out;
 		charval = 0;
 	    }
 	} else
 	    cur++;
     }
-    if ((cur != q) || (ret == NULL)) {
+
+    if (cur != q) {
         /*
 	 * Handle the last piece of text.
 	 */
-	if ((last != NULL) && (last->type == XML_TEXT_NODE)) {
-	    xmlNodeAddContentLen(last, q, cur - q);
+	if (xmlBufferAdd(buffer, q, cur - q))
+	    goto out;
+    }
+
+    if (buffer->use) {
+	node = xmlNewDocText(doc, NULL);
+	if (node == NULL) goto out;
+	node->content = xmlBufferDetach(buffer);
+
+	if (last == NULL) {
+	    last = ret = node;
 	} else {
-	    node = xmlNewDocTextLen(doc, q, cur - q);
-	    if (node == NULL) return(ret);
-	    if (last == NULL) {
-		ret = node;
-	    } else {
-		xmlAddNextSibling(last, node);
-	    }
+	    last = xmlAddNextSibling(last, node);
 	}
+    } else if (ret == NULL) {
+        ret = xmlNewDocText(doc, BAD_CAST "");
     }
+
+out:
+    xmlBufferFree(buffer);
     return(ret);
 }
 
@@ -1458,9 +1468,13 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) {
     const xmlChar *cur = value;
     const xmlChar *q;
     xmlEntityPtr ent;
+    xmlBufferPtr buffer;
 
     if (value == NULL) return(NULL);
 
+    buffer = xmlBufferCreateSize(0);
+    if (buffer == NULL) return(NULL);
+
     q = cur;
     while (*cur != 0) {
 	if (cur[0] == '&') {
@@ -1471,19 +1485,8 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) {
 	     * Save the current text.
 	     */
             if (cur != q) {
-		if ((last != NULL) && (last->type == XML_TEXT_NODE)) {
-		    xmlNodeAddContentLen(last, q, cur - q);
-		} else {
-		    node = xmlNewDocTextLen(doc, q, cur - q);
-		    if (node == NULL) return(ret);
-		    if (last == NULL)
-			last = ret = node;
-		    else {
-			last->next = node;
-			node->prev = last;
-			last = node;
-		    }
-		}
+		if (xmlBufferAdd(buffer, q, cur - q))
+		    goto out;
 	    }
 	    q = cur;
 	    if ((cur[1] == '#') && (cur[2] == 'x')) {
@@ -1536,7 +1539,7 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) {
 		if (*cur == 0) {
 		    xmlTreeErr(XML_TREE_UNTERMINATED_ENTITY,
 		               (xmlNodePtr) doc, (const char *) q);
-		    return(ret);
+		    goto out;
 		}
 		if (cur != q) {
 		    /*
@@ -1546,23 +1549,32 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) {
 		    ent = xmlGetDocEntity(doc, val);
 		    if ((ent != NULL) &&
 			(ent->etype == XML_INTERNAL_PREDEFINED_ENTITY)) {
-			if (last == NULL) {
-			    node = xmlNewDocText(doc, ent->content);
-			    last = ret = node;
-			} else if (last->type != XML_TEXT_NODE) {
-			    node = xmlNewDocText(doc, ent->content);
-			    last = xmlAddNextSibling(last, node);
-			} else
-			    xmlNodeAddContent(last, ent->content);
+
+			if (xmlBufferCat(buffer, ent->content))
+			    goto out;
 
 		    } else {
 			/*
+			 * Flush buffer so far
+			 */
+			if (buffer->use) {
+			    node = xmlNewDocText(doc, NULL);
+			    node->content = xmlBufferDetach(buffer);
+
+			    if (last == NULL) {
+				last = ret = node;
+			    } else {
+				last = xmlAddNextSibling(last, node);
+			    }
+			}
+
+			/*
 			 * Create a new REFERENCE_REF node
 			 */
 			node = xmlNewReference(doc, val);
 			if (node == NULL) {
 			    if (val != NULL) xmlFree(val);
-			    return(ret);
+			    goto out;
 			}
 			else if ((ent != NULL) && (ent->children == NULL)) {
 			    xmlNodePtr temp;
@@ -1593,14 +1605,10 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) {
 
 		len = xmlCopyCharMultiByte(buf, charval);
 		buf[len] = 0;
-		node = xmlNewDocText(doc, buf);
-		if (node != NULL) {
-		    if (last == NULL) {
-			last = ret = node;
-		    } else {
-			last = xmlAddNextSibling(last, node);
-		    }
-		}
+
+		if (xmlBufferCat(buffer, buf))
+		    goto out;
+		charval = 0;
 	    }
 	} else
 	    cur++;
@@ -1609,18 +1617,22 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) {
         /*
 	 * Handle the last piece of text.
 	 */
-	if ((last != NULL) && (last->type == XML_TEXT_NODE)) {
-	    xmlNodeAddContentLen(last, q, cur - q);
+	xmlBufferAdd(buffer, q, cur - q);
+    }
+
+    if (buffer->use) {
+	node = xmlNewDocText(doc, NULL);
+	node->content = xmlBufferDetach(buffer);
+
+	if (last == NULL) {
+	    last = ret = node;
 	} else {
-	    node = xmlNewDocTextLen(doc, q, cur - q);
-	    if (node == NULL) return(ret);
-	    if (last == NULL) {
-		last = ret = node;
-	    } else {
-		last = xmlAddNextSibling(last, node);
-	    }
+	    last = xmlAddNextSibling(last, node);
 	}
     }
+
+out:
+    xmlBufferFree(buffer);
     return(ret);
 }
 
@@ -6916,6 +6928,28 @@ xmlBufferCreateSize(size_t size) {
 }
 
 /**
+ * xmlBufferDetach
+ * @buf:  the buffer
+ *
+ * Returns the previous string contained by the buffer.
+ *
+ */
+xmlChar *
+xmlBufferDetach(xmlBufferPtr buf) {
+    xmlChar *ret;
+
+    if (buf == NULL) return(NULL);
+
+    ret = buf->content;
+    buf->content = NULL;
+    buf->size = 0;
+    buf->use = 0;
+
+    return ret;
+}
+
+
+/**
  * xmlBufferCreateStatic:
  * @mem: the memory area
  * @size:  the size in byte



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]