[gnumeric] html: improve space handling.

From: Morten Welinder <mortenw src gnome org>
To: commits-list gnome org
Cc:
Subject: [gnumeric] html: improve space handling.
Date: Sun, 25 Sep 2022 22:58:15 +0000 (UTC)
commit 577646d0e3168825b47bd69a92e5a0c50f694455
Author: Morten Welinder <terra gnome org>
Date:   Sun Sep 25 18:58:01 2022 -0400

    html: improve space handling.

 NEWS                     |  4 ++++
 plugins/html/ChangeLog   |  5 +++++
 plugins/html/html_read.c | 53 ++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 51 insertions(+), 11 deletions(-)
---
diff --git a/NEWS b/NEWS
index 33419a9a3..3d211e184 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,9 @@
 Gnumeric 1.12.54
 
+John Denker:
+       * Improve whitespace handling in html import.  [#671]
+
+
 --------------------------------------------------------------------------
 Gnumeric 1.12.53
 
diff --git a/plugins/html/ChangeLog b/plugins/html/ChangeLog
index 74cfbbb48..17282ed51 100644
--- a/plugins/html/ChangeLog
+++ b/plugins/html/ChangeLog
@@ -1,3 +1,8 @@
+2022-09-25  Morten Welinder  <terra gnome org>
+
+       * html_read.c: Improve whitespace handling.  Based on patch from
+       John Denker.  [#671]
+
 2022-09-17  Morten Welinder <terra gnome org>
 
        * Release 1.12.53
diff --git a/plugins/html/html_read.c b/plugins/html/html_read.c
index 50daf18c7..d78a3eb8a 100644
--- a/plugins/html/html_read.c
+++ b/plugins/html/html_read.c
@@ -81,27 +81,58 @@ html_get_sheet (char const *name, Workbook *wb)
        return sheet;
 }
 
+
+/* deletes any initial whitespace */
+/* thereafter, including at the end, */
+/* collapses any run of whitespace to a single space. */
+/* (This may or may not be what you want, e.g. <pre>...</pre>) */
+/* It's up to the caller to deal with the possible final trailing space. */
 static void
-html_append_text (GString *buf, const xmlChar *text)
+html_append_trim_text (GString *buf, const xmlChar *text)
 {
        const xmlChar *p;
+       const xmlChar *last_sp;
 
        while (*text) {
-               while (g_unichar_isspace (g_utf8_get_char (text)))
-                       text = g_utf8_next_char (text);
+               // collect a run of spaces, if any
+               for (last_sp = p = text;
+                    *p && g_unichar_isspace (g_utf8_get_char (p));
+                    p = g_utf8_next_char (p)) {
+                       last_sp = p;
+               }
+               if (buf->len == 0 ||
+                   g_unichar_isspace (g_utf8_get_char (g_utf8_prev_char (buf->str + buf->len)))) {
+                       text = p;             /* skip all the spaces */
+               } else {
+                       text = last_sp;       /* keep the last space */
+               }
                if (*text) {
-                       for (p = text;
+                       // collect a run of non-spaces, if any
+                       for (/* keep p */;
                             *p && !g_unichar_isspace (g_utf8_get_char (p));
-                            p =  g_utf8_next_char (p))
-                               ;
-                       if (buf->len > 0)
-                               g_string_append_c (buf, ' ');
+                            p =  g_utf8_next_char (p)) {
+                       }
+                       // here p points to either a space or EoS
+                       if (*p) p = g_utf8_next_char (p);
+                       // copy the non-spaces and one trailing space if any
                        g_string_append_len (buf, text, p - text);
-                       text = p;
                }
+               text = p;
        }
 }
 
+/* remove one trailing space, if it exists */
+static void
+html_rtrim (GString *buf)
+{
+       if (buf->len == 0)
+               return;
+
+       gchar* last = g_utf8_prev_char (buf->str + buf->len);
+       if (g_unichar_isspace (g_utf8_get_char (last)))
+               g_string_truncate(buf, last - buf->str);
+}
+
 static void
 html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
                   xmlBufferPtr a_buf, GSList **hrefs, gboolean first,
@@ -112,7 +143,7 @@ html_read_content (htmlNodePtr cur, GString *buf, GnmStyle *mstyle,
        for (ptr = cur->children; ptr != NULL ; ptr = ptr->next) {
                if (ptr->type == XML_TEXT_NODE) {
                        if (g_utf8_validate (ptr->content, -1, NULL))
-                               html_append_text (buf, ptr->content);
+                               html_append_trim_text (buf, ptr->content);
                        else
                                g_string_append (buf, _("[Warning: Invalid text string has been removed.]"));
                } else if (ptr->type == XML_ELEMENT_NODE) {
@@ -218,7 +249,7 @@ html_read_row (htmlNodePtr cur, htmlDocPtr doc, GnmHtmlTableCtxt *tc)
 
                        html_read_content (ptr, buf, mstyle, a_buf,
                                           &hrefs, TRUE, doc, tc);
-
+                       html_rtrim(buf);
 
                        if (g_slist_length (hrefs) >= 1 &&
                            buf->len > 0) {
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]