[gimp-help/wip/wormnest/python3-migration: 6/15] tools: complete port of xml2po to python3

From: Jacob Boerema <jboerema src gnome org>
To: commits-list gnome org
Cc:
Subject: [gimp-help/wip/wormnest/python3-migration: 6/15] tools: complete port of xml2po to python3
Date: Thu, 3 Jun 2021 15:22:28 +0000 (UTC)

commit 460ea38d5ca9569f60d0b13ab40e9b58c467f0d7
Author: Jacob Boerema <jgboerema gmail com>
Date:   Mon Mar 29 17:51:48 2021 -0400

    tools: complete port of xml2po to python3
    
    Also:
    - Let it work correctly on Windows by setting encoding for
      stdout and stderr to utf-8.
    - Fix setting correct timezone in po files.
    - Better error handling.
    - Detect errors in tags in the translated xml text.

 tools/xml2po.py                |  19 ++++-
 tools/xml2po/__init__.py       | 189 +++++++++++++++++++++++++++++++----------
 tools/xml2po/modes/basic.py    |   2 +-
 tools/xml2po/modes/gimphelp.py |   2 +-
 4 files changed, 164 insertions(+), 48 deletions(-)
---
diff --git a/tools/xml2po.py b/tools/xml2po.py
index b3e110dc3..ef8d725da 100755
--- a/tools/xml2po.py
+++ b/tools/xml2po.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 # -*- encoding: utf-8 -*-
 # Copyright (c) 2004, 2005, 2006 Danilo Šegan <danilo gnome org>.
 # Copyright (c) 2009 Claude Paroz <claude 2xlibre net>.
@@ -21,7 +21,7 @@
 #
 
 # xml2po -- translate XML documents
-VERSION = "0.18.0 (patched by GIMP Documentation Team)"
+VERSION = "0.19.0 (patched by GIMP Documentation Team)"
 
 # Versioning system (I use this for a long time, so lets explain it to
 # those Linux-versioning-scheme addicts):
@@ -37,11 +37,13 @@ import os
 import getopt
 import tempfile
 
+DEBUG_VERBOSITY = 0
+
 NULL_STRING = '/dev/null'
 if not os.path.exists('/dev/null'): NULL_STRING = 'NUL'
 
 def usage (with_help = False):
-    print("Usage:  %s [OPTIONS] [XMLFILE]..." % (sys.argv[0]), file=sys.stderr)
+    print(f"Usage: {sys.argv[0]} [OPTIONS] [XMLFILE]...", file=sys.stderr)
     if with_help:
         print("""
 OPTIONS may be some of:
@@ -86,6 +88,11 @@ def main(argv):
 
     from xml2po import Main
 
+
+    # Make sure stdout and stderr output utf-8 even on Windows where it's not the default
+    sys.stdout = open(sys.stdout.fileno(), 'w', encoding='utf-8', closefd=False)
+    sys.stderr = open(sys.stderr.fileno(), 'w', encoding='utf-8', closefd=False)
+
     # Default parameters
     default_mode = 'docbook'
     operation = 'pot' # 'pot', 'merge', 'update'
@@ -136,6 +143,8 @@ def main(argv):
             operation = 'merge'
             if 'translationlanguage' not in options:
                 options['translationlanguage'] = os.path.split(os.path.splitext(pofile)[0])[1]
+            if DEBUG_VERBOSITY > 0:
+                print(f"Converting {pofile} to {mofile_tmppath} using msgfmt")
             os.system("msgfmt -o %s %s >%s" % (mofile_tmppath, pofile, NULL_STRING)) and sys.exit(7)
             mofile = mofile_tmppath
         elif opt in ('-o', '--output'):
@@ -170,6 +179,10 @@ def main(argv):
             print("Error: You must specify MO file when merging translations.", file=sys.stderr)
             sys.exit(3)
 
+        if DEBUG_VERBOSITY > 0:
+            print(f"Merge mo file {mofile} with {filenames[0]}")
+        if pofile:
+            xml2po_main.pofile = pofile
         xml2po_main.merge(mofile, filenames[0])
 
     elif operation == 'update':
diff --git a/tools/xml2po/__init__.py b/tools/xml2po/__init__.py
index 7a07eb226..42fa81f39 100644
--- a/tools/xml2po/__init__.py
+++ b/tools/xml2po/__init__.py
@@ -26,6 +26,7 @@ import tempfile
 import gettext
 import libxml2
 
+DEBUG_VERBOSITY = 0
 NULL_STRING = '/dev/null'
 if not os.path.exists('/dev/null'): NULL_STRING = 'NUL'
 
@@ -86,14 +87,14 @@ class MessageOutput:
                 self.messages.append(t)
                 if spacepreserve:
                     self.nowrap[t] = True
-                if t in list(self.linenos.keys()):
+                if t in self.linenos.keys():
                     self.linenos[t].append((self.filename, tag, lineno))
                 else:
                     self.linenos[t] = [ (self.filename, tag, lineno) ]
                 if (not self.do_translations) and comment and not t in self.comments:
                     self.comments[t] = comment
             else:
-                if t in list(self.linenos.keys()):
+                if t in self.linenos.keys():
                     self.linenos[t].append((self.filename, tag, lineno))
                 else:
                     self.linenos[t] = [ (self.filename, tag, lineno) ]
@@ -101,7 +102,11 @@ class MessageOutput:
                     self.comments[t] = comment
 
     def outputHeader(self, out):
-        import time
+        from datetime import datetime
+        # Using time.strftime was not working correctly for me: instead of a
+        # timezone offset a timezone name was added. This fixes it.
+        dt = datetime.now()
+        tz = dt.astimezone().tzinfo
         out.write("""msgid ""
 msgstr ""
 "Project-Id-Version: PACKAGE VERSION\\n"
@@ -113,7 +118,7 @@ msgstr ""
 "Content-Type: text/plain; charset=UTF-8\\n"
 "Content-Transfer-Encoding: 8bit\\n"
 
-""" % (time.strftime("%Y-%m-%d %H:%M%z")))
+""" % (dt.astimezone(tz).strftime("%Y-%m-%d %H:%M%z")))
 
     def outputAll(self, out):
         self.outputHeader(out)
@@ -138,6 +143,7 @@ msgstr ""
 
 class XMLDocument(object):
     def __init__(self, filename, app):
+        self.filename = filename
         self.app = app
         self.expand_entities = self.app.options.get('expand_entities')
         self.ignored_tags = self.app.current_mode.getIgnoredTags()
@@ -145,7 +151,13 @@ class XMLDocument(object):
         ctxt.lineNumbers(1)
         if self.app.options.get('expand_all_entities'):
             ctxt.replaceEntities(1)
-        ctxt.parseDocument()
+
+        try:
+            ctxt.parseDocument()
+        except Exception as e:
+            print("Error parsing XML file '%s': %s" % (filename, str(e)), file=sys.stderr)
+            sys.exit(1)
+
         self.doc = ctxt.doc()
         if self.doc.name != filename:
             raise Exception("Error: I tried to open '%s' but got '%s' -- how did that happen?" % (filename, 
self.doc.name))
@@ -166,22 +178,23 @@ class XMLDocument(object):
         elif node.isText():
             if node.isBlankNode():
                 if self.app.options.get('expand_entities') or \
-                  (not (node.prev and not node.prev.isBlankNode() and node.__next__ and not 
node.next.isBlankNode()) ):
-                    #print >>sys.stderr, "BLANK"
+                  (not (node.prev and not node.prev.isBlankNode() and node.next and not 
node.next.isBlankNode()) ):
                     node.setContent('')
             else:
-                node.setContent(re.sub('\s+',' ', node.content))
+                node.setContent(re.sub(r'\s+',' ', node.content))
 
         elif node.children and node.type == 'element':
             child = node.children
             while child:
+                nextchild = child.next
                 self.normalizeNode(child)
-                child = child.__next__
+                child = nextchild
 
     def normalizeString(self, text, spacepreserve = False):
         """Normalizes string to be used as key for gettext lookup.
 
         Removes all unnecessary whitespace."""
+        mytext = text
         if spacepreserve:
             return text
         try:
@@ -203,13 +216,20 @@ class XMLDocument(object):
             print("""Error while normalizing string as XML:\n"%s"\n""" % (text), file=sys.stderr)
             return text
 
+        # Not sure if saving the doc here is really necessary. It was one of the
+        # things done in debugging and don't want to spend time now to check if
+        # we can remove it.
+        save_doc = self.doc
+        self.doc = ctxt.doc()
         self.normalizeNode(newnode)
+        self.doc = save_doc
 
         result = ''
         child = newnode.children
         while child:
+            nextchild = child.next
             result += child.serialize('utf-8')
-            child = child.__next__
+            child = nextchild
 
         result = re.sub('^ ','', result)
         result = re.sub(' $','', result)
@@ -235,15 +255,16 @@ class XMLDocument(object):
         ctxt.parseDocument()
         tree = ctxt.doc()
         if next:
-            newnode = tree.children.__next__
+            newnode = tree.children.next
         else:
             newnode = tree.children
 
         result = ''
         child = newnode.children
         while child:
+            nextchild = child.next
             result += child.serialize('utf-8')
-            child = child.__next__
+            child = nextchild
         tree.freeDoc()
         return result
 
@@ -252,6 +273,7 @@ class XMLDocument(object):
         result = ''
         if node.children:
             child = node.children
+            nextchild = child.next
             while child:
                 if child.type=='text':
                     result += self.doc.encodeEntitiesReentrant(child.content)
@@ -262,7 +284,7 @@ class XMLDocument(object):
                         result += child.content.decode('utf-8')
                 else:
                     result += self.myAttributeSerialize(child)
-                child = child.__next__
+                child = nextchild
         else:
             result = node.serialize('utf-8')
         return result
@@ -308,11 +330,81 @@ class XMLDocument(object):
             return None
 
     def replaceAttributeContentsWithText(self, node, text):
-        node.setContent(text)
+        try:
+            node.setContent(text.decode('utf-8'))
+        except TypeError:
+            sys.stderr.write("--> replaceAttributeContentsWithText: Failed to decode text to utf-8.")
+            sys.exit(1)
+
+    def CheckMatchedTags(self, text):
+        stack = []
+        textblock = text
+
+        log=sys.stdout
+
+        # It might be even better to do the below with regex, see e.g.
+        # https://datadependence.com/2016/03/find-unclosed-tags-using-stacks/
+        # However I'm not sure it really matters that much since the text
+        # blocks usually are fairly small and most don't have a lot of tags.
+        start_tag = textblock.find('<')
+        while start_tag > -1:
+            textblock = textblock[start_tag+1:]
+            end_tag = textblock.find('>')
+            if end_tag > -1:
+                # Found left and right brackets: grab tag
+                tag = textblock[: end_tag]
+                # Check that it's not a tag that closes itself and comment tags starting with <!
+                if textblock[end_tag-1] != '/' and textblock[0] != '!':
+                    # Tag can have multiple elements inside, watch for first space
+                    space = tag.find(' ')
+                    if space > -1:
+                        tag = tag[: space]
+
+                    open_tag = (len(tag) > 0 and tag[0] != '/')
+                    if open_tag:
+                        # Add tag to stack
+                        stack.append(tag)
+                    else:
+                        tag = tag[1:]
+                        if len(stack) == 0:
+                            pass
+                        else:
+                            if stack[-1] == tag:
+                                # Close the block
+                                stack.pop()
+                            else:
+                                print(f"\n========================", file=log)
+                                print(f"Source xml: {self.filename}", file=log)
+                                print(f"Source po : {self.app.pofile}", file=log)
+                                print(f"Translated msgstr:\n{text}\n", file=log)
+                                print(f"WARNING: Found closing tag [{tag}], however we expected 
[{stack[0]}].", file=log)
+                                print(f"Remaining tags: {str(stack)}", file=log)
+                                if tag in stack:
+                                    stack.remove(tag)
+                                    print("  Assuming incorrect tag order, found and removed tag from the 
stack", file=log)
+                                print(f"========================\n", file=log)
+                textblock = textblock[end_tag+1:]
+                start_tag = textblock.find('<')
+            else:
+                start_tag = -1
+
+
+        if len(stack):
+            print(f"\n========================", file=log)
+            print(f"Source xml: {self.filename}", file=log)
+            print(f"Source po : {self.app.pofile}", file=log)
+            print(f"ERROR: Found unmatched tags in po msgstr:\n{text}\n", file=log)
+            print(f"Tags not matched: {str(stack)}", file=log)
+            print(f"========================\n", file=log)
+            return False
+        return True
 
     def replaceNodeContentsWithText(self, node, text):
         """Replaces all subnodes of a node with contents of text treated as XML."""
 
+        if not self.CheckMatchedTags(text):
+            return
+
         if node.children:
             starttag = self.startTagForNode(node)
             endtag = self.endTagForNode(node)
@@ -326,7 +418,7 @@ class XMLDocument(object):
                 pass
 
             content = '<%s>%s</%s>' % (starttag, text, endtag)
-            tmp = tmp + content.encode('utf-8')
+            tmp = tmp + content
 
             newnode = None
             try:
@@ -338,7 +430,9 @@ class XMLDocument(object):
                 pass
 
             if not newnode:
-                print("""Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8')), 
file=sys.stderr)
+                print(f"\n--> Error parsing translation as XML:\n{text}")
+                # See: https://gitlab.gnome.org/GNOME/libxml2/-/issues/64
+                print("--> Note: this might be caused by a bug in libxml2.\n")
                 return
 
             newelem = newnode.getRootElement()
@@ -346,15 +440,14 @@ class XMLDocument(object):
             if newelem and newelem.children:
                 free = node.children
                 while free:
-                    next = free.__next__
+                    nextchild = free.next
                     free.unlinkNode()
-                    free = next
+                    free = nextchild
 
                 if node:
-                    copy = newelem.copyNodeList()
-                    next = node.__next__
+                    nextnode = node.next
                     node.replaceNode(newelem.copyNodeList())
-                    node.next = next
+                    node.__next__ = nextnode
 
             else:
                 # In practice, this happens with tags such as "<para>    </para>" (only whitespace in 
between)
@@ -374,10 +467,11 @@ class XMLDocument(object):
             return True
         child = node.children
         while child:
+            nextchild = child.next
             if child.isText() and child.content.strip() != '':
                 return True
             else:
-                child = child.__next__
+                child = nextchild
         return False
 
 
@@ -432,6 +526,10 @@ class XMLDocument(object):
 
         child = node.children
         while child:
+            # Although I do not know why, child or child.next gets changed inside the if part below.
+            # This makes child.next fail when it shouldn't. That's why we store nextchild here
+            # before going into the if and use that at the end of the loop
+            nextchild = child.next
             if (self.isFinalNode(child)) or (child.type == 'element' and self.worthOutputting(child)):
                 myrepl.append(self.processElementTag(child, myrepl, True))
                 outtxt += '<placeholder-%d/>' % (len(myrepl))
@@ -441,20 +539,20 @@ class XMLDocument(object):
                     outtxt += '<%s>%s</%s>' % (starttag, content, endtag)
                 else:
                     outtxt += self.doSerialize(child)
-            child = child.__next__
+            child = nextchild
 
         if self.app.operation == 'merge':
             norm_outtxt = self.normalizeString(outtxt, self.app.isSpacePreserveNode(node))
             translation = self.app.getTranslation(norm_outtxt)
         else:
-            translation = outtxt.decode('utf-8')
+            translation = outtxt
 
         starttag = self.startTagForNode(node)
         endtag = self.endTagForNode(node)
 
         worth = self.worthOutputting(node)
         if not translation:
-            translation = outtxt.decode('utf-8')
+            translation = outtxt
             if worth and self.app.options.get('mark_untranslated'):
                 node.setLang('C')
 
@@ -463,7 +561,7 @@ class XMLDocument(object):
                 # repl[0] may contain translated attributes with
                 # non-ASCII chars, so implicit conversion to <str> may fail
                 replacement = '<%s>%s</%s>' % \
-                              (repl[0].decode('utf-8'), repl[3], repl[2])
+                              (repl[0], repl[3], repl[2])
                 translation = translation.replace('<placeholder-%d/>' % (i+1), replacement)
 
             if worth:
@@ -479,7 +577,7 @@ class XMLDocument(object):
     def isExternalGeneralParsedEntity(self, node):
         try:
             # it would be nice if debugDumpNode could use StringIO, but it apparently cannot
-            tmp = tempfile.TemporaryFile()
+            tmp = tempfile.TemporaryFile(encoding='utf-8')
             node.debugDumpNode(tmp,0)
             tmp.seek(0)
             tmpstr = tmp.read()
@@ -507,25 +605,31 @@ class XMLDocument(object):
             if self.isExternalGeneralParsedEntity(node):
                 return node.serialize('utf-8')
             else:
-                return self.stringForEntity(node) #content #content #serialize("utf-8")
-        elif node.type == 'entity_decl':
+                return self.stringForEntity(node)
+        elif node.type == 'entity_decl --> serialize':
             return node.serialize('utf-8') #'<%s>%s</%s>' % (startTagForNode(node), node.content, node.name)
         elif node.type == 'text':
-            return node.serialize('utf-8')
+            nodetext = node.serialize('utf-8')
+            return nodetext
         elif node.type == 'element':
             repl = []
             (starttag, content, endtag, translation) = self.processElementTag(node, repl, True)
-            return '<%s>%s</%s>' % (starttag, content, endtag)
+            return '<%s>%s</%s>' % (starttag, content.encode('utf-8'), endtag)
         else:
             child = node.children
             outtxt = ''
             while child:
+                # Not sure if the same problem with using next.child happens here too
+                # but we will use nextchild here too just to be sure
+                nextchild = child.next
                 outtxt += self.doSerialize(child)
-                child = child.__next__
+                child = nextchild
             return outtxt
 
-def xml_error_handler(arg, ctxt):
+def xml_error_handler(ctxt, error):
     #deactivate error messages from the validation
+    if DEBUG_VERBOSITY > 0:
+        print(f"--> xml_error_handler: {error}")
     pass
 
 class Main(object):
@@ -538,11 +642,11 @@ class Main(object):
         self.current_mode = self.load_mode(mode)()
         # Prepare output
         if operation == 'update':
-            self.out = tempfile.TemporaryFile()
+            self.out = tempfile.TemporaryFile(encoding='utf-8')
         elif output == '-':
             self.out = sys.stdout
         else:
-            self.out = file(output, 'w')
+            self.out = open(output, 'w', encoding='utf-8', buffering=1)
 
     def load_mode(self, modename):
         try:
@@ -565,7 +669,7 @@ class Main(object):
             try:
                 doc = XMLDocument(xmlfile, self)
             except Exception as e:
-                print("Unable to parse XML file '%s': %s" % (xmlfile, str(e)), file=sys.stderr)
+                print("Error parsing XML file '%s': %s" % (xmlfile, str(e)), file=sys.stderr)
                 sys.exit(1)
             self.current_mode.preProcessXml(doc.doc, self.msg)
             doc.generate_messages()
@@ -578,13 +682,13 @@ class Main(object):
         try:
             doc = XMLDocument(xmlfile, self)
         except Exception as e:
-            print(str(e), file=sys.stderr)
+            print("Error parsing XML file '%s': %s" % (xmlfile, str(e)), file=sys.stderr)
             sys.exit(1)
-
         try:
             mfile = open(mofile, "rb")
         except:
-            print("Can't open MO file '%s'." % (mofile), file=sys.stderr)
+            print("Error opening MO file '%s': %s." % (mofile, str(e)), file=sys.stderr)
+            sys.exit(1)
         self.gt = gettext.GNUTranslations(mfile)
         self.gt.add_fallback(NoneTranslations())
         # Has preProcessXml use cases for merge?
@@ -607,7 +711,7 @@ class Main(object):
         try:
             doc = XMLDocument(xmlfile, self)
         except Exception as e:
-            print(str(e), file=sys.stderr)
+            print("Error parsing XML file '%s': %s" % (xmlfile, str(e)), file=sys.stderr)
             sys.exit(1)
         doc.generate_messages()
 
@@ -615,7 +719,7 @@ class Main(object):
         try:
             doc = XMLDocument(origxml, self)
         except Exception as e:
-            print(str(e), file=sys.stderr)
+            print("Error parsing XML file '%s': %s" % (origxml, str(e)), file=sys.stderr)
             sys.exit(1)
         doc.generate_messages()
         self.output_po()
@@ -646,11 +750,10 @@ class Main(object):
 
         text should be a string to look for.
         """
-        #print >>sys.stderr,"getTranslation('%s')" % (text.encode('utf-8'))
         if not text or text.strip() == '':
             return text
         if self.gt:
-            res = self.gt.ugettext(text.decode('utf-8'))
+            res = self.gt.gettext(text)
             return res
 
         return text
diff --git a/tools/xml2po/modes/basic.py b/tools/xml2po/modes/basic.py
index d9b318848..e2ef7a473 100644
--- a/tools/xml2po/modes/basic.py
+++ b/tools/xml2po/modes/basic.py
@@ -42,7 +42,7 @@ class basicXmlMode:
             while child and final_children:
                 if not child.isBlankNode() and child.type != 'comment' and not self.isFinalNode(child):
                     final_children = False
-                child = child.__next__
+                child = child.next
             if final_children:
                 return True
         return False
diff --git a/tools/xml2po/modes/gimphelp.py b/tools/xml2po/modes/gimphelp.py
index 17bd5a926..eff767c62 100644
--- a/tools/xml2po/modes/gimphelp.py
+++ b/tools/xml2po/modes/gimphelp.py
@@ -81,7 +81,7 @@ class gimphelpXmlMode(docbookXmlMode):
             child = node.children
             while child:
                 self._output_images(child,msg)
-                child = child.__next__
+                child = child.next
 
     def preProcessXml(self, doc, msg):
         """Add additional messages of interest here."""
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]