gimp-help-2 r2576 - in branches/xml2po-support: . tools
- From: ulfehlert svn gnome org
- To: svn-commits-list gnome org
- Subject: gimp-help-2 r2576 - in branches/xml2po-support: . tools
- Date: Mon, 6 Oct 2008 19:25:17 +0000 (UTC)
Author: ulfehlert
Date: Mon Oct 6 19:25:17 2008
New Revision: 2576
URL: http://svn.gnome.org/viewvc/gimp-help-2?rev=2576&view=rev
Log:
2008-10-06 Ulf-D. Ehlert <ulfehlert svn gnome org>
* tools/split_xml_multi_lang.txt: renamed to...
* tools/split_xml_multi_lang.test: ... and slightly changed
* tools/split_xml_multi_lang.py: changed to handle some
special cases
Added:
branches/xml2po-support/tools/split_xml_multi_lang.test (contents, props changed)
- copied, changed from r2574, /branches/xml2po-support/tools/split_xml_multi_lang.txt
Removed:
branches/xml2po-support/tools/split_xml_multi_lang.txt
Modified:
branches/xml2po-support/ChangeLog
branches/xml2po-support/tools/split_xml_multi_lang.py
Modified: branches/xml2po-support/tools/split_xml_multi_lang.py
==============================================================================
--- branches/xml2po-support/tools/split_xml_multi_lang.py (original)
+++ branches/xml2po-support/tools/split_xml_multi_lang.py Mon Oct 6 19:25:17 2008
@@ -43,6 +43,9 @@
RECURSIVE = True
NONRECURSIVE = False
+# Exceptions
+class BreakOutOfNestedLoops(Exception): pass
+
# these tags are considered NOT FINAL
sections = ('sect1', 'sect2', 'sect3', 'sect4', 'section', 'bibliodiv',
'book', 'part', 'chapter', 'preface', 'legalnotice')
@@ -77,6 +80,7 @@
final_nodes = paras + leafs + fobjects + fgui + keys + misc
+# >>>>>>>>>>>>>>>> not used >>>>>>>>>>>>>>>>
################################################################
# XML node #
################################################################
@@ -121,6 +125,7 @@
"""
return self._node.nodeType in (xml.dom.minidom.Node.COMMENT_NODE,) \
or self._node.nodeName in ('xi:include',)
+# <<<<<<<<<<<<<<<< not used <<<<<<<<<<<<<<<<
################################################################
@@ -134,14 +139,11 @@
file, to split the document into single-language documents, and to
print these documents as single-language XML files.
"""
- def __init__(self, filename, destdir = None):
+ def __init__(self, filename):
"""Multi-language XML document"""
self.filename = filename
- self.destdir = destdir
-
- self.dest = {}
- self.seqnum = 0
+ self.dest = {} # destination documents
self.logger = logging.getLogger("splitxml")
self.logger.info("Parsing %s" % filename)
@@ -163,11 +165,11 @@
destdir = os.path.join(destdir, langdir_template)
else:
destdir = langdir_template
- self.destdir = destdir
+ destdir_template = destdir
filename = os.path.basename(self.filename)
for lang in self.languages:
- destdir = self.destdir.replace(langdir_template, lang)
+ destdir = destdir_template.replace(langdir_template, lang)
if not os.path.isdir(destdir):
os.makedirs(destdir, 0755)
destfile = os.path.join(destdir, filename)
@@ -181,10 +183,10 @@
"""Split a multi-language XML document
This method creates XML document (root) nodes for every language,
- and constructs single language documents while processing the
- document recursively starting with the document element,
+ then constructs single-language documents while processing the
+ document recursively, starting with the document element,
"""
- self.logger.debug("process(%s)" % str(self.doc.documentElement.nodeName))
+ self.logger.debug("process(%s)" % self.doc.documentElement.nodeName)
self.languages = languages
if 'en' not in languages:
self.languages.insert(0, "en")
@@ -207,6 +209,7 @@
clone = child.cloneNode(NONRECURSIVE)
self.dest[lang].appendChild(clone)
else:
+ # This is the document element (aka root element)
try:
self.doc_languages = self.get_langs(child)
if not 'en' in self.doc_languages:
@@ -217,32 +220,43 @@
sys.exit(74)
# Now we know that the document element has a valid "lang"
# attribute, and so has every element (via parent nodes).
- self.logger.debug("languages = %s" % languages)
- source = self.vectorize(child)
- clones = self.append_clones(source, self.dest, NONRECURSIVE)
- return self.split(child, source, clones)
+ self.seqnum = 0
+ child.setAttribute("seqnum", str(self.seqnum))
+ clones = self.append_clones(child, self.dest, NONRECURSIVE)
+ return self.split(child, clones)
# Never reached, since "parse(filename)" catched it...
raise RuntimeError("Oops!? No document element found!?")
- def split(self, elem, source, dest):
+ def split(self, elem, dest):
"""Split a multi-language XML element
- This method does the real work when processing the
- document tree.
- TODO: describe the algorithm(?)
- """
- self.logger.debug("split(%s)" % (elem.nodeName))
- assert elem.isSameNode(source['en'])
+ This is the main routine for traversing the document tree. For
+ every child element (with lang="en") of the specified "elem" node,
+ this function searches the corresponding nodes containing
+ translations of that child. Then it appends the nodes to the
+ respective nodes of the destination vector and, if necessary,
+ traverses the child element calling itself recursively.
+
+ The "elem" argument may also be a set of different element nodes
+ for every language, then searching will be applied to different
+ sets of child nodes for every language.
+ """
+ if isinstance(elem, dict):
+ parentnodes = elem
+ elem = parentnodes['en']
+ else:
+ parentnodes = None
- # keep track of the visited nodes
- self.seqnum += 1
+ # dest is a cloned vector of elem:
+ assert elem.nodeName == dest['en'].nodeName
+
+ self.logger.debug("split(%s)" % (elem.nodeName))
for child in elem.childNodes:
# (1) skip this node if we don't need it (e.g. comments)
if self.ignore(child):
- #self.logger.debug("ignoring %s %s" % (child.nodeType, child.nodeName))
pass
# (2) append non-empty text nodes to the destination nodes
@@ -261,7 +275,6 @@
# (3) skip every non-English element
elif self.skip(child):
- #self.logger.debug("skipping %s %s" % (child.nodeType, child.nodeName))
pass
# (4) at last, handle non-trivial cases...
@@ -269,26 +282,29 @@
assert child.nodeType == child.ELEMENT_NODE \
and 'en' in self.get_langs(child)
- # for every language, find the respective node
- copies = self.vectorize(child, source) # no clones
+ # Find the corresponding node for every language. Here
+ # it makes a difference if we use a set of (different)
+ # parent nodes for every language or just one single
+ # element.
+ copies = self.vectorize(child, parentnodes)
- # (4a) append recursively (localized) clones of nodes we don't
+ # (4a) append recursively localized clones of nodes we don't
# need/want to process any further (para, phrase, etc.)
if self.final(child):
- self.logger.debug("split(%s): adding cloned final %s" % \
+ self.logger.debug("split(%s) --> adding cloned final %s" % \
(elem.nodeName, child.nodeName))
clones = self.append_clones(copies, dest, RECURSIVE)
- # (4b) append non-recursively (localized) clones of nodes and
- # process child recursively (sect[1-4], note, etc.)
+ # (4b) append non-recursively localized clones of nodes and
+ # then traverse child nodes recursively (sect[1-4], note, etc.)
else:
- self.logger.debug("split(%s): adding cloned %s" % \
+ self.logger.debug("split(%s) --> cloning %s" % \
(elem.nodeName, child.nodeName))
clones = self.append_clones(copies, dest, NONRECURSIVE)
- self.split(child, copies, clones)
+ self.split(copies, clones)
return dest
- def vectorize(self, elem, source=None):
+ def vectorize(self, elem, parents=None):
"""Make a set of corresponding nodes from an element node
This method gets an element with no 'lang' attribute or a 'lang'
@@ -296,44 +312,100 @@
corresponding nodes for all languages (translations). If there is
no translation for some language, the original input node (i.e.
'en') will be returned.
+
+ If every element of the resulting vector is the same node (i.e.
+ the same as "elem"), the result will be reduced to this element.
"""
- self.logger.debug("vectorize(%s)" % elem.nodeName)
+ if isinstance(parents, dict):
+ self.logger.debug("vectorize(%s in %s)" % \
+ (elem.nodeName, parents['en'].nodeName))
+ else:
+ self.logger.debug("vectorize(%s)" % elem.nodeName)
- # mark element as "seen"
- elem.setAttribute("seqnum", str(self.seqnum))
+ assert not isinstance(parents, dict) or elem.parentNode == parents['en']
- # handle element's "lang" attribute
- nodes = dict([(lang, elem) for lang in self.get_langs(elem)])
- assert nodes.has_key('en')
+ # Mark element as "seen"
+ self.seqnum += 1
+ elem.setAttribute("seqnum", str(self.seqnum))
- if len(nodes) == len(self.languages):
- return nodes
+ # Trivial case: the element contains all languages
+ if len(self.get_langs(elem)) == len(self.languages):
+ return elem
+
+ # Typical (and simple) case: we are working on the original source
+ # tree, the nodes of the resulting vector will have the same
+ # parent node.
+ if not parents:
+
+ # copy element for every element language
+ nodes = dict([(lang, elem) for lang in self.get_langs(elem)])
+ assert nodes.has_key('en')
+ assert len(nodes) != len(self.languages)
+
+ # TODO: describe algorithm
+
+ siblings = self.get_siblings(elem)
+ found = 0
+
+ for sibl in siblings:
+ langs = self.get_langs(sibl)
+ new_langs = [k for k in langs if k not in nodes]
+ if not self.final(elem):
+ if len(langs) > len(new_langs):
+ break
+ elif not new_langs:
+ continue
+ sibl.setAttribute("seqnum", str(self.seqnum))
+ for lang in new_langs:
+ nodes[lang] = sibl
+ found += 1
+ if len(nodes) == len(self.languages):
+ return nodes
+
+ for lang in (k for k in self.languages if not nodes.has_key(k)):
+ nodes[lang] = elem
+ assert len(nodes) == len(self.languages)
+
+ if found:
+ # nodes[x] != elem for one ore more x in self.languages
+ return nodes
+ else:
+ # nodes[x] == elem for every x in self.languages
+ return elem
- # Algorithm:
- # (1) create set of *all* sibling elements of the same type/name
- # (1a) filter out elements which has already been used (with
- # "seqnum" attribute) -- done in ".get_siblings()"
- # (2) select the first matching element for every language
- # (3) select input element for every missing language
+ # Special case: for every language (assuming there is a tranlation
+ # for this language, of course) we are working in separate subtree
+ # of the original source tree, and the nodes of the resulting
+ # vector may have different parent nodes.
+ else:
+ assert elem.parentNode == parents['en']
+ nodes = dict([(lang, elem) for lang in parents
+ if parents[lang] == parents["en"]])
+ for lang in parents:
+ if lang in nodes: continue
+ try:
+ for child in parents[lang].childNodes:
+ if child.nodeType != child.ELEMENT_NODE or \
+ child.nodeName != elem.nodeName or \
+ child.getAttribute("seqnum"): continue
+ languages = self.get_langs(child)
+ for lang in languages:
+ if lang in nodes:
+ raise BreakOutOfNestedLoops
+ for lang in languages:
+ nodes[lang] = child
+ child.setAttribute("seqnum", str(self.seqnum))
+ break
+ except BreakOutOfNestedLoops:
+ self.logger.warn("possibly incorrect %s in %s" % \
+ (elem.nodeName, parents[lang].nodeName))
+
+ for lang in (k for k in self.languages if not k in nodes):
+ nodes[lang] = elem
+ assert len(nodes) == len(self.languages)
- siblings = self.get_siblings(elem)
- try:
- for sibling in siblings:
- sibling_languages = self.get_langs(sibling)
- for lang in sibling_languages:
- if not nodes.has_key(lang):
- nodes[lang] = sibling
- sibling.setAttribute("seqnum", str(self.seqnum))
- if len(nodes) == len(self.languages):
- raise StopIteration # TODO: user-defined exception(s)
- except StopIteration:
- pass
-
- for lang in (k for k in self.languages if not nodes.has_key(k)):
- nodes[lang] = elem
- assert len(nodes) == len(self.languages)
+ return nodes
- return nodes
def get_siblings(self, element):
"""Get a list of all previous and following siblings
@@ -357,16 +429,34 @@
return siblings
def append_clones(self, element, parent, recursive):
- """Clone elements and append them to parent nodes
+ """Clone element(s) and append them to parent nodes
+
+ Element is either a single element (which means it's the
+ same element for every language) or a language-indexed
+ vector of elements.
+
+ The method returns a language-indexed node vector.
+ """
+ if isinstance(element, dict):
+ assert len(element) == len(self.languages)
+ clones = dict([(key, element[key].cloneNode(recursive))
+ for key in element])
+ else:
+ clones = dict([(key, element.cloneNode(recursive))
+ for key in self.languages])
+
+ if not self.logger.isEnabledFor(logging.DEBUG):
+ if clones['en'].hasAttribute("seqnum"):
+ for lang in clones:
+ clones[lang].removeAttribute("seqnum")
+ else:
+ # Should never happen...
+ self.logger.warn("%s without seqnum" % clones['en'].nodeName)
+ for lang in clones:
+ if clones[lang].hasAttribute("lang"):
+ clones[lang].removeAttribute("lang")
- Returns a dict of (lang,clone) pairs for a specified
- dict of (lang,element) pairs.
- """
- clones = dict([(key, element[key].cloneNode(recursive))
- for key in element])
for lang in clones:
- if not self.logger.isEnabledFor(logging.DEBUG):
- clones[lang].removeAttribute("seqnum")
parent[lang].appendChild(clones[lang])
return clones
@@ -450,7 +540,8 @@
elem = elem.parentNode
langs = elem.getAttribute("lang")
langs = langs.strip(';').split(';')
- return [lang for lang in langs if lang in self.languages]
+ # use "set(langs)" since "langs" may contain identical entries:
+ return [lang for lang in set(langs) if lang in self.languages]
################################################################
@@ -467,7 +558,7 @@
# parse command line
usage = "usage: %prog [options] [FILE [DIR]]"
- version = "%prog 0.4 2008-10-01"
+ version = "%prog 0.5 2008-10-06"
cmdline = optparse.OptionParser(usage=usage, version=version)
cmdline.set_defaults(languages= ",".join(languages))
@@ -522,7 +613,7 @@
testrunner = unittest.TextTestRunner()
suite = unittest.TestSuite()
suite.addTest(
- doctest.DocFileSuite("split_xml_multi_lang.txt",
+ doctest.DocFileSuite("split_xml_multi_lang.test",
optionflags = doctest.NORMALIZE_WHITESPACE |
doctest.ELLIPSIS |
doctest.REPORT_NDIFF)
Copied: branches/xml2po-support/tools/split_xml_multi_lang.test (from r2574, /branches/xml2po-support/tools/split_xml_multi_lang.txt)
==============================================================================
--- /branches/xml2po-support/tools/split_xml_multi_lang.txt (original)
+++ branches/xml2po-support/tools/split_xml_multi_lang.test Mon Oct 6 19:25:17 2008
@@ -34,13 +34,11 @@
Now we can create the multilangdoc object:
>>> from split_xml_multi_lang import MultiLangDoc
->>> mld = MultiLangDoc(testxmlfile, destdir)
+>>> mld = MultiLangDoc(testxmlfile)
>>> mld
<split_xml_multi_lang.MultiLangDoc object at 0x...>
>>> mld.filename == testxmlfile
True
->>> mld.destdir == destdir
-True
Processing
@@ -61,3 +59,10 @@
>>> mld.process([])
{u'en': <DOM Element: sect1 at -0x...>}
+
+The destination directory (actually a template) will be specified when
+printing the resulting files for every language:
+
+>>> mld.printfiles(destdir)
+FIXME
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]