gimp-help-2 r2548 - in branches/xml2po-support: . tools
- From: ulfehlert svn gnome org
- To: svn-commits-list gnome org
- Subject: gimp-help-2 r2548 - in branches/xml2po-support: . tools
- Date: Wed, 3 Sep 2008 19:52:49 +0000 (UTC)
Author: ulfehlert
Date: Wed Sep 3 19:52:49 2008
New Revision: 2548
URL: http://svn.gnome.org/viewvc/gimp-help-2?rev=2548&view=rev
Log:
2008-09-03 Ulf-D. Ehlert <ulfehlert svn gnome org>
* tools/split-xml-multi-lang.py: added (first version of) a script to
split multi-language XML files into single-language XML files of the
same structure, which should then be used with xml2po's "reuse" option
Added:
branches/xml2po-support/tools/split-xml-multi-lang.py (contents, props changed)
Modified:
branches/xml2po-support/ChangeLog
Added: branches/xml2po-support/tools/split-xml-multi-lang.py
==============================================================================
--- (empty file)
+++ branches/xml2po-support/tools/split-xml-multi-lang.py Wed Sep 3 19:52:49 2008
@@ -0,0 +1,477 @@
+#!/usr/bin/env python
+# _*_ coding: utf8 -*_
+"""
+ Convert multi-lang XML files
+
+ Copyright (C) 2008 The GIMP Documentation Team
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+"""
+
+
+import sys
+import os
+import os.path
+import codecs
+#import getopt
+import optparse
+import re
+import xml.dom.minidom
+import logging
+
+# Configure logging package
+logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s")
+Logger = logging.getLogger("splitxml")
+
+# these tags are considered NOT final
+sections = ('sect1', 'sect2', 'sect3', 'sect4', 'section')
+sectinfos = ('sect2info', 'sect1info', 'sect3info', 'sect4info')
+notes = ('warning', 'caution', 'important', 'tip', 'note')
+containers = ('figure', 'caption', 'revhistory', 'formalpara')
+objects = ('textobject', 'imageobject', 'mediaobject', 'screenshot')
+lists = ('itemizedlist', 'orderedlist', 'variablelist',
+ 'segmentedlist', 'simplelist', 'calloutlist')
+items = ('varlistentry', 'listitem')
+# these tags are considered final
+paras = ('para', 'simpara')
+leafs = ('phrase', 'revision', 'indexterm')
+
+non_final_nodes = sections + sectinfos + notes + containers + \
+ objects + lists + items
+final_nodes = paras + leafs
+
+
+################################################################
+# XML node #
+################################################################
+
+# TODO: try to implement this class, this should make
+# the code more readable (and logical), e.g.
+# some_node.is_whatever()
+# instead of
+# self.whatever(some_node)
+
+class XmlNode(object):
+ """FIXME"""
+ def __init__(self, node):
+ self.logger = logging.getLogger("splitxml.node")
+ assert isinstance(node, xml.dom.minidom.Node)
+ self._node = node
+
+ def get_children(self):
+ """FIXME
+
+ This method is just an idea, it has never been used or tested...
+ """
+ if self._node.childNodes:
+ return [XmlNode(child) for child in self._node.childNodes]
+ else:
+ return []
+
+ def type(self):
+ """Get the node type"""
+ return self._node.nodeType
+
+ def is_text(self):
+ """Whether or not a node is a text node.
+ """
+ return self.type() == xml.dom.minidom.Node.TEXT_NODE
+
+ def ignore(self):
+ """Whether or not a node is to be ignored.
+
+ This method is used to skip comments etc.
+ """
+ return self._node.nodeType in (xml.dom.minidom.Node.COMMENT_NODE,) \
+ or self._node.nodeName in ('xi:include',)
+
+
+################################################################
+# Multi-language XML document #
+################################################################
+
+class MultiLangDoc(object):
+ """Multi-language XML document
+
+ This class provides methods to read/parse a multi-lang XML source
+ file, to split the document into single-language documents, and to
+ print these documents as single-language XML files.
+ """
+ def __init__(self, filename, destdir = None):
+ """Multi-language XML document"""
+
+ self.filename = filename
+ self.destdir = destdir
+
+ self.logger = logging.getLogger("splitxml.doc")
+ self.logger.info("parsing %s" % filename)
+
+ self.doc = xml.dom.minidom.parse(filename)
+ self.dest = {}
+ self.seqnum = 0
+
+ def printfiles(self, destdir):
+ """Print resulting documents to the respective output files"""
+
+ assert destdir
+ langdir_template = '*'
+ destdir = destdir.rstrip('/')
+ if destdir.find(langdir_template) < 0:
+ if destdir:
+ destdir = os.path.join(destdir, langdir_template)
+ else:
+ destdir = langdir_template
+ self.destdir = destdir
+ filename = os.path.basename(self.filename)
+
+ for lang in self.languages:
+ destdir = self.destdir.replace(langdir_template, lang)
+ if not os.path.isdir(destdir):
+ os.makedirs(destdir, 0755)
+ destfile = os.path.join(destdir, filename)
+ output = codecs.open(destfile, 'w', "UTF-8")
+ self.logger.info("writing %s" % destfile)
+ self.dest[lang].writexml(output, encoding="UTF-8")
+ output.close()
+ self.dest[lang].unlink()
+
+ def process(self, languages):
+ """Split a multi-language XML document
+
+ This method creates XML document (root) nodes for every language,
+ and constructs single language documents while processing the
+ document recursively starting with the document element,
+ """
+ self.logger.debug("process(%s)" % str(self.doc.documentElement.nodeName))
+ self.languages = languages
+ if 'en' not in languages:
+ self.languages.insert(0, "en")
+
+ impl = xml.dom.minidom.getDOMImplementation()
+ for lang in self.languages:
+ self.dest[lang] = impl.createDocument(None, None, None)
+ dtd = impl.createDocumentType(
+ self.doc.documentElement.nodeName,
+ "-//OASIS//DTD DocBook XML V4.3//EN",
+ "http://www.docbook.org/xml/4.3/docbookx.dtd")
+ self.dest[lang].encoding = "UTF-8"
+ self.dest[lang].appendChild(dtd)
+
+ for child in self.doc.childNodes:
+ if child.nodeType == xml.dom.minidom.Node.DOCUMENT_TYPE_NODE:
+ continue
+ if child.nodeType != xml.dom.minidom.Node.ELEMENT_NODE:
+ for lang in self.languages:
+ clone = child.cloneNode(False)
+ self.dest[lang].appendChild(clone)
+ else:
+ # FIXME: what if root element has no 'lang' attribute # "en"?
+ source = self.vectorize(child)
+ clones = self.append_clones(source, self.dest, False)
+ return self.split(child, source, clones)
+
+ # Never reached, since "parse(filename)" catched it...
+ raise RuntimeError("Oops!? No document element found!?")
+
+ def split(self, elem, source, dest):
+ """Split a multi-language XML element
+
+ This method does the real work when processing the
+ document tree.
+ TODO: describe the algorithm(?)
+ """
+ self.logger.debug("split(%s)" % (elem.nodeName))
+ assert source and dest
+ self.seqnum += 1
+
+ for child in elem.childNodes:
+
+ # (1) skip this node if we don't need it (e.g. comments)
+ if self.ignore(child):
+ self.logger.debug("ignoring %s %s" % (child.nodeType, child.nodeName))
+
+ # (2) append non-empty text nodes to the destination nodes
+ elif self.text(child):
+ if child.nodeValue.strip():
+ # we should never be here, the (parent) node
+ # should be final then...
+ self.logger.warn("TEXT in %s" % elem.nodeName)
+ for lang in self.languages:
+ dest[lang].appendChild(child)
+
+ # (3) skip every non-English element
+ elif self.skip(child):
+ #self.logger.debug("skipping %s %s" % (child.nodeType, child.nodeName))
+ pass
+
+ # (4) at last, handle non-trivial cases...
+ else:
+ assert child.nodeType == child.ELEMENT_NODE \
+ and 'en' in self.get_langs(child)
+
+ # for every language, find the respective node
+ copies = self.vectorize(child) # no clones
+
+ # (4a) append recursively (localized) clones of nodes we don't
+ # need/want to process any further (para, phrase, etc.)
+ if self.final(child):
+ self.logger.debug("adding cloned final %s" % child.nodeName)
+ clones = self.append_clones(copies, dest, True)
+ # (4b) append non-recursively (localized) clones of nodes and
+ # process child recursively (sect[1-4], note, etc.)
+ else:
+ self.logger.debug("adding cloned %s" % child.nodeName)
+ clones = self.append_clones(copies, dest, False)
+ self.split(child, copies, clones)
+
+ return dest
+
+ def vectorize(self, elem):
+ """Make a set of corresponding nodes from an element node
+
+ This method gets an element with no 'lang' attribute or a 'lang'
+ attribute containing "en" (English language) and returns a set of
+ corresponding nodes for all languages (translations). If there is
+ no translation for some language, the original input node (i.e.
+ 'en') will be returned.
+ """
+ self.logger.debug("vectorize(%s)" % elem.nodeName)
+
+ # mark element as "seen"
+ elem.setAttribute("seqnum", str(self.seqnum))
+
+ # handle element's "lang" attribute
+ nodes = dict([(lang, elem) for lang in self.get_langs(elem)])
+ assert nodes.has_key('en')
+
+ if len(nodes) == len(self.languages):
+ return nodes
+
+ # Algorithm:
+ # (1) create set of *all* sibling elements of the same type/name
+ # (1a) filter out elements which has already been used (with
+ # "seqnum" attribute) -- done in ".get_siblings()"
+ # (2) select the first matching element for every language
+ # (3) select input element for every missing language
+
+ siblings = self.get_siblings(elem)
+ try:
+ for sibling in siblings:
+ sibling_languages = self.get_langs(sibling)
+ for lang in sibling_languages:
+ if not nodes.has_key(lang):
+ nodes[lang] = sibling
+ sibling.setAttribute("seqnum", str(self.seqnum))
+ if len(nodes) == len(self.languages):
+ raise StopIteration # TODO: user-defined exception(s)
+ except StopIteration:
+ pass
+
+ for lang in (k for k in self.languages if not nodes.has_key(k)):
+ nodes[lang] = elem
+ assert len(nodes) == len(self.languages)
+
+ return nodes
+
+ def get_siblings(self, element):
+ """Get a list of all previous and following siblings
+ of the same type as element
+
+ Elements with a "seqnum" attribute will be removed from the list.
+ """
+ siblings = []
+ this = element
+ while this.previousSibling: this = this.previousSibling
+ while this:
+ # TODO: add test for 'lang' attribute here(!?)
+ if this.nodeType == element.nodeType \
+ and this.nodeName == element.nodeName \
+ and not this.isSameNode(element):
+ if not (this.hasAttributes() and
+ this.attributes.get("seqnum")):
+ siblings.append(this)
+ this = this.nextSibling
+ return siblings
+
+ def append_clones(self, element, parent, recursive):
+ """Clone elements and append them to parent nodes
+
+ Returns a dict of (lang,clone) pairs for a specified
+ dict of (lang,element) pairs.
+ """
+ clones = dict([(key, element[key].cloneNode(recursive))
+ for key in element])
+ for lang in clones:
+ parent[lang].appendChild(clones[lang])
+ return clones
+
+ def final(self, node):
+ """Whether or not a node is a final node.
+
+ Final nodes will be cloned recursively and added to the
+ destination tree (more precisely: the localized nodes
+ will be cloned).
+
+ Non-final (localized) nodes will be cloned non-recursively
+ and also added to the destination tree, then they will be
+ processed recursively until a final node is reached.
+ """
+ name = node.nodeName
+
+ # Simple(?) cases
+ if name in non_final_nodes:
+ return False
+ elif name in final_nodes:
+ return True
+
+ # Special cases
+ # XXX: Hmm, what would happen if we used this test
+ # for *every* node?
+ if name in ('title', 'term'):
+ return self.has_nonempty_text(node)
+ else:
+ self.logger.warn("don't know what to do with '%s', assuming final" % name)
+ return True
+
+
+ def ignore(self, node):
+ """Whether or not a node is to be ignored.
+
+ This method is used to skip comments etc.
+ """
+ return node.nodeType in (node.COMMENT_NODE,) \
+ or node.nodeName in ('xi:include',)
+
+ def text(self, node):
+ """Whether or not a node is a text node."""
+ return node.nodeType == xml.dom.Node.TEXT_NODE
+
+ def has_nonempty_text(self, node):
+ """Whether or not a node has a non-empty text node."""
+ assert node.nodeType == xml.dom.Node.ELEMENT_NODE
+
+ for child in node.childNodes:
+ if child.nodeType == xml.dom.Node.TEXT_NODE \
+ and child.nodeValue.strip():
+ return True
+ return False
+
+ def skip(self, node):
+ """Whether or not a node is to be skipped.
+
+ This method is used to filter out non-English element nodes.
+ """
+ if node.nodeType != node.ELEMENT_NODE:
+ return True
+ else:
+ langs = self.get_langs(node)
+ return 'en' not in langs
+ #return not langs or 'en' not in langs
+
+ def get_langs(self, elem, all=False):
+ """Get a list of languages specified by the "lang"
+ attribute for a given element
+
+ Returnes all languages if element does not have a
+ "lang" attribute.
+ """
+ #if elem.hasAttributes():
+ # lang_attr = elem.attributes.get("lang")
+ #else:
+ # lang_attr = None
+ try:
+ lang_attr = elem.attributes.get("lang")
+ except:
+ lang_attr = None
+
+ if lang_attr:
+ # this is an Attr(Node) instance,
+ # its value is a string (e.g. "en;de;fr") or None
+ langs = lang_attr.value.strip(';').split(';')
+ if all:
+ return langs
+ else:
+ return [k for k in langs if k in self.languages]
+ else:
+ return self.languages
+
+
+################################################################
+# main program #
+################################################################
+
+def main():
+ """Read command line and then take off"""
+
+ logger = logging.getLogger("splitxml")
+
+ languages = ('de', 'es', 'fr', 'it', 'no', 'ru')
+
+ # parse command line
+
+ usage = "usage: %prog [options] [FILE [DIR]]"
+ version = "%prog 0.1"
+ cmdline = optparse.OptionParser(usage=usage, version=version)
+
+ cmdline.set_defaults(languages= ",".join(languages))
+ cmdline.add_option("--debug", dest="debug",
+ action="store_true", default=False,
+ help="produce some more or less useful debugging messages")
+ cmdline.add_option("-l", "--lang", dest="languages", metavar="LANG",
+ help="comma-separated list of languages, "
+ "'en' will be added automatically; "
+ "defaults to '" + ",".join(languages) + "'")
+ cmdline.add_option("-f", "--file", dest="filename", metavar="FILE",
+ help="input file (required), a multi-lang XML file (FILE may also "
+ "be specified as the first command-line argument)")
+ cmdline.add_option("-d", "--dest", dest="destdir", metavar="DIR",
+ help="output directory (required); if DIR contains a single '*', "
+ "this will be replaced by the respective language, otherwise "
+ "subdirs DIR/xx, DIR/yy, etc. will be used for output (DIR "
+ "may also be specified as the second command-line argument)")
+
+ (options, args) = cmdline.parse_args()
+
+ if args:
+ options.filename = args[0]
+ if len(args) == 2:
+ options.destdir = args[1]
+ else:
+ cmdline.error("too many arguments")
+
+ if not options.filename:
+ cmdline.error("no input file specified")
+ if not options.destdir:
+ cmdline.error("no output directory specified")
+
+ if options.debug:
+ Logger.setLevel(logging.DEBUG)
+
+ options.languages = re.split('[, ]+', options.languages)
+ try: options.languages.remove('en')
+ except ValueError: pass
+ options.languages.insert(0, 'en')
+
+ doc = MultiLangDoc(options.filename)
+ doc.process(options.languages)
+ doc.printfiles(options.destdir)
+
+
+# Main program start
+if __name__ == '__main__':
+ main()
+# pydoc doesn't like the following "raise" statement
+#else:
+# raise NotImplementedError
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]