[gcompris/gcomprixogoo] Improved parsing of wiktio2xml
- From: Bruno Coudoin <bcoudoin src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gcompris/gcomprixogoo] Improved parsing of wiktio2xml
- Date: Thu, 23 Sep 2010 23:46:37 +0000 (UTC)
commit e8c104c06b9bdb3d8c43cabd7b38e637cf43653e
Author: Bruno Coudoin <bruno coudoin free fr>
Date: Fri Sep 24 01:46:12 2010 +0200
Improved parsing of wiktio2xml
tools/wiktio2xml/wiktio.py | 27 +++++++----
tools/wiktio2xml/wiktio2xml.py | 100 +++++++++++++++++++++++++++-------------
2 files changed, 85 insertions(+), 42 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index 374850c..15cdb4d 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -10,6 +10,8 @@ class Definition:
self.text = ""
self.type = ""
self.subType = ""
+ self.filtered = False
+ self.gender = ""
def addText(self, text):
self.text += text
@@ -20,15 +22,21 @@ class Definition:
def setSubType(self, subType):
self.subType = subType
+ def setGender(self, gender):
+ self.gender = gender
+
def dump2html(self):
- print "<definition type='" + self.type \
- + "' subType='" + self.subType + "'>" \
- + self.text + "</definition>"
+ if self.filtered:
+ return
+ print "<h3>" + self.type + \
+ " " + self.subType + \
+ " " + self.gender + "</h3>"
+ print self.text
class Word:
- def __init__ (self):
- self.name = None
+ def __init__ (self, name = None):
+ self.name = name
self.definition = []
self.synonym = []
self.antonym = []
@@ -65,10 +73,11 @@ class Word:
def dump2htmlItem(self, title, liste):
if len(liste):
print "<h2>" + title + "</h2>"
- print "<ul>"
for s in liste:
- print "<li>" + s + "</li>"
- print "</ul>"
+ if s.find(":") >= 0:
+ print "<br></br>" + s
+ else:
+ print s
def dump2htmlPrononciation(self, title, liste):
prefix = "http://commons.wikimedia.org/wiki/File:"
@@ -104,7 +113,7 @@ class Wiktio:
return self.words
def sort(self):
- self.words.sort(key=lambda word: word.name)
+ self.words.sort(key=lambda word: word.name.lower())
def dumpHtmlHeader(self):
print """
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 5dca4c6..cc76270 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -37,18 +37,23 @@ class WikiHandler(ContentHandler):
self.isTextElement = True
self.textContent = ""
+ self.genders = {
+ "{{m}}": u"masculin",
+ "{{f}}": u"féminin",
+ "{{mf}}": u"masculin et féminin"
+ }
+
self.wordTypes = {
- "{{-nom-.*}}": "noun",
- "{{-nom-pr.*}}": "proper noun",
- "{{-verb.*}}": "proper noun",
- "{-pronom-.*}": "pronoun",
- "{-verb-.*}}": "verb",
- "{-adj-.*}}": "adjective",
- "{-adv-.*}}": "adverb",
- "{-art-.*}}": "article",
- "{-conj-.*}}": "conjunction",
- "{-prèp-.*}}": "preposition",
- "{-post-.*}}": "postposition"
+ u"{{-nom-.*}}": u"nom",
+ u"{{-nom-pr.*}}": u"nom propre",
+ u"{{-verb.*}}": u"verbe",
+ u"{-pronom-.*}": u"pronom",
+ u"{-adj-.*}}": u"adjectif",
+ u"{-adv-.*}}": u"adverbe",
+ u"{-art-.*}}": u"article",
+ u"{-conj-.*}}": u"conjunction",
+ u"{-prèp-.*}}": u"préposition",
+ u"{-post-.*}}": u"postposition"
}
self.wordSubTypes = {
@@ -59,13 +64,19 @@ class WikiHandler(ContentHandler):
# Some content in wiktionary is obvioulsy not appropriate
# for children. This contains a list of regexp.
- self.filterContent = [ r"de sade",
- r"pénis",
- r"homosexuel",
- r"vagin"]
-
- self.filterDefinitionType = [ r"{{vulg[^}]+}}",
- r"{{injur[^}]+}}" ]
+ self.filterContent = [ ur"de sade",
+ ur"pénis",
+ ur"homosexuel",
+ ur"vagin"]
+
+ self.filterDefinitionType = [ ur"{{vulg[^}]*}}",
+ ur"{{injur[^}]*}}",
+ ur"{{dés[^}]*}}",
+ ur"{{vx[^}]*}}",
+ ur"{{métonymie[^}]*}}",
+ ur"{{familier[^}]*}}",
+ ur"coït",
+ ur"argot"]
def endElement(self, name):
@@ -146,6 +157,8 @@ class WikiHandler(ContentHandler):
else:
return result + "<li>" + text + "</li>"
+ # Replaces '''xx''' and ''xx'' from the given text
+ # with openXml xx closeXml
def quote2xml(self, quote, openXml, closeXml, text):
index = 0
while index >= 0:
@@ -168,18 +181,16 @@ class WikiHandler(ContentHandler):
text = re.sub(r"{{[-\)\(]}}", "", text)
text = re.sub(r"\[\[\w+:\w+\]\]", "", text)
- text = re.sub(r"{{\(\|(.*)}}", r"\1", text)
+ text = re.sub(r"{{\(\|(.*)}}", r"", text)
if text == "":
return self.indents2xml(text, asText)
text = self.indents2xml(text, asText)
- text = re.sub(r"{{par ext[^}]+}}", r"(Par extension)", text)
- text = re.sub(r"{{litt[^}]+}}", r"(Littéraire)", text)
- text = re.sub(r"{{figuré[^}]+}}", r"(Figuré)", text)
- text = re.sub(r"{{dés[^}]+}}", r"(Désuet)", text)
- text = re.sub(r"{{vx[^}]+}}", r"(Vieilli)", text)
- text = re.sub(r"{{w\|([^}]+)}}", r"<i>\1</i>", text)
- text = re.sub(r"{{source\|([^}]+)}}", r"- (\1)", text)
+ text = re.sub(ur"{{par ext[^}]*}}", ur"(Par extension)", text)
+ text = re.sub(ur"{{litt[^}]*}}", ur"(Littéraire)", text)
+ text = re.sub(ur"{{figuré[^}]*}}", ur"(Figuré)", text)
+ text = re.sub(ur"{{w\|([^}]+)}}", ur"<i>\1</i>", text)
+ text = re.sub(ur"{{source\|([^}]+)}}", ur"- (\1)", text)
# Remove all unrecognized wiki tags
text = re.sub(r"{{[^}]+}}", "", text)
@@ -213,6 +224,8 @@ class WikiHandler(ContentHandler):
inPron = False
wordType = ""
wordSubType = ""
+ filterIndent = ""
+ gender = ""
# Append an end of text marker, it makes my life easier
self.textContent += "\n{{-EndOfTest-}}"
@@ -227,10 +240,28 @@ class WikiHandler(ContentHandler):
for filter in self.filterContent:
if re.search(filter, l, re.I):
- return None
+ if inDefinition:
+ inDefinition.filtered = True
+
+ if filterIndent != "":
+ # We are filtering, check this line is
+ # at a lower indentation level
+ result = re.search(r"^[ ]*[*#:;]+[ ]*", l)
+ if result:
+ if len(result.group(0).rstrip()) > len(filterIndent):
+ next = True
+ else:
+ filterIndent = ""
+ else:
+ filterIndent = ""
+
for filter in self.filterDefinitionType:
if re.search(filter, l, re.I):
+ result = re.search(r"^[ ]*[*#:;]+[ ]*", l)
+ if result:
+ # Keep the indent level for which we filter
+ filterIndent = result.group(0).rstrip()
next = True
if next:
@@ -252,6 +283,10 @@ class WikiHandler(ContentHandler):
if re.search(wt, l):
wordType = self.wordTypes[wt]
+ for wt in self.genders.keys():
+ if re.search(wt, l):
+ gender = self.genders[wt]
+
for wt in self.wordSubTypes.keys():
if re.search(wt, l):
wordSubType = self.wordSubTypes[wt]
@@ -285,12 +320,9 @@ class WikiHandler(ContentHandler):
if inPron:
if not re.search(r"{{-.*-.*}}", l):
- if l.find(".ogg") != -1:
- # Search the .ogg file
- file = l.split("=")
- if len(file) >= 2:
- file = file[1].replace("}}", "")
- inWord.addPrononciation(file)
+ file = re.subn(r".*audio=([^|}]+).*", r"\1", l)
+ if file[1] == 1:
+ inWord.addPrononciation(file[0])
else:
inPron = False
@@ -299,8 +331,10 @@ class WikiHandler(ContentHandler):
inDefinition = wiktio.Definition()
inDefinition.setType(wordType)
inDefinition.setSubType(wordSubType)
+ inDefinition.setGender(gender)
wordType = ""
wordSubType = ""
+ gender = ""
elif l == "{{-anagr-}}":
inAnagram = True
elif l == "{{-syn-}}":
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]