bigboard r7311 - trunk/bigboard/stocks/mail
- From: walters svn gnome org
- To: svn-commits-list gnome org
- Subject: bigboard r7311 - trunk/bigboard/stocks/mail
- Date: Thu, 24 Apr 2008 00:12:49 +0100 (BST)
Author: walters
Date: Wed Apr 23 23:12:49 2008
New Revision: 7311
URL: http://svn.gnome.org/viewvc/bigboard?rev=7311&view=rev
Log:
Use BeautifulSoup to parse GMail HTML
Modified:
trunk/bigboard/stocks/mail/MailStock.py
Modified: trunk/bigboard/stocks/mail/MailStock.py
==============================================================================
--- trunk/bigboard/stocks/mail/MailStock.py (original)
+++ trunk/bigboard/stocks/mail/MailStock.py Wed Apr 23 23:12:49 2008
@@ -1,7 +1,6 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import logging, re, time, urllib2
-import htmlentitydefs
import gobject, gtk
import hippo
@@ -25,65 +24,31 @@
def replace_chr(m):
return unichr(int(m.group(1), 16))
UNICHR_REPLACE = re.compile(r"\\u([A-F-a-f0-9]{4})")
-
-# http://effbot.org/zone/re-sub.htm#unescape-html
-##
-# Removes HTML or XML character references and entities from a text string.
-#
-# @param text The HTML (or XML) source text.
-# @return The plain text, as a Unicode string, if necessary.
-def unescape_html_entities(text):
- xml_entities = ["quot", "amp", "apos", "lt", "gt"]
- def fixup(m):
- text = m.group(0)
- if text[:2] == "&#":
- # character reference
- try:
- if text[:3] == "&#x":
- return unichr(int(text[3:-1], 16))
- else:
- return unichr(int(text[2:-1]))
- except ValueError:
- pass
- else:
- # named entity
- entityname = text[1:-1]
- # Don't unescape valid XML entities
- if entityname in xml_entities:
- return text
- try:
- text = unichr(htmlentitydefs.name2codepoint[entityname])
- except KeyError:
- pass
- return text # leave as is
- return re.sub("&#?\w+;", fixup, text)
-
def gmail_jshtml_str_parse(s, markup=False):
# Replace \uxxxx escapes
parsed_str = UNICHR_REPLACE.sub(replace_chr, s)
parsed_str = unescape_html_entities(parsed_str)
-
# At this point, we have a Python unicode string which *should* hold
- # an XML fragment. Convert that fragment into a document string.
+ # an HTML fragment. Convert that fragment into a document string.
pystr = "<html>" + parsed_str + "</html>"
- # Parse that document string into a DOM.
- dom = xml.dom.minidom.parseString(pystr)
- textContent = StringIO()
- # Now we parse the XML, only allowing the bold tag through, and eating everything else
- def DomToText(node):
- if node.nodeType == Node.TEXT_NODE:
- textContent.write(gobject.markup_escape_text(node.data))
- if markup and node.nodeType == Node.ELEMENT_NODE and node.nodeName == 'b':
+ # Now use BeautifulSoup to parse it
+ from BeautifulSoup import BeautifulSoup
+ soup = BeautifulSoup(pystr, convertEntities=BeautifulSoup.HTML_ENTITIES)
+ textContent = StringIO()
+ def filterBoldOnly(node):
+ if isinstance(node, unicode):
+ textContent.write(gobject.markup_escape_text(node))
+ return
+ if markup and node.name == 'b':
in_bold = True
textContent.write('<b>')
else:
- in_bold = False
- if node.hasChildNodes():
- for child in node.childNodes:
- DomToText(child)
+ in_bold = False
+ for child in node.childGenerator():
+ filterBoldOnly(child)
if in_bold:
textContent.write('</b>')
- DomToText(dom.documentElement)
+ filterBoldOnly(soup)
# Return the sanely filtered content
return textContent.getvalue()
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]