From 6a32ccab72ee5328e196ae7df46b6c3a4cd7638d Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Wed, 18 Feb 2009 16:05:14 -0500 Subject: [PATCH] Make HTMLParser handle Unicode. --- jirabot.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/jirabot.py b/jirabot.py index 8c23721..2fee41d 100755 --- a/jirabot.py +++ b/jirabot.py @@ -1,13 +1,14 @@ #!/usr/bin/python -import cStringIO import calendar import feedparser import formatter +import htmlentitydefs import htmllib import mechanize import os import random import string +import StringIO import time import traceback import urlparse @@ -17,6 +18,19 @@ zephyr_sender = 'jira' zephyr_class = 'andersk-test' time_file = 'jirabot.time' +class UnicodeHTMLParser(htmllib.HTMLParser): + entitydefs = dict((k, unichr(v)) for (k, v) in htmlentitydefs.name2codepoint.items()) + + def convert_charref(self, name): + try: + n = int(name) + except ValueError: + return + return self.convert_codepoint(n) + + def convert_codepoint(self, codepoint): + return unichr(codepoint) + def jira_init(): b = mechanize.Browser() b.set_handle_robots(False) @@ -78,12 +92,11 @@ def parse_comment(e): url = urlparse.urlunsplit(urlparse.urlparse(e.id)[0:3] + (None,None)) issue = url.rsplit('/', 1)[1] - s = cStringIO.StringIO() - parser = htmllib.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(s))) + s = StringIO.StringIO() + parser = UnicodeHTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(s))) parser.feed(e.summary.rsplit('', 1)[0]) parser.close() - s.seek(0) - comment = s.read() + comment = s.getvalue() msg = e.author + " added a comment:\n" + comment.rstrip() -- 2.45.1