From 0415e4c489786fc60fa9cbd180378e7202c94dc5 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 14 Jul 2011 07:26:45 -0400 Subject: Fixed #34. Better support for unicode text with the html_tidy extension. Force input and output of tidy to use UTF-8 and encode before and decode after passing in the text. --- markdown/extensions/html_tidy.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/markdown/extensions/html_tidy.py b/markdown/extensions/html_tidy.py index e1736eb..6aee083 100644 --- a/markdown/extensions/html_tidy.py +++ b/markdown/extensions/html_tidy.py @@ -40,6 +40,7 @@ class TidyExtension(markdown.Extension): # Set defaults to match typical markdown behavior. self.config = dict(output_xhtml=1, show_body_only=1, + char_encoding='utf8' ) # Merge in user defined configs overriding any present if nessecary. for c in configs: @@ -58,8 +59,10 @@ class TidyProcessor(markdown.postprocessors.Postprocessor): def run(self, text): # Pass text to Tidy. As Tidy does not accept unicode we need to encode # it and decode its return value. - return unicode(tidy.parseString(text.encode('utf-8'), - **self.markdown.tidy_options)) + enc = self.markdown.tidy_options.get('char_encoding', 'utf8') + return unicode(tidy.parseString(text.encode(enc), + **self.markdown.tidy_options), + encoding=enc) def makeExtension(configs=None): -- cgit v1.2.3