From 80f6ac599f79546512b522566cb421acea1aca19 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 28 Jul 2011 10:02:38 -0400 Subject: All internal encoding of output now uses the 'xmlcharrefreplace' error handler. Also added a note to the docs. Anyone doing their own encoding of output should be as well. --- docs/using_as_module.txt | 8 ++++++-- markdown/__init__.py | 9 ++++++--- tests/__init__.py | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/using_as_module.txt b/docs/using_as_module.txt index 343fee0..f50a0ec 100644 --- a/docs/using_as_module.txt +++ b/docs/using_as_module.txt @@ -39,7 +39,10 @@ The following options are available on the `markdown.markdown` function: If you want to write the output to disk, you must encode it yourself: - output_file = codecs.open("some_file.html", "w", encoding="utf-8") + output_file = codecs.open("some_file.html", "w", + encoding="utf-8", + errors="xmlcharrefreplace" + ) output_file.write(html) * `extensions`: A list of extensions. @@ -178,7 +181,8 @@ the following required options: * or `None` (default) which will write to `stdout`. * `encoding`: The encoding of the source text file. Defaults to - "utf-8". The same encoding will always be used for the output file. + "utf-8". The same encoding will always be used for the output file. + The 'xmlcharrefreplace' error handler is used when encoding the output. **Note:** This is the only place that decoding and encoding of unicode takes place in Python-Markdown. If this rather naive solution does not diff --git a/markdown/__init__.py b/markdown/__init__.py index 562ee5f..630198e 100644 --- a/markdown/__init__.py +++ b/markdown/__init__.py @@ -311,7 +311,8 @@ class Markdown: Decodes the file using the provided encoding (defaults to utf-8), passes the file content to markdown, and outputs the html to either the provided stream or the file with provided name, using the same - encoding as the source file. + encoding as the source file. The 'xmlcharrefreplace' error handler is + used when encoding the output. **Note:** This is the only place that decoding and encoding of unicode takes place in Python-Markdown. (All other code is unicode-in / @@ -341,11 +342,13 @@ class Markdown: # Write to file or stdout if isinstance(output, (str, unicode)): - output_file = codecs.open(output, "w", encoding=encoding) + output_file = codecs.open(output, "w", + encoding=encoding, + errors="xmlcharrefreplace") output_file.write(html) output_file.close() else: - output.write(html.encode(encoding)) + output.write(html.encode(encoding, errors="xmlcharrefreplace")) return self diff --git a/tests/__init__.py b/tests/__init__.py index 8fbc5d9..b274b1e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -51,7 +51,7 @@ def get_args(file, config): def normalize(text): """ Normalize whitespace for a string of html using tidy. """ - return str(tidy.parseString(text.encode('utf-8'), + return str(tidy.parseString(text.encode('utf-8', errors='xmlcharrefreplace'), drop_empty_paras=0, fix_backslash=0, fix_bad_comments=0, -- cgit v1.2.3