From 1fbd6ebdcc913e4dae5030d35009d4d3bb803916 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Wed, 27 Jul 2011 10:54:12 -0400 Subject: Stripped out encoding/decoding in the searializers. Those extra steps always bothered me as being unnecessary. Additionally, this should make conversion to Python 3 easier. The 2to3 tool wasn't converting the searializers properly and we were getting byte strings in the output. Previously, this wasn't a major problem because the default searializer was the xml searializer provided in the ElementTree standard lib. However, now that we are using our own xhtml searializer, it must work smoothly in all supported versions. As a side note, I believe the thought was that we needed to do the encoding to take advantage of the "xmlcharrefreplace" error handling. However, using the example in the python [docs](http://docs.python.org/howto/unicode.html#the-unicode-type): >>> u = unichr(40960) + u'abcd' + unichr(1972) >>> u.encode('utf-8', 'xmlcharrefreplace').decode('utf-8') == u True There's no point of using the "xmlcharrefreplace" error handling if we just convert back to the original Unicode anyway. Interestingly, the Python 3 standard lib is doing essentially what we are doing here, so I'm convinced this is the right way to go. --- markdown/__init__.py | 2 +- markdown/searializers.py | 84 +++++++++++++++++++++--------------------------- 2 files changed, 38 insertions(+), 48 deletions(-) diff --git a/markdown/__init__.py b/markdown/__init__.py index 7ca6cd1..562ee5f 100644 --- a/markdown/__init__.py +++ b/markdown/__init__.py @@ -285,7 +285,7 @@ class Markdown: root = newRoot # Serialize _properly_. Strip top-level tags. - output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8")) + output = self.serializer(root) if self.stripTopLevelTags: try: start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2 diff --git a/markdown/searializers.py b/markdown/searializers.py index 2de7b0f..22a83d4 100644 --- a/markdown/searializers.py +++ b/markdown/searializers.py @@ -1,6 +1,6 @@ -# markdown/html4.py +# markdown/searializers.py # -# Add html4 serialization to older versions of Elementree +# Add x/html serialization to Elementree # Taken from ElementTree 1.3 preview with slight modifications # # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. @@ -82,7 +82,7 @@ def _encode(text, encoding): except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_cdata(text, encoding): +def _escape_cdata(text): # escape character data try: # it's worth avoiding do-nothing calls for strings that are @@ -94,12 +94,12 @@ def _escape_cdata(text, encoding): text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") - return text.encode(encoding, "xmlcharrefreplace") + return text except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_attrib(text, encoding): +def _escape_attrib(text): # escape attribute value try: if "&" in text: @@ -112,11 +112,11 @@ def _escape_attrib(text, encoding): text = text.replace("\"", """) if "\n" in text: text = text.replace("\n", " ") - return text.encode(encoding, "xmlcharrefreplace") + return text except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_attrib_html(text, encoding): +def _escape_attrib_html(text): # escape attribute value try: if "&" in text: @@ -127,25 +127,25 @@ def _escape_attrib_html(text, encoding): text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) - return text.encode(encoding, "xmlcharrefreplace") + return text except (TypeError, AttributeError): _raise_serialization_error(text) -def _serialize_html(write, elem, encoding, qnames, namespaces, format): +def _serialize_html(write, elem, qnames, namespaces, format): tag = elem.tag text = elem.text if tag is Comment: - write("" % _escape_cdata(text, encoding)) + write("" % _escape_cdata(text)) elif tag is ProcessingInstruction: - write("" % _escape_cdata(text, encoding)) + write("" % _escape_cdata(text)) else: tag = qnames[tag] if tag is None: if text: - write(_escape_cdata(text, encoding)) + write(_escape_cdata(text)) for e in elem: - _serialize_html(write, e, encoding, qnames, None, format) + _serialize_html(write, e, qnames, None, format) else: write("<" + tag) items = elem.items() @@ -157,7 +157,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format): if isinstance(v, QName): v = qnames[v.text] else: - v = _escape_attrib_html(v, encoding) + v = _escape_attrib_html(v) if qnames[k] == v and format == 'html': # handle boolean attributes write(" %s" % v) @@ -169,10 +169,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format): for v, k in items: if k: k = ":" + k - write(" xmlns%s=\"%s\"" % ( - k.encode(encoding), - _escape_attrib(v, encoding) - )) + write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v))) if format == "xhtml" and tag in HTML_EMPTY: write(" />") else: @@ -180,38 +177,35 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format): tag = tag.lower() if text: if tag == "script" or tag == "style": - write(_encode(text, encoding)) + write(text) else: - write(_escape_cdata(text, encoding)) + write(_escape_cdata(text)) for e in elem: - _serialize_html(write, e, encoding, qnames, None, format) + _serialize_html(write, e, qnames, None, format) if tag not in HTML_EMPTY: write("") if elem.tail: - write(_escape_cdata(elem.tail, encoding)) + write(_escape_cdata(elem.tail)) def _write_html(root, - # keyword arguments - encoding="utf-8", - default_namespace=None, - format="html"): + encoding=None, + default_namespace=None, + format="html"): assert root is not None data = [] write = data.append - if not encoding: - encoding = "utf-8" - qnames, namespaces = _namespaces( - root, encoding, default_namespace - ) - _serialize_html( - write, root, encoding, qnames, namespaces, format - ) - return "".join(data) + qnames, namespaces = _namespaces(root, default_namespace) + _serialize_html(write, root, qnames, namespaces, format) + if encoding is None: + return "".join(data) + else: + return _encode("".join(data)) + # -------------------------------------------------------------------- # serialization support -def _namespaces(elem, encoding, default_namespace=None): +def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree # maps qnames to *encoded* prefix:local names @@ -222,9 +216,6 @@ def _namespaces(elem, encoding, default_namespace=None): if default_namespace: namespaces[default_namespace] = "" - def encode(text): - return text.encode(encoding) - def add_qname(qname): # calculate serialized qname representation try: @@ -238,17 +229,16 @@ def _namespaces(elem, encoding, default_namespace=None): if prefix != "xml": namespaces[uri] = prefix if prefix: - qnames[qname] = encode("%s:%s" % (prefix, tag)) + qnames[qname] = "%s:%s" % (prefix, tag) else: - qnames[qname] = encode(tag) # default element + qnames[qname] = tag # default element else: if default_namespace: - # FIXME: can this be handled in XML 1.0? raise ValueError( "cannot use non-qualified names with " "default_namespace option" ) - qnames[qname] = encode(qname) + qnames[qname] = qname except TypeError: _raise_serialization_error(qname) @@ -278,8 +268,8 @@ def _namespaces(elem, encoding, default_namespace=None): add_qname(text.text) return qnames, namespaces -def to_html_string(element, encoding=None): - return _write_html(ElementTree(element).getroot(), encoding, format="html") +def to_html_string(element): + return _write_html(ElementTree(element).getroot(), format="html") -def to_xhtml_string(element, encoding=None): - return _write_html(ElementTree(element).getroot(), encoding, format="xhtml") +def to_xhtml_string(element): + return _write_html(ElementTree(element).getroot(), format="xhtml") -- cgit v1.2.3