diff options
author | Waylan Limberg <waylan@gmail.com> | 2011-07-27 10:54:12 -0400 |
---|---|---|
committer | Waylan Limberg <waylan@gmail.com> | 2011-07-27 10:54:12 -0400 |
commit | 1fbd6ebdcc913e4dae5030d35009d4d3bb803916 (patch) | |
tree | d6bbe4d2bea8ce03e1920de219b83b51b236696e | |
parent | 872f49b4a8e71d3e0fbaea972d964ae466eaeafe (diff) | |
download | markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.gz markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.bz2 markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.zip |
Stripped out encoding/decoding in the searializers.
Those extra steps always bothered me as being unnecessary. Additionally, this
should make conversion to Python 3 easier. The 2to3 tool wasn't converting
the searializers properly and we were getting byte strings in the output.
Previously, this wasn't a major problem because the default searializer was
the xml searializer provided in the ElementTree standard lib. However, now
that we are using our own xhtml searializer, it must work smoothly in all
supported versions.
As a side note, I believe the thought was that we needed to do the encoding to
take advantage of the "xmlcharrefreplace" error handling. However, using the
example in the python [docs](http://docs.python.org/howto/unicode.html#the-unicode-type):
>>> u = unichr(40960) + u'abcd' + unichr(1972)
>>> u.encode('utf-8', 'xmlcharrefreplace').decode('utf-8') == u
True
There's no point of using the "xmlcharrefreplace" error handling if we just
convert back to the original Unicode anyway. Interestingly, the Python 3
standard lib is doing essentially what we are doing here, so I'm convinced this
is the right way to go.
-rw-r--r-- | markdown/__init__.py | 2 | ||||
-rw-r--r-- | markdown/searializers.py | 84 |
2 files changed, 38 insertions, 48 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py index 7ca6cd1..562ee5f 100644 --- a/markdown/__init__.py +++ b/markdown/__init__.py @@ -285,7 +285,7 @@ class Markdown: root = newRoot # Serialize _properly_. Strip top-level tags. - output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8")) + output = self.serializer(root) if self.stripTopLevelTags: try: start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2 diff --git a/markdown/searializers.py b/markdown/searializers.py index 2de7b0f..22a83d4 100644 --- a/markdown/searializers.py +++ b/markdown/searializers.py @@ -1,6 +1,6 @@ -# markdown/html4.py +# markdown/searializers.py # -# Add html4 serialization to older versions of Elementree +# Add x/html serialization to Elementree # Taken from ElementTree 1.3 preview with slight modifications # # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. @@ -82,7 +82,7 @@ def _encode(text, encoding): except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_cdata(text, encoding): +def _escape_cdata(text): # escape character data try: # it's worth avoiding do-nothing calls for strings that are @@ -94,12 +94,12 @@ def _escape_cdata(text, encoding): text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") - return text.encode(encoding, "xmlcharrefreplace") + return text except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_attrib(text, encoding): +def _escape_attrib(text): # escape attribute value try: if "&" in text: @@ -112,11 +112,11 @@ def _escape_attrib(text, encoding): text = text.replace("\"", """) if "\n" in text: text = text.replace("\n", " ") - return text.encode(encoding, "xmlcharrefreplace") + return text except (TypeError, AttributeError): _raise_serialization_error(text) -def _escape_attrib_html(text, encoding): +def _escape_attrib_html(text): # escape attribute value try: if "&" in text: @@ -127,25 +127,25 @@ def _escape_attrib_html(text, encoding): text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) - return text.encode(encoding, "xmlcharrefreplace") + return text except (TypeError, AttributeError): _raise_serialization_error(text) -def _serialize_html(write, elem, encoding, qnames, namespaces, format): +def _serialize_html(write, elem, qnames, namespaces, format): tag = elem.tag text = elem.text if tag is Comment: - write("<!--%s-->" % _escape_cdata(text, encoding)) + write("<!--%s-->" % _escape_cdata(text)) elif tag is ProcessingInstruction: - write("<?%s?>" % _escape_cdata(text, encoding)) + write("<?%s?>" % _escape_cdata(text)) else: tag = qnames[tag] if tag is None: if text: - write(_escape_cdata(text, encoding)) + write(_escape_cdata(text)) for e in elem: - _serialize_html(write, e, encoding, qnames, None, format) + _serialize_html(write, e, qnames, None, format) else: write("<" + tag) items = elem.items() @@ -157,7 +157,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format): if isinstance(v, QName): v = qnames[v.text] else: - v = _escape_attrib_html(v, encoding) + v = _escape_attrib_html(v) if qnames[k] == v and format == 'html': # handle boolean attributes write(" %s" % v) @@ -169,10 +169,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format): for v, k in items: if k: k = ":" + k - write(" xmlns%s=\"%s\"" % ( - k.encode(encoding), - _escape_attrib(v, encoding) - )) + write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v))) if format == "xhtml" and tag in HTML_EMPTY: write(" />") else: @@ -180,38 +177,35 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format): tag = tag.lower() if text: if tag == "script" or tag == "style": - write(_encode(text, encoding)) + write(text) else: - write(_escape_cdata(text, encoding)) + write(_escape_cdata(text)) for e in elem: - _serialize_html(write, e, encoding, qnames, None, format) + _serialize_html(write, e, qnames, None, format) if tag not in HTML_EMPTY: write("</" + tag + ">") if elem.tail: - write(_escape_cdata(elem.tail, encoding)) + write(_escape_cdata(elem.tail)) def _write_html(root, - # keyword arguments - encoding="utf-8", - default_namespace=None, - format="html"): + encoding=None, + default_namespace=None, + format="html"): assert root is not None data = [] write = data.append - if not encoding: - encoding = "utf-8" - qnames, namespaces = _namespaces( - root, encoding, default_namespace - ) - _serialize_html( - write, root, encoding, qnames, namespaces, format - ) - return "".join(data) + qnames, namespaces = _namespaces(root, default_namespace) + _serialize_html(write, root, qnames, namespaces, format) + if encoding is None: + return "".join(data) + else: + return _encode("".join(data)) + # -------------------------------------------------------------------- # serialization support -def _namespaces(elem, encoding, default_namespace=None): +def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree # maps qnames to *encoded* prefix:local names @@ -222,9 +216,6 @@ def _namespaces(elem, encoding, default_namespace=None): if default_namespace: namespaces[default_namespace] = "" - def encode(text): - return text.encode(encoding) - def add_qname(qname): # calculate serialized qname representation try: @@ -238,17 +229,16 @@ def _namespaces(elem, encoding, default_namespace=None): if prefix != "xml": namespaces[uri] = prefix if prefix: - qnames[qname] = encode("%s:%s" % (prefix, tag)) + qnames[qname] = "%s:%s" % (prefix, tag) else: - qnames[qname] = encode(tag) # default element + qnames[qname] = tag # default element else: if default_namespace: - # FIXME: can this be handled in XML 1.0? raise ValueError( "cannot use non-qualified names with " "default_namespace option" ) - qnames[qname] = encode(qname) + qnames[qname] = qname except TypeError: _raise_serialization_error(qname) @@ -278,8 +268,8 @@ def _namespaces(elem, encoding, default_namespace=None): add_qname(text.text) return qnames, namespaces -def to_html_string(element, encoding=None): - return _write_html(ElementTree(element).getroot(), encoding, format="html") +def to_html_string(element): + return _write_html(ElementTree(element).getroot(), format="html") -def to_xhtml_string(element, encoding=None): - return _write_html(ElementTree(element).getroot(), encoding, format="xhtml") +def to_xhtml_string(element): + return _write_html(ElementTree(element).getroot(), format="xhtml") |