aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWaylan Limberg <waylan@gmail.com>2011-07-27 10:54:12 -0400
committerWaylan Limberg <waylan@gmail.com>2011-07-27 10:54:12 -0400
commit1fbd6ebdcc913e4dae5030d35009d4d3bb803916 (patch)
treed6bbe4d2bea8ce03e1920de219b83b51b236696e
parent872f49b4a8e71d3e0fbaea972d964ae466eaeafe (diff)
downloadmarkdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.gz
markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.bz2
markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.zip
Stripped out encoding/decoding in the searializers.
Those extra steps always bothered me as being unnecessary. Additionally, this should make conversion to Python 3 easier. The 2to3 tool wasn't converting the searializers properly and we were getting byte strings in the output. Previously, this wasn't a major problem because the default searializer was the xml searializer provided in the ElementTree standard lib. However, now that we are using our own xhtml searializer, it must work smoothly in all supported versions. As a side note, I believe the thought was that we needed to do the encoding to take advantage of the "xmlcharrefreplace" error handling. However, using the example in the python [docs](http://docs.python.org/howto/unicode.html#the-unicode-type): >>> u = unichr(40960) + u'abcd' + unichr(1972) >>> u.encode('utf-8', 'xmlcharrefreplace').decode('utf-8') == u True There's no point of using the "xmlcharrefreplace" error handling if we just convert back to the original Unicode anyway. Interestingly, the Python 3 standard lib is doing essentially what we are doing here, so I'm convinced this is the right way to go.
-rw-r--r--markdown/__init__.py2
-rw-r--r--markdown/searializers.py84
2 files changed, 38 insertions, 48 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py
index 7ca6cd1..562ee5f 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -285,7 +285,7 @@ class Markdown:
root = newRoot
# Serialize _properly_. Strip top-level tags.
- output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
+ output = self.serializer(root)
if self.stripTopLevelTags:
try:
start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
diff --git a/markdown/searializers.py b/markdown/searializers.py
index 2de7b0f..22a83d4 100644
--- a/markdown/searializers.py
+++ b/markdown/searializers.py
@@ -1,6 +1,6 @@
-# markdown/html4.py
+# markdown/searializers.py
#
-# Add html4 serialization to older versions of Elementree
+# Add x/html serialization to Elementree
# Taken from ElementTree 1.3 preview with slight modifications
#
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
@@ -82,7 +82,7 @@ def _encode(text, encoding):
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _escape_cdata(text, encoding):
+def _escape_cdata(text):
# escape character data
try:
# it's worth avoiding do-nothing calls for strings that are
@@ -94,12 +94,12 @@ def _escape_cdata(text, encoding):
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
- return text.encode(encoding, "xmlcharrefreplace")
+ return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _escape_attrib(text, encoding):
+def _escape_attrib(text):
# escape attribute value
try:
if "&" in text:
@@ -112,11 +112,11 @@ def _escape_attrib(text, encoding):
text = text.replace("\"", "&quot;")
if "\n" in text:
text = text.replace("\n", "&#10;")
- return text.encode(encoding, "xmlcharrefreplace")
+ return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _escape_attrib_html(text, encoding):
+def _escape_attrib_html(text):
# escape attribute value
try:
if "&" in text:
@@ -127,25 +127,25 @@ def _escape_attrib_html(text, encoding):
text = text.replace(">", "&gt;")
if "\"" in text:
text = text.replace("\"", "&quot;")
- return text.encode(encoding, "xmlcharrefreplace")
+ return text
except (TypeError, AttributeError):
_raise_serialization_error(text)
-def _serialize_html(write, elem, encoding, qnames, namespaces, format):
+def _serialize_html(write, elem, qnames, namespaces, format):
tag = elem.tag
text = elem.text
if tag is Comment:
- write("<!--%s-->" % _escape_cdata(text, encoding))
+ write("<!--%s-->" % _escape_cdata(text))
elif tag is ProcessingInstruction:
- write("<?%s?>" % _escape_cdata(text, encoding))
+ write("<?%s?>" % _escape_cdata(text))
else:
tag = qnames[tag]
if tag is None:
if text:
- write(_escape_cdata(text, encoding))
+ write(_escape_cdata(text))
for e in elem:
- _serialize_html(write, e, encoding, qnames, None, format)
+ _serialize_html(write, e, qnames, None, format)
else:
write("<" + tag)
items = elem.items()
@@ -157,7 +157,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
if isinstance(v, QName):
v = qnames[v.text]
else:
- v = _escape_attrib_html(v, encoding)
+ v = _escape_attrib_html(v)
if qnames[k] == v and format == 'html':
# handle boolean attributes
write(" %s" % v)
@@ -169,10 +169,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
for v, k in items:
if k:
k = ":" + k
- write(" xmlns%s=\"%s\"" % (
- k.encode(encoding),
- _escape_attrib(v, encoding)
- ))
+ write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
if format == "xhtml" and tag in HTML_EMPTY:
write(" />")
else:
@@ -180,38 +177,35 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
tag = tag.lower()
if text:
if tag == "script" or tag == "style":
- write(_encode(text, encoding))
+ write(text)
else:
- write(_escape_cdata(text, encoding))
+ write(_escape_cdata(text))
for e in elem:
- _serialize_html(write, e, encoding, qnames, None, format)
+ _serialize_html(write, e, qnames, None, format)
if tag not in HTML_EMPTY:
write("</" + tag + ">")
if elem.tail:
- write(_escape_cdata(elem.tail, encoding))
+ write(_escape_cdata(elem.tail))
def _write_html(root,
- # keyword arguments
- encoding="utf-8",
- default_namespace=None,
- format="html"):
+ encoding=None,
+ default_namespace=None,
+ format="html"):
assert root is not None
data = []
write = data.append
- if not encoding:
- encoding = "utf-8"
- qnames, namespaces = _namespaces(
- root, encoding, default_namespace
- )
- _serialize_html(
- write, root, encoding, qnames, namespaces, format
- )
- return "".join(data)
+ qnames, namespaces = _namespaces(root, default_namespace)
+ _serialize_html(write, root, qnames, namespaces, format)
+ if encoding is None:
+ return "".join(data)
+ else:
+ return _encode("".join(data))
+
# --------------------------------------------------------------------
# serialization support
-def _namespaces(elem, encoding, default_namespace=None):
+def _namespaces(elem, default_namespace=None):
# identify namespaces used in this tree
# maps qnames to *encoded* prefix:local names
@@ -222,9 +216,6 @@ def _namespaces(elem, encoding, default_namespace=None):
if default_namespace:
namespaces[default_namespace] = ""
- def encode(text):
- return text.encode(encoding)
-
def add_qname(qname):
# calculate serialized qname representation
try:
@@ -238,17 +229,16 @@ def _namespaces(elem, encoding, default_namespace=None):
if prefix != "xml":
namespaces[uri] = prefix
if prefix:
- qnames[qname] = encode("%s:%s" % (prefix, tag))
+ qnames[qname] = "%s:%s" % (prefix, tag)
else:
- qnames[qname] = encode(tag) # default element
+ qnames[qname] = tag # default element
else:
if default_namespace:
- # FIXME: can this be handled in XML 1.0?
raise ValueError(
"cannot use non-qualified names with "
"default_namespace option"
)
- qnames[qname] = encode(qname)
+ qnames[qname] = qname
except TypeError:
_raise_serialization_error(qname)
@@ -278,8 +268,8 @@ def _namespaces(elem, encoding, default_namespace=None):
add_qname(text.text)
return qnames, namespaces
-def to_html_string(element, encoding=None):
- return _write_html(ElementTree(element).getroot(), encoding, format="html")
+def to_html_string(element):
+ return _write_html(ElementTree(element).getroot(), format="html")
-def to_xhtml_string(element, encoding=None):
- return _write_html(ElementTree(element).getroot(), encoding, format="xhtml")
+def to_xhtml_string(element):
+ return _write_html(ElementTree(element).getroot(), format="xhtml")