Stripped out encoding/decoding in the searializers.

Those extra steps always bothered me as being unnecessary. Additionally, this should make conversion to Python 3 easier. The 2to3 tool wasn't converting the searializers properly and we were getting byte strings in the output. Previously, this wasn't a major problem because the default searializer was the xml searializer provided in the ElementTree standard lib. However, now that we are using our own xhtml searializer, it must work smoothly in all supported versions. As a side note, I believe the thought was that we needed to do the encoding to take advantage of the "xmlcharrefreplace" error handling. However, using the example in the python [docs](http://docs.python.org/howto/unicode.html#the-unicode-type): >>> u = unichr(40960) + u'abcd' + unichr(1972) >>> u.encode('utf-8', 'xmlcharrefreplace').decode('utf-8') == u True There's no point of using the "xmlcharrefreplace" error handling if we just convert back to the original Unicode anyway. Interestingly, the Python 3 standard lib is doing essentially what we are doing here, so I'm convinced this is the right way to go.
author: Waylan Limberg <waylan@gmail.com> 2011-07-27 10:54:12 -0400
committer: Waylan Limberg <waylan@gmail.com> 2011-07-27 10:54:12 -0400
commit: 1fbd6ebdcc913e4dae5030d35009d4d3bb803916 (patch)
tree: d6bbe4d2bea8ce03e1920de219b83b51b236696e
parent: 872f49b4a8e71d3e0fbaea972d964ae466eaeafe (diff)
download: markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.gz
markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.bz2
markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.zip
2 files changed, 38 insertions, 48 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py
index 7ca6cd1..562ee5f 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -285,7 +285,7 @@ class Markdown:
                 root = newRoot
 
         # Serialize _properly_.  Strip top-level tags.
-        output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
+        output = self.serializer(root)
         if self.stripTopLevelTags:
             try:
                 start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
diff --git a/markdown/searializers.py b/markdown/searializers.py
index 2de7b0f..22a83d4 100644
--- a/markdown/searializers.py
+++ b/markdown/searializers.py
@@ -1,6 +1,6 @@
-# markdown/html4.py
+# markdown/searializers.py
 #
-# Add html4 serialization to older versions of Elementree
+# Add x/html serialization to Elementree
 # Taken from ElementTree 1.3 preview with slight modifications
 #
 # Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
@@ -82,7 +82,7 @@ def _encode(text, encoding):
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
-def _escape_cdata(text, encoding):
+def _escape_cdata(text):
     # escape character data
     try:
         # it's worth avoiding do-nothing calls for strings that are
@@ -94,12 +94,12 @@ def _escape_cdata(text, encoding):
             text = text.replace("<", "&lt;")
         if ">" in text:
             text = text.replace(">", "&gt;")
-        return text.encode(encoding, "xmlcharrefreplace")
+        return text
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
 
-def _escape_attrib(text, encoding):
+def _escape_attrib(text):
     # escape attribute value
     try:
         if "&" in text:
@@ -112,11 +112,11 @@ def _escape_attrib(text, encoding):
             text = text.replace("\"", "&quot;")
         if "\n" in text:
             text = text.replace("\n", "&#10;")
-        return text.encode(encoding, "xmlcharrefreplace")
+        return text
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
-def _escape_attrib_html(text, encoding):
+def _escape_attrib_html(text):
     # escape attribute value
     try:
         if "&" in text:
@@ -127,25 +127,25 @@ def _escape_attrib_html(text, encoding):
             text = text.replace(">", "&gt;")
         if "\"" in text:
             text = text.replace("\"", "&quot;")
-        return text.encode(encoding, "xmlcharrefreplace")
+        return text
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
 
-def _serialize_html(write, elem, encoding, qnames, namespaces, format):
+def _serialize_html(write, elem, qnames, namespaces, format):
     tag = elem.tag
     text = elem.text
     if tag is Comment:
-        write("<!--%s-->" % _escape_cdata(text, encoding))
+        write("<!--%s-->" % _escape_cdata(text))
     elif tag is ProcessingInstruction:
-        write("<?%s?>" % _escape_cdata(text, encoding))
+        write("<?%s?>" % _escape_cdata(text))
     else:
         tag = qnames[tag]
         if tag is None:
             if text:
-                write(_escape_cdata(text, encoding))
+                write(_escape_cdata(text))
             for e in elem:
-                _serialize_html(write, e, encoding, qnames, None, format)
+                _serialize_html(write, e, qnames, None, format)
         else:
             write("<" + tag)
             items = elem.items()
@@ -157,7 +157,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
                     if isinstance(v, QName):
                         v = qnames[v.text]
                     else:
-                        v = _escape_attrib_html(v, encoding)
+                        v = _escape_attrib_html(v)
                     if qnames[k] == v and format == 'html':
                         # handle boolean attributes
                         write(" %s" % v)
@@ -169,10 +169,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
                     for v, k in items:
                         if k:
                             k = ":" + k
-                        write(" xmlns%s=\"%s\"" % (
-                            k.encode(encoding),
-                            _escape_attrib(v, encoding)
-                            ))
+                        write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
             if format == "xhtml" and tag in HTML_EMPTY:
                 write(" />")
             else:
@@ -180,38 +177,35 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
                 tag = tag.lower()
                 if text:
                     if tag == "script" or tag == "style":
-                        write(_encode(text, encoding))
+                        write(text)
                     else:
-                        write(_escape_cdata(text, encoding))
+                        write(_escape_cdata(text))
                 for e in elem:
-                    _serialize_html(write, e, encoding, qnames, None, format)
+                    _serialize_html(write, e, qnames, None, format)
                 if tag not in HTML_EMPTY:
                     write("</" + tag + ">")
     if elem.tail:
-        write(_escape_cdata(elem.tail, encoding))
+        write(_escape_cdata(elem.tail))
 
 def _write_html(root,
-          # keyword arguments
-          encoding="utf-8",
-          default_namespace=None,
-          format="html"):
+                encoding=None,
+                default_namespace=None,
+                format="html"):
     assert root is not None
     data = []
     write = data.append
-    if not encoding:
-        encoding = "utf-8"
-    qnames, namespaces = _namespaces(
-            root, encoding, default_namespace
-            )
-    _serialize_html(
-                write, root, encoding, qnames, namespaces, format
-                )
-    return "".join(data)
+    qnames, namespaces = _namespaces(root, default_namespace)
+    _serialize_html(write, root, qnames, namespaces, format)
+    if encoding is None:
+        return "".join(data)
+    else:
+        return _encode("".join(data))
+
 
 # --------------------------------------------------------------------
 # serialization support
 
-def _namespaces(elem, encoding, default_namespace=None):
+def _namespaces(elem, default_namespace=None):
     # identify namespaces used in this tree
 
     # maps qnames to *encoded* prefix:local names
@@ -222,9 +216,6 @@ def _namespaces(elem, encoding, default_namespace=None):
     if default_namespace:
         namespaces[default_namespace] = ""
 
-    def encode(text):
-        return text.encode(encoding)
-
     def add_qname(qname):
         # calculate serialized qname representation
         try:
@@ -238,17 +229,16 @@ def _namespaces(elem, encoding, default_namespace=None):
                     if prefix != "xml":
                         namespaces[uri] = prefix
                 if prefix:
-                    qnames[qname] = encode("%s:%s" % (prefix, tag))
+                    qnames[qname] = "%s:%s" % (prefix, tag)
                 else:
-                    qnames[qname] = encode(tag) # default element
+                    qnames[qname] = tag # default element
             else:
                 if default_namespace:
-                    # FIXME: can this be handled in XML 1.0?
                     raise ValueError(
                         "cannot use non-qualified names with "
                         "default_namespace option"
                         )
-                qnames[qname] = encode(qname)
+                qnames[qname] = qname
         except TypeError:
             _raise_serialization_error(qname)
 
@@ -278,8 +268,8 @@ def _namespaces(elem, encoding, default_namespace=None):
             add_qname(text.text)
     return qnames, namespaces
 
-def to_html_string(element, encoding=None):
-    return _write_html(ElementTree(element).getroot(), encoding, format="html")
+def to_html_string(element):
+    return _write_html(ElementTree(element).getroot(), format="html")
 
-def to_xhtml_string(element, encoding=None):
-    return _write_html(ElementTree(element).getroot(), encoding, format="xhtml")
+def to_xhtml_string(element):
+    return _write_html(ElementTree(element).getroot(), format="xhtml")
author	Waylan Limberg <waylan@gmail.com>	2011-07-27 10:54:12 -0400
committer	Waylan Limberg <waylan@gmail.com>	2011-07-27 10:54:12 -0400
commit	1fbd6ebdcc913e4dae5030d35009d4d3bb803916 (patch)
tree	d6bbe4d2bea8ce03e1920de219b83b51b236696e
parent	872f49b4a8e71d3e0fbaea972d964ae466eaeafe (diff)
download	markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.gz markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.tar.bz2 markdown-1fbd6ebdcc913e4dae5030d35009d4d3bb803916.zip