From 1fbd6ebdcc913e4dae5030d35009d4d3bb803916 Mon Sep 17 00:00:00 2001
From: Waylan Limberg <waylan@gmail.com>
Date: Wed, 27 Jul 2011 10:54:12 -0400
Subject: Stripped out encoding/decoding in the searializers.

Those extra steps always bothered me as being unnecessary. Additionally, this
should make conversion to Python 3 easier. The 2to3 tool wasn't converting
the searializers properly and we were getting byte strings in the output.
Previously, this wasn't a major problem because the default searializer was
the xml searializer provided in the ElementTree standard lib. However, now
that we are using our own xhtml searializer, it must work smoothly in all
supported versions.

As a side note, I believe the thought was that we needed to do the encoding to
take advantage of the "xmlcharrefreplace" error handling. However, using the
example in the python [docs](http://docs.python.org/howto/unicode.html#the-unicode-type):

    >>> u = unichr(40960) + u'abcd' + unichr(1972)
    >>> u.encode('utf-8', 'xmlcharrefreplace').decode('utf-8') == u
    True

There's no point of using the "xmlcharrefreplace" error handling if we just
convert back to the original Unicode anyway. Interestingly, the Python 3
standard lib is doing essentially what we are doing here, so I'm convinced this
is the right way to go.
---
 markdown/__init__.py     |  2 +-
 markdown/searializers.py | 84 +++++++++++++++++++++---------------------------
 2 files changed, 38 insertions(+), 48 deletions(-)

diff --git a/markdown/__init__.py b/markdown/__init__.py
index 7ca6cd1..562ee5f 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -285,7 +285,7 @@ class Markdown:
                 root = newRoot
 
         # Serialize _properly_.  Strip top-level tags.
-        output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf-8"))
+        output = self.serializer(root)
         if self.stripTopLevelTags:
             try:
                 start = output.index('<%s>'%self.doc_tag)+len(self.doc_tag)+2
diff --git a/markdown/searializers.py b/markdown/searializers.py
index 2de7b0f..22a83d4 100644
--- a/markdown/searializers.py
+++ b/markdown/searializers.py
@@ -1,6 +1,6 @@
-# markdown/html4.py
+# markdown/searializers.py
 #
-# Add html4 serialization to older versions of Elementree
+# Add x/html serialization to Elementree
 # Taken from ElementTree 1.3 preview with slight modifications
 #
 # Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
@@ -82,7 +82,7 @@ def _encode(text, encoding):
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
-def _escape_cdata(text, encoding):
+def _escape_cdata(text):
     # escape character data
     try:
         # it's worth avoiding do-nothing calls for strings that are
@@ -94,12 +94,12 @@ def _escape_cdata(text, encoding):
             text = text.replace("<", "&lt;")
         if ">" in text:
             text = text.replace(">", "&gt;")
-        return text.encode(encoding, "xmlcharrefreplace")
+        return text
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
 
-def _escape_attrib(text, encoding):
+def _escape_attrib(text):
     # escape attribute value
     try:
         if "&" in text:
@@ -112,11 +112,11 @@ def _escape_attrib(text, encoding):
             text = text.replace("\"", "&quot;")
         if "\n" in text:
             text = text.replace("\n", "&#10;")
-        return text.encode(encoding, "xmlcharrefreplace")
+        return text
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
-def _escape_attrib_html(text, encoding):
+def _escape_attrib_html(text):
     # escape attribute value
     try:
         if "&" in text:
@@ -127,25 +127,25 @@ def _escape_attrib_html(text, encoding):
             text = text.replace(">", "&gt;")
         if "\"" in text:
             text = text.replace("\"", "&quot;")
-        return text.encode(encoding, "xmlcharrefreplace")
+        return text
     except (TypeError, AttributeError):
         _raise_serialization_error(text)
 
 
-def _serialize_html(write, elem, encoding, qnames, namespaces, format):
+def _serialize_html(write, elem, qnames, namespaces, format):
     tag = elem.tag
     text = elem.text
     if tag is Comment:
-        write("<!--%s-->" % _escape_cdata(text, encoding))
+        write("<!--%s-->" % _escape_cdata(text))
     elif tag is ProcessingInstruction:
-        write("<?%s?>" % _escape_cdata(text, encoding))
+        write("<?%s?>" % _escape_cdata(text))
     else:
         tag = qnames[tag]
         if tag is None:
             if text:
-                write(_escape_cdata(text, encoding))
+                write(_escape_cdata(text))
             for e in elem:
-                _serialize_html(write, e, encoding, qnames, None, format)
+                _serialize_html(write, e, qnames, None, format)
         else:
             write("<" + tag)
             items = elem.items()
@@ -157,7 +157,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
                     if isinstance(v, QName):
                         v = qnames[v.text]
                     else:
-                        v = _escape_attrib_html(v, encoding)
+                        v = _escape_attrib_html(v)
                     if qnames[k] == v and format == 'html':
                         # handle boolean attributes
                         write(" %s" % v)
@@ -169,10 +169,7 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
                     for v, k in items:
                         if k:
                             k = ":" + k
-                        write(" xmlns%s=\"%s\"" % (
-                            k.encode(encoding),
-                            _escape_attrib(v, encoding)
-                            ))
+                        write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
             if format == "xhtml" and tag in HTML_EMPTY:
                 write(" />")
             else:
@@ -180,38 +177,35 @@ def _serialize_html(write, elem, encoding, qnames, namespaces, format):
                 tag = tag.lower()
                 if text:
                     if tag == "script" or tag == "style":
-                        write(_encode(text, encoding))
+                        write(text)
                     else:
-                        write(_escape_cdata(text, encoding))
+                        write(_escape_cdata(text))
                 for e in elem:
-                    _serialize_html(write, e, encoding, qnames, None, format)
+                    _serialize_html(write, e, qnames, None, format)
                 if tag not in HTML_EMPTY:
                     write("</" + tag + ">")
     if elem.tail:
-        write(_escape_cdata(elem.tail, encoding))
+        write(_escape_cdata(elem.tail))
 
 def _write_html(root,
-          # keyword arguments
-          encoding="utf-8",
-          default_namespace=None,
-          format="html"):
+                encoding=None,
+                default_namespace=None,
+                format="html"):
     assert root is not None
     data = []
     write = data.append
-    if not encoding:
-        encoding = "utf-8"
-    qnames, namespaces = _namespaces(
-            root, encoding, default_namespace
-            )
-    _serialize_html(
-                write, root, encoding, qnames, namespaces, format
-                )
-    return "".join(data)
+    qnames, namespaces = _namespaces(root, default_namespace)
+    _serialize_html(write, root, qnames, namespaces, format)
+    if encoding is None:
+        return "".join(data)
+    else:
+        return _encode("".join(data))
+
 
 # --------------------------------------------------------------------
 # serialization support
 
-def _namespaces(elem, encoding, default_namespace=None):
+def _namespaces(elem, default_namespace=None):
     # identify namespaces used in this tree
 
     # maps qnames to *encoded* prefix:local names
@@ -222,9 +216,6 @@ def _namespaces(elem, encoding, default_namespace=None):
     if default_namespace:
         namespaces[default_namespace] = ""
 
-    def encode(text):
-        return text.encode(encoding)
-
     def add_qname(qname):
         # calculate serialized qname representation
         try:
@@ -238,17 +229,16 @@ def _namespaces(elem, encoding, default_namespace=None):
                     if prefix != "xml":
                         namespaces[uri] = prefix
                 if prefix:
-                    qnames[qname] = encode("%s:%s" % (prefix, tag))
+                    qnames[qname] = "%s:%s" % (prefix, tag)
                 else:
-                    qnames[qname] = encode(tag) # default element
+                    qnames[qname] = tag # default element
             else:
                 if default_namespace:
-                    # FIXME: can this be handled in XML 1.0?
                     raise ValueError(
                         "cannot use non-qualified names with "
                         "default_namespace option"
                         )
-                qnames[qname] = encode(qname)
+                qnames[qname] = qname
         except TypeError:
             _raise_serialization_error(qname)
 
@@ -278,8 +268,8 @@ def _namespaces(elem, encoding, default_namespace=None):
             add_qname(text.text)
     return qnames, namespaces
 
-def to_html_string(element, encoding=None):
-    return _write_html(ElementTree(element).getroot(), encoding, format="html")
+def to_html_string(element):
+    return _write_html(ElementTree(element).getroot(), format="html")
 
-def to_xhtml_string(element, encoding=None):
-    return _write_html(ElementTree(element).getroot(), encoding, format="xhtml")
+def to_xhtml_string(element):
+    return _write_html(ElementTree(element).getroot(), format="xhtml")
-- 
cgit v1.2.3