From 1a42fa8602a99e0a8807f517ac44ecda77d86c22 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 16 Jun 2011 08:31:02 -0400 Subject: Added our own xhtml searializer. We no longer use a xml searializer to output xhtml. This fixes #9 among other bugs. The test suite even had bad tests that should have been failing. They also have been corrected. --- markdown/__init__.py | 10 +- markdown/html4.py | 277 ----------------------------------- markdown/searializers.py | 292 +++++++++++++++++++++++++++++++++++++ tests/extensions/extra/tables.html | 12 +- tests/misc/attributes2.html | 6 +- tests/misc/blank-block-quote.html | 2 +- tests/test_apis.py | 12 +- 7 files changed, 313 insertions(+), 298 deletions(-) delete mode 100644 markdown/html4.py create mode 100644 markdown/searializers.py diff --git a/markdown/__init__.py b/markdown/__init__.py index 9b964b7..762da31 100644 --- a/markdown/__init__.py +++ b/markdown/__init__.py @@ -43,7 +43,7 @@ from treeprocessors import build_treeprocessors from inlinepatterns import build_inlinepatterns from postprocessors import build_postprocessors from extensions import Extension -import html4 +from searializers import to_html_string, to_xhtml_string # For backwards compatibility in the 2.0.x series # The things defined in these modules started off in __init__.py so third @@ -67,10 +67,10 @@ class Markdown: } output_formats = { - 'html' : html4.to_html_string, - 'html4' : html4.to_html_string, - 'xhtml' : util.etree.tostring, - 'xhtml1': util.etree.tostring, + 'html' : to_html_string, + 'html4' : to_html_string, + 'xhtml' : to_xhtml_string, + 'xhtml1': to_xhtml_string, } def __init__(self, extensions=[], **kwargs): diff --git a/markdown/html4.py b/markdown/html4.py deleted file mode 100644 index 611e628..0000000 --- a/markdown/html4.py +++ /dev/null @@ -1,277 +0,0 @@ -# markdown/html4.py -# -# Add html4 serialization to older versions of Elementree -# Taken from ElementTree 1.3 preview with slight modifications -# -# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. -# -# fredrik@pythonware.com -# http://www.pythonware.com -# -# -------------------------------------------------------------------- -# The ElementTree toolkit is -# -# Copyright (c) 1999-2007 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its -# associated documentation, you agree that you have read, understood, -# and will comply with the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and -# its associated documentation for any purpose and without fee is -# hereby granted, provided that the above copyright notice appears in -# all copies, and that both that copyright notice and this permission -# notice appear in supporting documentation, and that the name of -# Secret Labs AB or the author not be used in advertising or publicity -# pertaining to distribution of the software without specific, written -# prior permission. -# -# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD -# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- -# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR -# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY -# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -# OF THIS SOFTWARE. -# -------------------------------------------------------------------- - - -import util -ElementTree = util.etree.ElementTree -QName = util.etree.QName -if hasattr(util.etree, 'test_comment'): - Comment = util.etree.test_comment -else: - Comment = util.etree.Comment -PI = util.etree.PI -ProcessingInstruction = util.etree.ProcessingInstruction - -HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", - "img", "input", "isindex", "link", "meta" "param") - -try: - HTML_EMPTY = set(HTML_EMPTY) -except NameError: - pass - -_namespace_map = { - # "well-known" namespace prefixes - "http://www.w3.org/XML/1998/namespace": "xml", - "http://www.w3.org/1999/xhtml": "html", - "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", - "http://schemas.xmlsoap.org/wsdl/": "wsdl", - # xml schema - "http://www.w3.org/2001/XMLSchema": "xs", - "http://www.w3.org/2001/XMLSchema-instance": "xsi", - # dublic core - "http://purl.org/dc/elements/1.1/": "dc", -} - - -def _raise_serialization_error(text): - raise TypeError( - "cannot serialize %r (type %s)" % (text, type(text).__name__) - ) - -def _encode(text, encoding): - try: - return text.encode(encoding, "xmlcharrefreplace") - except (TypeError, AttributeError): - _raise_serialization_error(text) - -def _escape_cdata(text, encoding): - # escape character data - try: - # it's worth avoiding do-nothing calls for strings that are - # shorter than 500 character, or so. assume that's, by far, - # the most common case in most applications. - if "&" in text: - text = text.replace("&", "&") - if "<" in text: - text = text.replace("<", "<") - if ">" in text: - text = text.replace(">", ">") - return text.encode(encoding, "xmlcharrefreplace") - except (TypeError, AttributeError): - _raise_serialization_error(text) - - -def _escape_attrib(text, encoding): - # escape attribute value - try: - if "&" in text: - text = text.replace("&", "&") - if "<" in text: - text = text.replace("<", "<") - if ">" in text: - text = text.replace(">", ">") - if "\"" in text: - text = text.replace("\"", """) - if "\n" in text: - text = text.replace("\n", " ") - return text.encode(encoding, "xmlcharrefreplace") - except (TypeError, AttributeError): - _raise_serialization_error(text) - -def _escape_attrib_html(text, encoding): - # escape attribute value - try: - if "&" in text: - text = text.replace("&", "&") - if ">" in text: - text = text.replace(">", ">") - if "\"" in text: - text = text.replace("\"", """) - return text.encode(encoding, "xmlcharrefreplace") - except (TypeError, AttributeError): - _raise_serialization_error(text) - - -def _serialize_html(write, elem, encoding, qnames, namespaces): - tag = elem.tag - text = elem.text - if tag is Comment: - write("" % _escape_cdata(text, encoding)) - elif tag is ProcessingInstruction: - write("" % _escape_cdata(text, encoding)) - else: - tag = qnames[tag] - if tag is None: - if text: - write(_escape_cdata(text, encoding)) - for e in elem: - _serialize_html(write, e, encoding, qnames, None) - else: - write("<" + tag) - items = elem.items() - if items or namespaces: - items.sort() # lexical order - for k, v in items: - if isinstance(k, QName): - k = k.text - if isinstance(v, QName): - v = qnames[v.text] - else: - v = _escape_attrib_html(v, encoding) - # FIXME: handle boolean attributes - write(" %s=\"%s\"" % (qnames[k], v)) - if namespaces: - items = namespaces.items() - items.sort(key=lambda x: x[1]) # sort on prefix - for v, k in items: - if k: - k = ":" + k - write(" xmlns%s=\"%s\"" % ( - k.encode(encoding), - _escape_attrib(v, encoding) - )) - write(">") - tag = tag.lower() - if text: - if tag == "script" or tag == "style": - write(_encode(text, encoding)) - else: - write(_escape_cdata(text, encoding)) - for e in elem: - _serialize_html(write, e, encoding, qnames, None) - if tag not in HTML_EMPTY: - write("") - if elem.tail: - write(_escape_cdata(elem.tail, encoding)) - -def write_html(root, f, - # keyword arguments - encoding="us-ascii", - default_namespace=None): - assert root is not None - if not hasattr(f, "write"): - f = open(f, "wb") - write = f.write - if not encoding: - encoding = "us-ascii" - qnames, namespaces = _namespaces( - root, encoding, default_namespace - ) - _serialize_html( - write, root, encoding, qnames, namespaces - ) - -# -------------------------------------------------------------------- -# serialization support - -def _namespaces(elem, encoding, default_namespace=None): - # identify namespaces used in this tree - - # maps qnames to *encoded* prefix:local names - qnames = {None: None} - - # maps uri:s to prefixes - namespaces = {} - if default_namespace: - namespaces[default_namespace] = "" - - def encode(text): - return text.encode(encoding) - - def add_qname(qname): - # calculate serialized qname representation - try: - if qname[:1] == "{": - uri, tag = qname[1:].split("}", 1) - prefix = namespaces.get(uri) - if prefix is None: - prefix = _namespace_map.get(uri) - if prefix is None: - prefix = "ns%d" % len(namespaces) - if prefix != "xml": - namespaces[uri] = prefix - if prefix: - qnames[qname] = encode("%s:%s" % (prefix, tag)) - else: - qnames[qname] = encode(tag) # default element - else: - if default_namespace: - # FIXME: can this be handled in XML 1.0? - raise ValueError( - "cannot use non-qualified names with " - "default_namespace option" - ) - qnames[qname] = encode(qname) - except TypeError: - _raise_serialization_error(qname) - - # populate qname and namespaces table - try: - iterate = elem.iter - except AttributeError: - iterate = elem.getiterator # cET compatibility - for elem in iterate(): - tag = elem.tag - if isinstance(tag, QName) and tag.text not in qnames: - add_qname(tag.text) - elif isinstance(tag, basestring): - if tag not in qnames: - add_qname(tag) - elif tag is not None and tag is not Comment and tag is not PI: - _raise_serialization_error(tag) - for key, value in elem.items(): - if isinstance(key, QName): - key = key.text - if key not in qnames: - add_qname(key) - if isinstance(value, QName) and value.text not in qnames: - add_qname(value.text) - text = elem.text - if isinstance(text, QName) and text.text not in qnames: - add_qname(text.text) - return qnames, namespaces - -def to_html_string(element, encoding=None): - class dummy: - pass - data = [] - file = dummy() - file.write = data.append - write_html(ElementTree(element).getroot(),file,encoding) - return "".join(data) diff --git a/markdown/searializers.py b/markdown/searializers.py new file mode 100644 index 0000000..39dcb56 --- /dev/null +++ b/markdown/searializers.py @@ -0,0 +1,292 @@ +# markdown/html4.py +# +# Add html4 serialization to older versions of Elementree +# Taken from ElementTree 1.3 preview with slight modifications +# +# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2007 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + + +import util +ElementTree = util.etree.ElementTree +QName = util.etree.QName +if hasattr(util.etree, 'test_comment'): + Comment = util.etree.test_comment +else: + Comment = util.etree.Comment +PI = util.etree.PI +ProcessingInstruction = util.etree.ProcessingInstruction + +HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", + "img", "input", "isindex", "link", "meta" "param") + +try: + HTML_EMPTY = set(HTML_EMPTY) +except NameError: + pass + +_namespace_map = { + # "well-known" namespace prefixes + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/1999/xhtml": "html", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", + "http://schemas.xmlsoap.org/wsdl/": "wsdl", + # xml schema + "http://www.w3.org/2001/XMLSchema": "xs", + "http://www.w3.org/2001/XMLSchema-instance": "xsi", + # dublic core + "http://purl.org/dc/elements/1.1/": "dc", +} + + +def _raise_serialization_error(text): + raise TypeError( + "cannot serialize %r (type %s)" % (text, type(text).__name__) + ) + +def _encode(text, encoding): + try: + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + +def _escape_cdata(text, encoding): + # escape character data + try: + # it's worth avoiding do-nothing calls for strings that are + # shorter than 500 character, or so. assume that's, by far, + # the most common case in most applications. + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + + +def _escape_attrib(text, encoding): + # escape attribute value + try: + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + if "\"" in text: + text = text.replace("\"", """) + if "\n" in text: + text = text.replace("\n", " ") + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + +def _escape_attrib_html(text, encoding): + # escape attribute value + try: + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + if "\"" in text: + text = text.replace("\"", """) + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + + +def _serialize_html(write, elem, encoding, qnames, namespaces, format): + tag = elem.tag + text = elem.text + if tag is Comment: + write("" % _escape_cdata(text, encoding)) + elif tag is ProcessingInstruction: + write("" % _escape_cdata(text, encoding)) + else: + tag = qnames[tag] + if tag is None: + if text: + write(_escape_cdata(text, encoding)) + for e in elem: + _serialize_html(write, e, encoding, qnames, None, format) + else: + write("<" + tag) + items = elem.items() + if items or namespaces: + items.sort() # lexical order + for k, v in items: + if isinstance(k, QName): + k = k.text + if isinstance(v, QName): + v = qnames[v.text] + else: + v = _escape_attrib_html(v, encoding) + # FIXME: handle boolean attributes + write(" %s=\"%s\"" % (qnames[k], v)) + if namespaces: + items = namespaces.items() + items.sort(key=lambda x: x[1]) # sort on prefix + for v, k in items: + if k: + k = ":" + k + write(" xmlns%s=\"%s\"" % ( + k.encode(encoding), + _escape_attrib(v, encoding) + )) + if format == "xhtml" and tag in HTML_EMPTY: + write(" />") + else: + write(">") + tag = tag.lower() + if text: + if tag == "script" or tag == "style": + write(_encode(text, encoding)) + else: + write(_escape_cdata(text, encoding)) + for e in elem: + _serialize_html(write, e, encoding, qnames, None, format) + if tag not in HTML_EMPTY: + write("") + if elem.tail: + write(_escape_cdata(elem.tail, encoding)) + +def write_html(root, f, + # keyword arguments + encoding="us-ascii", + default_namespace=None, + format="html"): + assert root is not None + if not hasattr(f, "write"): + f = open(f, "wb") + write = f.write + if not encoding: + encoding = "us-ascii" + qnames, namespaces = _namespaces( + root, encoding, default_namespace + ) + _serialize_html( + write, root, encoding, qnames, namespaces, format + ) + +# -------------------------------------------------------------------- +# serialization support + +def _namespaces(elem, encoding, default_namespace=None): + # identify namespaces used in this tree + + # maps qnames to *encoded* prefix:local names + qnames = {None: None} + + # maps uri:s to prefixes + namespaces = {} + if default_namespace: + namespaces[default_namespace] = "" + + def encode(text): + return text.encode(encoding) + + def add_qname(qname): + # calculate serialized qname representation + try: + if qname[:1] == "{": + uri, tag = qname[1:].split("}", 1) + prefix = namespaces.get(uri) + if prefix is None: + prefix = _namespace_map.get(uri) + if prefix is None: + prefix = "ns%d" % len(namespaces) + if prefix != "xml": + namespaces[uri] = prefix + if prefix: + qnames[qname] = encode("%s:%s" % (prefix, tag)) + else: + qnames[qname] = encode(tag) # default element + else: + if default_namespace: + # FIXME: can this be handled in XML 1.0? + raise ValueError( + "cannot use non-qualified names with " + "default_namespace option" + ) + qnames[qname] = encode(qname) + except TypeError: + _raise_serialization_error(qname) + + # populate qname and namespaces table + try: + iterate = elem.iter + except AttributeError: + iterate = elem.getiterator # cET compatibility + for elem in iterate(): + tag = elem.tag + if isinstance(tag, QName) and tag.text not in qnames: + add_qname(tag.text) + elif isinstance(tag, basestring): + if tag not in qnames: + add_qname(tag) + elif tag is not None and tag is not Comment and tag is not PI: + _raise_serialization_error(tag) + for key, value in elem.items(): + if isinstance(key, QName): + key = key.text + if key not in qnames: + add_qname(key) + if isinstance(value, QName) and value.text not in qnames: + add_qname(value.text) + text = elem.text + if isinstance(text, QName) and text.text not in qnames: + add_qname(text.text) + return qnames, namespaces + +def to_html_string(element, encoding=None): + class dummy: + pass + data = [] + file = dummy() + file.write = data.append + write_html(ElementTree(element).getroot(), file, encoding, format="html") + return "".join(data) + +def to_xhtml_string(element, encoding=None): + class dummy: + pass + data = [] + file = dummy() + file.write = data.append + write_html(ElementTree(element).getroot(), file, encoding, format="xhtml") + return "".join(data) diff --git a/tests/extensions/extra/tables.html b/tests/extensions/extra/tables.html index c931e6a..1d626da 100644 --- a/tests/extensions/extra/tables.html +++ b/tests/extensions/extra/tables.html @@ -85,13 +85,13 @@ - + Q - + W - + W @@ -106,13 +106,13 @@ - + Q - + W - + W diff --git a/tests/misc/attributes2.html b/tests/misc/attributes2.html index 5971cc8..b78fee0 100644 --- a/tests/misc/attributes2.html +++ b/tests/misc/attributes2.html @@ -1,6 +1,6 @@ -

+

Or in the middle of the text

-

\ No newline at end of file +

\ No newline at end of file diff --git a/tests/misc/blank-block-quote.html b/tests/misc/blank-block-quote.html index 23df17a..966078c 100644 --- a/tests/misc/blank-block-quote.html +++ b/tests/misc/blank-block-quote.html @@ -1,3 +1,3 @@

aaaaaaaaaaa

-
+

bbbbbbbbbbb

\ No newline at end of file diff --git a/tests/test_apis.py b/tests/test_apis.py index eecfbf6..0de897a 100644 --- a/tests/test_apis.py +++ b/tests/test_apis.py @@ -277,7 +277,7 @@ class testETreeComments(unittest.TestCase): Test that ElementTree Comments work. These tests should only be a concern when using cElementTree with third - party serializers (including markdown's html4 serializer). While markdown + party serializers (including markdown's (x)html serializer). While markdown doesn't use ElementTree.Comment itself, we should certainly support any third party extensions which may. Therefore, these tests are included to ensure such support is maintained. @@ -301,14 +301,14 @@ class testETreeComments(unittest.TestCase): def testCommentSerialization(self): """ Test that an ElementTree Comment serializes properly. """ - self.assertEqual(markdown.html4.to_html_string(self.comment), + self.assertEqual(markdown.searializers.to_html_string(self.comment), '') def testCommentPrettify(self): """ Test that an ElementTree Comment is prettified properly. """ pretty = markdown.treeprocessors.PrettifyTreeprocessor() pretty.run(self.comment) - self.assertEqual(markdown.html4.to_html_string(self.comment), + self.assertEqual(markdown.searializers.to_html_string(self.comment), '\n') @@ -325,7 +325,7 @@ class testAtomicString(unittest.TestCase): p = markdown.util.etree.SubElement(tree, 'p') p.text = u'some *text*' new = self.inlineprocessor.run(tree) - self.assertEqual(markdown.html4.to_html_string(new), + self.assertEqual(markdown.searializers.to_html_string(new), '

some text

') def testSimpleAtomicString(self): @@ -334,7 +334,7 @@ class testAtomicString(unittest.TestCase): p = markdown.util.etree.SubElement(tree, 'p') p.text = markdown.util.AtomicString(u'some *text*') new = self.inlineprocessor.run(tree) - self.assertEqual(markdown.html4.to_html_string(new), + self.assertEqual(markdown.searializers.to_html_string(new), '

some *text*

') def testNestedAtomicString(self): @@ -352,6 +352,6 @@ class testAtomicString(unittest.TestCase): span2.tail = markdown.util.AtomicString(u' *test*') span1.tail = markdown.util.AtomicString(u' *with*') new = self.inlineprocessor.run(tree) - self.assertEqual(markdown.html4.to_html_string(new), + self.assertEqual(markdown.searializers.to_html_string(new), '

*some* *more* *text* *here* *to* *test* *with*

') -- cgit v1.2.3