diff options
-rw-r--r-- | markdown/__init__.py | 74 | ||||
-rw-r--r-- | markdown/commandline.py | 8 | ||||
-rw-r--r-- | markdown/html4.py | 274 | ||||
-rwxr-xr-x | test-markdown.py | 7 | ||||
-rw-r--r-- | tests/html4/html4.html | 2 | ||||
-rw-r--r-- | tests/html4/html4.txt | 2 |
6 files changed, 346 insertions, 21 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py index 390329a..f1ddcde 100644 --- a/markdown/__init__.py +++ b/markdown/__init__.py @@ -44,6 +44,7 @@ version_info = (2,0,0, "beta-2") import re import codecs +import sys import logging from logging import DEBUG, INFO, WARN, ERROR, CRITICAL @@ -66,6 +67,7 @@ COMMAND_LINE_LOGGING_LEVEL = CRITICAL TAB_LENGTH = 4 # expand tabs to this many spaces ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that +DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" "|script|noscript|form|fieldset|iframe|math" @@ -93,6 +95,8 @@ import odict etree = etree_loader.importETree() +# Adds the ability to output html4 +import html4 """ Constants you probably do not need to change @@ -157,7 +161,8 @@ class Markdown: def __init__(self, extensions=[], extension_configs={}, - safe_mode = False): + safe_mode = False, + output_format=DEFAULT_OUTPUT_FORMAT): """ Creates a new Markdown instance. @@ -169,6 +174,14 @@ class Markdown: as-is. * extension-configs: Configuration setting for extensions. * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + * output_format: Format of output. Supported formats are: + * "xhtml1": Outputs XHTML 1.x. Default. + * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). + * "html4": Outputs HTML 4 + * "html": Outputs latest supported version of HTML (currently HTML 4). + Note that it is suggested that the more specific formats ("xhtml1" + and "html4") be used as "xhtml" or "html" may change in the future + if it makes sense at that time. """ @@ -268,6 +281,7 @@ class Markdown: self.htmlStash = preprocessors.HtmlStash() self.registerExtensions(extensions = extensions, configs = extension_configs) + self.set_output_format(output_format) self.reset() def registerExtensions(self, extensions, configs): @@ -305,8 +319,25 @@ class Markdown: for extension in self.registeredExtensions: extension.reset() - def convert (self, source): - """Convert markdown to serialized XHTML.""" + def set_output_format(self, format): + """ Set the output format for the class instance. """ + if format.lower() in ['html', 'html4']: + self.serializer = html4.to_html_string + elif format.lower() in ['xhtml', 'xhtml1']: + self.serializer = etree.tostring + else: + message(CRITICAL, 'Invalid Output Format: "%s". Use one of "xhtml1" or "html4".' % format) + sys.exit() + + def convert(self, source): + """ + Convert markdown to serialized XHTML or HTML. + + Keyword arguments: + + * source: Source text as a Unicode string. + + """ # Fixup the source text if not source: @@ -337,19 +368,19 @@ class Markdown: root = newRoot # Serialize _properly_. Strip top-level tags. - xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8")) + output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8")) if self.stripTopLevelTags: - start = xml.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2 - end = xml.rindex('</%s>'%DOC_TAG) - xml = xml[start:end].strip() + start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2 + end = output.rindex('</%s>'%DOC_TAG) + output = output[start:end].strip() # Run the text post-processors for pp in self.postprocessors.values(): - xml = pp.run(xml) + output = pp.run(output) - return xml.strip() + return output.strip() - def convertFile(self, input = None, output = None, encoding = None): + def convertFile(self, input=None, output=None, encoding=None): """Converts a markdown file and returns the HTML as a unicode string. Decodes the file using the provided encoding (defaults to utf-8), @@ -365,9 +396,7 @@ class Markdown: * input: Name of source text file. * output: Name of output file. Writes to stdout if `None`. - * extensions: A list of extension names (may contain config args). * encoding: Encoding of input and output files. Defaults to utf-8. - * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". """ @@ -499,7 +528,8 @@ markdownFromFile(). def markdown(text, extensions = [], - safe_mode = False): + safe_mode = False, + output_format = DEFAULT_OUTPUT_FORMAT): """Convert a markdown string to HTML and return HTML as a unicode string. This is a shortcut function for `Markdown` class to cover the most @@ -511,12 +541,21 @@ def markdown(text, * text: Markdown formatted text as Unicode or ASCII string. * extensions: A list of extensions or extension names (may contain config args). * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + * output_format: Format of output. Supported formats are: + * "xhtml1": Outputs XHTML 1.x. Default. + * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1). + * "html4": Outputs HTML 4 + * "html": Outputs latest supported version of HTML (currently HTML 4). + Note that it is suggested that the more specific formats ("xhtml1" + and "html4") be used as "xhtml" or "html" may change in the future + if it makes sense at that time. Returns: An HTML document as a string. """ md = Markdown(extensions=load_extensions(extensions), - safe_mode = safe_mode) + safe_mode=safe_mode, + output_format=output_format) return md.convert(text) @@ -524,9 +563,12 @@ def markdownFromFile(input = None, output = None, extensions = [], encoding = None, - safe = False): + safe_mode = False, + output_format = DEFAULT_OUTPUT_FORMAT): """Read markdown code from a file and write it to a file or a stream.""" - md = Markdown(extensions=load_extensions(extensions), safe_mode = safe) + md = Markdown(extensions=load_extensions(extensions), + safe_mode=safe_mode, + output_format=output_format) md.convertFile(input, output, encoding) diff --git a/markdown/commandline.py b/markdown/commandline.py index 68efcdb..1eedc6d 100644 --- a/markdown/commandline.py +++ b/markdown/commandline.py @@ -57,6 +57,9 @@ def parse_options(): parser.add_option("-s", "--safe", dest="safe", default=False, metavar="SAFE_MODE", help="safe mode ('replace', 'remove' or 'escape' user's HTML tag)") + parser.add_option("-o", "--output_format", dest="output_format", + default='xhtml1', metavar="OUTPUT_FORMAT", + help="Format of output. One of 'xhtml1' (default) or 'html4'.") parser.add_option("--noisy", action="store_const", const=DEBUG, dest="verbose", help="print debug messages") @@ -76,9 +79,10 @@ def parse_options(): return {'input': input_file, 'output': options.filename, - 'safe': options.safe, + 'safe_mode': options.safe, 'extensions': options.extensions, - 'encoding': options.encoding }, options.verbose + 'encoding': options.encoding, + 'output_format': options.output_format}, options.verbose def run(): """Run Markdown from the command line.""" diff --git a/markdown/html4.py b/markdown/html4.py new file mode 100644 index 0000000..08f241d --- /dev/null +++ b/markdown/html4.py @@ -0,0 +1,274 @@ +# markdown/html4.py +# +# Add html4 serialization to older versions of Elementree +# Taken from ElementTree 1.3 preview with slight modifications +# +# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2007 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + + +import markdown +ElementTree = markdown.etree.ElementTree +QName = markdown.etree.QName +Comment = markdown.etree.Comment +PI = markdown.etree.PI +ProcessingInstruction = markdown.etree.ProcessingInstruction + +HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", + "img", "input", "isindex", "link", "meta" "param") + +try: + HTML_EMPTY = set(HTML_EMPTY) +except NameError: + pass + +_namespace_map = { + # "well-known" namespace prefixes + "http://www.w3.org/XML/1998/namespace": "xml", + "http://www.w3.org/1999/xhtml": "html", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", + "http://schemas.xmlsoap.org/wsdl/": "wsdl", + # xml schema + "http://www.w3.org/2001/XMLSchema": "xs", + "http://www.w3.org/2001/XMLSchema-instance": "xsi", + # dublic core + "http://purl.org/dc/elements/1.1/": "dc", +} + + +def _raise_serialization_error(text): + raise TypeError( + "cannot serialize %r (type %s)" % (text, type(text).__name__) + ) + +def _encode(text, encoding): + try: + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + +def _escape_cdata(text, encoding): + # escape character data + try: + # it's worth avoiding do-nothing calls for strings that are + # shorter than 500 character, or so. assume that's, by far, + # the most common case in most applications. + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + + +def _escape_attrib(text, encoding): + # escape attribute value + try: + if "&" in text: + text = text.replace("&", "&") + if "<" in text: + text = text.replace("<", "<") + if ">" in text: + text = text.replace(">", ">") + if "\"" in text: + text = text.replace("\"", """) + if "\n" in text: + text = text.replace("\n", " ") + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + +def _escape_attrib_html(text, encoding): + # escape attribute value + try: + if "&" in text: + text = text.replace("&", "&") + if ">" in text: + text = text.replace(">", ">") + if "\"" in text: + text = text.replace("\"", """) + return text.encode(encoding, "xmlcharrefreplace") + except (TypeError, AttributeError): + _raise_serialization_error(text) + + +def _serialize_html(write, elem, encoding, qnames, namespaces): + tag = elem.tag + text = elem.text + if tag is Comment: + write("<!--%s-->" % _escape_cdata(text, encoding)) + elif tag is ProcessingInstruction: + write("<?%s?>" % _escape_cdata(text, encoding)) + else: + tag = qnames[tag] + if tag is None: + if text: + write(_escape_cdata(text, encoding)) + for e in elem: + _serialize_html(write, e, encoding, qnames, None) + else: + write("<" + tag) + items = elem.items() + if items or namespaces: + items.sort() # lexical order + for k, v in items: + if isinstance(k, QName): + k = k.text + if isinstance(v, QName): + v = qnames[v.text] + else: + v = _escape_attrib_html(v, encoding) + # FIXME: handle boolean attributes + write(" %s=\"%s\"" % (qnames[k], v)) + if namespaces: + items = namespaces.items() + items.sort(key=lambda x: x[1]) # sort on prefix + for v, k in items: + if k: + k = ":" + k + write(" xmlns%s=\"%s\"" % ( + k.encode(encoding), + _escape_attrib(v, encoding) + )) + write(">") + tag = tag.lower() + if text: + if tag == "script" or tag == "style": + write(_encode(text, encoding)) + else: + write(_escape_cdata(text, encoding)) + for e in elem: + _serialize_html(write, e, encoding, qnames, None) + if tag not in HTML_EMPTY: + write("</" + tag + ">") + if elem.tail: + write(_escape_cdata(elem.tail, encoding)) + +def write_html(root, f, + # keyword arguments + encoding="us-ascii", + default_namespace=None): + assert root is not None + if not hasattr(f, "write"): + f = open(f, "wb") + write = f.write + if not encoding: + encoding = "us-ascii" + qnames, namespaces = _namespaces( + root, encoding, default_namespace + ) + _serialize_html( + write, root, encoding, qnames, namespaces + ) + +# -------------------------------------------------------------------- +# serialization support + +def _namespaces(elem, encoding, default_namespace=None): + # identify namespaces used in this tree + + # maps qnames to *encoded* prefix:local names + qnames = {None: None} + + # maps uri:s to prefixes + namespaces = {} + if default_namespace: + namespaces[default_namespace] = "" + + def encode(text): + return text.encode(encoding) + + def add_qname(qname): + # calculate serialized qname representation + try: + if qname[:1] == "{": + uri, tag = qname[1:].split("}", 1) + prefix = namespaces.get(uri) + if prefix is None: + prefix = _namespace_map.get(uri) + if prefix is None: + prefix = "ns%d" % len(namespaces) + if prefix != "xml": + namespaces[uri] = prefix + if prefix: + qnames[qname] = encode("%s:%s" % (prefix, tag)) + else: + qnames[qname] = encode(tag) # default element + else: + if default_namespace: + # FIXME: can this be handled in XML 1.0? + raise ValueError( + "cannot use non-qualified names with " + "default_namespace option" + ) + qnames[qname] = encode(qname) + except TypeError: + _raise_serialization_error(qname) + + # populate qname and namespaces table + try: + iterate = elem.iter + except AttributeError: + iterate = elem.getiterator # cET compatibility + for elem in iterate(): + tag = elem.tag + if isinstance(tag, QName) and tag.text not in qnames: + add_qname(tag.text) + elif isinstance(tag, basestring): + if tag not in qnames: + add_qname(tag) + elif tag is not None and tag is not Comment and tag is not PI: + _raise_serialization_error(tag) + for key, value in elem.items(): + if isinstance(key, QName): + key = key.text + if key not in qnames: + add_qname(key) + if isinstance(value, QName) and value.text not in qnames: + add_qname(value.text) + text = elem.text + if isinstance(text, QName) and text.text not in qnames: + add_qname(text.text) + return qnames, namespaces + +def to_html_string(element, encoding=None): + class dummy: + pass + data = [] + file = dummy() + file.write = data.append + write_html(ElementTree(element).getroot(),file,encoding) + return "".join(data) diff --git a/test-markdown.py b/test-markdown.py index f3be36e..95914c4 100755 --- a/test-markdown.py +++ b/test-markdown.py @@ -160,7 +160,7 @@ class TestRunner : if not os.path.exists(TMP_DIR): os.mkdir(TMP_DIR) - def test_directory(self, dir, measure_time=False, safe_mode=False, encoding = "utf8") : + def test_directory(self, dir, measure_time=False, safe_mode=False, encoding="utf8", output_format='xhtml1') : self.encoding = encoding benchmark_file_name = os.path.join(dir, "benchmark.dat") self.saved_benchmarks = {} @@ -187,7 +187,7 @@ class TestRunner : mem = memory() start = time.clock() - self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode) + self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode, output_format=output_format) construction_time = time.clock() - start construction_mem = memory(mem) @@ -228,7 +228,7 @@ class TestRunner : #################### - def run_test(self, dir, test, repeat) : + def run_test(self, dir, test, repeat): print "--- %s ---" % test self.html_diff_file.write("<tr><td>%s</td>" % test) @@ -324,6 +324,7 @@ def run_tests() : tester.test_directory("tests/extensions-x-toc") tester.test_directory("tests/extensions-x-def_list") tester.test_directory("tests/extensions-x-abbr") + tester.test_directory("tests/html4", output_format='html4') try: import pygments diff --git a/tests/html4/html4.html b/tests/html4/html4.html new file mode 100644 index 0000000..7c88ad7 --- /dev/null +++ b/tests/html4/html4.html @@ -0,0 +1,2 @@ +<p>A test of the most<br> +basic of html/xhtml differences.</p>
\ No newline at end of file diff --git a/tests/html4/html4.txt b/tests/html4/html4.txt new file mode 100644 index 0000000..fddaf8e --- /dev/null +++ b/tests/html4/html4.txt @@ -0,0 +1,2 @@ +A test of the most +basic of html/xhtml differences.
\ No newline at end of file |