Added optional HTML 4 output. Available formats currently include XHTML 1 and HTML 4.

Thanks to Eric Abrahamsen for doing the legwork and providing an initial working patch. And thanks to Fredrik Lundh for allowing us to include his html4 serializer from the ElementTree 1.3 preview.
author: Eric Abrahamsen <girzel@gmail.com> 2009-01-28 13:44:39 -0500
committer: Waylan Limberg <waylan@gmail.com> 2009-01-28 13:52:44 -0500
commit: c89c1263798eaedffa09077819e769b019801556 (patch)
tree: ed9023a9338a04a115c13aab41add006a09fd36b
parent: 94c7c29f4e766032520ffac8a080c3bdba6c4da4 (diff)
download: markdown-c89c1263798eaedffa09077819e769b019801556.tar.gz
markdown-c89c1263798eaedffa09077819e769b019801556.tar.bz2
markdown-c89c1263798eaedffa09077819e769b019801556.zip
6 files changed, 346 insertions, 21 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py
index 390329a..f1ddcde 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -44,6 +44,7 @@ version_info = (2,0,0, "beta-2")
 
 import re
 import codecs
+import sys
 import logging
 from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
 
@@ -66,6 +67,7 @@ COMMAND_LINE_LOGGING_LEVEL = CRITICAL
 TAB_LENGTH = 4               # expand tabs to this many spaces
 ENABLE_ATTRIBUTES = True     # @id = xyz -> <... id="xyz">
 SMART_EMPHASIS = True        # this_or_that does not become this<i>or</i>that
+DEFAULT_OUTPUT_FORMAT = 'xhtml1'     # xhtml or html4 output
 HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
 BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
                                   "|script|noscript|form|fieldset|iframe|math"
@@ -93,6 +95,8 @@ import odict
 
 etree = etree_loader.importETree()
 
+# Adds the ability to output html4
+import html4
 
 """
 Constants you probably do not need to change
@@ -157,7 +161,8 @@ class Markdown:
     def __init__(self,
                  extensions=[],
                  extension_configs={},
-                 safe_mode = False):
+                 safe_mode = False, 
+                 output_format=DEFAULT_OUTPUT_FORMAT):
         """
         Creates a new Markdown instance.
 
@@ -169,6 +174,14 @@ class Markdown:
            as-is.
         * extension-configs: Configuration setting for extensions.
         * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+        * output_format: Format of output. Supported formats are:
+            * "xhtml1": Outputs XHTML 1.x. Default.
+            * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
+            * "html4": Outputs HTML 4
+            * "html": Outputs latest supported version of HTML (currently HTML 4).
+            Note that it is suggested that the more specific formats ("xhtml1" 
+            and "html4") be used as "xhtml" or "html" may change in the future
+            if it makes sense at that time. 
 
         """
         
@@ -268,6 +281,7 @@ class Markdown:
         self.htmlStash = preprocessors.HtmlStash()
         self.registerExtensions(extensions = extensions,
                                 configs = extension_configs)
+        self.set_output_format(output_format)
         self.reset()
 
     def registerExtensions(self, extensions, configs):
@@ -305,8 +319,25 @@ class Markdown:
         for extension in self.registeredExtensions:
             extension.reset()
 
-    def convert (self, source):
-        """Convert markdown to serialized XHTML."""
+    def set_output_format(self, format):
+        """ Set the output format for the class instance. """
+        if format.lower() in ['html', 'html4']:
+            self.serializer = html4.to_html_string
+        elif format.lower() in ['xhtml', 'xhtml1']:
+            self.serializer = etree.tostring
+        else:
+            message(CRITICAL, 'Invalid Output Format: "%s". Use one of "xhtml1" or "html4".' % format)
+            sys.exit()
+
+    def convert(self, source):
+        """
+        Convert markdown to serialized XHTML or HTML.
+
+        Keyword arguments:
+
+        * source: Source text as a Unicode string.
+
+        """
 
         # Fixup the source text
         if not source:
@@ -337,19 +368,19 @@ class Markdown:
                 root = newRoot
 
         # Serialize _properly_.  Strip top-level tags.
-        xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8"))
+        output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8"))
         if self.stripTopLevelTags:
-            start = xml.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
-            end = xml.rindex('</%s>'%DOC_TAG)
-            xml = xml[start:end].strip()
+            start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
+            end = output.rindex('</%s>'%DOC_TAG)
+            output = output[start:end].strip()
 
         # Run the text post-processors
         for pp in self.postprocessors.values():
-            xml = pp.run(xml)
+            output = pp.run(output)
 
-        return xml.strip()
+        return output.strip()
 
-    def convertFile(self, input = None, output = None, encoding = None):
+    def convertFile(self, input=None, output=None, encoding=None):
         """Converts a markdown file and returns the HTML as a unicode string.
 
         Decodes the file using the provided encoding (defaults to utf-8),
@@ -365,9 +396,7 @@ class Markdown:
 
         * input: Name of source text file.
         * output: Name of output file. Writes to stdout if `None`.
-        * extensions: A list of extension names (may contain config args).
         * encoding: Encoding of input and output files. Defaults to utf-8.
-        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
 
         """
 
@@ -499,7 +528,8 @@ markdownFromFile().
 
 def markdown(text,
              extensions = [],
-             safe_mode = False):
+             safe_mode = False,
+             output_format = DEFAULT_OUTPUT_FORMAT):
     """Convert a markdown string to HTML and return HTML as a unicode string.
 
     This is a shortcut function for `Markdown` class to cover the most
@@ -511,12 +541,21 @@ def markdown(text,
     * text: Markdown formatted text as Unicode or ASCII string.
     * extensions: A list of extensions or extension names (may contain config args).
     * safe_mode: Disallow raw html.  One of "remove", "replace" or "escape".
+    * output_format: Format of output. Supported formats are:
+        * "xhtml1": Outputs XHTML 1.x. Default.
+        * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
+        * "html4": Outputs HTML 4
+        * "html": Outputs latest supported version of HTML (currently HTML 4).
+        Note that it is suggested that the more specific formats ("xhtml1" 
+        and "html4") be used as "xhtml" or "html" may change in the future
+        if it makes sense at that time. 
 
     Returns: An HTML document as a string.
 
     """
     md = Markdown(extensions=load_extensions(extensions),
-                  safe_mode = safe_mode)
+                  safe_mode=safe_mode, 
+                  output_format=output_format)
     return md.convert(text)
 
 
@@ -524,9 +563,12 @@ def markdownFromFile(input = None,
                      output = None,
                      extensions = [],
                      encoding = None,
-                     safe = False):
+                     safe_mode = False,
+                     output_format = DEFAULT_OUTPUT_FORMAT):
     """Read markdown code from a file and write it to a file or a stream."""
-    md = Markdown(extensions=load_extensions(extensions), safe_mode = safe)
+    md = Markdown(extensions=load_extensions(extensions), 
+                  safe_mode=safe_mode,
+                  output_format=output_format)
     md.convertFile(input, output, encoding)
 
 
diff --git a/markdown/commandline.py b/markdown/commandline.py
index 68efcdb..1eedc6d 100644
--- a/markdown/commandline.py
+++ b/markdown/commandline.py
@@ -57,6 +57,9 @@ def parse_options():
     parser.add_option("-s", "--safe", dest="safe", default=False,
                       metavar="SAFE_MODE",
                       help="safe mode ('replace', 'remove' or 'escape'  user's HTML tag)")
+    parser.add_option("-o", "--output_format", dest="output_format", 
+                      default='xhtml1', metavar="OUTPUT_FORMAT",
+                      help="Format of output. One of 'xhtml1' (default) or 'html4'.")
     parser.add_option("--noisy",
                       action="store_const", const=DEBUG, dest="verbose",
                       help="print debug messages")
@@ -76,9 +79,10 @@ def parse_options():
 
     return {'input': input_file,
             'output': options.filename,
-            'safe': options.safe,
+            'safe_mode': options.safe,
             'extensions': options.extensions,
-            'encoding': options.encoding }, options.verbose
+            'encoding': options.encoding,
+            'output_format': options.output_format}, options.verbose
 
 def run():
     """Run Markdown from the command line."""
diff --git a/markdown/html4.py b/markdown/html4.py
new file mode 100644
index 0000000..08f241d
--- /dev/null
+++ b/markdown/html4.py
@@ -0,0 +1,274 @@
+# markdown/html4.py
+#
+# Add html4 serialization to older versions of Elementree
+# Taken from ElementTree 1.3 preview with slight modifications
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
+#
+# fredrik@pythonware.com
+# http://www.pythonware.com
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+
+import markdown
+ElementTree = markdown.etree.ElementTree
+QName = markdown.etree.QName
+Comment = markdown.etree.Comment
+PI = markdown.etree.PI
+ProcessingInstruction = markdown.etree.ProcessingInstruction
+
+HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
+              "img", "input", "isindex", "link", "meta" "param")
+
+try:
+    HTML_EMPTY = set(HTML_EMPTY)
+except NameError:
+    pass
+
+_namespace_map = {
+    # "well-known" namespace prefixes
+    "http://www.w3.org/XML/1998/namespace": "xml",
+    "http://www.w3.org/1999/xhtml": "html",
+    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
+    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
+    # xml schema
+    "http://www.w3.org/2001/XMLSchema": "xs",
+    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
+    # dublic core
+    "http://purl.org/dc/elements/1.1/": "dc",
+}
+
+
+def _raise_serialization_error(text):
+    raise TypeError(
+        "cannot serialize %r (type %s)" % (text, type(text).__name__)
+        )
+
+def _encode(text, encoding):
+    try:
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+def _escape_cdata(text, encoding):
+    # escape character data
+    try:
+        # it's worth avoiding do-nothing calls for strings that are
+        # shorter than 500 character, or so.  assume that's, by far,
+        # the most common case in most applications.
+        if "&" in text:
+            text = text.replace("&", "&amp;")
+        if "<" in text:
+            text = text.replace("<", "&lt;")
+        if ">" in text:
+            text = text.replace(">", "&gt;")
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+
+def _escape_attrib(text, encoding):
+    # escape attribute value
+    try:
+        if "&" in text:
+            text = text.replace("&", "&amp;")
+        if "<" in text:
+            text = text.replace("<", "&lt;")
+        if ">" in text:
+            text = text.replace(">", "&gt;")
+        if "\"" in text:
+            text = text.replace("\"", "&quot;")
+        if "\n" in text:
+            text = text.replace("\n", "&#10;")
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+def _escape_attrib_html(text, encoding):
+    # escape attribute value
+    try:
+        if "&" in text:
+            text = text.replace("&", "&amp;")
+        if ">" in text:
+            text = text.replace(">", "&gt;")
+        if "\"" in text:
+            text = text.replace("\"", "&quot;")
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+
+def _serialize_html(write, elem, encoding, qnames, namespaces):
+    tag = elem.tag
+    text = elem.text
+    if tag is Comment:
+        write("<!--%s-->" % _escape_cdata(text, encoding))
+    elif tag is ProcessingInstruction:
+        write("<?%s?>" % _escape_cdata(text, encoding))
+    else:
+        tag = qnames[tag]
+        if tag is None:
+            if text:
+                write(_escape_cdata(text, encoding))
+            for e in elem:
+                _serialize_html(write, e, encoding, qnames, None)
+        else:
+            write("<" + tag)
+            items = elem.items()
+            if items or namespaces:
+                items.sort() # lexical order
+                for k, v in items:
+                    if isinstance(k, QName):
+                        k = k.text
+                    if isinstance(v, QName):
+                        v = qnames[v.text]
+                    else:
+                        v = _escape_attrib_html(v, encoding)
+                    # FIXME: handle boolean attributes
+                    write(" %s=\"%s\"" % (qnames[k], v))
+                if namespaces:
+                    items = namespaces.items()
+                    items.sort(key=lambda x: x[1]) # sort on prefix
+                    for v, k in items:
+                        if k:
+                            k = ":" + k
+                        write(" xmlns%s=\"%s\"" % (
+                            k.encode(encoding),
+                            _escape_attrib(v, encoding)
+                            ))
+            write(">")
+            tag = tag.lower()
+            if text:
+                if tag == "script" or tag == "style":
+                    write(_encode(text, encoding))
+                else:
+                    write(_escape_cdata(text, encoding))
+            for e in elem:
+                _serialize_html(write, e, encoding, qnames, None)
+            if tag not in HTML_EMPTY:
+                write("</" + tag + ">")
+    if elem.tail:
+        write(_escape_cdata(elem.tail, encoding))
+
+def write_html(root, f,
+          # keyword arguments
+          encoding="us-ascii",
+          default_namespace=None):
+    assert root is not None
+    if not hasattr(f, "write"):
+        f = open(f, "wb")
+    write = f.write
+    if not encoding:
+        encoding = "us-ascii"
+    qnames, namespaces = _namespaces(
+            root, encoding, default_namespace
+            )
+    _serialize_html(
+                write, root, encoding, qnames, namespaces
+                )
+
+# --------------------------------------------------------------------
+# serialization support
+
+def _namespaces(elem, encoding, default_namespace=None):
+    # identify namespaces used in this tree
+
+    # maps qnames to *encoded* prefix:local names
+    qnames = {None: None}
+
+    # maps uri:s to prefixes
+    namespaces = {}
+    if default_namespace:
+        namespaces[default_namespace] = ""
+
+    def encode(text):
+        return text.encode(encoding)
+
+    def add_qname(qname):
+        # calculate serialized qname representation
+        try:
+            if qname[:1] == "{":
+                uri, tag = qname[1:].split("}", 1)
+                prefix = namespaces.get(uri)
+                if prefix is None:
+                    prefix = _namespace_map.get(uri)
+                    if prefix is None:
+                        prefix = "ns%d" % len(namespaces)
+                    if prefix != "xml":
+                        namespaces[uri] = prefix
+                if prefix:
+                    qnames[qname] = encode("%s:%s" % (prefix, tag))
+                else:
+                    qnames[qname] = encode(tag) # default element
+            else:
+                if default_namespace:
+                    # FIXME: can this be handled in XML 1.0?
+                    raise ValueError(
+                        "cannot use non-qualified names with "
+                        "default_namespace option"
+                        )
+                qnames[qname] = encode(qname)
+        except TypeError:
+            _raise_serialization_error(qname)
+
+    # populate qname and namespaces table
+    try:
+        iterate = elem.iter
+    except AttributeError:
+        iterate = elem.getiterator # cET compatibility
+    for elem in iterate():
+        tag = elem.tag
+        if isinstance(tag, QName) and tag.text not in qnames:
+            add_qname(tag.text)
+        elif isinstance(tag, basestring):
+            if tag not in qnames:
+                add_qname(tag)
+        elif tag is not None and tag is not Comment and tag is not PI:
+            _raise_serialization_error(tag)
+        for key, value in elem.items():
+            if isinstance(key, QName):
+                key = key.text
+            if key not in qnames:
+                add_qname(key)
+            if isinstance(value, QName) and value.text not in qnames:
+                add_qname(value.text)
+        text = elem.text
+        if isinstance(text, QName) and text.text not in qnames:
+            add_qname(text.text)
+    return qnames, namespaces
+
+def to_html_string(element, encoding=None):
+    class dummy:
+        pass
+    data = []
+    file = dummy()
+    file.write = data.append
+    write_html(ElementTree(element).getroot(),file,encoding)
+    return "".join(data)
diff --git a/test-markdown.py b/test-markdown.py
index f3be36e..95914c4 100755
--- a/test-markdown.py
+++ b/test-markdown.py
@@ -160,7 +160,7 @@ class TestRunner :
         if not os.path.exists(TMP_DIR):
             os.mkdir(TMP_DIR)
 
-    def test_directory(self, dir, measure_time=False, safe_mode=False, encoding = "utf8") :
+    def test_directory(self, dir, measure_time=False, safe_mode=False, encoding="utf8", output_format='xhtml1') :
         self.encoding = encoding
         benchmark_file_name = os.path.join(dir, "benchmark.dat")
         self.saved_benchmarks = {}
@@ -187,7 +187,7 @@ class TestRunner :
 
         mem = memory()
         start = time.clock()
-        self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode)
+        self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode, output_format=output_format)
         construction_time = time.clock() - start
         construction_mem = memory(mem)
 
@@ -228,7 +228,7 @@ class TestRunner :
 ####################
 
 
-    def run_test(self, dir, test, repeat) :
+    def run_test(self, dir, test, repeat):
 
         print "--- %s ---" % test
         self.html_diff_file.write("<tr><td>%s</td>" % test)
@@ -324,6 +324,7 @@ def run_tests() :
     tester.test_directory("tests/extensions-x-toc")
     tester.test_directory("tests/extensions-x-def_list")
     tester.test_directory("tests/extensions-x-abbr")
+    tester.test_directory("tests/html4", output_format='html4')
 
     try:
         import pygments
diff --git a/tests/html4/html4.html b/tests/html4/html4.html
new file mode 100644
index 0000000..7c88ad7
--- /dev/null
+++ b/tests/html4/html4.html
@@ -0,0 +1,2 @@
+<p>A test of the most<br>
+basic of html/xhtml differences.</p>
+\ No newline at end of file
diff --git a/tests/html4/html4.txt b/tests/html4/html4.txt
new file mode 100644
index 0000000..fddaf8e
--- /dev/null
+++ b/tests/html4/html4.txt
@@ -0,0 +1,2 @@
+A test of the most  
+basic of html/xhtml differences.
+\ No newline at end of file
author	Eric Abrahamsen <girzel@gmail.com>	2009-01-28 13:44:39 -0500
committer	Waylan Limberg <waylan@gmail.com>	2009-01-28 13:52:44 -0500
commit	c89c1263798eaedffa09077819e769b019801556 (patch)
tree	ed9023a9338a04a115c13aab41add006a09fd36b
parent	94c7c29f4e766032520ffac8a080c3bdba6c4da4 (diff)
download	markdown-c89c1263798eaedffa09077819e769b019801556.tar.gz markdown-c89c1263798eaedffa09077819e769b019801556.tar.bz2 markdown-c89c1263798eaedffa09077819e769b019801556.zip