6 files changed, 346 insertions, 21 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py
index 390329a..f1ddcde 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -44,6 +44,7 @@ version_info = (2,0,0, "beta-2")
 
 import re
 import codecs
+import sys
 import logging
 from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
 
@@ -66,6 +67,7 @@ COMMAND_LINE_LOGGING_LEVEL = CRITICAL
 TAB_LENGTH = 4               # expand tabs to this many spaces
 ENABLE_ATTRIBUTES = True     # @id = xyz -> <... id="xyz">
 SMART_EMPHASIS = True        # this_or_that does not become this<i>or</i>that
+DEFAULT_OUTPUT_FORMAT = 'xhtml1'     # xhtml or html4 output
 HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
 BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
                                   "|script|noscript|form|fieldset|iframe|math"
@@ -93,6 +95,8 @@ import odict
 
 etree = etree_loader.importETree()
 
+# Adds the ability to output html4
+import html4
 
 """
 Constants you probably do not need to change
@@ -157,7 +161,8 @@ class Markdown:
     def __init__(self,
                  extensions=[],
                  extension_configs={},
-                 safe_mode = False):
+                 safe_mode = False, 
+                 output_format=DEFAULT_OUTPUT_FORMAT):
         """
         Creates a new Markdown instance.
 
@@ -169,6 +174,14 @@ class Markdown:
            as-is.
         * extension-configs: Configuration setting for extensions.
         * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+        * output_format: Format of output. Supported formats are:
+            * "xhtml1": Outputs XHTML 1.x. Default.
+            * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
+            * "html4": Outputs HTML 4
+            * "html": Outputs latest supported version of HTML (currently HTML 4).
+            Note that it is suggested that the more specific formats ("xhtml1" 
+            and "html4") be used as "xhtml" or "html" may change in the future
+            if it makes sense at that time. 
 
         """
         
@@ -268,6 +281,7 @@ class Markdown:
         self.htmlStash = preprocessors.HtmlStash()
         self.registerExtensions(extensions = extensions,
                                 configs = extension_configs)
+        self.set_output_format(output_format)
         self.reset()
 
     def registerExtensions(self, extensions, configs):
@@ -305,8 +319,25 @@ class Markdown:
         for extension in self.registeredExtensions:
             extension.reset()
 
-    def convert (self, source):
-        """Convert markdown to serialized XHTML."""
+    def set_output_format(self, format):
+        """ Set the output format for the class instance. """
+        if format.lower() in ['html', 'html4']:
+            self.serializer = html4.to_html_string
+        elif format.lower() in ['xhtml', 'xhtml1']:
+            self.serializer = etree.tostring
+        else:
+            message(CRITICAL, 'Invalid Output Format: "%s". Use one of "xhtml1" or "html4".' % format)
+            sys.exit()
+
+    def convert(self, source):
+        """
+        Convert markdown to serialized XHTML or HTML.
+
+        Keyword arguments:
+
+        * source: Source text as a Unicode string.
+
+        """
 
         # Fixup the source text
         if not source:
@@ -337,19 +368,19 @@ class Markdown:
                 root = newRoot
 
         # Serialize _properly_.  Strip top-level tags.
-        xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8"))
+        output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8"))
         if self.stripTopLevelTags:
-            start = xml.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
-            end = xml.rindex('</%s>'%DOC_TAG)
-            xml = xml[start:end].strip()
+            start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
+            end = output.rindex('</%s>'%DOC_TAG)
+            output = output[start:end].strip()
 
         # Run the text post-processors
         for pp in self.postprocessors.values():
-            xml = pp.run(xml)
+            output = pp.run(output)
 
-        return xml.strip()
+        return output.strip()
 
-    def convertFile(self, input = None, output = None, encoding = None):
+    def convertFile(self, input=None, output=None, encoding=None):
         """Converts a markdown file and returns the HTML as a unicode string.
 
         Decodes the file using the provided encoding (defaults to utf-8),
@@ -365,9 +396,7 @@ class Markdown:
 
         * input: Name of source text file.
         * output: Name of output file. Writes to stdout if `None`.
-        * extensions: A list of extension names (may contain config args).
         * encoding: Encoding of input and output files. Defaults to utf-8.
-        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
 
         """
 
@@ -499,7 +528,8 @@ markdownFromFile().
 
 def markdown(text,
              extensions = [],
-             safe_mode = False):
+             safe_mode = False,
+             output_format = DEFAULT_OUTPUT_FORMAT):
     """Convert a markdown string to HTML and return HTML as a unicode string.
 
     This is a shortcut function for `Markdown` class to cover the most
@@ -511,12 +541,21 @@ def markdown(text,
     * text: Markdown formatted text as Unicode or ASCII string.
     * extensions: A list of extensions or extension names (may contain config args).
     * safe_mode: Disallow raw html.  One of "remove", "replace" or "escape".
+    * output_format: Format of output. Supported formats are:
+        * "xhtml1": Outputs XHTML 1.x. Default.
+        * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
+        * "html4": Outputs HTML 4
+        * "html": Outputs latest supported version of HTML (currently HTML 4).
+        Note that it is suggested that the more specific formats ("xhtml1" 
+        and "html4") be used as "xhtml" or "html" may change in the future
+        if it makes sense at that time. 
 
     Returns: An HTML document as a string.
 
     """
     md = Markdown(extensions=load_extensions(extensions),
-                  safe_mode = safe_mode)
+                  safe_mode=safe_mode, 
+                  output_format=output_format)
     return md.convert(text)
 
 
@@ -524,9 +563,12 @@ def markdownFromFile(input = None,
                      output = None,
                      extensions = [],
                      encoding = None,
-                     safe = False):
+                     safe_mode = False,
+                     output_format = DEFAULT_OUTPUT_FORMAT):
     """Read markdown code from a file and write it to a file or a stream."""
-    md = Markdown(extensions=load_extensions(extensions), safe_mode = safe)
+    md = Markdown(extensions=load_extensions(extensions), 
+                  safe_mode=safe_mode,
+                  output_format=output_format)
     md.convertFile(input, output, encoding)
 
 
diff --git a/markdown/commandline.py b/markdown/commandline.py
index 68efcdb..1eedc6d 100644
--- a/markdown/commandline.py
+++ b/markdown/commandline.py
@@ -57,6 +57,9 @@ def parse_options():
     parser.add_option("-s", "--safe", dest="safe", default=False,
                       metavar="SAFE_MODE",
                       help="safe mode ('replace', 'remove' or 'escape'  user's HTML tag)")
+    parser.add_option("-o", "--output_format", dest="output_format", 
+                      default='xhtml1', metavar="OUTPUT_FORMAT",
+                      help="Format of output. One of 'xhtml1' (default) or 'html4'.")
     parser.add_option("--noisy",
                       action="store_const", const=DEBUG, dest="verbose",
                       help="print debug messages")
@@ -76,9 +79,10 @@ def parse_options():
 
     return {'input': input_file,
             'output': options.filename,
-            'safe': options.safe,
+            'safe_mode': options.safe,
             'extensions': options.extensions,
-            'encoding': options.encoding }, options.verbose
+            'encoding': options.encoding,
+            'output_format': options.output_format}, options.verbose
 
 def run():
     """Run Markdown from the command line."""
diff --git a/markdown/html4.py b/markdown/html4.py
new file mode 100644
index 0000000..08f241d
--- /dev/null
+++ b/markdown/html4.py
@@ -0,0 +1,274 @@
+# markdown/html4.py
+#
+# Add html4 serialization to older versions of Elementree
+# Taken from ElementTree 1.3 preview with slight modifications
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
+#
+# fredrik@pythonware.com
+# http://www.pythonware.com
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+
+import markdown
+ElementTree = markdown.etree.ElementTree
+QName = markdown.etree.QName
+Comment = markdown.etree.Comment
+PI = markdown.etree.PI
+ProcessingInstruction = markdown.etree.ProcessingInstruction
+
+HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
+              "img", "input", "isindex", "link", "meta" "param")
+
+try:
+    HTML_EMPTY = set(HTML_EMPTY)
+except NameError:
+    pass
+
+_namespace_map = {
+    # "well-known" namespace prefixes
+    "http://www.w3.org/XML/1998/namespace": "xml",
+    "http://www.w3.org/1999/xhtml": "html",
+    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
+    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
+    # xml schema
+    "http://www.w3.org/2001/XMLSchema": "xs",
+    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
+    # dublic core
+    "http://purl.org/dc/elements/1.1/": "dc",
+}
+
+
+def _raise_serialization_error(text):
+    raise TypeError(
+        "cannot serialize %r (type %s)" % (text, type(text).__name__)
+        )
+
+def _encode(text, encoding):
+    try:
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+def _escape_cdata(text, encoding):
+    # escape character data
+    try:
+        # it's worth avoiding do-nothing calls for strings that are
+        # shorter than 500 character, or so.  assume that's, by far,
+        # the most common case in most applications.
+        if "&" in text:
+            text = text.replace("&", "&amp;")
+        if "<" in text:
+            text = text.replace("<", "&lt;")
+        if ">" in text:
+            text = text.replace(">", "&gt;")
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+
+def _escape_attrib(text, encoding):
+    # escape attribute value
+    try:
+        if "&" in text:
+            text = text.replace("&", "&amp;")
+        if "<" in text:
+            text = text.replace("<", "&lt;")
+        if ">" in text:
+            text = text.replace(">", "&gt;")
+        if "\"" in text:
+            text = text.replace("\"", "&quot;")
+        if "\n" in text:
+            text = text.replace("\n", "&#10;")
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+def _escape_attrib_html(text, encoding):
+    # escape attribute value
+    try:
+        if "&" in text:
+            text = text.replace("&", "&amp;")
+        if ">" in text:
+            text = text.replace(">", "&gt;")
+        if "\"" in text:
+            text = text.replace("\"", "&quot;")
+        return text.encode(encoding, "xmlcharrefreplace")
+    except (TypeError, AttributeError):
+        _raise_serialization_error(text)
+
+
+def _serialize_html(write, elem, encoding, qnames, namespaces):
+    tag = elem.tag
+    text = elem.text
+    if tag is Comment:
+        write("<!--%s-->" % _escape_cdata(text, encoding))
+    elif tag is ProcessingInstruction:
+        write("<?%s?>" % _escape_cdata(text, encoding))
+    else:
+        tag = qnames[tag]
+        if tag is None:
+            if text:
+                write(_escape_cdata(text, encoding))
+            for e in elem:
+                _serialize_html(write, e, encoding, qnames, None)
+        else:
+            write("<" + tag)
+            items = elem.items()
+            if items or namespaces:
+                items.sort() # lexical order
+                for k, v in items:
+                    if isinstance(k, QName):
+                        k = k.text
+                    if isinstance(v, QName):
+                        v = qnames[v.text]
+                    else:
+                        v = _escape_attrib_html(v, encoding)
+                    # FIXME: handle boolean attributes
+                    write(" %s=\"%s\"" % (qnames[k], v))
+                if namespaces:
+                    items = namespaces.items()
+                    items.sort(key=lambda x: x[1]) # sort on prefix
+                    for v, k in items:
+                        if k:
+                            k = ":" + k
+                        write(" xmlns%s=\"%s\"" % (
+                            k.encode(encoding),
+                            _escape_attrib(v, encoding)
+                            ))
+            write(">")
+            tag = tag.lower()
+            if text:
+                if tag == "script" or tag == "style":
+                    write(_encode(text, encoding))
+                else:
+                    write(_escape_cdata(text, encoding))
+            for e in elem:
+                _serialize_html(write, e, encoding, qnames, None)
+            if tag not in HTML_EMPTY:
+                write("</" + tag + ">")
+    if elem.tail:
+        write(_escape_cdata(elem.tail, encoding))
+
+def write_html(root, f,
+          # keyword arguments
+          encoding="us-ascii",
+          default_namespace=None):
+    assert root is not None
+    if not hasattr(f, "write"):
+        f = open(f, "wb")
+    write = f.write
+    if not encoding:
+        encoding = "us-ascii"
+    qnames, namespaces = _namespaces(
+            root, encoding, default_namespace
+            )
+    _serialize_html(
+                write, root, encoding, qnames, namespaces
+                )
+
+# --------------------------------------------------------------------
+# serialization support
+
+def _namespaces(elem, encoding, default_namespace=None):
+    # identify namespaces used in this tree
+
+    # maps qnames to *encoded* prefix:local names
+    qnames = {None: None}
+
+    # maps uri:s to prefixes
+    namespaces = {}
+    if default_namespace:
+        namespaces[default_namespace] = ""
+
+    def encode(text):
+        return text.encode(encoding)
+
+    def add_qname(qname):
+        # calculate serialized qname representation
+        try:
+            if qname[:1] == "{":
+                uri, tag = qname[1:].split("}", 1)
+                prefix = namespaces.get(uri)
+                if prefix is None:
+                    prefix = _namespace_map.get(uri)
+                    if prefix is None:
+                        prefix = "ns%d" % len(namespaces)
+                    if prefix != "xml":
+                        namespaces[uri] = prefix
+                if prefix:
+                    qnames[qname] = encode("%s:%s" % (prefix, tag))
+                else:
+                    qnames[qname] = encode(tag) # default element
+            else:
+                if default_namespace:
+                    # FIXME: can this be handled in XML 1.0?
+                    raise ValueError(
+                        "cannot use non-qualified names with "
+                        "default_namespace option"
+                        )
+                qnames[qname] = encode(qname)
+        except TypeError:
+            _raise_serialization_error(qname)
+
+    # populate qname and namespaces table
+    try:
+        iterate = elem.iter
+    except AttributeError:
+        iterate = elem.getiterator # cET compatibility
+    for elem in iterate():
+        tag = elem.tag
+        if isinstance(tag, QName) and tag.text not in qnames:
+            add_qname(tag.text)
+        elif isinstance(tag, basestring):
+            if tag not in qnames:
+                add_qname(tag)
+        elif tag is not None and tag is not Comment and tag is not PI:
+            _raise_serialization_error(tag)
+        for key, value in elem.items():
+            if isinstance(key, QName):
+                key = key.text
+            if key not in qnames:
+                add_qname(key)
+            if isinstance(value, QName) and value.text not in qnames:
+                add_qname(value.text)
+        text = elem.text
+        if isinstance(text, QName) and text.text not in qnames:
+            add_qname(text.text)
+    return qnames, namespaces
+
+def to_html_string(element, encoding=None):
+    class dummy:
+        pass
+    data = []
+    file = dummy()
+    file.write = data.append
+    write_html(ElementTree(element).getroot(),file,encoding)
+    return "".join(data)
diff --git a/test-markdown.py b/test-markdown.py
index f3be36e..95914c4 100755
--- a/test-markdown.py
+++ b/test-markdown.py
@@ -160,7 +160,7 @@ class TestRunner :
         if not os.path.exists(TMP_DIR):
             os.mkdir(TMP_DIR)
 
-    def test_directory(self, dir, measure_time=False, safe_mode=False, encoding = "utf8") :
+    def test_directory(self, dir, measure_time=False, safe_mode=False, encoding="utf8", output_format='xhtml1') :
         self.encoding = encoding
         benchmark_file_name = os.path.join(dir, "benchmark.dat")
         self.saved_benchmarks = {}
@@ -187,7 +187,7 @@ class TestRunner :
 
         mem = memory()
         start = time.clock()
-        self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode)
+        self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode, output_format=output_format)
         construction_time = time.clock() - start
         construction_mem = memory(mem)
 
@@ -228,7 +228,7 @@ class TestRunner :
 ####################
 
 
-    def run_test(self, dir, test, repeat) :
+    def run_test(self, dir, test, repeat):
 
         print "--- %s ---" % test
         self.html_diff_file.write("<tr><td>%s</td>" % test)
@@ -324,6 +324,7 @@ def run_tests() :
     tester.test_directory("tests/extensions-x-toc")
     tester.test_directory("tests/extensions-x-def_list")
     tester.test_directory("tests/extensions-x-abbr")
+    tester.test_directory("tests/html4", output_format='html4')
 
     try:
         import pygments
diff --git a/tests/html4/html4.html b/tests/html4/html4.html
new file mode 100644
index 0000000..7c88ad7
--- /dev/null
+++ b/tests/html4/html4.html
@@ -0,0 +1,2 @@
+<p>A test of the most<br>
+basic of html/xhtml differences.</p>
+\ No newline at end of file
diff --git a/tests/html4/html4.txt b/tests/html4/html4.txt
new file mode 100644
index 0000000..fddaf8e
--- /dev/null
+++ b/tests/html4/html4.txt
@@ -0,0 +1,2 @@
+A test of the most  
+basic of html/xhtml differences.
+\ No newline at end of file