aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--markdown/__init__.py74
-rw-r--r--markdown/commandline.py8
-rw-r--r--markdown/html4.py274
-rwxr-xr-xtest-markdown.py7
-rw-r--r--tests/html4/html4.html2
-rw-r--r--tests/html4/html4.txt2
6 files changed, 346 insertions, 21 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py
index 390329a..f1ddcde 100644
--- a/markdown/__init__.py
+++ b/markdown/__init__.py
@@ -44,6 +44,7 @@ version_info = (2,0,0, "beta-2")
import re
import codecs
+import sys
import logging
from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
@@ -66,6 +67,7 @@ COMMAND_LINE_LOGGING_LEVEL = CRITICAL
TAB_LENGTH = 4 # expand tabs to this many spaces
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that
+DEFAULT_OUTPUT_FORMAT = 'xhtml1' # xhtml or html4 output
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
"|script|noscript|form|fieldset|iframe|math"
@@ -93,6 +95,8 @@ import odict
etree = etree_loader.importETree()
+# Adds the ability to output html4
+import html4
"""
Constants you probably do not need to change
@@ -157,7 +161,8 @@ class Markdown:
def __init__(self,
extensions=[],
extension_configs={},
- safe_mode = False):
+ safe_mode = False,
+ output_format=DEFAULT_OUTPUT_FORMAT):
"""
Creates a new Markdown instance.
@@ -169,6 +174,14 @@ class Markdown:
as-is.
* extension-configs: Configuration setting for extensions.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+ * output_format: Format of output. Supported formats are:
+ * "xhtml1": Outputs XHTML 1.x. Default.
+ * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
+ * "html4": Outputs HTML 4
+ * "html": Outputs latest supported version of HTML (currently HTML 4).
+ Note that it is suggested that the more specific formats ("xhtml1"
+ and "html4") be used as "xhtml" or "html" may change in the future
+ if it makes sense at that time.
"""
@@ -268,6 +281,7 @@ class Markdown:
self.htmlStash = preprocessors.HtmlStash()
self.registerExtensions(extensions = extensions,
configs = extension_configs)
+ self.set_output_format(output_format)
self.reset()
def registerExtensions(self, extensions, configs):
@@ -305,8 +319,25 @@ class Markdown:
for extension in self.registeredExtensions:
extension.reset()
- def convert (self, source):
- """Convert markdown to serialized XHTML."""
+ def set_output_format(self, format):
+ """ Set the output format for the class instance. """
+ if format.lower() in ['html', 'html4']:
+ self.serializer = html4.to_html_string
+ elif format.lower() in ['xhtml', 'xhtml1']:
+ self.serializer = etree.tostring
+ else:
+ message(CRITICAL, 'Invalid Output Format: "%s". Use one of "xhtml1" or "html4".' % format)
+ sys.exit()
+
+ def convert(self, source):
+ """
+ Convert markdown to serialized XHTML or HTML.
+
+ Keyword arguments:
+
+ * source: Source text as a Unicode string.
+
+ """
# Fixup the source text
if not source:
@@ -337,19 +368,19 @@ class Markdown:
root = newRoot
# Serialize _properly_. Strip top-level tags.
- xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8"))
+ output, length = codecs.utf_8_decode(self.serializer(root, encoding="utf8"))
if self.stripTopLevelTags:
- start = xml.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
- end = xml.rindex('</%s>'%DOC_TAG)
- xml = xml[start:end].strip()
+ start = output.index('<%s>'%DOC_TAG)+len(DOC_TAG)+2
+ end = output.rindex('</%s>'%DOC_TAG)
+ output = output[start:end].strip()
# Run the text post-processors
for pp in self.postprocessors.values():
- xml = pp.run(xml)
+ output = pp.run(output)
- return xml.strip()
+ return output.strip()
- def convertFile(self, input = None, output = None, encoding = None):
+ def convertFile(self, input=None, output=None, encoding=None):
"""Converts a markdown file and returns the HTML as a unicode string.
Decodes the file using the provided encoding (defaults to utf-8),
@@ -365,9 +396,7 @@ class Markdown:
* input: Name of source text file.
* output: Name of output file. Writes to stdout if `None`.
- * extensions: A list of extension names (may contain config args).
* encoding: Encoding of input and output files. Defaults to utf-8.
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
"""
@@ -499,7 +528,8 @@ markdownFromFile().
def markdown(text,
extensions = [],
- safe_mode = False):
+ safe_mode = False,
+ output_format = DEFAULT_OUTPUT_FORMAT):
"""Convert a markdown string to HTML and return HTML as a unicode string.
This is a shortcut function for `Markdown` class to cover the most
@@ -511,12 +541,21 @@ def markdown(text,
* text: Markdown formatted text as Unicode or ASCII string.
* extensions: A list of extensions or extension names (may contain config args).
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+ * output_format: Format of output. Supported formats are:
+ * "xhtml1": Outputs XHTML 1.x. Default.
+ * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
+ * "html4": Outputs HTML 4
+ * "html": Outputs latest supported version of HTML (currently HTML 4).
+ Note that it is suggested that the more specific formats ("xhtml1"
+ and "html4") be used as "xhtml" or "html" may change in the future
+ if it makes sense at that time.
Returns: An HTML document as a string.
"""
md = Markdown(extensions=load_extensions(extensions),
- safe_mode = safe_mode)
+ safe_mode=safe_mode,
+ output_format=output_format)
return md.convert(text)
@@ -524,9 +563,12 @@ def markdownFromFile(input = None,
output = None,
extensions = [],
encoding = None,
- safe = False):
+ safe_mode = False,
+ output_format = DEFAULT_OUTPUT_FORMAT):
"""Read markdown code from a file and write it to a file or a stream."""
- md = Markdown(extensions=load_extensions(extensions), safe_mode = safe)
+ md = Markdown(extensions=load_extensions(extensions),
+ safe_mode=safe_mode,
+ output_format=output_format)
md.convertFile(input, output, encoding)
diff --git a/markdown/commandline.py b/markdown/commandline.py
index 68efcdb..1eedc6d 100644
--- a/markdown/commandline.py
+++ b/markdown/commandline.py
@@ -57,6 +57,9 @@ def parse_options():
parser.add_option("-s", "--safe", dest="safe", default=False,
metavar="SAFE_MODE",
help="safe mode ('replace', 'remove' or 'escape' user's HTML tag)")
+ parser.add_option("-o", "--output_format", dest="output_format",
+ default='xhtml1', metavar="OUTPUT_FORMAT",
+ help="Format of output. One of 'xhtml1' (default) or 'html4'.")
parser.add_option("--noisy",
action="store_const", const=DEBUG, dest="verbose",
help="print debug messages")
@@ -76,9 +79,10 @@ def parse_options():
return {'input': input_file,
'output': options.filename,
- 'safe': options.safe,
+ 'safe_mode': options.safe,
'extensions': options.extensions,
- 'encoding': options.encoding }, options.verbose
+ 'encoding': options.encoding,
+ 'output_format': options.output_format}, options.verbose
def run():
"""Run Markdown from the command line."""
diff --git a/markdown/html4.py b/markdown/html4.py
new file mode 100644
index 0000000..08f241d
--- /dev/null
+++ b/markdown/html4.py
@@ -0,0 +1,274 @@
+# markdown/html4.py
+#
+# Add html4 serialization to older versions of Elementree
+# Taken from ElementTree 1.3 preview with slight modifications
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
+#
+# fredrik@pythonware.com
+# http://www.pythonware.com
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2007 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+
+import markdown
+ElementTree = markdown.etree.ElementTree
+QName = markdown.etree.QName
+Comment = markdown.etree.Comment
+PI = markdown.etree.PI
+ProcessingInstruction = markdown.etree.ProcessingInstruction
+
+HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
+ "img", "input", "isindex", "link", "meta" "param")
+
+try:
+ HTML_EMPTY = set(HTML_EMPTY)
+except NameError:
+ pass
+
+_namespace_map = {
+ # "well-known" namespace prefixes
+ "http://www.w3.org/XML/1998/namespace": "xml",
+ "http://www.w3.org/1999/xhtml": "html",
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
+ "http://schemas.xmlsoap.org/wsdl/": "wsdl",
+ # xml schema
+ "http://www.w3.org/2001/XMLSchema": "xs",
+ "http://www.w3.org/2001/XMLSchema-instance": "xsi",
+ # dublic core
+ "http://purl.org/dc/elements/1.1/": "dc",
+}
+
+
+def _raise_serialization_error(text):
+ raise TypeError(
+ "cannot serialize %r (type %s)" % (text, type(text).__name__)
+ )
+
+def _encode(text, encoding):
+ try:
+ return text.encode(encoding, "xmlcharrefreplace")
+ except (TypeError, AttributeError):
+ _raise_serialization_error(text)
+
+def _escape_cdata(text, encoding):
+ # escape character data
+ try:
+ # it's worth avoiding do-nothing calls for strings that are
+ # shorter than 500 character, or so. assume that's, by far,
+ # the most common case in most applications.
+ if "&" in text:
+ text = text.replace("&", "&amp;")
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ return text.encode(encoding, "xmlcharrefreplace")
+ except (TypeError, AttributeError):
+ _raise_serialization_error(text)
+
+
+def _escape_attrib(text, encoding):
+ # escape attribute value
+ try:
+ if "&" in text:
+ text = text.replace("&", "&amp;")
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ if "\"" in text:
+ text = text.replace("\"", "&quot;")
+ if "\n" in text:
+ text = text.replace("\n", "&#10;")
+ return text.encode(encoding, "xmlcharrefreplace")
+ except (TypeError, AttributeError):
+ _raise_serialization_error(text)
+
+def _escape_attrib_html(text, encoding):
+ # escape attribute value
+ try:
+ if "&" in text:
+ text = text.replace("&", "&amp;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ if "\"" in text:
+ text = text.replace("\"", "&quot;")
+ return text.encode(encoding, "xmlcharrefreplace")
+ except (TypeError, AttributeError):
+ _raise_serialization_error(text)
+
+
+def _serialize_html(write, elem, encoding, qnames, namespaces):
+ tag = elem.tag
+ text = elem.text
+ if tag is Comment:
+ write("<!--%s-->" % _escape_cdata(text, encoding))
+ elif tag is ProcessingInstruction:
+ write("<?%s?>" % _escape_cdata(text, encoding))
+ else:
+ tag = qnames[tag]
+ if tag is None:
+ if text:
+ write(_escape_cdata(text, encoding))
+ for e in elem:
+ _serialize_html(write, e, encoding, qnames, None)
+ else:
+ write("<" + tag)
+ items = elem.items()
+ if items or namespaces:
+ items.sort() # lexical order
+ for k, v in items:
+ if isinstance(k, QName):
+ k = k.text
+ if isinstance(v, QName):
+ v = qnames[v.text]
+ else:
+ v = _escape_attrib_html(v, encoding)
+ # FIXME: handle boolean attributes
+ write(" %s=\"%s\"" % (qnames[k], v))
+ if namespaces:
+ items = namespaces.items()
+ items.sort(key=lambda x: x[1]) # sort on prefix
+ for v, k in items:
+ if k:
+ k = ":" + k
+ write(" xmlns%s=\"%s\"" % (
+ k.encode(encoding),
+ _escape_attrib(v, encoding)
+ ))
+ write(">")
+ tag = tag.lower()
+ if text:
+ if tag == "script" or tag == "style":
+ write(_encode(text, encoding))
+ else:
+ write(_escape_cdata(text, encoding))
+ for e in elem:
+ _serialize_html(write, e, encoding, qnames, None)
+ if tag not in HTML_EMPTY:
+ write("</" + tag + ">")
+ if elem.tail:
+ write(_escape_cdata(elem.tail, encoding))
+
+def write_html(root, f,
+ # keyword arguments
+ encoding="us-ascii",
+ default_namespace=None):
+ assert root is not None
+ if not hasattr(f, "write"):
+ f = open(f, "wb")
+ write = f.write
+ if not encoding:
+ encoding = "us-ascii"
+ qnames, namespaces = _namespaces(
+ root, encoding, default_namespace
+ )
+ _serialize_html(
+ write, root, encoding, qnames, namespaces
+ )
+
+# --------------------------------------------------------------------
+# serialization support
+
+def _namespaces(elem, encoding, default_namespace=None):
+ # identify namespaces used in this tree
+
+ # maps qnames to *encoded* prefix:local names
+ qnames = {None: None}
+
+ # maps uri:s to prefixes
+ namespaces = {}
+ if default_namespace:
+ namespaces[default_namespace] = ""
+
+ def encode(text):
+ return text.encode(encoding)
+
+ def add_qname(qname):
+ # calculate serialized qname representation
+ try:
+ if qname[:1] == "{":
+ uri, tag = qname[1:].split("}", 1)
+ prefix = namespaces.get(uri)
+ if prefix is None:
+ prefix = _namespace_map.get(uri)
+ if prefix is None:
+ prefix = "ns%d" % len(namespaces)
+ if prefix != "xml":
+ namespaces[uri] = prefix
+ if prefix:
+ qnames[qname] = encode("%s:%s" % (prefix, tag))
+ else:
+ qnames[qname] = encode(tag) # default element
+ else:
+ if default_namespace:
+ # FIXME: can this be handled in XML 1.0?
+ raise ValueError(
+ "cannot use non-qualified names with "
+ "default_namespace option"
+ )
+ qnames[qname] = encode(qname)
+ except TypeError:
+ _raise_serialization_error(qname)
+
+ # populate qname and namespaces table
+ try:
+ iterate = elem.iter
+ except AttributeError:
+ iterate = elem.getiterator # cET compatibility
+ for elem in iterate():
+ tag = elem.tag
+ if isinstance(tag, QName) and tag.text not in qnames:
+ add_qname(tag.text)
+ elif isinstance(tag, basestring):
+ if tag not in qnames:
+ add_qname(tag)
+ elif tag is not None and tag is not Comment and tag is not PI:
+ _raise_serialization_error(tag)
+ for key, value in elem.items():
+ if isinstance(key, QName):
+ key = key.text
+ if key not in qnames:
+ add_qname(key)
+ if isinstance(value, QName) and value.text not in qnames:
+ add_qname(value.text)
+ text = elem.text
+ if isinstance(text, QName) and text.text not in qnames:
+ add_qname(text.text)
+ return qnames, namespaces
+
+def to_html_string(element, encoding=None):
+ class dummy:
+ pass
+ data = []
+ file = dummy()
+ file.write = data.append
+ write_html(ElementTree(element).getroot(),file,encoding)
+ return "".join(data)
diff --git a/test-markdown.py b/test-markdown.py
index f3be36e..95914c4 100755
--- a/test-markdown.py
+++ b/test-markdown.py
@@ -160,7 +160,7 @@ class TestRunner :
if not os.path.exists(TMP_DIR):
os.mkdir(TMP_DIR)
- def test_directory(self, dir, measure_time=False, safe_mode=False, encoding = "utf8") :
+ def test_directory(self, dir, measure_time=False, safe_mode=False, encoding="utf8", output_format='xhtml1') :
self.encoding = encoding
benchmark_file_name = os.path.join(dir, "benchmark.dat")
self.saved_benchmarks = {}
@@ -187,7 +187,7 @@ class TestRunner :
mem = memory()
start = time.clock()
- self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode)
+ self.md = markdown.Markdown(extensions=extensions, safe_mode = safe_mode, output_format=output_format)
construction_time = time.clock() - start
construction_mem = memory(mem)
@@ -228,7 +228,7 @@ class TestRunner :
####################
- def run_test(self, dir, test, repeat) :
+ def run_test(self, dir, test, repeat):
print "--- %s ---" % test
self.html_diff_file.write("<tr><td>%s</td>" % test)
@@ -324,6 +324,7 @@ def run_tests() :
tester.test_directory("tests/extensions-x-toc")
tester.test_directory("tests/extensions-x-def_list")
tester.test_directory("tests/extensions-x-abbr")
+ tester.test_directory("tests/html4", output_format='html4')
try:
import pygments
diff --git a/tests/html4/html4.html b/tests/html4/html4.html
new file mode 100644
index 0000000..7c88ad7
--- /dev/null
+++ b/tests/html4/html4.html
@@ -0,0 +1,2 @@
+<p>A test of the most<br>
+basic of html/xhtml differences.</p> \ No newline at end of file
diff --git a/tests/html4/html4.txt b/tests/html4/html4.txt
new file mode 100644
index 0000000..fddaf8e
--- /dev/null
+++ b/tests/html4/html4.txt
@@ -0,0 +1,2 @@
+A test of the most
+basic of html/xhtml differences. \ No newline at end of file