"""
Python Markdown
===============
Python Markdown converts Markdown to HTML and can be used as a library or
called from the command line.
## Basic usage as a module:
import markdown
md = Markdown()
html = md.convert(your_text_string)
## Basic use from the command line:
python markdown.py source.txt > destination.html
Run "python markdown.py --help" to see more options.
## Extensions
See for more
information and instructions on how to extend the functionality of
Python Markdown. Read that before you try modifying this file.
## Authors and License
Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
Contact: markdown@freewisdom.org
Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
Copyright 200? Django Software Foundation (OrderedDict implementation)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
License: BSD (see docs/LICENSE for details).
"""
version = "2.0-beta-2"
version_info = (2,0,0, "beta-2")
import re
import codecs
import logging
from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
"""
CONSTANTS
=============================================================================
"""
"""
Constants you might want to modify
-----------------------------------------------------------------------------
"""
# default logging level for command-line use
COMMAND_LINE_LOGGING_LEVEL = CRITICAL
TAB_LENGTH = 4 # expand tabs to this many spaces
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
SMART_EMPHASIS = True # this_or_that does not become thisorthat
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
"|script|noscript|form|fieldset|iframe|math"
"|ins|del|hr|hr/|style|li|dt|dd|tr")
# Placeholders
STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder
ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder
INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
AMP_SUBSTITUTE = STX+"amp"+ETX
import preprocessors, blockprocessors, treeprocessors, inlinepatterns
import postprocessors
import blockparser
import etree_loader
import odict
# Extensions should use "markdown.etree" instead of "etree" (or do `from
# markdown import etree`). Do not import it by yourself.
etree = etree_loader.importETree()
"""
Constants you probably do not need to change
-----------------------------------------------------------------------------
"""
RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
# Hebrew (0590-05FF), Arabic (0600-06FF),
# Syriac (0700-074F), Arabic supplement (0750-077F),
# Thaana (0780-07BF), Nko (07C0-07FF).
(u'\u2D30', u'\u2D7F'), # Tifinagh
)
"""
AUXILIARY GLOBAL FUNCTIONS
=============================================================================
"""
def message(level, text):
""" A wrapper method for logging debug messages. """
logging.getLogger('MARKDOWN').log(level, text)
def isBlockLevel(tag):
"""Check if the tag is a block level HTML tag."""
return BLOCK_LEVEL_ELEMENTS.match(tag)
"""
MISC AUXILIARY CLASSES
=============================================================================
"""
class AtomicString(unicode):
"""A string which should not be further processed."""
pass
"""
OVERALL DESIGN
=============================================================================
Markdown processing takes place in four steps:
1. A bunch of "preprocessors" munge the input text.
2. BlockParser() parses the high-level structural elements of the
pre-processed text into an ElementTree.
3. A bunch of "treeprocessors" are run against the ElementTree. One such
treeprocessor runs InlinePatterns against the ElementTree, detecting inline
markup.
4. Some post-processors are run against the text after the ElementTree has
been serialized into text.
5. The output is written to a string.
Those steps are put together by the Markdown() class.
"""
class Markdown:
"""Convert Markdown to HTML."""
def __init__(self,
extensions=[],
extension_configs={},
safe_mode = False):
"""
Creates a new Markdown instance.
Keyword arguments:
* extensions: A list of extensions.
If they are of type string, the module mdx_name.py will be loaded.
If they are a subclass of markdown.Extension, they will be used
as-is.
* extension-configs: Configuration setting for extensions.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
"""
self.safeMode = safe_mode
self.registeredExtensions = []
self.docType = ""
self.stripTopLevelTags = True
# Preprocessors
self.preprocessors = odict.OrderedDict()
self.preprocessors["html_block"] = \
preprocessors.HtmlBlockPreprocessor(self)
self.preprocessors["reference"] = \
preprocessors.ReferencePreprocessor(self)
# footnote preprocessor will be inserted with "amp_substitute"
self.references = {}
self.htmlStash = preprocessors.HtmlStash()
self.registerExtensions(extensions = extensions,
configs = extension_configs)
self.reset()
def registerExtensions(self, extensions, configs):
"""
Register extensions with this instance of Markdown.
Keyword aurguments:
* extensions: A list of extensions, which can either
be strings or objects. See the docstring on Markdown.
* configs: A dictionary mapping module names to config options.
"""
for ext in extensions:
if isinstance(ext, basestring):
ext = load_extension(ext, configs.get(ext, []))
elif hasattr(ext, 'extendMarkdown'):
# Looks like an Extension.
# Nothing to do here.
pass
else:
message(ERROR, "Incorrect type! Extension '%s' is "
"neither a string or an Extension." %(repr(ext)))
continue
ext.extendMarkdown(self, globals())
def registerExtension(self, extension):
""" This gets called by the extension """
self.registeredExtensions.append(extension)
def reset(self):
"""
Resets all state variables so that we can start with a new text.
"""
self.htmlStash.reset()
self.references.clear()
for extension in self.registeredExtensions:
extension.reset()
def convert (self, source):
"""Convert markdown to serialized XHTML."""
# Fixup the source text
if not source:
return u"" # a blank unicode string
try:
source = unicode(source)
except UnicodeDecodeError:
message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
return u""
source = source.replace(STX, "").replace(ETX, "")
source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
source = re.sub(r'\n\s+\n', '\n\n', source)
source = source.expandtabs(TAB_LENGTH)
# Split into lines and run the line preprocessors.
self.lines = source.split("\n")
for prep in self.preprocessors.values():
self.lines = prep.run(self.lines)
# Parse the high-level elements.
root = self.parser.parseDocument(self.lines).getroot()
# Run the tree-processors
for treeprocessor in self.treeprocessors.values():
newRoot = treeprocessor.run(root)
if newRoot:
root = newRoot
# Serialize _properly_. Strip top-level tags.
xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8"))
if self.stripTopLevelTags:
xml = xml.strip()[44:-7] + "\n"
# Run the text post-processors
for pp in self.postprocessors.values():
xml = pp.run(xml)
return xml.strip()
def convertFile(self, input = None, output = None, encoding = None):
"""Converts a markdown file and returns the HTML as a unicode string.
Decodes the file using the provided encoding (defaults to utf-8),
passes the file content to markdown, and outputs the html to either
the provided stream or the file with provided name, using the same
encoding as the source file.
**Note:** This is the only place that decoding and encoding of unicode
takes place in Python-Markdown. (All other code is unicode-in /
unicode-out.)
Keyword arguments:
* input: Name of source text file.
* output: Name of output file. Writes to stdout if `None`.
* extensions: A list of extension names (may contain config args).
* encoding: Encoding of input and output files. Defaults to utf-8.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
"""
encoding = encoding or "utf-8"
# Read the source
input_file = codecs.open(input, mode="r", encoding=encoding)
text = input_file.read()
input_file.close()
text = text.lstrip(u'\ufeff') # remove the byte-order mark
# Convert
html = self.convert(text)
# Write to file or stdout
if type(output) == type("string"):
output_file = codecs.open(output, "w", encoding=encoding)
output_file.write(html)
output_file.close()
else:
output.write(html.encode(encoding))
"""
Extensions
-----------------------------------------------------------------------------
"""
class Extension:
""" Base class for extensions to subclass. """
def __init__(self, configs = {}):
"""Create an instance of an Extention.
Keyword arguments:
* configs: A dict of configuration setting used by an Extension.
"""
self.config = configs
def getConfig(self, key):
""" Return a setting for the given key or an empty string. """
if self.config.has_key(key):
return self.config[key][0]
else:
return ""
def getConfigInfo(self):
""" Return all config settings as a list of tuples. """
return [(key, self.config[key][1]) for key in self.config.keys()]
def setConfig(self, key, value):
""" Set a config setting for `key` with the given `value`. """
self.config[key][0] = value
def extendMarkdown(self, md, md_globals):
"""
Add the various proccesors and patterns to the Markdown Instance.
This method must be overriden by every extension.
Keyword arguments:
* md: The Markdown instance.
* md_globals: Global variables in the markdown module namespace.
"""
pass
def load_extension(ext_name, configs = []):
"""Load extension by name, then return the module.
The extension name may contain arguments as part of the string in the
following format: "extname(key1=value1,key2=value2)"
"""
# Parse extensions config params (ignore the order)
configs = dict(configs)
pos = ext_name.find("(") # find the first "("
if pos > 0:
ext_args = ext_name[pos+1:-1]
ext_name = ext_name[:pos]
pairs = [x.split("=") for x in ext_args.split(",")]
configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
# Setup the module names
ext_module = 'markdown.extensions'
module_name_new_style = '.'.join([ext_module, ext_name])
module_name_old_style = '_'.join(['mdx', ext_name])
# Try loading the extention first from one place, then another
try: # New style (markdown_extensons.)
module = __import__(module_name_new_style, {}, {}, [ext_module])
except ImportError:
try: # Old style (mdx.)
module = __import__(module_name_old_style)
except ImportError:
message(CRITICAL, "Failed loading extension '%s' from '%s' or '%s'"
% (ext_name, module_name_new_style, module_name_old_style))
# If the module is loaded successfully, we expect it to define a
# function called makeExtension()
try:
return module.makeExtension(configs.items())
except:
message(CRITICAL, "Failed to instantiate extension '%s'" % ext_name)
def load_extensions(ext_names):
"""Loads multiple extensions"""
extensions = []
for ext_name in ext_names:
extension = load_extension(ext_name)
if extension:
extensions.append(extension)
return extensions
"""
EXPORTED FUNCTIONS
=============================================================================
Those are the two functions we really mean to export: markdown() and
markdownFromFile().
"""
def markdown(text,
extensions = [],
safe_mode = False):
"""Convert a markdown string to HTML and return HTML as a unicode string.
This is a shortcut function for `Markdown` class to cover the most
basic use case. It initializes an instance of Markdown, loads the
necessary extensions and runs the parser on the given text.
Keyword arguments:
* text: Markdown formatted text as Unicode or ASCII string.
* extensions: A list of extensions or extension names (may contain config args).
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
Returns: An HTML document as a string.
"""
md = Markdown(extensions=load_extensions(extensions),
safe_mode = safe_mode)
return md.convert(text)
def markdownFromFile(input = None,
output = None,
extensions = [],
encoding = None,
safe = False):
"""Read markdown code from a file and write it to a file or a stream."""
md = Markdown(extensions=load_extensions(extensions), safe_mode = safe)
md.convertFile(input, output, encoding)