""" Python Markdown =============== Python Markdown converts Markdown to HTML and can be used as a library or called from the command line. ## Basic usage as a module: import markdown md = Markdown() html = md.convert(your_text_string) ## Basic use from the command line: python markdown.py source.txt > destination.html Run "python markdown.py --help" to see more options. ## Extensions See for more information and instructions on how to extend the functionality of Python Markdown. Read that before you try modifying this file. ## Authors and License Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com). Contact: markdown@freewisdom.org Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) Copyright 200? Django Software Foundation (OrderedDict implementation) Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) Copyright 2004 Manfred Stienstra (the original version) License: BSD (see docs/LICENSE for details). """ version = "2.0-beta-2" version_info = (2,0,0, "beta-2") import re import codecs import logging from logging import DEBUG, INFO, WARN, ERROR, CRITICAL """ CONSTANTS ============================================================================= """ """ Constants you might want to modify ----------------------------------------------------------------------------- """ # default logging level for command-line use COMMAND_LINE_LOGGING_LEVEL = CRITICAL TAB_LENGTH = 4 # expand tabs to this many spaces ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> SMART_EMPHASIS = True # this_or_that does not become thisorthat HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" +"|script|noscript|form|fieldset|iframe|math" +"|ins|del|hr|hr/|style|li|dt|dd|tr") # Placeholders STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX AMP_SUBSTITUTE = STX+"amp"+ETX import linepreprocessors, blockprocessors, treeprocessors, inlinepatterns import postprocessors import blockparser import etree_loader import odict """ Constants you probably do not need to change ----------------------------------------------------------------------------- """ RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), # Hebrew (0590-05FF), Arabic (0600-06FF), # Syriac (0700-074F), Arabic supplement (0750-077F), # Thaana (0780-07BF), Nko (07C0-07FF). (u'\u2D30', u'\u2D7F'), # Tifinagh ) """ AUXILIARY GLOBAL FUNCTIONS ============================================================================= """ def message(level, text): """ A wrapper method for logging debug messages. """ logging.getLogger('MARKDOWN').log(level, text) def isBlockLevel(tag): """Check if the tag is a block level HTML tag.""" return BLOCK_LEVEL_ELEMENTS.match(tag) """ OVERALL DESIGN ============================================================================= Markdown processing takes place in four steps: 1. A bunch of "preprocessors" munge the input text. 2. BlockParser() parses the high-level structural elements of the pre-processed text into an ElementTree. 3. A bunch of "treeprocessors" are run against the ElementTree. One such treeprocessor runs InlinePatterns against the ElementTree, detecting inline markup. 4. Some post-processors are run against the text after the ElementTree has been serialized into text. 5. The output is written to a string. Those steps are put together by the Markdown() class. The code below is organized as follows: 1. BlockParser and it's BlockProcessors - does core block parsing. 2. All the preprocessors, patterns, treeprocessors, and postprocessors. 3. Markdown class - does the high-level wrapping. """ """ POST-PROCESSORS ============================================================================= Markdown also allows post-processors, which are similar to preprocessors in that they need to implement a "run" method. However, they are run after core processing. There are two types of post-processors: Treeprocessor and Postprocessor """ """ MISC AUXILIARY CLASSES ============================================================================= """ class AtomicString(unicode): """A string which should not be further processed.""" pass """ Markdown ============================================================================= """ class Markdown: """Convert Markdown to HTML.""" def __init__(self, extensions=[], extension_configs={}, safe_mode = False): """ Creates a new Markdown instance. Keyword arguments: * extensions: A list of extensions. If they are of type string, the module mdx_name.py will be loaded. If they are a subclass of markdown.Extension, they will be used as-is. * extension-configs: Configuration setting for extensions. * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". """ self.safeMode = safe_mode self.registeredExtensions = [] self.docType = "" self.stripTopLevelTags = True # Preprocessors self.preprocessors = odict.OrderedDict() self.preprocessors["html_block"] = linepreprocessors.HtmlBlockPreprocessor(self) self.preprocessors["reference"] = linepreprocessors.ReferencePreprocessor(self) # footnote preprocessor will be inserted with "amp_substitute" self.references = {} self.htmlStash = linepreprocessors.HtmlStash() self.registerExtensions(extensions = extensions, configs = extension_configs) self.reset() def registerExtensions(self, extensions, configs): """ Register extensions with this instance of Markdown. Keyword aurguments: * extensions: A list of extensions, which can either be strings or objects. See the docstring on Markdown. * configs: A dictionary mapping module names to config options. """ for ext in extensions: if isinstance(ext, basestring): ext = load_extension(ext, configs.get(ext, [])) elif hasattr(ext, 'extendMarkdown'): # Looks like an Extension. # Nothing to do here. pass else: message(ERROR, "Incorrect type! Extension '%s' is " "neither a string or an Extension." %(repr(ext))) continue ext.extendMarkdown(self, globals()) def registerExtension(self, extension): """ This gets called by the extension """ self.registeredExtensions.append(extension) def reset(self): """ Resets all state variables so that we can start with a new text. """ self.htmlStash.reset() self.references.clear() for extension in self.registeredExtensions: extension.reset() def convert (self, source): """Convert markdown to serialized XHTML.""" # Fixup the source text if not source: return u"" # a blank unicode string try: source = unicode(source) except UnicodeDecodeError: message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') return u"" source = source.replace(STX, "").replace(ETX, "") source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" source = re.sub(r'\n\s+\n', '\n\n', source) source = source.expandtabs(TAB_LENGTH) # Split into lines and run the line preprocessors. self.lines = source.split("\n") for prep in self.preprocessors.values(): self.lines = prep.run(self.lines) # Parse the high-level elements. root = self.parser.parseDocument(self.lines).getroot() # Run the tree-processors for treeprocessor in self.treeprocessors.values(): newRoot = treeprocessor.run(root) if newRoot: root = newRoot # Serialize _properly_. Strip top-level tags. xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8")) if self.stripTopLevelTags: xml = xml.strip()[44:-7] + "\n" # Run the text post-processors for pp in self.postprocessors.values(): xml = pp.run(xml) return xml.strip() def convertFile(self, input = None, output = None, encoding = None): """Converts a markdown file and returns the HTML as a unicode string. Decodes the file using the provided encoding (defaults to utf-8), passes the file content to markdown, and outputs the html to either the provided stream or the file with provided name, using the same encoding as the source file. **Note:** This is the only place that decoding and encoding of unicode takes place in Python-Markdown. (All other code is unicode-in / unicode-out.) Keyword arguments: * input: Name of source text file. * output: Name of output file. Writes to stdout if `None`. * extensions: A list of extension names (may contain config args). * encoding: Encoding of input and output files. Defaults to utf-8. * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". """ encoding = encoding or "utf-8" # Read the source input_file = codecs.open(input, mode="r", encoding=encoding) text = input_file.read() input_file.close() text = text.lstrip(u'\ufeff') # remove the byte-order mark # Convert html = self.convert(text) # Write to file or stdout if type(output) == type("string"): output_file = codecs.open(output, "w", encoding=encoding) output_file.write(html) output_file.close() else: output.write(html.encode(encoding)) """ Extensions ----------------------------------------------------------------------------- """ class Extension: """ Base class for extensions to subclass. """ def __init__(self, configs = {}): """Create an instance of an Extention. Keyword arguments: * configs: A dict of configuration setting used by an Extension. """ self.config = configs def getConfig(self, key): """ Return a setting for the given key or an empty string. """ if self.config.has_key(key): return self.config[key][0] else: return "" def getConfigInfo(self): """ Return all config settings as a list of tuples. """ return [(key, self.config[key][1]) for key in self.config.keys()] def setConfig(self, key, value): """ Set a config setting for `key` with the given `value`. """ self.config[key][0] = value def extendMarkdown(self, md, md_globals): """ Add the various proccesors and patterns to the Markdown Instance. This method must be overriden by every extension. Keyword arguments: * md: The Markdown instance. * md_globals: Global variables in the markdown module namespace. """ pass def load_extension(ext_name, configs = []): """Load extension by name, then return the module. The extension name may contain arguments as part of the string in the following format: "extname(key1=value1,key2=value2)" """ # Parse extensions config params (ignore the order) configs = dict(configs) pos = ext_name.find("(") # find the first "(" if pos > 0: ext_args = ext_name[pos+1:-1] ext_name = ext_name[:pos] pairs = [x.split("=") for x in ext_args.split(",")] configs.update([(x.strip(), y.strip()) for (x, y) in pairs]) # Setup the module names ext_module = 'markdown_extensions' module_name_new_style = '.'.join([ext_module, ext_name]) module_name_old_style = '_'.join(['mdx', ext_name]) # Try loading the extention first from one place, then another try: # New style (markdown_extensons.) module = __import__(module_name_new_style, {}, {}, [ext_module]) except ImportError: try: # Old style (mdx.) module = __import__(module_name_old_style) except ImportError: message(CRITICAL, "Failed loading extension '%s' from '%s' or '%s'" % (ext_name, module_name_new_style, module_name_old_style)) # If the module is loaded successfully, we expect it to define a # function called makeExtension() try: return module.makeExtension(configs.items()) except: message(CRITICAL, "Failed to instantiate extension '%s'" % ext_name) def load_extensions(ext_names): """Loads multiple extensions""" extensions = [] for ext_name in ext_names: extension = load_extension(ext_name) if extension: extensions.append(extension) return extensions # Extensions should use "markdown.etree" instead of "etree" (or do `from # markdown import etree`). Do not import it by yourself. etree = etree_loader.importETree() """ EXPORTED FUNCTIONS ============================================================================= Those are the two functions we really mean to export: markdown() and markdownFromFile(). """ def markdown(text, extensions = [], safe_mode = False): """Convert a markdown string to HTML and return HTML as a unicode string. This is a shortcut function for `Markdown` class to cover the most basic use case. It initializes an instance of Markdown, loads the necessary extensions and runs the parser on the given text. Keyword arguments: * text: Markdown formatted text as Unicode or ASCII string. * extensions: A list of extensions or extension names (may contain config args). * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". Returns: An HTML document as a string. """ md = Markdown(extensions=load_extensions(extensions), safe_mode = safe_mode) return md.convert(text) def markdownFromFile(input = None, output = None, extensions = [], encoding = None, safe = False): """Read markdown code from a file and write it to a file or a stream.""" md = Markdown(extensions=load_extensions(extensions), safe_mode = safe) md.convertFile(input, output, encoding)