diff options
author | Yuri Takhteyev <yuri@freewisdom.org> | 2008-11-17 00:17:15 -0800 |
---|---|---|
committer | Yuri Takhteyev <yuri@freewisdom.org> | 2008-11-17 00:17:15 -0800 |
commit | 159a274a977c496434dbc484a1b253663cde4eed (patch) | |
tree | 53c9a6d3c69cbb8be3e6b47ea0d35cab075f03c9 /markdown | |
parent | 3dfcbc8d7900aa0f07124c9d7598cb7ecc2ff41b (diff) | |
download | markdown-159a274a977c496434dbc484a1b253663cde4eed.tar.gz markdown-159a274a977c496434dbc484a1b253663cde4eed.tar.bz2 markdown-159a274a977c496434dbc484a1b253663cde4eed.zip |
Attempting a refactoring, breaking markdown into multiple files.
Diffstat (limited to 'markdown')
-rwxr-xr-x | markdown/__init__.py | 793 | ||||
-rw-r--r-- | markdown/blockparser.py | 105 | ||||
-rw-r--r-- | markdown/blockprocessors.py | 418 | ||||
-rw-r--r-- | markdown/commandline.py | 97 | ||||
-rw-r--r-- | markdown/inlinepatterns.py | 367 | ||||
-rw-r--r-- | markdown/linepreprocessors.py | 177 | ||||
-rw-r--r-- | markdown/treeprocessors.py | 323 |
7 files changed, 2280 insertions, 0 deletions
diff --git a/markdown/__init__.py b/markdown/__init__.py new file mode 100755 index 0000000..742460e --- /dev/null +++ b/markdown/__init__.py @@ -0,0 +1,793 @@ +""" +Python Markdown +=============== + +Python Markdown converts Markdown to HTML and can be used as a library or +called from the command line. + +## Basic usage as a module: + + import markdown + md = Markdown() + html = md.convert(your_text_string) + +## Basic use from the command line: + + python markdown.py source.txt > destination.html + +Run "python markdown.py --help" to see more options. + +## Extensions + +See <http://www.freewisdom.org/projects/python-markdown/> for more +information and instructions on how to extend the functionality of +Python Markdown. Read that before you try modifying this file. + +## Authors and License + +Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and +maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan +Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com). + +Contact: markdown@freewisdom.org + +Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) +Copyright 200? Django Software Foundation (OrderedDict implementation) +Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) +Copyright 2004 Manfred Stienstra (the original version) + +License: BSD (see docs/LICENSE for details). +""" + +version = "2.0-alpha" +version_info = (2,0,0, "beta") + +import re +#import sys +import codecs +import logging +from logging import DEBUG, INFO, WARN, ERROR, CRITICAL + + +""" +CONSTANTS +============================================================================= +""" + +""" +Constants you might want to modify +----------------------------------------------------------------------------- +""" + +# default logging level for command-line use +COMMAND_LINE_LOGGING_LEVEL = CRITICAL +TAB_LENGTH = 4 # expand tabs to this many spaces +ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> +SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that +HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode +BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" + +"|script|noscript|form|fieldset|iframe|math" + +"|ins|del|hr|hr/|style|li|dt|dd|tr") + +import linepreprocessors, blockprocessors, treeprocessors, inlinepatterns, blockparser + +""" +Constants you probably do not need to change +----------------------------------------------------------------------------- +""" + +RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), + # Hebrew (0590-05FF), Arabic (0600-06FF), + # Syriac (0700-074F), Arabic supplement (0750-077F), + # Thaana (0780-07BF), Nko (07C0-07FF). + (u'\u2D30', u'\u2D7F'), # Tifinagh + ) + +# Placeholders +STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder +ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder +HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:" +HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX +INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" +INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX +AMP_SUBSTITUTE = STX+"amp"+ETX + + +""" +AUXILIARY GLOBAL FUNCTIONS +============================================================================= +""" + +def message(level, text): + """ A wrapper method for logging debug messages. """ + logging.getLogger('MARKDOWN').log(level, text) + +## Import +def importETree(): + """Import the best implementation of ElementTree, return a module object.""" + etree_in_c = None + try: # Is it Python 2.5+ with C implemenation of ElementTree installed? + import xml.etree.cElementTree as etree_in_c + except ImportError: + try: # Is it Python 2.5+ with Python implementation of ElementTree? + import xml.etree.ElementTree as etree + except ImportError: + try: # An earlier version of Python with cElementTree installed? + import cElementTree as etree_in_c + except ImportError: + try: # An earlier version of Python with Python ElementTree? + import elementtree.ElementTree as etree + except ImportError: + message(CRITICAL, "Failed to import ElementTree") + sys.exit(1) + if etree_in_c and etree_in_c.VERSION < "1.0": + message(CRITICAL, "For cElementTree version 1.0 or higher is required.") + sys.exit(1) + elif etree_in_c : + return etree_in_c + elif etree.VERSION < "1.1": + message(CRITICAL, "For ElementTree version 1.1 or higher is required") + sys.exit(1) + else : + return etree + +def isBlockLevel(tag): + """Check if the tag is a block level HTML tag.""" + return BLOCK_LEVEL_ELEMENTS.match(tag) + + +""" +OVERALL DESIGN +============================================================================= + +Markdown processing takes place in four steps: + +1. A bunch of "preprocessors" munge the input text. +2. BlockParser() parses the high-level structural elements of the + pre-processed text into an ElementTree. +3. A bunch of "treeprocessors" are run against the ElementTree. One such + treeprocessor runs InlinePatterns against the ElementTree, detecting inline + markup. +4. Some post-processors are run against the text after the ElementTree has + been serialized into text. +5. The output is written to a string. + +Those steps are put together by the Markdown() class. + +The code below is organized as follows: + +1. BlockParser and it's BlockProcessors - does core block parsing. +2. All the preprocessors, patterns, treeprocessors, and postprocessors. +3. Markdown class - does the high-level wrapping. +""" + + + + + +""" +POST-PROCESSORS +============================================================================= + +Markdown also allows post-processors, which are similar to preprocessors in +that they need to implement a "run" method. However, they are run after core +processing. + +There are two types of post-processors: Treeprocessor and Postprocessor +""" + +class Processor: + def __init__(self, markdown_instance=None): + if markdown_instance: + self.markdown = markdown_instance + + +class Postprocessor(Processor): + """ + Postprocessors are run after the ElementTree it converted back into text. + + Each Postprocessor implements a "run" method that takes a pointer to a + text string, modifies it as necessary and returns a text string. + + Postprocessors must extend markdown.Postprocessor. + + """ + + def run(self, text): + """ + Subclasses of Postprocessor should implement a `run` method, which + takes the html document as a single text string and returns a + (possibly modified) string. + + """ + pass + + + +class RawHtmlPostprocessor(Postprocessor): + """ Restore raw html to the document. """ + + def run(self, text): + """ Iterate over html stash and restore "safe" html. """ + for i in range(self.markdown.htmlStash.html_counter): + html, safe = self.markdown.htmlStash.rawHtmlBlocks[i] + if self.markdown.safeMode and not safe: + if str(self.markdown.safeMode).lower() == 'escape': + html = self.escape(html) + elif str(self.markdown.safeMode).lower() == 'remove': + html = '' + else: + html = HTML_REMOVED_TEXT + if safe or not self.markdown.safeMode: + text = text.replace("<p>%s</p>" % (HTML_PLACEHOLDER % i), + html + "\n") + text = text.replace(HTML_PLACEHOLDER % i, html) + return text + + def escape(self, html): + """ Basic html escaping """ + html = html.replace('&', '&') + html = html.replace('<', '<') + html = html.replace('>', '>') + return html.replace('"', '"') + + +class AndSubstitutePostprocessor(Postprocessor): + """ Restore valid entities """ + def __init__(self): + pass + + def run(self, text): + text = text.replace(AMP_SUBSTITUTE, "&") + return text + + +""" +MISC AUXILIARY CLASSES +============================================================================= +""" + +class AtomicString(unicode): + """A string which should not be further processed.""" + pass + + +class HtmlStash: + """ + This class is used for stashing HTML objects that we extract + in the beginning and replace with place-holders. + """ + + def __init__ (self): + """ Create a HtmlStash. """ + self.html_counter = 0 # for counting inline html segments + self.rawHtmlBlocks=[] + + def store(self, html, safe=False): + """ + Saves an HTML segment for later reinsertion. Returns a + placeholder string that needs to be inserted into the + document. + + Keyword arguments: + + * html: an html segment + * safe: label an html segment as safe for safemode + + Returns : a placeholder string + + """ + self.rawHtmlBlocks.append((html, safe)) + placeholder = HTML_PLACEHOLDER % self.html_counter + self.html_counter += 1 + return placeholder + + def reset(self): + self.html_counter = 0 + self.rawHtmlBlocks = [] + +class OrderedDict(dict): + """ + A dictionary that keeps its keys in the order in which they're inserted. + + Copied from Django's SortedDict with some modifications. + + """ + def __new__(cls, *args, **kwargs): + instance = super(OrderedDict, cls).__new__(cls, *args, **kwargs) + instance.keyOrder = [] + return instance + + def __init__(self, data=None): + if data is None: + data = {} + super(OrderedDict, self).__init__(data) + if isinstance(data, dict): + self.keyOrder = data.keys() + else: + self.keyOrder = [] + for key, value in data: + if key not in self.keyOrder: + self.keyOrder.append(key) + + def __deepcopy__(self, memo): + from copy import deepcopy + return self.__class__([(key, deepcopy(value, memo)) + for key, value in self.iteritems()]) + + def __setitem__(self, key, value): + super(OrderedDict, self).__setitem__(key, value) + if key not in self.keyOrder: + self.keyOrder.append(key) + + def __delitem__(self, key): + super(OrderedDict, self).__delitem__(key) + self.keyOrder.remove(key) + + def __iter__(self): + for k in self.keyOrder: + yield k + + def pop(self, k, *args): + result = super(OrderedDict, self).pop(k, *args) + try: + self.keyOrder.remove(k) + except ValueError: + # Key wasn't in the dictionary in the first place. No problem. + pass + return result + + def popitem(self): + result = super(OrderedDict, self).popitem() + self.keyOrder.remove(result[0]) + return result + + def items(self): + return zip(self.keyOrder, self.values()) + + def iteritems(self): + for key in self.keyOrder: + yield key, super(OrderedDict, self).__getitem__(key) + + def keys(self): + return self.keyOrder[:] + + def iterkeys(self): + return iter(self.keyOrder) + + def values(self): + return [super(OrderedDict, self).__getitem__(k) for k in self.keyOrder] + + def itervalues(self): + for key in self.keyOrder: + yield super(OrderedDict, self).__getitem__(key) + + def update(self, dict_): + for k, v in dict_.items(): + self.__setitem__(k, v) + + def setdefault(self, key, default): + if key not in self.keyOrder: + self.keyOrder.append(key) + return super(OrderedDict, self).setdefault(key, default) + + def value_for_index(self, index): + """Return the value of the item at the given zero-based index.""" + return self[self.keyOrder[index]] + + def insert(self, index, key, value): + """Insert the key, value pair before the item with the given index.""" + if key in self.keyOrder: + n = self.keyOrder.index(key) + del self.keyOrder[n] + if n < index: + index -= 1 + self.keyOrder.insert(index, key) + super(OrderedDict, self).__setitem__(key, value) + + def copy(self): + """Return a copy of this object.""" + # This way of initializing the copy means it works for subclasses, too. + obj = self.__class__(self) + obj.keyOrder = self.keyOrder[:] + return obj + + def __repr__(self): + """ + Replace the normal dict.__repr__ with a version that returns the keys + in their sorted order. + """ + return '{%s}' % ', '.join(['%r: %r' % (k, v) for k, v in self.items()]) + + def clear(self): + super(OrderedDict, self).clear() + self.keyOrder = [] + + def index(self, key): + """ Return the index of a given key. """ + return self.keyOrder.index(key) + + def index_for_location(self, location): + """ Return index or None for a given location. """ + if location == '_begin': + i = 0 + elif location == '_end': + i = None + elif location.startswith('<') or location.startswith('>'): + i = self.index(location[1:]) + if location.startswith('>'): + if i >= len(self): + # last item + i = None + else: + i += 1 + else: + raise ValueError('Not a valid location: "%s". Location key ' + 'must start with a ">" or "<".' % location) + return i + + def add(self, key, value, location): + """ Insert by key location. """ + i = self.index_for_location(location) + if i is not None: + self.insert(i, key, value) + else: + self.__setitem__(key, value) + + def link(self, key, location): + """ Change location of an existing item. """ + n = self.keyOrder.index(key) + del self.keyOrder[n] + i = self.index_for_location(location) + try: + if i is not None: + self.keyOrder.insert(i, key) + else: + self.keyOrder.append(key) + except Error: + # restore to prevent data loss and reraise + self.keyOrder.insert(n, key) + raise Error + + +""" +Markdown +============================================================================= +""" + +class Markdown: + """Convert Markdown to HTML.""" + + def __init__(self, + extensions=[], + extension_configs={}, + safe_mode = False): + """ + Creates a new Markdown instance. + + Keyword arguments: + + * extensions: A list of extensions. + If they are of type string, the module mdx_name.py will be loaded. + If they are a subclass of markdown.Extension, they will be used + as-is. + * extension-configs: Configuration setting for extensions. + * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + + """ + self.parser = blockparser.BlockParser() + self.safeMode = safe_mode + self.registeredExtensions = [] + self.docType = "" + self.stripTopLevelTags = True + + self.preprocessors = OrderedDict() + self.preprocessors["html_block"] = linepreprocessors.HtmlBlockPreprocessor(self) + self.preprocessors["reference"] = linepreprocessors.ReferencePreprocessor(self) + # footnote preprocessor will be inserted with "<reference" + + self.treeprocessors = OrderedDict() + self.treeprocessors["inline"] = treeprocessors.InlineProcessor(self) + self.treeprocessors["prettify"] = treeprocessors.PrettifyTreeprocessor(self) + + self.postprocessors = OrderedDict() + self.postprocessors["raw_html"] = RawHtmlPostprocessor(self) + self.postprocessors["amp_substitute"] = AndSubstitutePostprocessor() + # footnote postprocessor will be inserted with ">amp_substitute" + + self.prePatterns = [] + + self.inlinePatterns = OrderedDict() + self.inlinePatterns["backtick"] = inlinepatterns.BacktickPattern(inlinepatterns.BACKTICK_RE) + self.inlinePatterns["escape"] = inlinepatterns.SimpleTextPattern(inlinepatterns.ESCAPE_RE) + self.inlinePatterns["reference"] = inlinepatterns.ReferencePattern(inlinepatterns.REFERENCE_RE, self) + self.inlinePatterns["link"] = inlinepatterns.LinkPattern(inlinepatterns.LINK_RE, self) + self.inlinePatterns["image_link"] = inlinepatterns.ImagePattern(inlinepatterns.IMAGE_LINK_RE, self) + self.inlinePatterns["image_reference"] = \ + inlinepatterns.ImageReferencePattern(inlinepatterns.IMAGE_REFERENCE_RE, self) + self.inlinePatterns["autolink"] = inlinepatterns.AutolinkPattern(inlinepatterns.AUTOLINK_RE, self) + self.inlinePatterns["automail"] = inlinepatterns.AutomailPattern(inlinepatterns.AUTOMAIL_RE, self) + self.inlinePatterns["linebreak2"] = \ + inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_2_RE, 'br') + self.inlinePatterns["linebreak"] = \ + inlinepatterns.SubstituteTagPattern(inlinepatterns.LINE_BREAK_RE, 'br') + self.inlinePatterns["html"] = inlinepatterns.HtmlPattern(inlinepatterns.HTML_RE, self) + self.inlinePatterns["entity"] = inlinepatterns.HtmlPattern(inlinepatterns.ENTITY_RE, self) + self.inlinePatterns["not_strong"] = inlinepatterns.SimpleTextPattern(inlinepatterns.NOT_STRONG_RE) + self.inlinePatterns["strong_em"] = \ + inlinepatterns.DoubleTagPattern(inlinepatterns.STRONG_EM_RE, 'strong,em') + self.inlinePatterns["strong"] = inlinepatterns.SimpleTagPattern(inlinepatterns.STRONG_RE, 'strong') + self.inlinePatterns["emphasis"] = inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_RE, 'em') + self.inlinePatterns["emphasis2"] = \ + inlinepatterns.SimpleTagPattern(inlinepatterns.EMPHASIS_2_RE, 'em') + # The order of the handlers matters!!! + + self.references = {} + self.htmlStash = HtmlStash() + self.registerExtensions(extensions = extensions, + configs = extension_configs) + self.reset() + + def registerExtensions(self, extensions, configs): + """ + Register extensions with this instance of Markdown. + + Keyword aurguments: + + * extensions: A list of extensions, which can either + be strings or objects. See the docstring on Markdown. + * configs: A dictionary mapping module names to config options. + + """ + for ext in extensions: + if isinstance(ext, basestring): + ext = load_extension(ext, configs.get(ext, [])) + elif hasattr(ext, 'extendMarkdown'): + # Looks like an Extension. + # Nothing to do here. + pass + else: + message(ERROR, "Incorrect type! Extension '%s' is " + "neither a string or an Extension." %(repr(ext))) + continue + ext.extendMarkdown(self, globals()) + + def registerExtension(self, extension): + """ This gets called by the extension """ + self.registeredExtensions.append(extension) + + def reset(self): + """ + Resets all state variables so that we can start with a new text. + """ + self.htmlStash.reset() + self.references.clear() + + for extension in self.registeredExtensions: + extension.reset() + + def convert (self, source): + """Convert markdown to serialized XHTML.""" + + # Fixup the source text + if not source: + return u"" # a blank unicode string + try: + source = unicode(source) + except UnicodeDecodeError: + message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') + return u"" + + source = source.replace(STX, "").replace(ETX, "") + source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" + source = re.sub(r'\n\s+\n', '\n\n', source) + source = source.expandtabs(TAB_LENGTH) + + # Split into lines and run the line preprocessors. + self.lines = source.split("\n") + for prep in self.preprocessors.values(): + self.lines = prep.run(self.lines) + + # Parse the high-level elements. + root = self.parser.parseDocument(self.lines).getroot() + + # Run the tree-processors + for treeprocessor in self.treeprocessors.values(): + newRoot = treeprocessor.run(root) + if newRoot: + root = newRoot + + # Serialize _properly_. Strip top-level tags. + xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8")) + if self.stripTopLevelTags: + xml = xml.strip()[44:-7] + "\n" + + # Run the text post-processors + for pp in self.postprocessors.values(): + xml = pp.run(xml) + + return xml.strip() + + def convertFile(self, input = None, output = None, encoding = None): + """Converts a markdown file and returns the HTML as a unicode string. + + Decodes the file using the provided encoding (defaults to utf-8), + passes the file content to markdown, and outputs the html to either + the provided stream or the file with provided name, using the same + encoding as the source file. + + **Note:** This is the only place that decoding and encoding of unicode + takes place in Python-Markdown. (All other code is unicode-in / + unicode-out.) + + Keyword arguments: + + * input: Name of source text file. + * output: Name of output file. Writes to stdout if `None`. + * extensions: A list of extension names (may contain config args). + * encoding: Encoding of input and output files. Defaults to utf-8. + * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + + """ + + encoding = encoding or "utf-8" + + # Read the source + input_file = codecs.open(input, mode="r", encoding=encoding) + text = input_file.read() + input_file.close() + text = text.lstrip(u'\ufeff') # remove the byte-order mark + + # Convert + html = self.convert(text) + + # Write to file or stdout + if type(output) == type("string"): + output_file = codecs.open(output, "w", encoding=encoding) + output_file.write(html) + output_file.close() + else: + output.write(html.encode(encoding)) + + +""" +Extensions +----------------------------------------------------------------------------- +""" + +class Extension: + """ Base class for extensions to subclass. """ + def __init__(self, configs = {}): + """Create an instance of an Extention. + + Keyword arguments: + + * configs: A dict of configuration setting used by an Extension. + """ + self.config = configs + + def getConfig(self, key): + """ Return a setting for the given key or an empty string. """ + if self.config.has_key(key): + return self.config[key][0] + else: + return "" + + def getConfigInfo(self): + """ Return all config settings as a list of tuples. """ + return [(key, self.config[key][1]) for key in self.config.keys()] + + def setConfig(self, key, value): + """ Set a config setting for `key` with the given `value`. """ + self.config[key][0] = value + + def extendMarkdown(self, md, md_globals): + """ + Add the various proccesors and patterns to the Markdown Instance. + + This method must be overriden by every extension. + + Keyword arguments: + + * md: The Markdown instance. + + * md_globals: Global variables in the markdown module namespace. + + """ + pass + +def load_extension(ext_name, configs = []): + """Load extension by name, then return the module. + + The extension name may contain arguments as part of the string in the + following format: "extname(key1=value1,key2=value2)" + + """ + + # Parse extensions config params (ignore the order) + configs = dict(configs) + pos = ext_name.find("(") # find the first "(" + if pos > 0: + ext_args = ext_name[pos+1:-1] + ext_name = ext_name[:pos] + pairs = [x.split("=") for x in ext_args.split(",")] + configs.update([(x.strip(), y.strip()) for (x, y) in pairs]) + + # Setup the module names + ext_module = 'markdown_extensions' + module_name_new_style = '.'.join([ext_module, ext_name]) + module_name_old_style = '_'.join(['mdx', ext_name]) + + # Try loading the extention first from one place, then another + try: # New style (markdown_extensons.<extension>) + module = __import__(module_name_new_style, {}, {}, [ext_module]) + except ImportError: + try: # Old style (mdx.<extension>) + module = __import__(module_name_old_style) + except ImportError: + message(CRITICAL, "Failed loading extension '%s' from '%s' or '%s'" + % (ext_name, module_name_new_style, module_name_old_style)) + + # If the module is loaded successfully, we expect it to define a + # function called makeExtension() + try: + return module.makeExtension(configs.items()) + except: + message(CRITICAL, "Failed to instantiate extension '%s'" % ext_name) + +def load_extensions(ext_names): + """Loads multiple extensions""" + extensions = [] + for ext_name in ext_names: + extension = load_extension(ext_name) + if extension: + extensions.append(extension) + return extensions + +# Extensions should use "markdown.etree" instead of "etree" (or do `from +# markdown import etree`). Do not import it by yourself. + +etree = importETree() + +""" +EXPORTED FUNCTIONS +============================================================================= + +Those are the two functions we really mean to export: markdown() and +markdownFromFile(). +""" + +def markdown(text, + extensions = [], + safe_mode = False): + """Convert a markdown string to HTML and return HTML as a unicode string. + + This is a shortcut function for `Markdown` class to cover the most + basic use case. It initializes an instance of Markdown, loads the + necessary extensions and runs the parser on the given text. + + Keyword arguments: + + * text: Markdown formatted text as Unicode or ASCII string. + * extensions: A list of extensions or extension names (may contain config args). + * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + + Returns: An HTML document as a string. + + """ + md = Markdown(extensions=load_extensions(extensions), + safe_mode = safe_mode) + return md.convert(text) + + +def markdownFromFile(input = None, + output = None, + extensions = [], + encoding = None, + safe = False): + """Read markdown code from a file and write it to a file or a stream.""" + md = Markdown(extensions=load_extensions(extensions), safe_mode = safe) + md.convertFile(input, output, encoding) + + + diff --git a/markdown/blockparser.py b/markdown/blockparser.py new file mode 100644 index 0000000..9e8c18f --- /dev/null +++ b/markdown/blockparser.py @@ -0,0 +1,105 @@ + +import markdown + +class State(list): + """ Track the current and nested state of the parser. + + This utility class is used to track the state of the BlockParser and + support multiple levels if nesting. It's just a simple API wrapped around + a list. Each time a state is set, that state is appended to the end of the + list. Each time a state is reset, that state is removed from the end of + the list. + + Therefore, each time a state is set for a nested block, that state must be + reset when we back out of that level of nesting or the state could be + corrupted. + + While all the methods of a list object are available, only the three + defined below need be used. + + """ + + def set(self, state): + """ Set a new state. """ + self.append(state) + + def reset(self): + """ Step back one step in nested state. """ + self.pop() + + def isstate(self, state): + """ Test that top (current) level is of given state. """ + if len(self): + return self[-1] == state + else: + return False + +class BlockParser: + """ Parse Markdown blocks into an ElementTree object. + + A wrapper class that stitches the various BlockProcessors together, + looping through them and creating an ElementTree object. + """ + + def __init__(self): + self.blockprocessors = markdown.OrderedDict() + self.blockprocessors['empty'] = markdown.blockprocessors.EmptyBlockProcessor(self) + self.blockprocessors['indent'] = markdown.blockprocessors.ListIndentProcessor(self) + self.blockprocessors['code'] = markdown.blockprocessors.CodeBlockProcessor(self) + self.blockprocessors['hashheader'] = markdown.blockprocessors.HashHeaderProcessor(self) + self.blockprocessors['setextheader'] = markdown.blockprocessors.SetextHeaderProcessor(self) + self.blockprocessors['hr'] = markdown.blockprocessors.HRProcessor(self) + self.blockprocessors['olist'] = markdown.blockprocessors.OListProcessor(self) + self.blockprocessors['ulist'] = markdown.blockprocessors.UListProcessor(self) + self.blockprocessors['quote'] = markdown.blockprocessors.BlockQuoteProcessor(self) + self.blockprocessors['paragraph'] = markdown.blockprocessors.ParagraphProcessor(self) + self.state = State() + + def parseDocument(self, lines): + """ Parse a markdown document into an ElementTree. + + Given a list of lines, an ElementTree object (not just a parent Element) + is created and the root element is passed to the parser as the parent. + The ElementTree object is returned. + + This should only be called on an entire document, not pieces. + + """ + # Create a ElementTree from the lines + root = markdown.etree.Element("div") + self.parseChunk(root, '\n'.join(lines)) + return markdown.etree.ElementTree(root) + + def parseChunk(self, parent, text): + """ Parse a chunk of markdown text and attach to given etree node. + + While the ``text`` argument is generally assumed to contain multiple + blocks which will be split on blank lines, it could contain only one + block. Generally, this method would be called by extensions when + block parsing is required. + + The ``parent`` etree Element passed in is altered in place. + Nothing is returned. + + """ + self.parseBlocks(parent, text.split('\n\n')) + + def parseBlocks(self, parent, blocks): + """ Process blocks of markdown text and attach to given etree node. + + Given a list of ``blocks``, each blockprocessor is stepped through + until there are no blocks left. While an extension could potentially + call this method directly, it's generally expected to be used internally. + + This is a public method as an extension may need to add/alter additional + BlockProcessors which call this method to recursively parse a nested + block. + + """ + while blocks: + for processor in self.blockprocessors.values(): + if processor.test(parent, blocks[0]): + processor.run(parent, blocks) + break + + diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py new file mode 100644 index 0000000..b120d73 --- /dev/null +++ b/markdown/blockprocessors.py @@ -0,0 +1,418 @@ +""" +CORE MARKDOWN BLOCKPARSER +============================================================================= + +This parser handles basic parsing of Markdown blocks. It doesn't concern itself +with inline elements such as **bold** or *italics*, but rather just catches +blocks, lists, quotes, etc. + +The BlockParser is made up of a bunch of BlockProssors, each handling a +different type of block. Extensions may add/replace/remove BlockProcessors +as they need to alter how markdown blocks are parsed. + +""" + +import re +import markdown + +class BlockProcessor: + """ Base class for block processors. + + Each subclass will provide the methods below to work with the source and + tree. Each processor will need to define it's own ``test`` and ``run`` + methods. The ``test`` method should return True or False, to indicate + whether the current block should be processed by this processor. If the + test passes, the parser will call the processors ``run`` method. + + """ + + def __init__(self, parser=None): + self.parser = parser + + def lastChild(self, parent): + """ Return the last child of an etree element. """ + if len(parent): + return parent[-1] + else: + return None + + def detab(self, text): + """ Remove a tab from the front of each line of the given text. """ + newtext = [] + lines = text.split('\n') + for line in lines: + if line.startswith(' '*markdown.TAB_LENGTH): + newtext.append(line[markdown.TAB_LENGTH:]) + elif not line.strip(): + newtext.append('') + else: + break + return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) + + def looseDetab(self, text): + """ Remove a tab from front of lines but allowing dedented lines. """ + lines = text.split('\n') + for i in range(len(lines)): + if lines[i].startswith(' '*markdown.TAB_LENGTH): + lines[i] = lines[i][markdown.TAB_LENGTH:] + return '\n'.join(lines) + + def test(self, parent, block): + """ Test for block type. Must be overridden by subclasses. + + As the parser loops through processors, it will call the ``test`` method + on each to determine if the given block of text is of that type. This + method must return a boolean ``True`` or ``False``. The actual method of + testing is left to the needs of that particular block type. It could + be as simple as ``block.startswith(some_string)`` or a complex regular + expression. As the block type may be different depending on the parent + of the block (i.e. inside a list), the parent etree element is also + provided and may be used as part of the test. + + Keywords: + + * ``parent``: A etree element which will be the parent of the block. + * ``block``: A block of text from the source which has been split at + blank lines. + """ + pass + + def run(self, parent, blocks): + """ Run processor. Must be overridden by subclasses. + + When the parser determines the appropriate type of a block, the parser + will call the corresponding processor's ``run`` method. This method + should parse the individual lines of the block and append them to + the etree. + + Note that both the ``parent`` and ``etree`` keywords are pointers + to instances of the objects which should be edited in place. Each + processor must make changes to the existing objects as there is no + mechanism to return new/different objects to replace them. + + This means that this method should be adding SubElements or adding text + to the parent, and should remove (``pop``) or add (``insert``) items to + the list of blocks. + + Keywords: + + * ``parent``: A etree element which is the parent of the current block. + * ``blocks``: A list of all remaining blocks of the document. + """ + pass + + +class ListIndentProcessor(BlockProcessor): + """ Process children of list items. + + Example: + * a list item + process this part + + or this part + + """ + + def test(self, parent, block): + return block.startswith(' '*markdown.TAB_LENGTH) and \ + (parent.tag == "li" or \ + (len(parent) and parent[-1] and \ + (parent[-1].tag == "ul" or parent[-1].tag == "ol") + ) + ) + + def run(self, parent, blocks): + block = self.looseDetab(blocks.pop(0)) + sibling = self.lastChild(parent) + if parent.tag == 'li': + # The parent is already a li. Just parse the child block. + self.parser.parseBlocks(parent, [block]) + elif len(sibling) and sibling[-1].tag == 'li': + # The parent is a list (``ol`` or ``ul``) which has children. + # Assume the last child li is the parent of this block. + if sibling[-1].text: + # If the parent li has text, that text needs to be moved to a p + block = '%s\n\n%s' % (sibling[-1].text, block) + sibling[-1].text = '' + self.parser.parseChunk(sibling[-1], block) + else: + # Create a new li and parse the block with it as the parent. + li = markdown.etree.SubElement(sibling, 'li') + self.parser.parseBlocks(li, [block]) + + +class CodeBlockProcessor(BlockProcessor): + """ Process code blocks. """ + + def test(self, parent, block): + return block.startswith(' '*markdown.TAB_LENGTH) + + def run(self, parent, blocks): + sibling = self.lastChild(parent) + block = blocks.pop(0) + theRest = '' + if sibling and sibling.tag == "pre" and len(sibling) \ + and sibling[0].tag == "code": + # The previous block was a code block. As blank lines do not start + # new code blocks, append this block to the previous, adding back + # linebreaks removed from the split into a list. + code = sibling[0] + block, theRest = self.detab(block) + code.text = markdown.AtomicString('%s\n%s\n' % (code.text, block.rstrip())) + else: + # This is a new codeblock. Create the elements and insert text. + pre = markdown.etree.SubElement(parent, 'pre') + code = markdown.etree.SubElement(pre, 'code') + block, theRest = self.detab(block) + code.text = markdown.AtomicString('%s\n' % block.rstrip()) + if theRest: + # This block contained unindented line(s) after the first indented + # line. Insert these lines as the first block of the master blocks + # list for future processing. + blocks.insert(0, theRest) + + +class BlockQuoteProcessor(BlockProcessor): + + RE = re.compile(r'(^|\n)[ ]{0,3}>[ ](.*)') + + def test(self, parent, block): + return bool(self.RE.search(block)) + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.search(block) + if m: + before = block[:m.start()] # Lines before blockquote + # Pass lines before blockquote in recursively for parsing forst. + self.parser.parseBlocks(parent, [before]) + # Remove ``> `` from begining of each line. + block = '\n'.join([self.clean(line) for line in + block[m.start():].split('\n')]) + sibling = self.lastChild(parent) + if sibling and sibling.tag == "blockquote": + # Previous block was a blockquote so set that as this blocks parent + quote = sibling + else: + # This is a new blockquote. Create a new parent element. + quote = markdown.etree.SubElement(parent, 'blockquote') + # Recursively parse block with blockquote as parent. + self.parser.parseChunk(quote, block) + + def clean(self, line): + """ Remove ``>`` from beginning of a line. """ + m = self.RE.match(line) + if line.strip() == ">": + return "" + elif m: + return m.group(2) + else: + return line + +class OListProcessor(BlockProcessor): + """ Process ordered list blocks. """ + + TAG = 'ol' + # Detect an item (``1. item``). ``group(1)`` contains contents of item. + RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)') + # Detect items on secondary lines. they can be of either list type. + CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ](.*)') + # Detect indented (nested) items of either type + INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ].*') + + def test(self, parent, block): + return bool(self.RE.match(block)) + + def run(self, parent, blocks): + # Check fr multiple items in one block. + items = self.get_items(blocks.pop(0)) + sibling = self.lastChild(parent) + if sibling and (sibling.tag == 'ol' or sibling.tag == 'ul'): + # Previous block was a list item, so set that as parent + lst = sibling + # make sure previous item is in a p. + if len(lst) and lst[-1].text and not len(lst[-1]): + p = markdown.etree.SubElement(lst[-1], 'p') + p.text = lst[-1].text + lst[-1].text = '' + # parse first block differently as it gets wrapped in a p. + li = markdown.etree.SubElement(lst, 'li') + self.parser.state.set('looselist') + firstitem = items.pop(0) + self.parser.parseBlocks(li, [firstitem]) + self.parser.state.reset() + else: + # This is a new list so create parent with appropriate tag. + lst = markdown.etree.SubElement(parent, self.TAG) + self.parser.state.set('list') + # Loop through items in block, recursively parsing each with the + # appropriate parent. + for item in items: + if item.startswith(' '*markdown.TAB_LENGTH): + # Item is indented. Parse with last item as parent + self.parser.parseBlocks(lst[-1], [item]) + else: + # New item. Create li and parse with it as parent + li = markdown.etree.SubElement(lst, 'li') + self.parser.parseBlocks(li, [item]) + self.parser.state.reset() + + def get_items(self, block): + """ Break a block into list items. """ + items = [] + for line in block.split('\n'): + m = self.CHILD_RE.match(line) + if m: + # This is a new item. Append + items.append(m.group(3)) + elif self.INDENT_RE.match(line): + # This is an indented (possibly nested) item. + if items[-1].startswith(' '*markdown.TAB_LENGTH): + # Previous item was indented. Append to that item. + items[-1] = '%s\n%s' % (items[-1], line) + else: + items.append(line) + else: + # This is another line of previous item. Append to that item. + items[-1] = '%s\n%s' % (items[-1], line) + return items + + +class UListProcessor(OListProcessor): + """ Process unordered list blocks. """ + + TAG = 'ul' + RE = re.compile(r'^[ ]{0,3}[*+-][ ](.*)') + + +class HashHeaderProcessor(BlockProcessor): + """ Process Hash Headers. """ + + # Detect a header at start of any line in block + RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)') + + def test(self, parent, block): + return bool(self.RE.search(block)) + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.search(block) + if m: + before = block[:m.start()] # All lines before header + after = block[m.end():] # All lines after header + if before: + # As the header was not the first line of the block and the + # lines before the header must be parsed first, + # recursively parse this lines as a block. + self.parser.parseBlocks(parent, [before]) + # Create header using named groups from RE + h = markdown.etree.SubElement(parent, 'h%d' % len(m.group('level'))) + h.text = m.group('header').strip() + if after: + # Insert remaining lines as first block for future parsing. + blocks.insert(0, after) + else: + # This should never happen, but just in case... + message(CRITICAL, "We've got a problem header!") + + +class SetextHeaderProcessor(BlockProcessor): + """ Process Setext-style Headers. """ + + # Detect Setext-style header. Must be first 2 lines of block. + RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE) + + def test(self, parent, block): + return bool(self.RE.match(block)) + + def run(self, parent, blocks): + lines = blocks.pop(0).split('\n') + # Determine level. ``=`` is 1 and ``-`` is 2. + if lines[1].startswith('='): + level = 1 + else: + level = 2 + h = markdown.etree.SubElement(parent, 'h%d' % level) + h.text = lines[0].strip() + if len(lines) > 2: + # Block contains additional lines. Add to master blocks for later. + blocks.insert(0, '\n'.join(lines[2:])) + + +class HRProcessor(BlockProcessor): + """ Process Horizontal Rules. """ + + RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*' + # Detect hr on any line of a block. + SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE) + # Match a hr on a single line of text. + MATCH_RE = re.compile(r'^%s$' % RE) + + def test(self, parent, block): + return bool(self.SEARCH_RE.search(block)) + + def run(self, parent, blocks): + lines = blocks.pop(0).split('\n') + prelines = [] + # Check for lines in block before hr. + for line in lines: + m = self.MATCH_RE.match(line) + if m: + break + else: + prelines.append(line) + if len(prelines): + # Recursively parse lines before hr so they get parsed first. + self.parser.parseBlocks(parent, ['\n'.join(prelines)]) + # create hr + hr = markdown.etree.SubElement(parent, 'hr') + # check for lines in block after hr. + lines = lines[len(prelines)+1:] + if len(lines): + # Add lines after hr to master blocks for later parsing. + blocks.insert(0, '\n'.join(lines)) + + +class EmptyBlockProcessor(BlockProcessor): + """ Process blocks and start with an empty line. """ + + # Detect a block that only contains whitespace + # or only whitespace on the first line. + RE = re.compile(r'^\s*\n') + + def test(self, parent, block): + return bool(self.RE.match(block)) + + def run(self, parent, blocks): + block = blocks.pop(0) + m = self.RE.match(block) + if m: + # Add remaining line to master blocks for later. + blocks.insert(0, block[m.end():]) + sibling = self.lastChild(parent) + if sibling and sibling.tag == 'pre' and sibling[0] and \ + sibling[0].tag == 'code': + # Last block is a codeblock. Append to preserve whitespace. + sibling[0].text = markdown.AtomicString('%s/n/n/n' % sibling[0].text ) + + +class ParagraphProcessor(BlockProcessor): + """ Process Paragraph blocks. """ + + def test(self, parent, block): + return True + + def run(self, parent, blocks): + block = blocks.pop(0) + if block.strip(): + # Not a blank block. Add to parent, otherwise throw it away. + if self.parser.state.isstate('list'): + # The parent is a tight-list. Append to parent.text + if parent.text: + parent.text = '%s\n%s' % (parent.text, block) + else: + parent.text = block.lstrip() + else: + # Create a regular paragraph + p = markdown.etree.SubElement(parent, 'p') + p.text = block.lstrip() diff --git a/markdown/commandline.py b/markdown/commandline.py new file mode 100644 index 0000000..9b1a1bf --- /dev/null +++ b/markdown/commandline.py @@ -0,0 +1,97 @@ +""" +COMMAND-LINE SPECIFIC STUFF +============================================================================= + +The rest of the code is specifically for handling the case where Python +Markdown is called from the command line. +""" + +import markdown +import sys +import logging +from logging import DEBUG, INFO, WARN, ERROR, CRITICAL + +EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" +""" The name used in the usage statement displayed for python versions < 2.3. +(With python 2.3 and higher the usage statement is generated by optparse +and uses the actual name of the executable called.) """ + +OPTPARSE_WARNING = """ +Python 2.3 or higher required for advanced command line options. +For lower versions of Python use: + + %s INPUT_FILE > OUTPUT_FILE + +""" % EXECUTABLE_NAME_FOR_USAGE + +def parse_options(): + """ + Define and parse `optparse` options for command-line usage. + """ + + try: + optparse = __import__("optparse") + except: + if len(sys.argv) == 2: + return {'input': sys.argv[1], + 'output': None, + 'safe': False, + 'extensions': [], + 'encoding': None }, CRITICAL + else: + print OPTPARSE_WARNING + return None, None + + parser = optparse.OptionParser(usage="%prog INPUTFILE [options]") + parser.add_option("-f", "--file", dest="filename", default=sys.stdout, + help="write output to OUTPUT_FILE", + metavar="OUTPUT_FILE") + parser.add_option("-e", "--encoding", dest="encoding", + help="encoding for input and output files",) + parser.add_option("-q", "--quiet", default = CRITICAL, + action="store_const", const=CRITICAL+10, dest="verbose", + help="suppress all messages") + parser.add_option("-v", "--verbose", + action="store_const", const=INFO, dest="verbose", + help="print info messages") + parser.add_option("-s", "--safe", dest="safe", default=False, + metavar="SAFE_MODE", + help="safe mode ('replace', 'remove' or 'escape' user's HTML tag)") + parser.add_option("--noisy", + action="store_const", const=DEBUG, dest="verbose", + help="print debug messages") + parser.add_option("-x", "--extension", action="append", dest="extensions", + help = "load extension EXTENSION", metavar="EXTENSION") + + (options, args) = parser.parse_args() + + if not len(args) == 1: + parser.print_help() + return None, None + else: + input_file = args[0] + + if not options.extensions: + options.extensions = [] + + return {'input': input_file, + 'output': options.filename, + 'safe': options.safe, + 'extensions': options.extensions, + 'encoding': options.encoding }, options.verbose + +def run(): + """Run Markdown from the command line.""" + + # Setup a logger manually for compatibility with Python 2.3 + logger = logging.getLogger('MARKDOWN') + logger.setLevel(markdown.COMMAND_LINE_LOGGING_LEVEL) + logger.addHandler(logging.StreamHandler()) + + # Parse options and adjust logging level if necessary + options, logging_level = parse_options() + if not options: sys.exit(0) + if logging_level: logging.getLogger('MARKDOWN').setLevel(logging_level) + + # Run + markdown.markdownFromFile(**options) diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py new file mode 100644 index 0000000..66d3063 --- /dev/null +++ b/markdown/inlinepatterns.py @@ -0,0 +1,367 @@ +""" +INLINE PATTERNS +============================================================================= + +Inline patterns such as *emphasis* are handled by means of auxiliary +objects, one per pattern. Pattern objects must be instances of classes +that extend markdown.Pattern. Each pattern object uses a single regular +expression and needs support the following methods: + + pattern.getCompiledRegExp() # returns a regular expression + + pattern.handleMatch(m) # takes a match object and returns + # an ElementTree element or just plain text + +All of python markdown's built-in patterns subclass from Pattern, +but you can add additional patterns that don't. + +Also note that all the regular expressions used by inline must +capture the whole block. For this reason, they all start with +'^(.*)' and end with '(.*)!'. In case with built-in expression +Pattern takes care of adding the "^(.*)" and "(.*)!". + +Finally, the order in which regular expressions are applied is very +important - e.g. if we first replace http://.../ links with <a> tags +and _then_ try to replace inline html, we would end up with a mess. +So, we apply the expressions in the following order: + +* escape and backticks have to go before everything else, so + that we can preempt any markdown patterns by escaping them. + +* then we handle auto-links (must be done before inline html) + +* then we handle inline HTML. At this point we will simply + replace all inline HTML strings with a placeholder and add + the actual HTML to a hash. + +* then inline images (must be done before links) + +* then bracketed links, first regular then reference-style + +* finally we apply strong and emphasis +""" + +import markdown +import re +from urlparse import urlparse, urlunparse +import htmlentitydefs + +""" +The actual regular expressions for patterns +----------------------------------------------------------------------------- +""" + +NOBRACKET = r'[^\]\[]*' +BRK = ( r'\[(' + + (NOBRACKET + r'(\[')*6 + + (NOBRACKET+ r'\])*')*6 + + NOBRACKET + r')\]' ) +NOIMG = r'(?<!\!)' + +BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` +ESCAPE_RE = r'\\(.)' # \< +EMPHASIS_RE = r'(\*)([^\*]*)\2' # *emphasis* +STRONG_RE = r'(\*{2}|_{2})(.*?)\2' # **strong** +STRONG_EM_RE = r'(\*{3}|_{3})(.*?)\2' # ***strong*** + +if markdown.SMART_EMPHASIS: + EMPHASIS_2_RE = r'(?<!\S)(_)(\S.*?)\2' # _emphasis_ +else: + EMPHASIS_2_RE = r'(_)(.*?)\2' # _emphasis_ + +LINK_RE = NOIMG + BRK + \ +r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)''' +# [text](url) or [text](<url>) + +IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' +# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) +REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3] +IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2] +NOT_STRONG_RE = r'( \* )' # stand-alone * or _ +AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com> +AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com> + +HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> +ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & +LINE_BREAK_RE = r' \n' # two spaces at end of line +LINE_BREAK_2_RE = r' $' # two spaces at end of text + + +def dequote(string): + """Remove quotes from around a string.""" + if ( ( string.startswith('"') and string.endswith('"')) + or (string.startswith("'") and string.endswith("'")) ): + return string[1:-1] + else: + return string + +ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} + +def handleAttributes(text, parent): + """Set values of an element based on attribute definitions ({@id=123}).""" + def attributeCallback(match): + parent.set(match.group(1), match.group(2)) + return ATTR_RE.sub(attributeCallback, text) + + +""" +The pattern classes +----------------------------------------------------------------------------- +""" + +class Pattern: + """Base class that inline patterns subclass. """ + + def __init__ (self, pattern, markdown_instance=None): + """ + Create an instant of an inline pattern. + + Keyword arguments: + + * pattern: A regular expression that matches a pattern + + """ + self.pattern = pattern + self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL) + + # Api for Markdown to pass safe_mode into instance + self.safe_mode = False + if markdown_instance: + self.markdown = markdown_instance + + def getCompiledRegExp (self): + """ Return a compiled regular expression. """ + return self.compiled_re + + def handleMatch(self, m): + """Return a ElementTree element from the given match. + + Subclasses should override this method. + + Keyword arguments: + + * m: A re match object containing a match of the pattern. + + """ + pass + + def type(self): + """ Return class name, to define pattern type """ + return self.__class__.__name__ + +BasePattern = Pattern # for backward compatibility + +class SimpleTextPattern (Pattern): + """ Return a simple text of group(2) of a Pattern. """ + def handleMatch(self, m): + text = m.group(2) + if text == markdown.INLINE_PLACEHOLDER_PREFIX: + return None + return text + +class SimpleTagPattern (Pattern): + """ + Return element of type `tag` with a text attribute of group(3) + of a Pattern. + + """ + def __init__ (self, pattern, tag): + Pattern.__init__(self, pattern) + self.tag = tag + + def handleMatch(self, m): + el = markdown.etree.Element(self.tag) + el.text = m.group(3) + return el + + +class SubstituteTagPattern (SimpleTagPattern): + """ Return a eLement of type `tag` with no children. """ + def handleMatch (self, m): + return markdown.etree.Element(self.tag) + + +class BacktickPattern (Pattern): + """ Return a `<code>` element containing the matching text. """ + def __init__ (self, pattern): + Pattern.__init__(self, pattern) + self.tag = "code" + + def handleMatch(self, m): + el = markdown.etree.Element(self.tag) + el.text = markdown.AtomicString(m.group(3).strip()) + return el + + +class DoubleTagPattern (SimpleTagPattern): + """Return a ElementTree element nested in tag2 nested in tag1. + + Useful for strong emphasis etc. + + """ + def handleMatch(self, m): + tag1, tag2 = self.tag.split(",") + el1 = markdown.etree.Element(tag1) + el2 = markdown.etree.SubElement(el1, tag2) + el2.text = m.group(3) + return el1 + + +class HtmlPattern (Pattern): + """ Store raw inline html and return a placeholder. """ + def handleMatch (self, m): + rawhtml = m.group(2) + inline = True + place_holder = self.markdown.htmlStash.store(rawhtml) + return place_holder + + +class LinkPattern (Pattern): + """ Return a link element from the given match. """ + def handleMatch(self, m): + el = markdown.etree.Element("a") + el.text = m.group(2) + title = m.group(11) + href = m.group(9) + + if href: + if href[0] == "<": + href = href[1:-1] + el.set("href", self.sanitize_url(href.strip())) + else: + el.set("href", "") + + if title: + title = dequote(title) #.replace('"', """) + el.set("title", title) + return el + + def sanitize_url(self, url): + """ + Sanitize a url against xss attacks in "safe_mode". + + Rather than specifically blacklisting `javascript:alert("XSS")` and all + its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known + safe url formats. Most urls contain a network location, however some + are known not to (i.e.: mailto links). Script urls do not contain a + location. Additionally, for `javascript:...`, the scheme would be + "javascript" but some aliases will appear to `urlparse()` to have no + scheme. On top of that relative links (i.e.: "foo/bar.html") have no + scheme. Therefore we must check "path", "parameters", "query" and + "fragment" for any literal colons. We don't check "scheme" for colons + because it *should* never have any and "netloc" must allow the form: + `username:password@host:port`. + + """ + locless_schemes = ['', 'mailto', 'news'] + scheme, netloc, path, params, query, fragment = url = urlparse(url) + safe_url = False + if netloc != '' or scheme in locless_schemes: + safe_url = True + + for part in url[2:]: + if ":" in part: + safe_url = False + + if self.markdown.safeMode and not safe_url: + return '' + else: + return urlunparse(url) + +class ImagePattern(LinkPattern): + """ Return a img element from the given match. """ + def handleMatch(self, m): + el = markdown.etree.Element("img") + src_parts = m.group(9).split() + if src_parts: + src = src_parts[0] + if src[0] == "<" and src[-1] == ">": + src = src[1:-1] + el.set('src', self.sanitize_url(src)) + else: + el.set('src', "") + if len(src_parts) > 1: + el.set('title', dequote(" ".join(src_parts[1:]))) + + if markdown.ENABLE_ATTRIBUTES: + truealt = handleAttributes(m.group(2), el) + else: + truealt = m.group(2) + + el.set('alt', truealt) + return el + +class ReferencePattern(LinkPattern): + """ Match to a stored reference and return link element. """ + def handleMatch(self, m): + if m.group(9): + id = m.group(9).lower() + else: + # if we got something like "[Google][]" + # we'll use "google" as the id + id = m.group(2).lower() + + if not self.markdown.references.has_key(id): # ignore undefined refs + return None + href, title = self.markdown.references[id] + + text = m.group(2) + return self.makeTag(href, title, text) + + def makeTag(self, href, title, text): + el = markdown.etree.Element('a') + + el.set('href', self.sanitize_url(href)) + if title: + el.set('title', title) + + el.text = text + return el + + +class ImageReferencePattern (ReferencePattern): + """ Match to a stored reference and return img element. """ + def makeTag(self, href, title, text): + el = markdown.etree.Element("img") + el.set("src", self.sanitize_url(href)) + if title: + el.set("title", title) + el.set("alt", text) + return el + + +class AutolinkPattern (Pattern): + """ Return a link Element given an autolink (`<http://example/com>`). """ + def handleMatch(self, m): + el = markdown.etree.Element("a") + el.set('href', m.group(2)) + el.text = markdown.AtomicString(m.group(2)) + return el + +class AutomailPattern (Pattern): + """ + Return a mailto link Element given an automail link (`<foo@example.com>`). + """ + def handleMatch(self, m): + el = markdown.etree.Element('a') + email = m.group(2) + if email.startswith("mailto:"): + email = email[len("mailto:"):] + + def codepoint2name(code): + """Return entity definition by code, or the code if not defined.""" + entity = htmlentitydefs.codepoint2name.get(code) + if entity: + return "%s%s;" % (markdown.AMP_SUBSTITUTE, entity) + else: + return "%s#%d;" % (markdown.AMP_SUBSTITUTE, code) + + letters = [codepoint2name(ord(letter)) for letter in email] + el.text = markdown.AtomicString(''.join(letters)) + + mailto = "mailto:" + email + mailto = "".join([markdown.AMP_SUBSTITUTE + '#%d;' % + ord(letter) for letter in mailto]) + el.set('href', mailto) + return el + diff --git a/markdown/linepreprocessors.py b/markdown/linepreprocessors.py new file mode 100644 index 0000000..998bdf8 --- /dev/null +++ b/markdown/linepreprocessors.py @@ -0,0 +1,177 @@ + +""" +PRE-PROCESSORS +============================================================================= + +Preprocessors work on source text before we start doing anything too +complicated. +""" + +import re +import markdown + +class Processor: + def __init__(self, markdown_instance=None): + if markdown_instance: + self.markdown = markdown_instance + +class Preprocessor (Processor): + """ + Preprocessors are run after the text is broken into lines. + + Each preprocessor implements a "run" method that takes a pointer to a + list of lines of the document, modifies it as necessary and returns + either the same pointer or a pointer to a new list. + + Preprocessors must extend markdown.Preprocessor. + + """ + def run(self, lines): + """ + Each subclass of Preprocessor should override the `run` method, which + takes the document as a list of strings split by newlines and returns + the (possibly modified) list of lines. + + """ + pass + + +class HtmlBlockPreprocessor(Preprocessor): + """Remove html blocks from the text and store them for later retrieval.""" + + right_tag_patterns = ["</%s>", "%s>"] + + def _get_left_tag(self, block): + return block[1:].replace(">", " ", 1).split()[0].lower() + + def _get_right_tag(self, left_tag, block): + for p in self.right_tag_patterns: + tag = p % left_tag + i = block.rfind(tag) + if i > 2: + return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) + return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) + + def _equal_tags(self, left_tag, right_tag): + if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. + return True + if ("/" + left_tag) == right_tag: + return True + if (right_tag == "--" and left_tag == "--"): + return True + elif left_tag == right_tag[1:] \ + and right_tag[0] != "<": + return True + else: + return False + + def _is_oneliner(self, tag): + return (tag in ['hr', 'hr/']) + + def run(self, lines): + text = "\n".join(lines) + new_blocks = [] + text = text.split("\n\n") + items = [] + left_tag = '' + right_tag = '' + in_tag = False # flag + + while text: + block = text[0] + if block.startswith("\n"): + block = block[1:] + text = text[1:] + + if block.startswith("\n"): + block = block[1:] + + if not in_tag: + if block.startswith("<"): + left_tag = self._get_left_tag(block) + right_tag, data_index = self._get_right_tag(left_tag, block) + + if data_index < len(block): + text.insert(0, block[data_index:]) + block = block[:data_index] + + if not (markdown.isBlockLevel(left_tag) \ + or block[1] in ["!", "?", "@", "%"]): + new_blocks.append(block) + continue + + if self._is_oneliner(left_tag): + new_blocks.append(block.strip()) + continue + + if block[1] == "!": + # is a comment block + left_tag = "--" + right_tag, data_index = self._get_right_tag(left_tag, block) + # keep checking conditions below and maybe just append + + if block.rstrip().endswith(">") \ + and self._equal_tags(left_tag, right_tag): + new_blocks.append( + self.markdown.htmlStash.store(block.strip())) + continue + else: #if not block[1] == "!": + # if is block level tag and is not complete + + if markdown.isBlockLevel(left_tag) or left_tag == "--" \ + and not block.rstrip().endswith(">"): + items.append(block.strip()) + in_tag = True + else: + new_blocks.append( + self.markdown.htmlStash.store(block.strip())) + + continue + + new_blocks.append(block) + + else: + items.append(block.strip()) + + right_tag, data_index = self._get_right_tag(left_tag, block) + + if self._equal_tags(left_tag, right_tag): + # if find closing tag + in_tag = False + new_blocks.append( + self.markdown.htmlStash.store('\n\n'.join(items))) + items = [] + + if items: + new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) + new_blocks.append('\n') + + new_text = "\n\n".join(new_blocks) + return new_text.split("\n") + + +class ReferencePreprocessor(Preprocessor): + """ Remove reference definitions from text and store for later use. """ + + RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) + + def run (self, lines): + new_text = []; + for line in lines: + m = self.RE.match(line) + if m: + id = m.group(2).strip().lower() + t = m.group(4).strip() # potential title + if not t: + self.markdown.references[id] = (m.group(3), t) + elif (len(t) >= 2 + and (t[0] == t[-1] == "\"" + or t[0] == t[-1] == "\'" + or (t[0] == "(" and t[-1] == ")") ) ): + self.markdown.references[id] = (m.group(3), t[1:-1]) + else: + new_text.append(line) + else: + new_text.append(line) + + return new_text #+ "\n" diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py new file mode 100644 index 0000000..e8d7cd0 --- /dev/null +++ b/markdown/treeprocessors.py @@ -0,0 +1,323 @@ +import markdown +import re + +def isString(s): + """ Check if it's string """ + return isinstance(s, unicode) or isinstance(s, str) + +class Processor: + def __init__(self, markdown_instance=None): + if markdown_instance: + self.markdown = markdown_instance + +class Treeprocessor(Processor): + """ + Treeprocessors are run on the ElementTree object before serialization. + + Each Treeprocessor implements a "run" method that takes a pointer to an + ElementTree, modifies it as necessary and returns an ElementTree + object. + + Treeprocessors must extend markdown.Treeprocessor. + + """ + def run(self, root): + """ + Subclasses of Treeprocessor should implement a `run` method, which + takes a root ElementTree. This method can return another ElementTree + object, and the existing root ElementTree will be replaced, or it can + modify the current tree and return None. + """ + pass + + +class InlineProcessor(Treeprocessor): + """ + A Treeprocessor that traverses a tree, applying inline patterns. + """ + + def __init__ (self, md): + self.__placeholder_prefix = markdown.INLINE_PLACEHOLDER_PREFIX + self.__placeholder_suffix = markdown.ETX + self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ + + len(self.__placeholder_suffix) + self.__placeholder_re = re.compile(markdown.INLINE_PLACEHOLDER % r'([0-9]{4})') + self.markdown = md + + def __makePlaceholder(self, type): + """ Generate a placeholder """ + id = "%04d" % len(self.stashed_nodes) + hash = markdown.INLINE_PLACEHOLDER % id + return hash, id + + def __findPlaceholder(self, data, index): + """ + Extract id from data string, start from index + + Keyword arguments: + + * data: string + * index: index, from which we start search + + Returns: placeholder id and string index, after the found placeholder. + """ + + m = self.__placeholder_re.search(data, index) + if m: + return m.group(1), m.end() + else: + return None, index + 1 + + def __stashNode(self, node, type): + """ Add node to stash """ + placeholder, id = self.__makePlaceholder(type) + self.stashed_nodes[id] = node + return placeholder + + def __handleInline(self, data, patternIndex=0): + """ + Process string with inline patterns and replace it + with placeholders + + Keyword arguments: + + * data: A line of Markdown text + * patternIndex: The index of the inlinePattern to start with + + Returns: String with placeholders. + + """ + if not isinstance(data, markdown.AtomicString): + startIndex = 0 + while patternIndex < len(self.markdown.inlinePatterns): + data, matched, startIndex = self.__applyPattern( + self.markdown.inlinePatterns.value_for_index(patternIndex), + data, patternIndex, startIndex) + if not matched: + patternIndex += 1 + return data + + def __processElementText(self, node, subnode, isText=True): + """ + Process placeholders in Element.text or Element.tail + of Elements popped from self.stashed_nodes. + + Keywords arguments: + + * node: parent node + * subnode: processing node + * isText: bool variable, True - it's text, False - it's tail + + Returns: None + + """ + if isText: + text = subnode.text + subnode.text = None + else: + text = subnode.tail + subnode.tail = None + + childResult = self.__processPlaceholders(text, subnode) + + if not isText and node is not subnode: + pos = node.getchildren().index(subnode) + node.remove(subnode) + else: + pos = 0 + + childResult.reverse() + for newChild in childResult: + node.insert(pos, newChild) + + def __processPlaceholders(self, data, parent): + """ + Process string with placeholders and generate ElementTree tree. + + Keyword arguments: + + * data: string with placeholders instead of ElementTree elements. + * parent: Element, which contains processing inline data + + Returns: list with ElementTree elements with applied inline patterns. + """ + def linkText(text): + if text: + if result: + if result[-1].tail: + result[-1].tail += text + else: + result[-1].tail = text + else: + if parent.text: + parent.text += text + else: + parent.text = text + + result = [] + strartIndex = 0 + while data: + index = data.find(self.__placeholder_prefix, strartIndex) + if index != -1: + id, phEndIndex = self.__findPlaceholder(data, index) + + if self.stashed_nodes.has_key(id): + node = self.stashed_nodes.get(id) + + if index > 0: + text = data[strartIndex:index] + linkText(text) + + if not isString(node): # it's Element + for child in [node] + node.getchildren(): + if child.tail: + if child.tail.strip(): + self.__processElementText(node, child, False) + if child.text: + if child.text.strip(): + self.__processElementText(child, child) + else: # it's just a string + linkText(node) + strartIndex = phEndIndex + continue + + strartIndex = phEndIndex + result.append(node) + + else: # wrong placeholder + end = index + len(prefix) + linkText(data[strartIndex:end]) + strartIndex = end + else: + text = data[strartIndex:] + linkText(text) + data = "" + + return result + + def __applyPattern(self, pattern, data, patternIndex, startIndex=0): + """ + Check if the line fits the pattern, create the necessary + elements, add it to stashed_nodes. + + Keyword arguments: + + * data: the text to be processed + * pattern: the pattern to be checked + * patternIndex: index of current pattern + * startIndex: string index, from which we starting search + + Returns: String with placeholders instead of ElementTree elements. + + """ + match = pattern.getCompiledRegExp().match(data[startIndex:]) + leftData = data[:startIndex] + + if not match: + return data, False, 0 + + node = pattern.handleMatch(match) + + if node is None: + return data, True, len(leftData) + match.span(len(match.groups()))[0] + + if not isString(node): + if not isinstance(node.text, markdown.AtomicString): + # We need to process current node too + for child in [node] + node.getchildren(): + if not isString(node): + if child.text: + child.text = self.__handleInline(child.text, + patternIndex + 1) + if child.tail: + child.tail = self.__handleInline(child.tail, + patternIndex) + + placeholder = self.__stashNode(node, pattern.type()) + + return "%s%s%s%s" % (leftData, + match.group(1), + placeholder, match.groups()[-1]), True, 0 + + def run(self, tree): + """Apply inline patterns to a parsed Markdown tree. + + Iterate over ElementTree, find elements with inline tag, apply inline + patterns and append newly created Elements to tree. If you don't + want process your data with inline paterns, instead of normal string, + use subclass AtomicString: + + node.text = markdown.AtomicString("data won't be processed with inline patterns") + + Arguments: + + * markdownTree: ElementTree object, representing Markdown tree. + + Returns: ElementTree object with applied inline patterns. + + """ + self.stashed_nodes = {} + + stack = [tree] + + while stack: + currElement = stack.pop() + insertQueue = [] + for child in currElement.getchildren(): + if child.text and not isinstance(child.text, markdown.AtomicString): + text = child.text + child.text = None + lst = self.__processPlaceholders(self.__handleInline( + text), child) + stack += lst + insertQueue.append((child, lst)) + + if child.getchildren(): + stack.append(child) + + for element, lst in insertQueue: + if element.text: + element.text = markdown.inlinepatterns.handleAttributes(element.text, element) + i = 0 + for newChild in lst: + # Processing attributes + if newChild.tail: + newChild.tail = markdown.inlinepatterns.handleAttributes(newChild.tail, + element) + if newChild.text: + newChild.text = markdown.inlinepatterns.handleAttributes(newChild.text, + newChild) + element.insert(i, newChild) + i += 1 + + return tree + + +class PrettifyTreeprocessor(Treeprocessor): + """Add linebreaks to the html document.""" + def _prettifyETree(self, elem): + """Recursively add linebreaks to ElementTree children.""" + i = "\n" + if markdown.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: + if (not elem.text or not elem.text.strip()) \ + and len(elem) and markdown.isBlockLevel(elem[0].tag): + elem.text = i + for e in elem: + if markdown.isBlockLevel(e.tag): + self._prettifyETree(e) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + if not elem.tail or not elem.tail.strip(): + elem.tail = i + + def run(self, root): + """.Add linebreaks to ElementTree root object.""" + self._prettifyETree(root) + # Do <br />'s seperately as they are often in the middle of + # inline content and missed by _prettifyETree. + brs = root.getiterator('br') + for br in brs: + if not br.tail or not br.tail.strip(): + br.tail = '\n' + else: + br.tail = '\n%s' % br.tail |